1 /* 2 * kernel/sched/core.c 3 * 4 * Core kernel scheduler code and related syscalls 5 * 6 * Copyright (C) 1991-2002 Linus Torvalds 7 */ 8 #include "sched.h" 9 10 #include <linux/nospec.h> 11 12 #include <linux/kcov.h> 13 14 #include <asm/switch_to.h> 15 #include <asm/tlb.h> 16 17 #include "../workqueue_internal.h" 18 #include "../smpboot.h" 19 20 #include "pelt.h" 21 22 #define CREATE_TRACE_POINTS 23 #include <trace/events/sched.h> 24 25 DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); 26 27 #if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL) 28 /* 29 * Debugging: various feature bits 30 * 31 * If SCHED_DEBUG is disabled, each compilation unit has its own copy of 32 * sysctl_sched_features, defined in sched.h, to allow constants propagation 33 * at compile time and compiler optimization based on features default. 34 */ 35 #define SCHED_FEAT(name, enabled) \ 36 (1UL << __SCHED_FEAT_##name) * enabled | 37 const_debug unsigned int sysctl_sched_features = 38 #include "features.h" 39 0; 40 #undef SCHED_FEAT 41 #endif 42 43 /* 44 * Number of tasks to iterate in a single balance run. 45 * Limited because this is done with IRQs disabled. 46 */ 47 const_debug unsigned int sysctl_sched_nr_migrate = 32; 48 49 /* 50 * period over which we measure -rt task CPU usage in us. 51 * default: 1s 52 */ 53 unsigned int sysctl_sched_rt_period = 1000000; 54 55 __read_mostly int scheduler_running; 56 57 /* 58 * part of the period that we allow rt tasks to run in us. 59 * default: 0.95s 60 */ 61 int sysctl_sched_rt_runtime = 950000; 62 63 /* 64 * __task_rq_lock - lock the rq @p resides on. 65 */ 66 struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) 67 __acquires(rq->lock) 68 { 69 struct rq *rq; 70 71 lockdep_assert_held(&p->pi_lock); 72 73 for (;;) { 74 rq = task_rq(p); 75 raw_spin_lock(&rq->lock); 76 if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) { 77 rq_pin_lock(rq, rf); 78 return rq; 79 } 80 raw_spin_unlock(&rq->lock); 81 82 while (unlikely(task_on_rq_migrating(p))) 83 cpu_relax(); 84 } 85 } 86 87 /* 88 * task_rq_lock - lock p->pi_lock and lock the rq @p resides on. 89 */ 90 struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) 91 __acquires(p->pi_lock) 92 __acquires(rq->lock) 93 { 94 struct rq *rq; 95 96 for (;;) { 97 raw_spin_lock_irqsave(&p->pi_lock, rf->flags); 98 rq = task_rq(p); 99 raw_spin_lock(&rq->lock); 100 /* 101 * move_queued_task() task_rq_lock() 102 * 103 * ACQUIRE (rq->lock) 104 * [S] ->on_rq = MIGRATING [L] rq = task_rq() 105 * WMB (__set_task_cpu()) ACQUIRE (rq->lock); 106 * [S] ->cpu = new_cpu [L] task_rq() 107 * [L] ->on_rq 108 * RELEASE (rq->lock) 109 * 110 * If we observe the old CPU in task_rq_lock, the acquire of 111 * the old rq->lock will fully serialize against the stores. 112 * 113 * If we observe the new CPU in task_rq_lock, the acquire will 114 * pair with the WMB to ensure we must then also see migrating. 115 */ 116 if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) { 117 rq_pin_lock(rq, rf); 118 return rq; 119 } 120 raw_spin_unlock(&rq->lock); 121 raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); 122 123 while (unlikely(task_on_rq_migrating(p))) 124 cpu_relax(); 125 } 126 } 127 128 /* 129 * RQ-clock updating methods: 130 */ 131 132 static void update_rq_clock_task(struct rq *rq, s64 delta) 133 { 134 /* 135 * In theory, the compile should just see 0 here, and optimize out the call 136 * to sched_rt_avg_update. But I don't trust it... 137 */ 138 s64 __maybe_unused steal = 0, irq_delta = 0; 139 140 #ifdef CONFIG_IRQ_TIME_ACCOUNTING 141 irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; 142 143 /* 144 * Since irq_time is only updated on {soft,}irq_exit, we might run into 145 * this case when a previous update_rq_clock() happened inside a 146 * {soft,}irq region. 147 * 148 * When this happens, we stop ->clock_task and only update the 149 * prev_irq_time stamp to account for the part that fit, so that a next 150 * update will consume the rest. This ensures ->clock_task is 151 * monotonic. 152 * 153 * It does however cause some slight miss-attribution of {soft,}irq 154 * time, a more accurate solution would be to update the irq_time using 155 * the current rq->clock timestamp, except that would require using 156 * atomic ops. 157 */ 158 if (irq_delta > delta) 159 irq_delta = delta; 160 161 rq->prev_irq_time += irq_delta; 162 delta -= irq_delta; 163 #endif 164 #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING 165 if (static_key_false((¶virt_steal_rq_enabled))) { 166 steal = paravirt_steal_clock(cpu_of(rq)); 167 steal -= rq->prev_steal_time_rq; 168 169 if (unlikely(steal > delta)) 170 steal = delta; 171 172 rq->prev_steal_time_rq += steal; 173 delta -= steal; 174 } 175 #endif 176 177 rq->clock_task += delta; 178 179 #ifdef CONFIG_HAVE_SCHED_AVG_IRQ 180 if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY)) 181 update_irq_load_avg(rq, irq_delta + steal); 182 #endif 183 } 184 185 void update_rq_clock(struct rq *rq) 186 { 187 s64 delta; 188 189 lockdep_assert_held(&rq->lock); 190 191 if (rq->clock_update_flags & RQCF_ACT_SKIP) 192 return; 193 194 #ifdef CONFIG_SCHED_DEBUG 195 if (sched_feat(WARN_DOUBLE_CLOCK)) 196 SCHED_WARN_ON(rq->clock_update_flags & RQCF_UPDATED); 197 rq->clock_update_flags |= RQCF_UPDATED; 198 #endif 199 200 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; 201 if (delta < 0) 202 return; 203 rq->clock += delta; 204 update_rq_clock_task(rq, delta); 205 } 206 207 208 #ifdef CONFIG_SCHED_HRTICK 209 /* 210 * Use HR-timers to deliver accurate preemption points. 211 */ 212 213 static void hrtick_clear(struct rq *rq) 214 { 215 if (hrtimer_active(&rq->hrtick_timer)) 216 hrtimer_cancel(&rq->hrtick_timer); 217 } 218 219 /* 220 * High-resolution timer tick. 221 * Runs from hardirq context with interrupts disabled. 222 */ 223 static enum hrtimer_restart hrtick(struct hrtimer *timer) 224 { 225 struct rq *rq = container_of(timer, struct rq, hrtick_timer); 226 struct rq_flags rf; 227 228 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); 229 230 rq_lock(rq, &rf); 231 update_rq_clock(rq); 232 rq->curr->sched_class->task_tick(rq, rq->curr, 1); 233 rq_unlock(rq, &rf); 234 235 return HRTIMER_NORESTART; 236 } 237 238 #ifdef CONFIG_SMP 239 240 static void __hrtick_restart(struct rq *rq) 241 { 242 struct hrtimer *timer = &rq->hrtick_timer; 243 244 hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED); 245 } 246 247 /* 248 * called from hardirq (IPI) context 249 */ 250 static void __hrtick_start(void *arg) 251 { 252 struct rq *rq = arg; 253 struct rq_flags rf; 254 255 rq_lock(rq, &rf); 256 __hrtick_restart(rq); 257 rq->hrtick_csd_pending = 0; 258 rq_unlock(rq, &rf); 259 } 260 261 /* 262 * Called to set the hrtick timer state. 263 * 264 * called with rq->lock held and irqs disabled 265 */ 266 void hrtick_start(struct rq *rq, u64 delay) 267 { 268 struct hrtimer *timer = &rq->hrtick_timer; 269 ktime_t time; 270 s64 delta; 271 272 /* 273 * Don't schedule slices shorter than 10000ns, that just 274 * doesn't make sense and can cause timer DoS. 275 */ 276 delta = max_t(s64, delay, 10000LL); 277 time = ktime_add_ns(timer->base->get_time(), delta); 278 279 hrtimer_set_expires(timer, time); 280 281 if (rq == this_rq()) { 282 __hrtick_restart(rq); 283 } else if (!rq->hrtick_csd_pending) { 284 smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd); 285 rq->hrtick_csd_pending = 1; 286 } 287 } 288 289 #else 290 /* 291 * Called to set the hrtick timer state. 292 * 293 * called with rq->lock held and irqs disabled 294 */ 295 void hrtick_start(struct rq *rq, u64 delay) 296 { 297 /* 298 * Don't schedule slices shorter than 10000ns, that just 299 * doesn't make sense. Rely on vruntime for fairness. 300 */ 301 delay = max_t(u64, delay, 10000LL); 302 hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), 303 HRTIMER_MODE_REL_PINNED); 304 } 305 #endif /* CONFIG_SMP */ 306 307 static void hrtick_rq_init(struct rq *rq) 308 { 309 #ifdef CONFIG_SMP 310 rq->hrtick_csd_pending = 0; 311 312 rq->hrtick_csd.flags = 0; 313 rq->hrtick_csd.func = __hrtick_start; 314 rq->hrtick_csd.info = rq; 315 #endif 316 317 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 318 rq->hrtick_timer.function = hrtick; 319 } 320 #else /* CONFIG_SCHED_HRTICK */ 321 static inline void hrtick_clear(struct rq *rq) 322 { 323 } 324 325 static inline void hrtick_rq_init(struct rq *rq) 326 { 327 } 328 #endif /* CONFIG_SCHED_HRTICK */ 329 330 /* 331 * cmpxchg based fetch_or, macro so it works for different integer types 332 */ 333 #define fetch_or(ptr, mask) \ 334 ({ \ 335 typeof(ptr) _ptr = (ptr); \ 336 typeof(mask) _mask = (mask); \ 337 typeof(*_ptr) _old, _val = *_ptr; \ 338 \ 339 for (;;) { \ 340 _old = cmpxchg(_ptr, _val, _val | _mask); \ 341 if (_old == _val) \ 342 break; \ 343 _val = _old; \ 344 } \ 345 _old; \ 346 }) 347 348 #if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG) 349 /* 350 * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG, 351 * this avoids any races wrt polling state changes and thereby avoids 352 * spurious IPIs. 353 */ 354 static bool set_nr_and_not_polling(struct task_struct *p) 355 { 356 struct thread_info *ti = task_thread_info(p); 357 return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG); 358 } 359 360 /* 361 * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set. 362 * 363 * If this returns true, then the idle task promises to call 364 * sched_ttwu_pending() and reschedule soon. 365 */ 366 static bool set_nr_if_polling(struct task_struct *p) 367 { 368 struct thread_info *ti = task_thread_info(p); 369 typeof(ti->flags) old, val = READ_ONCE(ti->flags); 370 371 for (;;) { 372 if (!(val & _TIF_POLLING_NRFLAG)) 373 return false; 374 if (val & _TIF_NEED_RESCHED) 375 return true; 376 old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED); 377 if (old == val) 378 break; 379 val = old; 380 } 381 return true; 382 } 383 384 #else 385 static bool set_nr_and_not_polling(struct task_struct *p) 386 { 387 set_tsk_need_resched(p); 388 return true; 389 } 390 391 #ifdef CONFIG_SMP 392 static bool set_nr_if_polling(struct task_struct *p) 393 { 394 return false; 395 } 396 #endif 397 #endif 398 399 void wake_q_add(struct wake_q_head *head, struct task_struct *task) 400 { 401 struct wake_q_node *node = &task->wake_q; 402 403 /* 404 * Atomically grab the task, if ->wake_q is !nil already it means 405 * its already queued (either by us or someone else) and will get the 406 * wakeup due to that. 407 * 408 * This cmpxchg() executes a full barrier, which pairs with the full 409 * barrier executed by the wakeup in wake_up_q(). 410 */ 411 if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL)) 412 return; 413 414 get_task_struct(task); 415 416 /* 417 * The head is context local, there can be no concurrency. 418 */ 419 *head->lastp = node; 420 head->lastp = &node->next; 421 } 422 423 void wake_up_q(struct wake_q_head *head) 424 { 425 struct wake_q_node *node = head->first; 426 427 while (node != WAKE_Q_TAIL) { 428 struct task_struct *task; 429 430 task = container_of(node, struct task_struct, wake_q); 431 BUG_ON(!task); 432 /* Task can safely be re-inserted now: */ 433 node = node->next; 434 task->wake_q.next = NULL; 435 436 /* 437 * wake_up_process() executes a full barrier, which pairs with 438 * the queueing in wake_q_add() so as not to miss wakeups. 439 */ 440 wake_up_process(task); 441 put_task_struct(task); 442 } 443 } 444 445 /* 446 * resched_curr - mark rq's current task 'to be rescheduled now'. 447 * 448 * On UP this means the setting of the need_resched flag, on SMP it 449 * might also involve a cross-CPU call to trigger the scheduler on 450 * the target CPU. 451 */ 452 void resched_curr(struct rq *rq) 453 { 454 struct task_struct *curr = rq->curr; 455 int cpu; 456 457 lockdep_assert_held(&rq->lock); 458 459 if (test_tsk_need_resched(curr)) 460 return; 461 462 cpu = cpu_of(rq); 463 464 if (cpu == smp_processor_id()) { 465 set_tsk_need_resched(curr); 466 set_preempt_need_resched(); 467 return; 468 } 469 470 if (set_nr_and_not_polling(curr)) 471 smp_send_reschedule(cpu); 472 else 473 trace_sched_wake_idle_without_ipi(cpu); 474 } 475 476 void resched_cpu(int cpu) 477 { 478 struct rq *rq = cpu_rq(cpu); 479 unsigned long flags; 480 481 raw_spin_lock_irqsave(&rq->lock, flags); 482 if (cpu_online(cpu) || cpu == smp_processor_id()) 483 resched_curr(rq); 484 raw_spin_unlock_irqrestore(&rq->lock, flags); 485 } 486 487 #ifdef CONFIG_SMP 488 #ifdef CONFIG_NO_HZ_COMMON 489 /* 490 * In the semi idle case, use the nearest busy CPU for migrating timers 491 * from an idle CPU. This is good for power-savings. 492 * 493 * We don't do similar optimization for completely idle system, as 494 * selecting an idle CPU will add more delays to the timers than intended 495 * (as that CPU's timer base may not be uptodate wrt jiffies etc). 496 */ 497 int get_nohz_timer_target(void) 498 { 499 int i, cpu = smp_processor_id(); 500 struct sched_domain *sd; 501 502 if (!idle_cpu(cpu) && housekeeping_cpu(cpu, HK_FLAG_TIMER)) 503 return cpu; 504 505 rcu_read_lock(); 506 for_each_domain(cpu, sd) { 507 for_each_cpu(i, sched_domain_span(sd)) { 508 if (cpu == i) 509 continue; 510 511 if (!idle_cpu(i) && housekeeping_cpu(i, HK_FLAG_TIMER)) { 512 cpu = i; 513 goto unlock; 514 } 515 } 516 } 517 518 if (!housekeeping_cpu(cpu, HK_FLAG_TIMER)) 519 cpu = housekeeping_any_cpu(HK_FLAG_TIMER); 520 unlock: 521 rcu_read_unlock(); 522 return cpu; 523 } 524 525 /* 526 * When add_timer_on() enqueues a timer into the timer wheel of an 527 * idle CPU then this timer might expire before the next timer event 528 * which is scheduled to wake up that CPU. In case of a completely 529 * idle system the next event might even be infinite time into the 530 * future. wake_up_idle_cpu() ensures that the CPU is woken up and 531 * leaves the inner idle loop so the newly added timer is taken into 532 * account when the CPU goes back to idle and evaluates the timer 533 * wheel for the next timer event. 534 */ 535 static void wake_up_idle_cpu(int cpu) 536 { 537 struct rq *rq = cpu_rq(cpu); 538 539 if (cpu == smp_processor_id()) 540 return; 541 542 if (set_nr_and_not_polling(rq->idle)) 543 smp_send_reschedule(cpu); 544 else 545 trace_sched_wake_idle_without_ipi(cpu); 546 } 547 548 static bool wake_up_full_nohz_cpu(int cpu) 549 { 550 /* 551 * We just need the target to call irq_exit() and re-evaluate 552 * the next tick. The nohz full kick at least implies that. 553 * If needed we can still optimize that later with an 554 * empty IRQ. 555 */ 556 if (cpu_is_offline(cpu)) 557 return true; /* Don't try to wake offline CPUs. */ 558 if (tick_nohz_full_cpu(cpu)) { 559 if (cpu != smp_processor_id() || 560 tick_nohz_tick_stopped()) 561 tick_nohz_full_kick_cpu(cpu); 562 return true; 563 } 564 565 return false; 566 } 567 568 /* 569 * Wake up the specified CPU. If the CPU is going offline, it is the 570 * caller's responsibility to deal with the lost wakeup, for example, 571 * by hooking into the CPU_DEAD notifier like timers and hrtimers do. 572 */ 573 void wake_up_nohz_cpu(int cpu) 574 { 575 if (!wake_up_full_nohz_cpu(cpu)) 576 wake_up_idle_cpu(cpu); 577 } 578 579 static inline bool got_nohz_idle_kick(void) 580 { 581 int cpu = smp_processor_id(); 582 583 if (!(atomic_read(nohz_flags(cpu)) & NOHZ_KICK_MASK)) 584 return false; 585 586 if (idle_cpu(cpu) && !need_resched()) 587 return true; 588 589 /* 590 * We can't run Idle Load Balance on this CPU for this time so we 591 * cancel it and clear NOHZ_BALANCE_KICK 592 */ 593 atomic_andnot(NOHZ_KICK_MASK, nohz_flags(cpu)); 594 return false; 595 } 596 597 #else /* CONFIG_NO_HZ_COMMON */ 598 599 static inline bool got_nohz_idle_kick(void) 600 { 601 return false; 602 } 603 604 #endif /* CONFIG_NO_HZ_COMMON */ 605 606 #ifdef CONFIG_NO_HZ_FULL 607 bool sched_can_stop_tick(struct rq *rq) 608 { 609 int fifo_nr_running; 610 611 /* Deadline tasks, even if single, need the tick */ 612 if (rq->dl.dl_nr_running) 613 return false; 614 615 /* 616 * If there are more than one RR tasks, we need the tick to effect the 617 * actual RR behaviour. 618 */ 619 if (rq->rt.rr_nr_running) { 620 if (rq->rt.rr_nr_running == 1) 621 return true; 622 else 623 return false; 624 } 625 626 /* 627 * If there's no RR tasks, but FIFO tasks, we can skip the tick, no 628 * forced preemption between FIFO tasks. 629 */ 630 fifo_nr_running = rq->rt.rt_nr_running - rq->rt.rr_nr_running; 631 if (fifo_nr_running) 632 return true; 633 634 /* 635 * If there are no DL,RR/FIFO tasks, there must only be CFS tasks left; 636 * if there's more than one we need the tick for involuntary 637 * preemption. 638 */ 639 if (rq->nr_running > 1) 640 return false; 641 642 return true; 643 } 644 #endif /* CONFIG_NO_HZ_FULL */ 645 #endif /* CONFIG_SMP */ 646 647 #if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \ 648 (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH))) 649 /* 650 * Iterate task_group tree rooted at *from, calling @down when first entering a 651 * node and @up when leaving it for the final time. 652 * 653 * Caller must hold rcu_lock or sufficient equivalent. 654 */ 655 int walk_tg_tree_from(struct task_group *from, 656 tg_visitor down, tg_visitor up, void *data) 657 { 658 struct task_group *parent, *child; 659 int ret; 660 661 parent = from; 662 663 down: 664 ret = (*down)(parent, data); 665 if (ret) 666 goto out; 667 list_for_each_entry_rcu(child, &parent->children, siblings) { 668 parent = child; 669 goto down; 670 671 up: 672 continue; 673 } 674 ret = (*up)(parent, data); 675 if (ret || parent == from) 676 goto out; 677 678 child = parent; 679 parent = parent->parent; 680 if (parent) 681 goto up; 682 out: 683 return ret; 684 } 685 686 int tg_nop(struct task_group *tg, void *data) 687 { 688 return 0; 689 } 690 #endif 691 692 static void set_load_weight(struct task_struct *p, bool update_load) 693 { 694 int prio = p->static_prio - MAX_RT_PRIO; 695 struct load_weight *load = &p->se.load; 696 697 /* 698 * SCHED_IDLE tasks get minimal weight: 699 */ 700 if (idle_policy(p->policy)) { 701 load->weight = scale_load(WEIGHT_IDLEPRIO); 702 load->inv_weight = WMULT_IDLEPRIO; 703 p->se.runnable_weight = load->weight; 704 return; 705 } 706 707 /* 708 * SCHED_OTHER tasks have to update their load when changing their 709 * weight 710 */ 711 if (update_load && p->sched_class == &fair_sched_class) { 712 reweight_task(p, prio); 713 } else { 714 load->weight = scale_load(sched_prio_to_weight[prio]); 715 load->inv_weight = sched_prio_to_wmult[prio]; 716 p->se.runnable_weight = load->weight; 717 } 718 } 719 720 static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags) 721 { 722 if (!(flags & ENQUEUE_NOCLOCK)) 723 update_rq_clock(rq); 724 725 if (!(flags & ENQUEUE_RESTORE)) 726 sched_info_queued(rq, p); 727 728 p->sched_class->enqueue_task(rq, p, flags); 729 } 730 731 static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags) 732 { 733 if (!(flags & DEQUEUE_NOCLOCK)) 734 update_rq_clock(rq); 735 736 if (!(flags & DEQUEUE_SAVE)) 737 sched_info_dequeued(rq, p); 738 739 p->sched_class->dequeue_task(rq, p, flags); 740 } 741 742 void activate_task(struct rq *rq, struct task_struct *p, int flags) 743 { 744 if (task_contributes_to_load(p)) 745 rq->nr_uninterruptible--; 746 747 enqueue_task(rq, p, flags); 748 } 749 750 void deactivate_task(struct rq *rq, struct task_struct *p, int flags) 751 { 752 if (task_contributes_to_load(p)) 753 rq->nr_uninterruptible++; 754 755 dequeue_task(rq, p, flags); 756 } 757 758 /* 759 * __normal_prio - return the priority that is based on the static prio 760 */ 761 static inline int __normal_prio(struct task_struct *p) 762 { 763 return p->static_prio; 764 } 765 766 /* 767 * Calculate the expected normal priority: i.e. priority 768 * without taking RT-inheritance into account. Might be 769 * boosted by interactivity modifiers. Changes upon fork, 770 * setprio syscalls, and whenever the interactivity 771 * estimator recalculates. 772 */ 773 static inline int normal_prio(struct task_struct *p) 774 { 775 int prio; 776 777 if (task_has_dl_policy(p)) 778 prio = MAX_DL_PRIO-1; 779 else if (task_has_rt_policy(p)) 780 prio = MAX_RT_PRIO-1 - p->rt_priority; 781 else 782 prio = __normal_prio(p); 783 return prio; 784 } 785 786 /* 787 * Calculate the current priority, i.e. the priority 788 * taken into account by the scheduler. This value might 789 * be boosted by RT tasks, or might be boosted by 790 * interactivity modifiers. Will be RT if the task got 791 * RT-boosted. If not then it returns p->normal_prio. 792 */ 793 static int effective_prio(struct task_struct *p) 794 { 795 p->normal_prio = normal_prio(p); 796 /* 797 * If we are RT tasks or we were boosted to RT priority, 798 * keep the priority unchanged. Otherwise, update priority 799 * to the normal priority: 800 */ 801 if (!rt_prio(p->prio)) 802 return p->normal_prio; 803 return p->prio; 804 } 805 806 /** 807 * task_curr - is this task currently executing on a CPU? 808 * @p: the task in question. 809 * 810 * Return: 1 if the task is currently executing. 0 otherwise. 811 */ 812 inline int task_curr(const struct task_struct *p) 813 { 814 return cpu_curr(task_cpu(p)) == p; 815 } 816 817 /* 818 * switched_from, switched_to and prio_changed must _NOT_ drop rq->lock, 819 * use the balance_callback list if you want balancing. 820 * 821 * this means any call to check_class_changed() must be followed by a call to 822 * balance_callback(). 823 */ 824 static inline void check_class_changed(struct rq *rq, struct task_struct *p, 825 const struct sched_class *prev_class, 826 int oldprio) 827 { 828 if (prev_class != p->sched_class) { 829 if (prev_class->switched_from) 830 prev_class->switched_from(rq, p); 831 832 p->sched_class->switched_to(rq, p); 833 } else if (oldprio != p->prio || dl_task(p)) 834 p->sched_class->prio_changed(rq, p, oldprio); 835 } 836 837 void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) 838 { 839 const struct sched_class *class; 840 841 if (p->sched_class == rq->curr->sched_class) { 842 rq->curr->sched_class->check_preempt_curr(rq, p, flags); 843 } else { 844 for_each_class(class) { 845 if (class == rq->curr->sched_class) 846 break; 847 if (class == p->sched_class) { 848 resched_curr(rq); 849 break; 850 } 851 } 852 } 853 854 /* 855 * A queue event has occurred, and we're going to schedule. In 856 * this case, we can save a useless back to back clock update. 857 */ 858 if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr)) 859 rq_clock_skip_update(rq); 860 } 861 862 #ifdef CONFIG_SMP 863 864 static inline bool is_per_cpu_kthread(struct task_struct *p) 865 { 866 if (!(p->flags & PF_KTHREAD)) 867 return false; 868 869 if (p->nr_cpus_allowed != 1) 870 return false; 871 872 return true; 873 } 874 875 /* 876 * Per-CPU kthreads are allowed to run on !actie && online CPUs, see 877 * __set_cpus_allowed_ptr() and select_fallback_rq(). 878 */ 879 static inline bool is_cpu_allowed(struct task_struct *p, int cpu) 880 { 881 if (!cpumask_test_cpu(cpu, &p->cpus_allowed)) 882 return false; 883 884 if (is_per_cpu_kthread(p)) 885 return cpu_online(cpu); 886 887 return cpu_active(cpu); 888 } 889 890 /* 891 * This is how migration works: 892 * 893 * 1) we invoke migration_cpu_stop() on the target CPU using 894 * stop_one_cpu(). 895 * 2) stopper starts to run (implicitly forcing the migrated thread 896 * off the CPU) 897 * 3) it checks whether the migrated task is still in the wrong runqueue. 898 * 4) if it's in the wrong runqueue then the migration thread removes 899 * it and puts it into the right queue. 900 * 5) stopper completes and stop_one_cpu() returns and the migration 901 * is done. 902 */ 903 904 /* 905 * move_queued_task - move a queued task to new rq. 906 * 907 * Returns (locked) new rq. Old rq's lock is released. 908 */ 909 static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf, 910 struct task_struct *p, int new_cpu) 911 { 912 lockdep_assert_held(&rq->lock); 913 914 p->on_rq = TASK_ON_RQ_MIGRATING; 915 dequeue_task(rq, p, DEQUEUE_NOCLOCK); 916 set_task_cpu(p, new_cpu); 917 rq_unlock(rq, rf); 918 919 rq = cpu_rq(new_cpu); 920 921 rq_lock(rq, rf); 922 BUG_ON(task_cpu(p) != new_cpu); 923 enqueue_task(rq, p, 0); 924 p->on_rq = TASK_ON_RQ_QUEUED; 925 check_preempt_curr(rq, p, 0); 926 927 return rq; 928 } 929 930 struct migration_arg { 931 struct task_struct *task; 932 int dest_cpu; 933 }; 934 935 /* 936 * Move (not current) task off this CPU, onto the destination CPU. We're doing 937 * this because either it can't run here any more (set_cpus_allowed() 938 * away from this CPU, or CPU going down), or because we're 939 * attempting to rebalance this task on exec (sched_exec). 940 * 941 * So we race with normal scheduler movements, but that's OK, as long 942 * as the task is no longer on this CPU. 943 */ 944 static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf, 945 struct task_struct *p, int dest_cpu) 946 { 947 /* Affinity changed (again). */ 948 if (!is_cpu_allowed(p, dest_cpu)) 949 return rq; 950 951 update_rq_clock(rq); 952 rq = move_queued_task(rq, rf, p, dest_cpu); 953 954 return rq; 955 } 956 957 /* 958 * migration_cpu_stop - this will be executed by a highprio stopper thread 959 * and performs thread migration by bumping thread off CPU then 960 * 'pushing' onto another runqueue. 961 */ 962 static int migration_cpu_stop(void *data) 963 { 964 struct migration_arg *arg = data; 965 struct task_struct *p = arg->task; 966 struct rq *rq = this_rq(); 967 struct rq_flags rf; 968 969 /* 970 * The original target CPU might have gone down and we might 971 * be on another CPU but it doesn't matter. 972 */ 973 local_irq_disable(); 974 /* 975 * We need to explicitly wake pending tasks before running 976 * __migrate_task() such that we will not miss enforcing cpus_allowed 977 * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test. 978 */ 979 sched_ttwu_pending(); 980 981 raw_spin_lock(&p->pi_lock); 982 rq_lock(rq, &rf); 983 /* 984 * If task_rq(p) != rq, it cannot be migrated here, because we're 985 * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because 986 * we're holding p->pi_lock. 987 */ 988 if (task_rq(p) == rq) { 989 if (task_on_rq_queued(p)) 990 rq = __migrate_task(rq, &rf, p, arg->dest_cpu); 991 else 992 p->wake_cpu = arg->dest_cpu; 993 } 994 rq_unlock(rq, &rf); 995 raw_spin_unlock(&p->pi_lock); 996 997 local_irq_enable(); 998 return 0; 999 } 1000 1001 /* 1002 * sched_class::set_cpus_allowed must do the below, but is not required to 1003 * actually call this function. 1004 */ 1005 void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask) 1006 { 1007 cpumask_copy(&p->cpus_allowed, new_mask); 1008 p->nr_cpus_allowed = cpumask_weight(new_mask); 1009 } 1010 1011 void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) 1012 { 1013 struct rq *rq = task_rq(p); 1014 bool queued, running; 1015 1016 lockdep_assert_held(&p->pi_lock); 1017 1018 queued = task_on_rq_queued(p); 1019 running = task_current(rq, p); 1020 1021 if (queued) { 1022 /* 1023 * Because __kthread_bind() calls this on blocked tasks without 1024 * holding rq->lock. 1025 */ 1026 lockdep_assert_held(&rq->lock); 1027 dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK); 1028 } 1029 if (running) 1030 put_prev_task(rq, p); 1031 1032 p->sched_class->set_cpus_allowed(p, new_mask); 1033 1034 if (queued) 1035 enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); 1036 if (running) 1037 set_curr_task(rq, p); 1038 } 1039 1040 /* 1041 * Change a given task's CPU affinity. Migrate the thread to a 1042 * proper CPU and schedule it away if the CPU it's executing on 1043 * is removed from the allowed bitmask. 1044 * 1045 * NOTE: the caller must have a valid reference to the task, the 1046 * task must not exit() & deallocate itself prematurely. The 1047 * call is not atomic; no spinlocks may be held. 1048 */ 1049 static int __set_cpus_allowed_ptr(struct task_struct *p, 1050 const struct cpumask *new_mask, bool check) 1051 { 1052 const struct cpumask *cpu_valid_mask = cpu_active_mask; 1053 unsigned int dest_cpu; 1054 struct rq_flags rf; 1055 struct rq *rq; 1056 int ret = 0; 1057 1058 rq = task_rq_lock(p, &rf); 1059 update_rq_clock(rq); 1060 1061 if (p->flags & PF_KTHREAD) { 1062 /* 1063 * Kernel threads are allowed on online && !active CPUs 1064 */ 1065 cpu_valid_mask = cpu_online_mask; 1066 } 1067 1068 /* 1069 * Must re-check here, to close a race against __kthread_bind(), 1070 * sched_setaffinity() is not guaranteed to observe the flag. 1071 */ 1072 if (check && (p->flags & PF_NO_SETAFFINITY)) { 1073 ret = -EINVAL; 1074 goto out; 1075 } 1076 1077 if (cpumask_equal(&p->cpus_allowed, new_mask)) 1078 goto out; 1079 1080 if (!cpumask_intersects(new_mask, cpu_valid_mask)) { 1081 ret = -EINVAL; 1082 goto out; 1083 } 1084 1085 do_set_cpus_allowed(p, new_mask); 1086 1087 if (p->flags & PF_KTHREAD) { 1088 /* 1089 * For kernel threads that do indeed end up on online && 1090 * !active we want to ensure they are strict per-CPU threads. 1091 */ 1092 WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) && 1093 !cpumask_intersects(new_mask, cpu_active_mask) && 1094 p->nr_cpus_allowed != 1); 1095 } 1096 1097 /* Can the task run on the task's current CPU? If so, we're done */ 1098 if (cpumask_test_cpu(task_cpu(p), new_mask)) 1099 goto out; 1100 1101 dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask); 1102 if (task_running(rq, p) || p->state == TASK_WAKING) { 1103 struct migration_arg arg = { p, dest_cpu }; 1104 /* Need help from migration thread: drop lock and wait. */ 1105 task_rq_unlock(rq, p, &rf); 1106 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); 1107 tlb_migrate_finish(p->mm); 1108 return 0; 1109 } else if (task_on_rq_queued(p)) { 1110 /* 1111 * OK, since we're going to drop the lock immediately 1112 * afterwards anyway. 1113 */ 1114 rq = move_queued_task(rq, &rf, p, dest_cpu); 1115 } 1116 out: 1117 task_rq_unlock(rq, p, &rf); 1118 1119 return ret; 1120 } 1121 1122 int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) 1123 { 1124 return __set_cpus_allowed_ptr(p, new_mask, false); 1125 } 1126 EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); 1127 1128 void set_task_cpu(struct task_struct *p, unsigned int new_cpu) 1129 { 1130 #ifdef CONFIG_SCHED_DEBUG 1131 /* 1132 * We should never call set_task_cpu() on a blocked task, 1133 * ttwu() will sort out the placement. 1134 */ 1135 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && 1136 !p->on_rq); 1137 1138 /* 1139 * Migrating fair class task must have p->on_rq = TASK_ON_RQ_MIGRATING, 1140 * because schedstat_wait_{start,end} rebase migrating task's wait_start 1141 * time relying on p->on_rq. 1142 */ 1143 WARN_ON_ONCE(p->state == TASK_RUNNING && 1144 p->sched_class == &fair_sched_class && 1145 (p->on_rq && !task_on_rq_migrating(p))); 1146 1147 #ifdef CONFIG_LOCKDEP 1148 /* 1149 * The caller should hold either p->pi_lock or rq->lock, when changing 1150 * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks. 1151 * 1152 * sched_move_task() holds both and thus holding either pins the cgroup, 1153 * see task_group(). 1154 * 1155 * Furthermore, all task_rq users should acquire both locks, see 1156 * task_rq_lock(). 1157 */ 1158 WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || 1159 lockdep_is_held(&task_rq(p)->lock))); 1160 #endif 1161 /* 1162 * Clearly, migrating tasks to offline CPUs is a fairly daft thing. 1163 */ 1164 WARN_ON_ONCE(!cpu_online(new_cpu)); 1165 #endif 1166 1167 trace_sched_migrate_task(p, new_cpu); 1168 1169 if (task_cpu(p) != new_cpu) { 1170 if (p->sched_class->migrate_task_rq) 1171 p->sched_class->migrate_task_rq(p, new_cpu); 1172 p->se.nr_migrations++; 1173 rseq_migrate(p); 1174 perf_event_task_migrate(p); 1175 } 1176 1177 __set_task_cpu(p, new_cpu); 1178 } 1179 1180 #ifdef CONFIG_NUMA_BALANCING 1181 static void __migrate_swap_task(struct task_struct *p, int cpu) 1182 { 1183 if (task_on_rq_queued(p)) { 1184 struct rq *src_rq, *dst_rq; 1185 struct rq_flags srf, drf; 1186 1187 src_rq = task_rq(p); 1188 dst_rq = cpu_rq(cpu); 1189 1190 rq_pin_lock(src_rq, &srf); 1191 rq_pin_lock(dst_rq, &drf); 1192 1193 p->on_rq = TASK_ON_RQ_MIGRATING; 1194 deactivate_task(src_rq, p, 0); 1195 set_task_cpu(p, cpu); 1196 activate_task(dst_rq, p, 0); 1197 p->on_rq = TASK_ON_RQ_QUEUED; 1198 check_preempt_curr(dst_rq, p, 0); 1199 1200 rq_unpin_lock(dst_rq, &drf); 1201 rq_unpin_lock(src_rq, &srf); 1202 1203 } else { 1204 /* 1205 * Task isn't running anymore; make it appear like we migrated 1206 * it before it went to sleep. This means on wakeup we make the 1207 * previous CPU our target instead of where it really is. 1208 */ 1209 p->wake_cpu = cpu; 1210 } 1211 } 1212 1213 struct migration_swap_arg { 1214 struct task_struct *src_task, *dst_task; 1215 int src_cpu, dst_cpu; 1216 }; 1217 1218 static int migrate_swap_stop(void *data) 1219 { 1220 struct migration_swap_arg *arg = data; 1221 struct rq *src_rq, *dst_rq; 1222 int ret = -EAGAIN; 1223 1224 if (!cpu_active(arg->src_cpu) || !cpu_active(arg->dst_cpu)) 1225 return -EAGAIN; 1226 1227 src_rq = cpu_rq(arg->src_cpu); 1228 dst_rq = cpu_rq(arg->dst_cpu); 1229 1230 double_raw_lock(&arg->src_task->pi_lock, 1231 &arg->dst_task->pi_lock); 1232 double_rq_lock(src_rq, dst_rq); 1233 1234 if (task_cpu(arg->dst_task) != arg->dst_cpu) 1235 goto unlock; 1236 1237 if (task_cpu(arg->src_task) != arg->src_cpu) 1238 goto unlock; 1239 1240 if (!cpumask_test_cpu(arg->dst_cpu, &arg->src_task->cpus_allowed)) 1241 goto unlock; 1242 1243 if (!cpumask_test_cpu(arg->src_cpu, &arg->dst_task->cpus_allowed)) 1244 goto unlock; 1245 1246 __migrate_swap_task(arg->src_task, arg->dst_cpu); 1247 __migrate_swap_task(arg->dst_task, arg->src_cpu); 1248 1249 ret = 0; 1250 1251 unlock: 1252 double_rq_unlock(src_rq, dst_rq); 1253 raw_spin_unlock(&arg->dst_task->pi_lock); 1254 raw_spin_unlock(&arg->src_task->pi_lock); 1255 1256 return ret; 1257 } 1258 1259 /* 1260 * Cross migrate two tasks 1261 */ 1262 int migrate_swap(struct task_struct *cur, struct task_struct *p, 1263 int target_cpu, int curr_cpu) 1264 { 1265 struct migration_swap_arg arg; 1266 int ret = -EINVAL; 1267 1268 arg = (struct migration_swap_arg){ 1269 .src_task = cur, 1270 .src_cpu = curr_cpu, 1271 .dst_task = p, 1272 .dst_cpu = target_cpu, 1273 }; 1274 1275 if (arg.src_cpu == arg.dst_cpu) 1276 goto out; 1277 1278 /* 1279 * These three tests are all lockless; this is OK since all of them 1280 * will be re-checked with proper locks held further down the line. 1281 */ 1282 if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu)) 1283 goto out; 1284 1285 if (!cpumask_test_cpu(arg.dst_cpu, &arg.src_task->cpus_allowed)) 1286 goto out; 1287 1288 if (!cpumask_test_cpu(arg.src_cpu, &arg.dst_task->cpus_allowed)) 1289 goto out; 1290 1291 trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu); 1292 ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg); 1293 1294 out: 1295 return ret; 1296 } 1297 #endif /* CONFIG_NUMA_BALANCING */ 1298 1299 /* 1300 * wait_task_inactive - wait for a thread to unschedule. 1301 * 1302 * If @match_state is nonzero, it's the @p->state value just checked and 1303 * not expected to change. If it changes, i.e. @p might have woken up, 1304 * then return zero. When we succeed in waiting for @p to be off its CPU, 1305 * we return a positive number (its total switch count). If a second call 1306 * a short while later returns the same number, the caller can be sure that 1307 * @p has remained unscheduled the whole time. 1308 * 1309 * The caller must ensure that the task *will* unschedule sometime soon, 1310 * else this function might spin for a *long* time. This function can't 1311 * be called with interrupts off, or it may introduce deadlock with 1312 * smp_call_function() if an IPI is sent by the same process we are 1313 * waiting to become inactive. 1314 */ 1315 unsigned long wait_task_inactive(struct task_struct *p, long match_state) 1316 { 1317 int running, queued; 1318 struct rq_flags rf; 1319 unsigned long ncsw; 1320 struct rq *rq; 1321 1322 for (;;) { 1323 /* 1324 * We do the initial early heuristics without holding 1325 * any task-queue locks at all. We'll only try to get 1326 * the runqueue lock when things look like they will 1327 * work out! 1328 */ 1329 rq = task_rq(p); 1330 1331 /* 1332 * If the task is actively running on another CPU 1333 * still, just relax and busy-wait without holding 1334 * any locks. 1335 * 1336 * NOTE! Since we don't hold any locks, it's not 1337 * even sure that "rq" stays as the right runqueue! 1338 * But we don't care, since "task_running()" will 1339 * return false if the runqueue has changed and p 1340 * is actually now running somewhere else! 1341 */ 1342 while (task_running(rq, p)) { 1343 if (match_state && unlikely(p->state != match_state)) 1344 return 0; 1345 cpu_relax(); 1346 } 1347 1348 /* 1349 * Ok, time to look more closely! We need the rq 1350 * lock now, to be *sure*. If we're wrong, we'll 1351 * just go back and repeat. 1352 */ 1353 rq = task_rq_lock(p, &rf); 1354 trace_sched_wait_task(p); 1355 running = task_running(rq, p); 1356 queued = task_on_rq_queued(p); 1357 ncsw = 0; 1358 if (!match_state || p->state == match_state) 1359 ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ 1360 task_rq_unlock(rq, p, &rf); 1361 1362 /* 1363 * If it changed from the expected state, bail out now. 1364 */ 1365 if (unlikely(!ncsw)) 1366 break; 1367 1368 /* 1369 * Was it really running after all now that we 1370 * checked with the proper locks actually held? 1371 * 1372 * Oops. Go back and try again.. 1373 */ 1374 if (unlikely(running)) { 1375 cpu_relax(); 1376 continue; 1377 } 1378 1379 /* 1380 * It's not enough that it's not actively running, 1381 * it must be off the runqueue _entirely_, and not 1382 * preempted! 1383 * 1384 * So if it was still runnable (but just not actively 1385 * running right now), it's preempted, and we should 1386 * yield - it could be a while. 1387 */ 1388 if (unlikely(queued)) { 1389 ktime_t to = NSEC_PER_SEC / HZ; 1390 1391 set_current_state(TASK_UNINTERRUPTIBLE); 1392 schedule_hrtimeout(&to, HRTIMER_MODE_REL); 1393 continue; 1394 } 1395 1396 /* 1397 * Ahh, all good. It wasn't running, and it wasn't 1398 * runnable, which means that it will never become 1399 * running in the future either. We're all done! 1400 */ 1401 break; 1402 } 1403 1404 return ncsw; 1405 } 1406 1407 /*** 1408 * kick_process - kick a running thread to enter/exit the kernel 1409 * @p: the to-be-kicked thread 1410 * 1411 * Cause a process which is running on another CPU to enter 1412 * kernel-mode, without any delay. (to get signals handled.) 1413 * 1414 * NOTE: this function doesn't have to take the runqueue lock, 1415 * because all it wants to ensure is that the remote task enters 1416 * the kernel. If the IPI races and the task has been migrated 1417 * to another CPU then no harm is done and the purpose has been 1418 * achieved as well. 1419 */ 1420 void kick_process(struct task_struct *p) 1421 { 1422 int cpu; 1423 1424 preempt_disable(); 1425 cpu = task_cpu(p); 1426 if ((cpu != smp_processor_id()) && task_curr(p)) 1427 smp_send_reschedule(cpu); 1428 preempt_enable(); 1429 } 1430 EXPORT_SYMBOL_GPL(kick_process); 1431 1432 /* 1433 * ->cpus_allowed is protected by both rq->lock and p->pi_lock 1434 * 1435 * A few notes on cpu_active vs cpu_online: 1436 * 1437 * - cpu_active must be a subset of cpu_online 1438 * 1439 * - on CPU-up we allow per-CPU kthreads on the online && !active CPU, 1440 * see __set_cpus_allowed_ptr(). At this point the newly online 1441 * CPU isn't yet part of the sched domains, and balancing will not 1442 * see it. 1443 * 1444 * - on CPU-down we clear cpu_active() to mask the sched domains and 1445 * avoid the load balancer to place new tasks on the to be removed 1446 * CPU. Existing tasks will remain running there and will be taken 1447 * off. 1448 * 1449 * This means that fallback selection must not select !active CPUs. 1450 * And can assume that any active CPU must be online. Conversely 1451 * select_task_rq() below may allow selection of !active CPUs in order 1452 * to satisfy the above rules. 1453 */ 1454 static int select_fallback_rq(int cpu, struct task_struct *p) 1455 { 1456 int nid = cpu_to_node(cpu); 1457 const struct cpumask *nodemask = NULL; 1458 enum { cpuset, possible, fail } state = cpuset; 1459 int dest_cpu; 1460 1461 /* 1462 * If the node that the CPU is on has been offlined, cpu_to_node() 1463 * will return -1. There is no CPU on the node, and we should 1464 * select the CPU on the other node. 1465 */ 1466 if (nid != -1) { 1467 nodemask = cpumask_of_node(nid); 1468 1469 /* Look for allowed, online CPU in same node. */ 1470 for_each_cpu(dest_cpu, nodemask) { 1471 if (!cpu_active(dest_cpu)) 1472 continue; 1473 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) 1474 return dest_cpu; 1475 } 1476 } 1477 1478 for (;;) { 1479 /* Any allowed, online CPU? */ 1480 for_each_cpu(dest_cpu, &p->cpus_allowed) { 1481 if (!is_cpu_allowed(p, dest_cpu)) 1482 continue; 1483 1484 goto out; 1485 } 1486 1487 /* No more Mr. Nice Guy. */ 1488 switch (state) { 1489 case cpuset: 1490 if (IS_ENABLED(CONFIG_CPUSETS)) { 1491 cpuset_cpus_allowed_fallback(p); 1492 state = possible; 1493 break; 1494 } 1495 /* Fall-through */ 1496 case possible: 1497 do_set_cpus_allowed(p, cpu_possible_mask); 1498 state = fail; 1499 break; 1500 1501 case fail: 1502 BUG(); 1503 break; 1504 } 1505 } 1506 1507 out: 1508 if (state != cpuset) { 1509 /* 1510 * Don't tell them about moving exiting tasks or 1511 * kernel threads (both mm NULL), since they never 1512 * leave kernel. 1513 */ 1514 if (p->mm && printk_ratelimit()) { 1515 printk_deferred("process %d (%s) no longer affine to cpu%d\n", 1516 task_pid_nr(p), p->comm, cpu); 1517 } 1518 } 1519 1520 return dest_cpu; 1521 } 1522 1523 /* 1524 * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable. 1525 */ 1526 static inline 1527 int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) 1528 { 1529 lockdep_assert_held(&p->pi_lock); 1530 1531 if (p->nr_cpus_allowed > 1) 1532 cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags); 1533 else 1534 cpu = cpumask_any(&p->cpus_allowed); 1535 1536 /* 1537 * In order not to call set_task_cpu() on a blocking task we need 1538 * to rely on ttwu() to place the task on a valid ->cpus_allowed 1539 * CPU. 1540 * 1541 * Since this is common to all placement strategies, this lives here. 1542 * 1543 * [ this allows ->select_task() to simply return task_cpu(p) and 1544 * not worry about this generic constraint ] 1545 */ 1546 if (unlikely(!is_cpu_allowed(p, cpu))) 1547 cpu = select_fallback_rq(task_cpu(p), p); 1548 1549 return cpu; 1550 } 1551 1552 static void update_avg(u64 *avg, u64 sample) 1553 { 1554 s64 diff = sample - *avg; 1555 *avg += diff >> 3; 1556 } 1557 1558 void sched_set_stop_task(int cpu, struct task_struct *stop) 1559 { 1560 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; 1561 struct task_struct *old_stop = cpu_rq(cpu)->stop; 1562 1563 if (stop) { 1564 /* 1565 * Make it appear like a SCHED_FIFO task, its something 1566 * userspace knows about and won't get confused about. 1567 * 1568 * Also, it will make PI more or less work without too 1569 * much confusion -- but then, stop work should not 1570 * rely on PI working anyway. 1571 */ 1572 sched_setscheduler_nocheck(stop, SCHED_FIFO, ¶m); 1573 1574 stop->sched_class = &stop_sched_class; 1575 } 1576 1577 cpu_rq(cpu)->stop = stop; 1578 1579 if (old_stop) { 1580 /* 1581 * Reset it back to a normal scheduling class so that 1582 * it can die in pieces. 1583 */ 1584 old_stop->sched_class = &rt_sched_class; 1585 } 1586 } 1587 1588 #else 1589 1590 static inline int __set_cpus_allowed_ptr(struct task_struct *p, 1591 const struct cpumask *new_mask, bool check) 1592 { 1593 return set_cpus_allowed_ptr(p, new_mask); 1594 } 1595 1596 #endif /* CONFIG_SMP */ 1597 1598 static void 1599 ttwu_stat(struct task_struct *p, int cpu, int wake_flags) 1600 { 1601 struct rq *rq; 1602 1603 if (!schedstat_enabled()) 1604 return; 1605 1606 rq = this_rq(); 1607 1608 #ifdef CONFIG_SMP 1609 if (cpu == rq->cpu) { 1610 __schedstat_inc(rq->ttwu_local); 1611 __schedstat_inc(p->se.statistics.nr_wakeups_local); 1612 } else { 1613 struct sched_domain *sd; 1614 1615 __schedstat_inc(p->se.statistics.nr_wakeups_remote); 1616 rcu_read_lock(); 1617 for_each_domain(rq->cpu, sd) { 1618 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { 1619 __schedstat_inc(sd->ttwu_wake_remote); 1620 break; 1621 } 1622 } 1623 rcu_read_unlock(); 1624 } 1625 1626 if (wake_flags & WF_MIGRATED) 1627 __schedstat_inc(p->se.statistics.nr_wakeups_migrate); 1628 #endif /* CONFIG_SMP */ 1629 1630 __schedstat_inc(rq->ttwu_count); 1631 __schedstat_inc(p->se.statistics.nr_wakeups); 1632 1633 if (wake_flags & WF_SYNC) 1634 __schedstat_inc(p->se.statistics.nr_wakeups_sync); 1635 } 1636 1637 static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) 1638 { 1639 activate_task(rq, p, en_flags); 1640 p->on_rq = TASK_ON_RQ_QUEUED; 1641 1642 /* If a worker is waking up, notify the workqueue: */ 1643 if (p->flags & PF_WQ_WORKER) 1644 wq_worker_waking_up(p, cpu_of(rq)); 1645 } 1646 1647 /* 1648 * Mark the task runnable and perform wakeup-preemption. 1649 */ 1650 static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags, 1651 struct rq_flags *rf) 1652 { 1653 check_preempt_curr(rq, p, wake_flags); 1654 p->state = TASK_RUNNING; 1655 trace_sched_wakeup(p); 1656 1657 #ifdef CONFIG_SMP 1658 if (p->sched_class->task_woken) { 1659 /* 1660 * Our task @p is fully woken up and running; so its safe to 1661 * drop the rq->lock, hereafter rq is only used for statistics. 1662 */ 1663 rq_unpin_lock(rq, rf); 1664 p->sched_class->task_woken(rq, p); 1665 rq_repin_lock(rq, rf); 1666 } 1667 1668 if (rq->idle_stamp) { 1669 u64 delta = rq_clock(rq) - rq->idle_stamp; 1670 u64 max = 2*rq->max_idle_balance_cost; 1671 1672 update_avg(&rq->avg_idle, delta); 1673 1674 if (rq->avg_idle > max) 1675 rq->avg_idle = max; 1676 1677 rq->idle_stamp = 0; 1678 } 1679 #endif 1680 } 1681 1682 static void 1683 ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, 1684 struct rq_flags *rf) 1685 { 1686 int en_flags = ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK; 1687 1688 lockdep_assert_held(&rq->lock); 1689 1690 #ifdef CONFIG_SMP 1691 if (p->sched_contributes_to_load) 1692 rq->nr_uninterruptible--; 1693 1694 if (wake_flags & WF_MIGRATED) 1695 en_flags |= ENQUEUE_MIGRATED; 1696 #endif 1697 1698 ttwu_activate(rq, p, en_flags); 1699 ttwu_do_wakeup(rq, p, wake_flags, rf); 1700 } 1701 1702 /* 1703 * Called in case the task @p isn't fully descheduled from its runqueue, 1704 * in this case we must do a remote wakeup. Its a 'light' wakeup though, 1705 * since all we need to do is flip p->state to TASK_RUNNING, since 1706 * the task is still ->on_rq. 1707 */ 1708 static int ttwu_remote(struct task_struct *p, int wake_flags) 1709 { 1710 struct rq_flags rf; 1711 struct rq *rq; 1712 int ret = 0; 1713 1714 rq = __task_rq_lock(p, &rf); 1715 if (task_on_rq_queued(p)) { 1716 /* check_preempt_curr() may use rq clock */ 1717 update_rq_clock(rq); 1718 ttwu_do_wakeup(rq, p, wake_flags, &rf); 1719 ret = 1; 1720 } 1721 __task_rq_unlock(rq, &rf); 1722 1723 return ret; 1724 } 1725 1726 #ifdef CONFIG_SMP 1727 void sched_ttwu_pending(void) 1728 { 1729 struct rq *rq = this_rq(); 1730 struct llist_node *llist = llist_del_all(&rq->wake_list); 1731 struct task_struct *p, *t; 1732 struct rq_flags rf; 1733 1734 if (!llist) 1735 return; 1736 1737 rq_lock_irqsave(rq, &rf); 1738 update_rq_clock(rq); 1739 1740 llist_for_each_entry_safe(p, t, llist, wake_entry) 1741 ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0, &rf); 1742 1743 rq_unlock_irqrestore(rq, &rf); 1744 } 1745 1746 void scheduler_ipi(void) 1747 { 1748 /* 1749 * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting 1750 * TIF_NEED_RESCHED remotely (for the first time) will also send 1751 * this IPI. 1752 */ 1753 preempt_fold_need_resched(); 1754 1755 if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick()) 1756 return; 1757 1758 /* 1759 * Not all reschedule IPI handlers call irq_enter/irq_exit, since 1760 * traditionally all their work was done from the interrupt return 1761 * path. Now that we actually do some work, we need to make sure 1762 * we do call them. 1763 * 1764 * Some archs already do call them, luckily irq_enter/exit nest 1765 * properly. 1766 * 1767 * Arguably we should visit all archs and update all handlers, 1768 * however a fair share of IPIs are still resched only so this would 1769 * somewhat pessimize the simple resched case. 1770 */ 1771 irq_enter(); 1772 sched_ttwu_pending(); 1773 1774 /* 1775 * Check if someone kicked us for doing the nohz idle load balance. 1776 */ 1777 if (unlikely(got_nohz_idle_kick())) { 1778 this_rq()->idle_balance = 1; 1779 raise_softirq_irqoff(SCHED_SOFTIRQ); 1780 } 1781 irq_exit(); 1782 } 1783 1784 static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags) 1785 { 1786 struct rq *rq = cpu_rq(cpu); 1787 1788 p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED); 1789 1790 if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) { 1791 if (!set_nr_if_polling(rq->idle)) 1792 smp_send_reschedule(cpu); 1793 else 1794 trace_sched_wake_idle_without_ipi(cpu); 1795 } 1796 } 1797 1798 void wake_up_if_idle(int cpu) 1799 { 1800 struct rq *rq = cpu_rq(cpu); 1801 struct rq_flags rf; 1802 1803 rcu_read_lock(); 1804 1805 if (!is_idle_task(rcu_dereference(rq->curr))) 1806 goto out; 1807 1808 if (set_nr_if_polling(rq->idle)) { 1809 trace_sched_wake_idle_without_ipi(cpu); 1810 } else { 1811 rq_lock_irqsave(rq, &rf); 1812 if (is_idle_task(rq->curr)) 1813 smp_send_reschedule(cpu); 1814 /* Else CPU is not idle, do nothing here: */ 1815 rq_unlock_irqrestore(rq, &rf); 1816 } 1817 1818 out: 1819 rcu_read_unlock(); 1820 } 1821 1822 bool cpus_share_cache(int this_cpu, int that_cpu) 1823 { 1824 return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); 1825 } 1826 #endif /* CONFIG_SMP */ 1827 1828 static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) 1829 { 1830 struct rq *rq = cpu_rq(cpu); 1831 struct rq_flags rf; 1832 1833 #if defined(CONFIG_SMP) 1834 if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) { 1835 sched_clock_cpu(cpu); /* Sync clocks across CPUs */ 1836 ttwu_queue_remote(p, cpu, wake_flags); 1837 return; 1838 } 1839 #endif 1840 1841 rq_lock(rq, &rf); 1842 update_rq_clock(rq); 1843 ttwu_do_activate(rq, p, wake_flags, &rf); 1844 rq_unlock(rq, &rf); 1845 } 1846 1847 /* 1848 * Notes on Program-Order guarantees on SMP systems. 1849 * 1850 * MIGRATION 1851 * 1852 * The basic program-order guarantee on SMP systems is that when a task [t] 1853 * migrates, all its activity on its old CPU [c0] happens-before any subsequent 1854 * execution on its new CPU [c1]. 1855 * 1856 * For migration (of runnable tasks) this is provided by the following means: 1857 * 1858 * A) UNLOCK of the rq(c0)->lock scheduling out task t 1859 * B) migration for t is required to synchronize *both* rq(c0)->lock and 1860 * rq(c1)->lock (if not at the same time, then in that order). 1861 * C) LOCK of the rq(c1)->lock scheduling in task 1862 * 1863 * Release/acquire chaining guarantees that B happens after A and C after B. 1864 * Note: the CPU doing B need not be c0 or c1 1865 * 1866 * Example: 1867 * 1868 * CPU0 CPU1 CPU2 1869 * 1870 * LOCK rq(0)->lock 1871 * sched-out X 1872 * sched-in Y 1873 * UNLOCK rq(0)->lock 1874 * 1875 * LOCK rq(0)->lock // orders against CPU0 1876 * dequeue X 1877 * UNLOCK rq(0)->lock 1878 * 1879 * LOCK rq(1)->lock 1880 * enqueue X 1881 * UNLOCK rq(1)->lock 1882 * 1883 * LOCK rq(1)->lock // orders against CPU2 1884 * sched-out Z 1885 * sched-in X 1886 * UNLOCK rq(1)->lock 1887 * 1888 * 1889 * BLOCKING -- aka. SLEEP + WAKEUP 1890 * 1891 * For blocking we (obviously) need to provide the same guarantee as for 1892 * migration. However the means are completely different as there is no lock 1893 * chain to provide order. Instead we do: 1894 * 1895 * 1) smp_store_release(X->on_cpu, 0) 1896 * 2) smp_cond_load_acquire(!X->on_cpu) 1897 * 1898 * Example: 1899 * 1900 * CPU0 (schedule) CPU1 (try_to_wake_up) CPU2 (schedule) 1901 * 1902 * LOCK rq(0)->lock LOCK X->pi_lock 1903 * dequeue X 1904 * sched-out X 1905 * smp_store_release(X->on_cpu, 0); 1906 * 1907 * smp_cond_load_acquire(&X->on_cpu, !VAL); 1908 * X->state = WAKING 1909 * set_task_cpu(X,2) 1910 * 1911 * LOCK rq(2)->lock 1912 * enqueue X 1913 * X->state = RUNNING 1914 * UNLOCK rq(2)->lock 1915 * 1916 * LOCK rq(2)->lock // orders against CPU1 1917 * sched-out Z 1918 * sched-in X 1919 * UNLOCK rq(2)->lock 1920 * 1921 * UNLOCK X->pi_lock 1922 * UNLOCK rq(0)->lock 1923 * 1924 * 1925 * However, for wakeups there is a second guarantee we must provide, namely we 1926 * must ensure that CONDITION=1 done by the caller can not be reordered with 1927 * accesses to the task state; see try_to_wake_up() and set_current_state(). 1928 */ 1929 1930 /** 1931 * try_to_wake_up - wake up a thread 1932 * @p: the thread to be awakened 1933 * @state: the mask of task states that can be woken 1934 * @wake_flags: wake modifier flags (WF_*) 1935 * 1936 * If (@state & @p->state) @p->state = TASK_RUNNING. 1937 * 1938 * If the task was not queued/runnable, also place it back on a runqueue. 1939 * 1940 * Atomic against schedule() which would dequeue a task, also see 1941 * set_current_state(). 1942 * 1943 * This function executes a full memory barrier before accessing the task 1944 * state; see set_current_state(). 1945 * 1946 * Return: %true if @p->state changes (an actual wakeup was done), 1947 * %false otherwise. 1948 */ 1949 static int 1950 try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) 1951 { 1952 unsigned long flags; 1953 int cpu, success = 0; 1954 1955 /* 1956 * If we are going to wake up a thread waiting for CONDITION we 1957 * need to ensure that CONDITION=1 done by the caller can not be 1958 * reordered with p->state check below. This pairs with mb() in 1959 * set_current_state() the waiting thread does. 1960 */ 1961 raw_spin_lock_irqsave(&p->pi_lock, flags); 1962 smp_mb__after_spinlock(); 1963 if (!(p->state & state)) 1964 goto out; 1965 1966 trace_sched_waking(p); 1967 1968 /* We're going to change ->state: */ 1969 success = 1; 1970 cpu = task_cpu(p); 1971 1972 /* 1973 * Ensure we load p->on_rq _after_ p->state, otherwise it would 1974 * be possible to, falsely, observe p->on_rq == 0 and get stuck 1975 * in smp_cond_load_acquire() below. 1976 * 1977 * sched_ttwu_pending() try_to_wake_up() 1978 * STORE p->on_rq = 1 LOAD p->state 1979 * UNLOCK rq->lock 1980 * 1981 * __schedule() (switch to task 'p') 1982 * LOCK rq->lock smp_rmb(); 1983 * smp_mb__after_spinlock(); 1984 * UNLOCK rq->lock 1985 * 1986 * [task p] 1987 * STORE p->state = UNINTERRUPTIBLE LOAD p->on_rq 1988 * 1989 * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in 1990 * __schedule(). See the comment for smp_mb__after_spinlock(). 1991 */ 1992 smp_rmb(); 1993 if (p->on_rq && ttwu_remote(p, wake_flags)) 1994 goto stat; 1995 1996 #ifdef CONFIG_SMP 1997 /* 1998 * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be 1999 * possible to, falsely, observe p->on_cpu == 0. 2000 * 2001 * One must be running (->on_cpu == 1) in order to remove oneself 2002 * from the runqueue. 2003 * 2004 * __schedule() (switch to task 'p') try_to_wake_up() 2005 * STORE p->on_cpu = 1 LOAD p->on_rq 2006 * UNLOCK rq->lock 2007 * 2008 * __schedule() (put 'p' to sleep) 2009 * LOCK rq->lock smp_rmb(); 2010 * smp_mb__after_spinlock(); 2011 * STORE p->on_rq = 0 LOAD p->on_cpu 2012 * 2013 * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in 2014 * __schedule(). See the comment for smp_mb__after_spinlock(). 2015 */ 2016 smp_rmb(); 2017 2018 /* 2019 * If the owning (remote) CPU is still in the middle of schedule() with 2020 * this task as prev, wait until its done referencing the task. 2021 * 2022 * Pairs with the smp_store_release() in finish_task(). 2023 * 2024 * This ensures that tasks getting woken will be fully ordered against 2025 * their previous state and preserve Program Order. 2026 */ 2027 smp_cond_load_acquire(&p->on_cpu, !VAL); 2028 2029 p->sched_contributes_to_load = !!task_contributes_to_load(p); 2030 p->state = TASK_WAKING; 2031 2032 if (p->in_iowait) { 2033 delayacct_blkio_end(p); 2034 atomic_dec(&task_rq(p)->nr_iowait); 2035 } 2036 2037 cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags); 2038 if (task_cpu(p) != cpu) { 2039 wake_flags |= WF_MIGRATED; 2040 set_task_cpu(p, cpu); 2041 } 2042 2043 #else /* CONFIG_SMP */ 2044 2045 if (p->in_iowait) { 2046 delayacct_blkio_end(p); 2047 atomic_dec(&task_rq(p)->nr_iowait); 2048 } 2049 2050 #endif /* CONFIG_SMP */ 2051 2052 ttwu_queue(p, cpu, wake_flags); 2053 stat: 2054 ttwu_stat(p, cpu, wake_flags); 2055 out: 2056 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 2057 2058 return success; 2059 } 2060 2061 /** 2062 * try_to_wake_up_local - try to wake up a local task with rq lock held 2063 * @p: the thread to be awakened 2064 * @rf: request-queue flags for pinning 2065 * 2066 * Put @p on the run-queue if it's not already there. The caller must 2067 * ensure that this_rq() is locked, @p is bound to this_rq() and not 2068 * the current task. 2069 */ 2070 static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf) 2071 { 2072 struct rq *rq = task_rq(p); 2073 2074 if (WARN_ON_ONCE(rq != this_rq()) || 2075 WARN_ON_ONCE(p == current)) 2076 return; 2077 2078 lockdep_assert_held(&rq->lock); 2079 2080 if (!raw_spin_trylock(&p->pi_lock)) { 2081 /* 2082 * This is OK, because current is on_cpu, which avoids it being 2083 * picked for load-balance and preemption/IRQs are still 2084 * disabled avoiding further scheduler activity on it and we've 2085 * not yet picked a replacement task. 2086 */ 2087 rq_unlock(rq, rf); 2088 raw_spin_lock(&p->pi_lock); 2089 rq_relock(rq, rf); 2090 } 2091 2092 if (!(p->state & TASK_NORMAL)) 2093 goto out; 2094 2095 trace_sched_waking(p); 2096 2097 if (!task_on_rq_queued(p)) { 2098 if (p->in_iowait) { 2099 delayacct_blkio_end(p); 2100 atomic_dec(&rq->nr_iowait); 2101 } 2102 ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK); 2103 } 2104 2105 ttwu_do_wakeup(rq, p, 0, rf); 2106 ttwu_stat(p, smp_processor_id(), 0); 2107 out: 2108 raw_spin_unlock(&p->pi_lock); 2109 } 2110 2111 /** 2112 * wake_up_process - Wake up a specific process 2113 * @p: The process to be woken up. 2114 * 2115 * Attempt to wake up the nominated process and move it to the set of runnable 2116 * processes. 2117 * 2118 * Return: 1 if the process was woken up, 0 if it was already running. 2119 * 2120 * This function executes a full memory barrier before accessing the task state. 2121 */ 2122 int wake_up_process(struct task_struct *p) 2123 { 2124 return try_to_wake_up(p, TASK_NORMAL, 0); 2125 } 2126 EXPORT_SYMBOL(wake_up_process); 2127 2128 int wake_up_state(struct task_struct *p, unsigned int state) 2129 { 2130 return try_to_wake_up(p, state, 0); 2131 } 2132 2133 /* 2134 * Perform scheduler related setup for a newly forked process p. 2135 * p is forked by current. 2136 * 2137 * __sched_fork() is basic setup used by init_idle() too: 2138 */ 2139 static void __sched_fork(unsigned long clone_flags, struct task_struct *p) 2140 { 2141 p->on_rq = 0; 2142 2143 p->se.on_rq = 0; 2144 p->se.exec_start = 0; 2145 p->se.sum_exec_runtime = 0; 2146 p->se.prev_sum_exec_runtime = 0; 2147 p->se.nr_migrations = 0; 2148 p->se.vruntime = 0; 2149 INIT_LIST_HEAD(&p->se.group_node); 2150 2151 #ifdef CONFIG_FAIR_GROUP_SCHED 2152 p->se.cfs_rq = NULL; 2153 #endif 2154 2155 #ifdef CONFIG_SCHEDSTATS 2156 /* Even if schedstat is disabled, there should not be garbage */ 2157 memset(&p->se.statistics, 0, sizeof(p->se.statistics)); 2158 #endif 2159 2160 RB_CLEAR_NODE(&p->dl.rb_node); 2161 init_dl_task_timer(&p->dl); 2162 init_dl_inactive_task_timer(&p->dl); 2163 __dl_clear_params(p); 2164 2165 INIT_LIST_HEAD(&p->rt.run_list); 2166 p->rt.timeout = 0; 2167 p->rt.time_slice = sched_rr_timeslice; 2168 p->rt.on_rq = 0; 2169 p->rt.on_list = 0; 2170 2171 #ifdef CONFIG_PREEMPT_NOTIFIERS 2172 INIT_HLIST_HEAD(&p->preempt_notifiers); 2173 #endif 2174 2175 init_numa_balancing(clone_flags, p); 2176 } 2177 2178 DEFINE_STATIC_KEY_FALSE(sched_numa_balancing); 2179 2180 #ifdef CONFIG_NUMA_BALANCING 2181 2182 void set_numabalancing_state(bool enabled) 2183 { 2184 if (enabled) 2185 static_branch_enable(&sched_numa_balancing); 2186 else 2187 static_branch_disable(&sched_numa_balancing); 2188 } 2189 2190 #ifdef CONFIG_PROC_SYSCTL 2191 int sysctl_numa_balancing(struct ctl_table *table, int write, 2192 void __user *buffer, size_t *lenp, loff_t *ppos) 2193 { 2194 struct ctl_table t; 2195 int err; 2196 int state = static_branch_likely(&sched_numa_balancing); 2197 2198 if (write && !capable(CAP_SYS_ADMIN)) 2199 return -EPERM; 2200 2201 t = *table; 2202 t.data = &state; 2203 err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); 2204 if (err < 0) 2205 return err; 2206 if (write) 2207 set_numabalancing_state(state); 2208 return err; 2209 } 2210 #endif 2211 #endif 2212 2213 #ifdef CONFIG_SCHEDSTATS 2214 2215 DEFINE_STATIC_KEY_FALSE(sched_schedstats); 2216 static bool __initdata __sched_schedstats = false; 2217 2218 static void set_schedstats(bool enabled) 2219 { 2220 if (enabled) 2221 static_branch_enable(&sched_schedstats); 2222 else 2223 static_branch_disable(&sched_schedstats); 2224 } 2225 2226 void force_schedstat_enabled(void) 2227 { 2228 if (!schedstat_enabled()) { 2229 pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n"); 2230 static_branch_enable(&sched_schedstats); 2231 } 2232 } 2233 2234 static int __init setup_schedstats(char *str) 2235 { 2236 int ret = 0; 2237 if (!str) 2238 goto out; 2239 2240 /* 2241 * This code is called before jump labels have been set up, so we can't 2242 * change the static branch directly just yet. Instead set a temporary 2243 * variable so init_schedstats() can do it later. 2244 */ 2245 if (!strcmp(str, "enable")) { 2246 __sched_schedstats = true; 2247 ret = 1; 2248 } else if (!strcmp(str, "disable")) { 2249 __sched_schedstats = false; 2250 ret = 1; 2251 } 2252 out: 2253 if (!ret) 2254 pr_warn("Unable to parse schedstats=\n"); 2255 2256 return ret; 2257 } 2258 __setup("schedstats=", setup_schedstats); 2259 2260 static void __init init_schedstats(void) 2261 { 2262 set_schedstats(__sched_schedstats); 2263 } 2264 2265 #ifdef CONFIG_PROC_SYSCTL 2266 int sysctl_schedstats(struct ctl_table *table, int write, 2267 void __user *buffer, size_t *lenp, loff_t *ppos) 2268 { 2269 struct ctl_table t; 2270 int err; 2271 int state = static_branch_likely(&sched_schedstats); 2272 2273 if (write && !capable(CAP_SYS_ADMIN)) 2274 return -EPERM; 2275 2276 t = *table; 2277 t.data = &state; 2278 err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); 2279 if (err < 0) 2280 return err; 2281 if (write) 2282 set_schedstats(state); 2283 return err; 2284 } 2285 #endif /* CONFIG_PROC_SYSCTL */ 2286 #else /* !CONFIG_SCHEDSTATS */ 2287 static inline void init_schedstats(void) {} 2288 #endif /* CONFIG_SCHEDSTATS */ 2289 2290 /* 2291 * fork()/clone()-time setup: 2292 */ 2293 int sched_fork(unsigned long clone_flags, struct task_struct *p) 2294 { 2295 unsigned long flags; 2296 2297 __sched_fork(clone_flags, p); 2298 /* 2299 * We mark the process as NEW here. This guarantees that 2300 * nobody will actually run it, and a signal or other external 2301 * event cannot wake it up and insert it on the runqueue either. 2302 */ 2303 p->state = TASK_NEW; 2304 2305 /* 2306 * Make sure we do not leak PI boosting priority to the child. 2307 */ 2308 p->prio = current->normal_prio; 2309 2310 /* 2311 * Revert to default priority/policy on fork if requested. 2312 */ 2313 if (unlikely(p->sched_reset_on_fork)) { 2314 if (task_has_dl_policy(p) || task_has_rt_policy(p)) { 2315 p->policy = SCHED_NORMAL; 2316 p->static_prio = NICE_TO_PRIO(0); 2317 p->rt_priority = 0; 2318 } else if (PRIO_TO_NICE(p->static_prio) < 0) 2319 p->static_prio = NICE_TO_PRIO(0); 2320 2321 p->prio = p->normal_prio = __normal_prio(p); 2322 set_load_weight(p, false); 2323 2324 /* 2325 * We don't need the reset flag anymore after the fork. It has 2326 * fulfilled its duty: 2327 */ 2328 p->sched_reset_on_fork = 0; 2329 } 2330 2331 if (dl_prio(p->prio)) 2332 return -EAGAIN; 2333 else if (rt_prio(p->prio)) 2334 p->sched_class = &rt_sched_class; 2335 else 2336 p->sched_class = &fair_sched_class; 2337 2338 init_entity_runnable_average(&p->se); 2339 2340 /* 2341 * The child is not yet in the pid-hash so no cgroup attach races, 2342 * and the cgroup is pinned to this child due to cgroup_fork() 2343 * is ran before sched_fork(). 2344 * 2345 * Silence PROVE_RCU. 2346 */ 2347 raw_spin_lock_irqsave(&p->pi_lock, flags); 2348 /* 2349 * We're setting the CPU for the first time, we don't migrate, 2350 * so use __set_task_cpu(). 2351 */ 2352 __set_task_cpu(p, smp_processor_id()); 2353 if (p->sched_class->task_fork) 2354 p->sched_class->task_fork(p); 2355 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 2356 2357 #ifdef CONFIG_SCHED_INFO 2358 if (likely(sched_info_on())) 2359 memset(&p->sched_info, 0, sizeof(p->sched_info)); 2360 #endif 2361 #if defined(CONFIG_SMP) 2362 p->on_cpu = 0; 2363 #endif 2364 init_task_preempt_count(p); 2365 #ifdef CONFIG_SMP 2366 plist_node_init(&p->pushable_tasks, MAX_PRIO); 2367 RB_CLEAR_NODE(&p->pushable_dl_tasks); 2368 #endif 2369 return 0; 2370 } 2371 2372 unsigned long to_ratio(u64 period, u64 runtime) 2373 { 2374 if (runtime == RUNTIME_INF) 2375 return BW_UNIT; 2376 2377 /* 2378 * Doing this here saves a lot of checks in all 2379 * the calling paths, and returning zero seems 2380 * safe for them anyway. 2381 */ 2382 if (period == 0) 2383 return 0; 2384 2385 return div64_u64(runtime << BW_SHIFT, period); 2386 } 2387 2388 /* 2389 * wake_up_new_task - wake up a newly created task for the first time. 2390 * 2391 * This function will do some initial scheduler statistics housekeeping 2392 * that must be done for every newly created context, then puts the task 2393 * on the runqueue and wakes it. 2394 */ 2395 void wake_up_new_task(struct task_struct *p) 2396 { 2397 struct rq_flags rf; 2398 struct rq *rq; 2399 2400 raw_spin_lock_irqsave(&p->pi_lock, rf.flags); 2401 p->state = TASK_RUNNING; 2402 #ifdef CONFIG_SMP 2403 /* 2404 * Fork balancing, do it here and not earlier because: 2405 * - cpus_allowed can change in the fork path 2406 * - any previously selected CPU might disappear through hotplug 2407 * 2408 * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq, 2409 * as we're not fully set-up yet. 2410 */ 2411 p->recent_used_cpu = task_cpu(p); 2412 __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); 2413 #endif 2414 rq = __task_rq_lock(p, &rf); 2415 update_rq_clock(rq); 2416 post_init_entity_util_avg(&p->se); 2417 2418 activate_task(rq, p, ENQUEUE_NOCLOCK); 2419 p->on_rq = TASK_ON_RQ_QUEUED; 2420 trace_sched_wakeup_new(p); 2421 check_preempt_curr(rq, p, WF_FORK); 2422 #ifdef CONFIG_SMP 2423 if (p->sched_class->task_woken) { 2424 /* 2425 * Nothing relies on rq->lock after this, so its fine to 2426 * drop it. 2427 */ 2428 rq_unpin_lock(rq, &rf); 2429 p->sched_class->task_woken(rq, p); 2430 rq_repin_lock(rq, &rf); 2431 } 2432 #endif 2433 task_rq_unlock(rq, p, &rf); 2434 } 2435 2436 #ifdef CONFIG_PREEMPT_NOTIFIERS 2437 2438 static DEFINE_STATIC_KEY_FALSE(preempt_notifier_key); 2439 2440 void preempt_notifier_inc(void) 2441 { 2442 static_branch_inc(&preempt_notifier_key); 2443 } 2444 EXPORT_SYMBOL_GPL(preempt_notifier_inc); 2445 2446 void preempt_notifier_dec(void) 2447 { 2448 static_branch_dec(&preempt_notifier_key); 2449 } 2450 EXPORT_SYMBOL_GPL(preempt_notifier_dec); 2451 2452 /** 2453 * preempt_notifier_register - tell me when current is being preempted & rescheduled 2454 * @notifier: notifier struct to register 2455 */ 2456 void preempt_notifier_register(struct preempt_notifier *notifier) 2457 { 2458 if (!static_branch_unlikely(&preempt_notifier_key)) 2459 WARN(1, "registering preempt_notifier while notifiers disabled\n"); 2460 2461 hlist_add_head(¬ifier->link, ¤t->preempt_notifiers); 2462 } 2463 EXPORT_SYMBOL_GPL(preempt_notifier_register); 2464 2465 /** 2466 * preempt_notifier_unregister - no longer interested in preemption notifications 2467 * @notifier: notifier struct to unregister 2468 * 2469 * This is *not* safe to call from within a preemption notifier. 2470 */ 2471 void preempt_notifier_unregister(struct preempt_notifier *notifier) 2472 { 2473 hlist_del(¬ifier->link); 2474 } 2475 EXPORT_SYMBOL_GPL(preempt_notifier_unregister); 2476 2477 static void __fire_sched_in_preempt_notifiers(struct task_struct *curr) 2478 { 2479 struct preempt_notifier *notifier; 2480 2481 hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) 2482 notifier->ops->sched_in(notifier, raw_smp_processor_id()); 2483 } 2484 2485 static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) 2486 { 2487 if (static_branch_unlikely(&preempt_notifier_key)) 2488 __fire_sched_in_preempt_notifiers(curr); 2489 } 2490 2491 static void 2492 __fire_sched_out_preempt_notifiers(struct task_struct *curr, 2493 struct task_struct *next) 2494 { 2495 struct preempt_notifier *notifier; 2496 2497 hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) 2498 notifier->ops->sched_out(notifier, next); 2499 } 2500 2501 static __always_inline void 2502 fire_sched_out_preempt_notifiers(struct task_struct *curr, 2503 struct task_struct *next) 2504 { 2505 if (static_branch_unlikely(&preempt_notifier_key)) 2506 __fire_sched_out_preempt_notifiers(curr, next); 2507 } 2508 2509 #else /* !CONFIG_PREEMPT_NOTIFIERS */ 2510 2511 static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) 2512 { 2513 } 2514 2515 static inline void 2516 fire_sched_out_preempt_notifiers(struct task_struct *curr, 2517 struct task_struct *next) 2518 { 2519 } 2520 2521 #endif /* CONFIG_PREEMPT_NOTIFIERS */ 2522 2523 static inline void prepare_task(struct task_struct *next) 2524 { 2525 #ifdef CONFIG_SMP 2526 /* 2527 * Claim the task as running, we do this before switching to it 2528 * such that any running task will have this set. 2529 */ 2530 next->on_cpu = 1; 2531 #endif 2532 } 2533 2534 static inline void finish_task(struct task_struct *prev) 2535 { 2536 #ifdef CONFIG_SMP 2537 /* 2538 * After ->on_cpu is cleared, the task can be moved to a different CPU. 2539 * We must ensure this doesn't happen until the switch is completely 2540 * finished. 2541 * 2542 * In particular, the load of prev->state in finish_task_switch() must 2543 * happen before this. 2544 * 2545 * Pairs with the smp_cond_load_acquire() in try_to_wake_up(). 2546 */ 2547 smp_store_release(&prev->on_cpu, 0); 2548 #endif 2549 } 2550 2551 static inline void 2552 prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf) 2553 { 2554 /* 2555 * Since the runqueue lock will be released by the next 2556 * task (which is an invalid locking op but in the case 2557 * of the scheduler it's an obvious special-case), so we 2558 * do an early lockdep release here: 2559 */ 2560 rq_unpin_lock(rq, rf); 2561 spin_release(&rq->lock.dep_map, 1, _THIS_IP_); 2562 #ifdef CONFIG_DEBUG_SPINLOCK 2563 /* this is a valid case when another task releases the spinlock */ 2564 rq->lock.owner = next; 2565 #endif 2566 } 2567 2568 static inline void finish_lock_switch(struct rq *rq) 2569 { 2570 /* 2571 * If we are tracking spinlock dependencies then we have to 2572 * fix up the runqueue lock - which gets 'carried over' from 2573 * prev into current: 2574 */ 2575 spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); 2576 raw_spin_unlock_irq(&rq->lock); 2577 } 2578 2579 /* 2580 * NOP if the arch has not defined these: 2581 */ 2582 2583 #ifndef prepare_arch_switch 2584 # define prepare_arch_switch(next) do { } while (0) 2585 #endif 2586 2587 #ifndef finish_arch_post_lock_switch 2588 # define finish_arch_post_lock_switch() do { } while (0) 2589 #endif 2590 2591 /** 2592 * prepare_task_switch - prepare to switch tasks 2593 * @rq: the runqueue preparing to switch 2594 * @prev: the current task that is being switched out 2595 * @next: the task we are going to switch to. 2596 * 2597 * This is called with the rq lock held and interrupts off. It must 2598 * be paired with a subsequent finish_task_switch after the context 2599 * switch. 2600 * 2601 * prepare_task_switch sets up locking and calls architecture specific 2602 * hooks. 2603 */ 2604 static inline void 2605 prepare_task_switch(struct rq *rq, struct task_struct *prev, 2606 struct task_struct *next) 2607 { 2608 kcov_prepare_switch(prev); 2609 sched_info_switch(rq, prev, next); 2610 perf_event_task_sched_out(prev, next); 2611 rseq_preempt(prev); 2612 fire_sched_out_preempt_notifiers(prev, next); 2613 prepare_task(next); 2614 prepare_arch_switch(next); 2615 } 2616 2617 /** 2618 * finish_task_switch - clean up after a task-switch 2619 * @prev: the thread we just switched away from. 2620 * 2621 * finish_task_switch must be called after the context switch, paired 2622 * with a prepare_task_switch call before the context switch. 2623 * finish_task_switch will reconcile locking set up by prepare_task_switch, 2624 * and do any other architecture-specific cleanup actions. 2625 * 2626 * Note that we may have delayed dropping an mm in context_switch(). If 2627 * so, we finish that here outside of the runqueue lock. (Doing it 2628 * with the lock held can cause deadlocks; see schedule() for 2629 * details.) 2630 * 2631 * The context switch have flipped the stack from under us and restored the 2632 * local variables which were saved when this task called schedule() in the 2633 * past. prev == current is still correct but we need to recalculate this_rq 2634 * because prev may have moved to another CPU. 2635 */ 2636 static struct rq *finish_task_switch(struct task_struct *prev) 2637 __releases(rq->lock) 2638 { 2639 struct rq *rq = this_rq(); 2640 struct mm_struct *mm = rq->prev_mm; 2641 long prev_state; 2642 2643 /* 2644 * The previous task will have left us with a preempt_count of 2 2645 * because it left us after: 2646 * 2647 * schedule() 2648 * preempt_disable(); // 1 2649 * __schedule() 2650 * raw_spin_lock_irq(&rq->lock) // 2 2651 * 2652 * Also, see FORK_PREEMPT_COUNT. 2653 */ 2654 if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET, 2655 "corrupted preempt_count: %s/%d/0x%x\n", 2656 current->comm, current->pid, preempt_count())) 2657 preempt_count_set(FORK_PREEMPT_COUNT); 2658 2659 rq->prev_mm = NULL; 2660 2661 /* 2662 * A task struct has one reference for the use as "current". 2663 * If a task dies, then it sets TASK_DEAD in tsk->state and calls 2664 * schedule one last time. The schedule call will never return, and 2665 * the scheduled task must drop that reference. 2666 * 2667 * We must observe prev->state before clearing prev->on_cpu (in 2668 * finish_task), otherwise a concurrent wakeup can get prev 2669 * running on another CPU and we could rave with its RUNNING -> DEAD 2670 * transition, resulting in a double drop. 2671 */ 2672 prev_state = prev->state; 2673 vtime_task_switch(prev); 2674 perf_event_task_sched_in(prev, current); 2675 finish_task(prev); 2676 finish_lock_switch(rq); 2677 finish_arch_post_lock_switch(); 2678 kcov_finish_switch(current); 2679 2680 fire_sched_in_preempt_notifiers(current); 2681 /* 2682 * When switching through a kernel thread, the loop in 2683 * membarrier_{private,global}_expedited() may have observed that 2684 * kernel thread and not issued an IPI. It is therefore possible to 2685 * schedule between user->kernel->user threads without passing though 2686 * switch_mm(). Membarrier requires a barrier after storing to 2687 * rq->curr, before returning to userspace, so provide them here: 2688 * 2689 * - a full memory barrier for {PRIVATE,GLOBAL}_EXPEDITED, implicitly 2690 * provided by mmdrop(), 2691 * - a sync_core for SYNC_CORE. 2692 */ 2693 if (mm) { 2694 membarrier_mm_sync_core_before_usermode(mm); 2695 mmdrop(mm); 2696 } 2697 if (unlikely(prev_state == TASK_DEAD)) { 2698 if (prev->sched_class->task_dead) 2699 prev->sched_class->task_dead(prev); 2700 2701 /* 2702 * Remove function-return probe instances associated with this 2703 * task and put them back on the free list. 2704 */ 2705 kprobe_flush_task(prev); 2706 2707 /* Task is done with its stack. */ 2708 put_task_stack(prev); 2709 2710 put_task_struct(prev); 2711 } 2712 2713 tick_nohz_task_switch(); 2714 return rq; 2715 } 2716 2717 #ifdef CONFIG_SMP 2718 2719 /* rq->lock is NOT held, but preemption is disabled */ 2720 static void __balance_callback(struct rq *rq) 2721 { 2722 struct callback_head *head, *next; 2723 void (*func)(struct rq *rq); 2724 unsigned long flags; 2725 2726 raw_spin_lock_irqsave(&rq->lock, flags); 2727 head = rq->balance_callback; 2728 rq->balance_callback = NULL; 2729 while (head) { 2730 func = (void (*)(struct rq *))head->func; 2731 next = head->next; 2732 head->next = NULL; 2733 head = next; 2734 2735 func(rq); 2736 } 2737 raw_spin_unlock_irqrestore(&rq->lock, flags); 2738 } 2739 2740 static inline void balance_callback(struct rq *rq) 2741 { 2742 if (unlikely(rq->balance_callback)) 2743 __balance_callback(rq); 2744 } 2745 2746 #else 2747 2748 static inline void balance_callback(struct rq *rq) 2749 { 2750 } 2751 2752 #endif 2753 2754 /** 2755 * schedule_tail - first thing a freshly forked thread must call. 2756 * @prev: the thread we just switched away from. 2757 */ 2758 asmlinkage __visible void schedule_tail(struct task_struct *prev) 2759 __releases(rq->lock) 2760 { 2761 struct rq *rq; 2762 2763 /* 2764 * New tasks start with FORK_PREEMPT_COUNT, see there and 2765 * finish_task_switch() for details. 2766 * 2767 * finish_task_switch() will drop rq->lock() and lower preempt_count 2768 * and the preempt_enable() will end up enabling preemption (on 2769 * PREEMPT_COUNT kernels). 2770 */ 2771 2772 rq = finish_task_switch(prev); 2773 balance_callback(rq); 2774 preempt_enable(); 2775 2776 if (current->set_child_tid) 2777 put_user(task_pid_vnr(current), current->set_child_tid); 2778 2779 calculate_sigpending(); 2780 } 2781 2782 /* 2783 * context_switch - switch to the new MM and the new thread's register state. 2784 */ 2785 static __always_inline struct rq * 2786 context_switch(struct rq *rq, struct task_struct *prev, 2787 struct task_struct *next, struct rq_flags *rf) 2788 { 2789 struct mm_struct *mm, *oldmm; 2790 2791 prepare_task_switch(rq, prev, next); 2792 2793 mm = next->mm; 2794 oldmm = prev->active_mm; 2795 /* 2796 * For paravirt, this is coupled with an exit in switch_to to 2797 * combine the page table reload and the switch backend into 2798 * one hypercall. 2799 */ 2800 arch_start_context_switch(prev); 2801 2802 /* 2803 * If mm is non-NULL, we pass through switch_mm(). If mm is 2804 * NULL, we will pass through mmdrop() in finish_task_switch(). 2805 * Both of these contain the full memory barrier required by 2806 * membarrier after storing to rq->curr, before returning to 2807 * user-space. 2808 */ 2809 if (!mm) { 2810 next->active_mm = oldmm; 2811 mmgrab(oldmm); 2812 enter_lazy_tlb(oldmm, next); 2813 } else 2814 switch_mm_irqs_off(oldmm, mm, next); 2815 2816 if (!prev->mm) { 2817 prev->active_mm = NULL; 2818 rq->prev_mm = oldmm; 2819 } 2820 2821 rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP); 2822 2823 prepare_lock_switch(rq, next, rf); 2824 2825 /* Here we just switch the register state and the stack. */ 2826 switch_to(prev, next, prev); 2827 barrier(); 2828 2829 return finish_task_switch(prev); 2830 } 2831 2832 /* 2833 * nr_running and nr_context_switches: 2834 * 2835 * externally visible scheduler statistics: current number of runnable 2836 * threads, total number of context switches performed since bootup. 2837 */ 2838 unsigned long nr_running(void) 2839 { 2840 unsigned long i, sum = 0; 2841 2842 for_each_online_cpu(i) 2843 sum += cpu_rq(i)->nr_running; 2844 2845 return sum; 2846 } 2847 2848 /* 2849 * Check if only the current task is running on the CPU. 2850 * 2851 * Caution: this function does not check that the caller has disabled 2852 * preemption, thus the result might have a time-of-check-to-time-of-use 2853 * race. The caller is responsible to use it correctly, for example: 2854 * 2855 * - from a non-preemptable section (of course) 2856 * 2857 * - from a thread that is bound to a single CPU 2858 * 2859 * - in a loop with very short iterations (e.g. a polling loop) 2860 */ 2861 bool single_task_running(void) 2862 { 2863 return raw_rq()->nr_running == 1; 2864 } 2865 EXPORT_SYMBOL(single_task_running); 2866 2867 unsigned long long nr_context_switches(void) 2868 { 2869 int i; 2870 unsigned long long sum = 0; 2871 2872 for_each_possible_cpu(i) 2873 sum += cpu_rq(i)->nr_switches; 2874 2875 return sum; 2876 } 2877 2878 /* 2879 * IO-wait accounting, and how its mostly bollocks (on SMP). 2880 * 2881 * The idea behind IO-wait account is to account the idle time that we could 2882 * have spend running if it were not for IO. That is, if we were to improve the 2883 * storage performance, we'd have a proportional reduction in IO-wait time. 2884 * 2885 * This all works nicely on UP, where, when a task blocks on IO, we account 2886 * idle time as IO-wait, because if the storage were faster, it could've been 2887 * running and we'd not be idle. 2888 * 2889 * This has been extended to SMP, by doing the same for each CPU. This however 2890 * is broken. 2891 * 2892 * Imagine for instance the case where two tasks block on one CPU, only the one 2893 * CPU will have IO-wait accounted, while the other has regular idle. Even 2894 * though, if the storage were faster, both could've ran at the same time, 2895 * utilising both CPUs. 2896 * 2897 * This means, that when looking globally, the current IO-wait accounting on 2898 * SMP is a lower bound, by reason of under accounting. 2899 * 2900 * Worse, since the numbers are provided per CPU, they are sometimes 2901 * interpreted per CPU, and that is nonsensical. A blocked task isn't strictly 2902 * associated with any one particular CPU, it can wake to another CPU than it 2903 * blocked on. This means the per CPU IO-wait number is meaningless. 2904 * 2905 * Task CPU affinities can make all that even more 'interesting'. 2906 */ 2907 2908 unsigned long nr_iowait(void) 2909 { 2910 unsigned long i, sum = 0; 2911 2912 for_each_possible_cpu(i) 2913 sum += atomic_read(&cpu_rq(i)->nr_iowait); 2914 2915 return sum; 2916 } 2917 2918 /* 2919 * Consumers of these two interfaces, like for example the cpuidle menu 2920 * governor, are using nonsensical data. Preferring shallow idle state selection 2921 * for a CPU that has IO-wait which might not even end up running the task when 2922 * it does become runnable. 2923 */ 2924 2925 unsigned long nr_iowait_cpu(int cpu) 2926 { 2927 struct rq *this = cpu_rq(cpu); 2928 return atomic_read(&this->nr_iowait); 2929 } 2930 2931 void get_iowait_load(unsigned long *nr_waiters, unsigned long *load) 2932 { 2933 struct rq *rq = this_rq(); 2934 *nr_waiters = atomic_read(&rq->nr_iowait); 2935 *load = rq->load.weight; 2936 } 2937 2938 #ifdef CONFIG_SMP 2939 2940 /* 2941 * sched_exec - execve() is a valuable balancing opportunity, because at 2942 * this point the task has the smallest effective memory and cache footprint. 2943 */ 2944 void sched_exec(void) 2945 { 2946 struct task_struct *p = current; 2947 unsigned long flags; 2948 int dest_cpu; 2949 2950 raw_spin_lock_irqsave(&p->pi_lock, flags); 2951 dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0); 2952 if (dest_cpu == smp_processor_id()) 2953 goto unlock; 2954 2955 if (likely(cpu_active(dest_cpu))) { 2956 struct migration_arg arg = { p, dest_cpu }; 2957 2958 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 2959 stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg); 2960 return; 2961 } 2962 unlock: 2963 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 2964 } 2965 2966 #endif 2967 2968 DEFINE_PER_CPU(struct kernel_stat, kstat); 2969 DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat); 2970 2971 EXPORT_PER_CPU_SYMBOL(kstat); 2972 EXPORT_PER_CPU_SYMBOL(kernel_cpustat); 2973 2974 /* 2975 * The function fair_sched_class.update_curr accesses the struct curr 2976 * and its field curr->exec_start; when called from task_sched_runtime(), 2977 * we observe a high rate of cache misses in practice. 2978 * Prefetching this data results in improved performance. 2979 */ 2980 static inline void prefetch_curr_exec_start(struct task_struct *p) 2981 { 2982 #ifdef CONFIG_FAIR_GROUP_SCHED 2983 struct sched_entity *curr = (&p->se)->cfs_rq->curr; 2984 #else 2985 struct sched_entity *curr = (&task_rq(p)->cfs)->curr; 2986 #endif 2987 prefetch(curr); 2988 prefetch(&curr->exec_start); 2989 } 2990 2991 /* 2992 * Return accounted runtime for the task. 2993 * In case the task is currently running, return the runtime plus current's 2994 * pending runtime that have not been accounted yet. 2995 */ 2996 unsigned long long task_sched_runtime(struct task_struct *p) 2997 { 2998 struct rq_flags rf; 2999 struct rq *rq; 3000 u64 ns; 3001 3002 #if defined(CONFIG_64BIT) && defined(CONFIG_SMP) 3003 /* 3004 * 64-bit doesn't need locks to atomically read a 64-bit value. 3005 * So we have a optimization chance when the task's delta_exec is 0. 3006 * Reading ->on_cpu is racy, but this is ok. 3007 * 3008 * If we race with it leaving CPU, we'll take a lock. So we're correct. 3009 * If we race with it entering CPU, unaccounted time is 0. This is 3010 * indistinguishable from the read occurring a few cycles earlier. 3011 * If we see ->on_cpu without ->on_rq, the task is leaving, and has 3012 * been accounted, so we're correct here as well. 3013 */ 3014 if (!p->on_cpu || !task_on_rq_queued(p)) 3015 return p->se.sum_exec_runtime; 3016 #endif 3017 3018 rq = task_rq_lock(p, &rf); 3019 /* 3020 * Must be ->curr _and_ ->on_rq. If dequeued, we would 3021 * project cycles that may never be accounted to this 3022 * thread, breaking clock_gettime(). 3023 */ 3024 if (task_current(rq, p) && task_on_rq_queued(p)) { 3025 prefetch_curr_exec_start(p); 3026 update_rq_clock(rq); 3027 p->sched_class->update_curr(rq); 3028 } 3029 ns = p->se.sum_exec_runtime; 3030 task_rq_unlock(rq, p, &rf); 3031 3032 return ns; 3033 } 3034 3035 /* 3036 * This function gets called by the timer code, with HZ frequency. 3037 * We call it with interrupts disabled. 3038 */ 3039 void scheduler_tick(void) 3040 { 3041 int cpu = smp_processor_id(); 3042 struct rq *rq = cpu_rq(cpu); 3043 struct task_struct *curr = rq->curr; 3044 struct rq_flags rf; 3045 3046 sched_clock_tick(); 3047 3048 rq_lock(rq, &rf); 3049 3050 update_rq_clock(rq); 3051 curr->sched_class->task_tick(rq, curr, 0); 3052 cpu_load_update_active(rq); 3053 calc_global_load_tick(rq); 3054 3055 rq_unlock(rq, &rf); 3056 3057 perf_event_task_tick(); 3058 3059 #ifdef CONFIG_SMP 3060 rq->idle_balance = idle_cpu(cpu); 3061 trigger_load_balance(rq); 3062 #endif 3063 } 3064 3065 #ifdef CONFIG_NO_HZ_FULL 3066 3067 struct tick_work { 3068 int cpu; 3069 struct delayed_work work; 3070 }; 3071 3072 static struct tick_work __percpu *tick_work_cpu; 3073 3074 static void sched_tick_remote(struct work_struct *work) 3075 { 3076 struct delayed_work *dwork = to_delayed_work(work); 3077 struct tick_work *twork = container_of(dwork, struct tick_work, work); 3078 int cpu = twork->cpu; 3079 struct rq *rq = cpu_rq(cpu); 3080 struct task_struct *curr; 3081 struct rq_flags rf; 3082 u64 delta; 3083 3084 /* 3085 * Handle the tick only if it appears the remote CPU is running in full 3086 * dynticks mode. The check is racy by nature, but missing a tick or 3087 * having one too much is no big deal because the scheduler tick updates 3088 * statistics and checks timeslices in a time-independent way, regardless 3089 * of when exactly it is running. 3090 */ 3091 if (idle_cpu(cpu) || !tick_nohz_tick_stopped_cpu(cpu)) 3092 goto out_requeue; 3093 3094 rq_lock_irq(rq, &rf); 3095 curr = rq->curr; 3096 if (is_idle_task(curr)) 3097 goto out_unlock; 3098 3099 update_rq_clock(rq); 3100 delta = rq_clock_task(rq) - curr->se.exec_start; 3101 3102 /* 3103 * Make sure the next tick runs within a reasonable 3104 * amount of time. 3105 */ 3106 WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3); 3107 curr->sched_class->task_tick(rq, curr, 0); 3108 3109 out_unlock: 3110 rq_unlock_irq(rq, &rf); 3111 3112 out_requeue: 3113 /* 3114 * Run the remote tick once per second (1Hz). This arbitrary 3115 * frequency is large enough to avoid overload but short enough 3116 * to keep scheduler internal stats reasonably up to date. 3117 */ 3118 queue_delayed_work(system_unbound_wq, dwork, HZ); 3119 } 3120 3121 static void sched_tick_start(int cpu) 3122 { 3123 struct tick_work *twork; 3124 3125 if (housekeeping_cpu(cpu, HK_FLAG_TICK)) 3126 return; 3127 3128 WARN_ON_ONCE(!tick_work_cpu); 3129 3130 twork = per_cpu_ptr(tick_work_cpu, cpu); 3131 twork->cpu = cpu; 3132 INIT_DELAYED_WORK(&twork->work, sched_tick_remote); 3133 queue_delayed_work(system_unbound_wq, &twork->work, HZ); 3134 } 3135 3136 #ifdef CONFIG_HOTPLUG_CPU 3137 static void sched_tick_stop(int cpu) 3138 { 3139 struct tick_work *twork; 3140 3141 if (housekeeping_cpu(cpu, HK_FLAG_TICK)) 3142 return; 3143 3144 WARN_ON_ONCE(!tick_work_cpu); 3145 3146 twork = per_cpu_ptr(tick_work_cpu, cpu); 3147 cancel_delayed_work_sync(&twork->work); 3148 } 3149 #endif /* CONFIG_HOTPLUG_CPU */ 3150 3151 int __init sched_tick_offload_init(void) 3152 { 3153 tick_work_cpu = alloc_percpu(struct tick_work); 3154 BUG_ON(!tick_work_cpu); 3155 3156 return 0; 3157 } 3158 3159 #else /* !CONFIG_NO_HZ_FULL */ 3160 static inline void sched_tick_start(int cpu) { } 3161 static inline void sched_tick_stop(int cpu) { } 3162 #endif 3163 3164 #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ 3165 defined(CONFIG_TRACE_PREEMPT_TOGGLE)) 3166 /* 3167 * If the value passed in is equal to the current preempt count 3168 * then we just disabled preemption. Start timing the latency. 3169 */ 3170 static inline void preempt_latency_start(int val) 3171 { 3172 if (preempt_count() == val) { 3173 unsigned long ip = get_lock_parent_ip(); 3174 #ifdef CONFIG_DEBUG_PREEMPT 3175 current->preempt_disable_ip = ip; 3176 #endif 3177 trace_preempt_off(CALLER_ADDR0, ip); 3178 } 3179 } 3180 3181 void preempt_count_add(int val) 3182 { 3183 #ifdef CONFIG_DEBUG_PREEMPT 3184 /* 3185 * Underflow? 3186 */ 3187 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) 3188 return; 3189 #endif 3190 __preempt_count_add(val); 3191 #ifdef CONFIG_DEBUG_PREEMPT 3192 /* 3193 * Spinlock count overflowing soon? 3194 */ 3195 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= 3196 PREEMPT_MASK - 10); 3197 #endif 3198 preempt_latency_start(val); 3199 } 3200 EXPORT_SYMBOL(preempt_count_add); 3201 NOKPROBE_SYMBOL(preempt_count_add); 3202 3203 /* 3204 * If the value passed in equals to the current preempt count 3205 * then we just enabled preemption. Stop timing the latency. 3206 */ 3207 static inline void preempt_latency_stop(int val) 3208 { 3209 if (preempt_count() == val) 3210 trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip()); 3211 } 3212 3213 void preempt_count_sub(int val) 3214 { 3215 #ifdef CONFIG_DEBUG_PREEMPT 3216 /* 3217 * Underflow? 3218 */ 3219 if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) 3220 return; 3221 /* 3222 * Is the spinlock portion underflowing? 3223 */ 3224 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && 3225 !(preempt_count() & PREEMPT_MASK))) 3226 return; 3227 #endif 3228 3229 preempt_latency_stop(val); 3230 __preempt_count_sub(val); 3231 } 3232 EXPORT_SYMBOL(preempt_count_sub); 3233 NOKPROBE_SYMBOL(preempt_count_sub); 3234 3235 #else 3236 static inline void preempt_latency_start(int val) { } 3237 static inline void preempt_latency_stop(int val) { } 3238 #endif 3239 3240 static inline unsigned long get_preempt_disable_ip(struct task_struct *p) 3241 { 3242 #ifdef CONFIG_DEBUG_PREEMPT 3243 return p->preempt_disable_ip; 3244 #else 3245 return 0; 3246 #endif 3247 } 3248 3249 /* 3250 * Print scheduling while atomic bug: 3251 */ 3252 static noinline void __schedule_bug(struct task_struct *prev) 3253 { 3254 /* Save this before calling printk(), since that will clobber it */ 3255 unsigned long preempt_disable_ip = get_preempt_disable_ip(current); 3256 3257 if (oops_in_progress) 3258 return; 3259 3260 printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", 3261 prev->comm, prev->pid, preempt_count()); 3262 3263 debug_show_held_locks(prev); 3264 print_modules(); 3265 if (irqs_disabled()) 3266 print_irqtrace_events(prev); 3267 if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) 3268 && in_atomic_preempt_off()) { 3269 pr_err("Preemption disabled at:"); 3270 print_ip_sym(preempt_disable_ip); 3271 pr_cont("\n"); 3272 } 3273 if (panic_on_warn) 3274 panic("scheduling while atomic\n"); 3275 3276 dump_stack(); 3277 add_taint(TAINT_WARN, LOCKDEP_STILL_OK); 3278 } 3279 3280 /* 3281 * Various schedule()-time debugging checks and statistics: 3282 */ 3283 static inline void schedule_debug(struct task_struct *prev) 3284 { 3285 #ifdef CONFIG_SCHED_STACK_END_CHECK 3286 if (task_stack_end_corrupted(prev)) 3287 panic("corrupted stack end detected inside scheduler\n"); 3288 #endif 3289 3290 if (unlikely(in_atomic_preempt_off())) { 3291 __schedule_bug(prev); 3292 preempt_count_set(PREEMPT_DISABLED); 3293 } 3294 rcu_sleep_check(); 3295 3296 profile_hit(SCHED_PROFILING, __builtin_return_address(0)); 3297 3298 schedstat_inc(this_rq()->sched_count); 3299 } 3300 3301 /* 3302 * Pick up the highest-prio task: 3303 */ 3304 static inline struct task_struct * 3305 pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) 3306 { 3307 const struct sched_class *class; 3308 struct task_struct *p; 3309 3310 /* 3311 * Optimization: we know that if all tasks are in the fair class we can 3312 * call that function directly, but only if the @prev task wasn't of a 3313 * higher scheduling class, because otherwise those loose the 3314 * opportunity to pull in more work from other CPUs. 3315 */ 3316 if (likely((prev->sched_class == &idle_sched_class || 3317 prev->sched_class == &fair_sched_class) && 3318 rq->nr_running == rq->cfs.h_nr_running)) { 3319 3320 p = fair_sched_class.pick_next_task(rq, prev, rf); 3321 if (unlikely(p == RETRY_TASK)) 3322 goto again; 3323 3324 /* Assumes fair_sched_class->next == idle_sched_class */ 3325 if (unlikely(!p)) 3326 p = idle_sched_class.pick_next_task(rq, prev, rf); 3327 3328 return p; 3329 } 3330 3331 again: 3332 for_each_class(class) { 3333 p = class->pick_next_task(rq, prev, rf); 3334 if (p) { 3335 if (unlikely(p == RETRY_TASK)) 3336 goto again; 3337 return p; 3338 } 3339 } 3340 3341 /* The idle class should always have a runnable task: */ 3342 BUG(); 3343 } 3344 3345 /* 3346 * __schedule() is the main scheduler function. 3347 * 3348 * The main means of driving the scheduler and thus entering this function are: 3349 * 3350 * 1. Explicit blocking: mutex, semaphore, waitqueue, etc. 3351 * 3352 * 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return 3353 * paths. For example, see arch/x86/entry_64.S. 3354 * 3355 * To drive preemption between tasks, the scheduler sets the flag in timer 3356 * interrupt handler scheduler_tick(). 3357 * 3358 * 3. Wakeups don't really cause entry into schedule(). They add a 3359 * task to the run-queue and that's it. 3360 * 3361 * Now, if the new task added to the run-queue preempts the current 3362 * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets 3363 * called on the nearest possible occasion: 3364 * 3365 * - If the kernel is preemptible (CONFIG_PREEMPT=y): 3366 * 3367 * - in syscall or exception context, at the next outmost 3368 * preempt_enable(). (this might be as soon as the wake_up()'s 3369 * spin_unlock()!) 3370 * 3371 * - in IRQ context, return from interrupt-handler to 3372 * preemptible context 3373 * 3374 * - If the kernel is not preemptible (CONFIG_PREEMPT is not set) 3375 * then at the next: 3376 * 3377 * - cond_resched() call 3378 * - explicit schedule() call 3379 * - return from syscall or exception to user-space 3380 * - return from interrupt-handler to user-space 3381 * 3382 * WARNING: must be called with preemption disabled! 3383 */ 3384 static void __sched notrace __schedule(bool preempt) 3385 { 3386 struct task_struct *prev, *next; 3387 unsigned long *switch_count; 3388 struct rq_flags rf; 3389 struct rq *rq; 3390 int cpu; 3391 3392 cpu = smp_processor_id(); 3393 rq = cpu_rq(cpu); 3394 prev = rq->curr; 3395 3396 schedule_debug(prev); 3397 3398 if (sched_feat(HRTICK)) 3399 hrtick_clear(rq); 3400 3401 local_irq_disable(); 3402 rcu_note_context_switch(preempt); 3403 3404 /* 3405 * Make sure that signal_pending_state()->signal_pending() below 3406 * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) 3407 * done by the caller to avoid the race with signal_wake_up(). 3408 * 3409 * The membarrier system call requires a full memory barrier 3410 * after coming from user-space, before storing to rq->curr. 3411 */ 3412 rq_lock(rq, &rf); 3413 smp_mb__after_spinlock(); 3414 3415 /* Promote REQ to ACT */ 3416 rq->clock_update_flags <<= 1; 3417 update_rq_clock(rq); 3418 3419 switch_count = &prev->nivcsw; 3420 if (!preempt && prev->state) { 3421 if (unlikely(signal_pending_state(prev->state, prev))) { 3422 prev->state = TASK_RUNNING; 3423 } else { 3424 deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK); 3425 prev->on_rq = 0; 3426 3427 if (prev->in_iowait) { 3428 atomic_inc(&rq->nr_iowait); 3429 delayacct_blkio_start(); 3430 } 3431 3432 /* 3433 * If a worker went to sleep, notify and ask workqueue 3434 * whether it wants to wake up a task to maintain 3435 * concurrency. 3436 */ 3437 if (prev->flags & PF_WQ_WORKER) { 3438 struct task_struct *to_wakeup; 3439 3440 to_wakeup = wq_worker_sleeping(prev); 3441 if (to_wakeup) 3442 try_to_wake_up_local(to_wakeup, &rf); 3443 } 3444 } 3445 switch_count = &prev->nvcsw; 3446 } 3447 3448 next = pick_next_task(rq, prev, &rf); 3449 clear_tsk_need_resched(prev); 3450 clear_preempt_need_resched(); 3451 3452 if (likely(prev != next)) { 3453 rq->nr_switches++; 3454 rq->curr = next; 3455 /* 3456 * The membarrier system call requires each architecture 3457 * to have a full memory barrier after updating 3458 * rq->curr, before returning to user-space. 3459 * 3460 * Here are the schemes providing that barrier on the 3461 * various architectures: 3462 * - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC. 3463 * switch_mm() rely on membarrier_arch_switch_mm() on PowerPC. 3464 * - finish_lock_switch() for weakly-ordered 3465 * architectures where spin_unlock is a full barrier, 3466 * - switch_to() for arm64 (weakly-ordered, spin_unlock 3467 * is a RELEASE barrier), 3468 */ 3469 ++*switch_count; 3470 3471 trace_sched_switch(preempt, prev, next); 3472 3473 /* Also unlocks the rq: */ 3474 rq = context_switch(rq, prev, next, &rf); 3475 } else { 3476 rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP); 3477 rq_unlock_irq(rq, &rf); 3478 } 3479 3480 balance_callback(rq); 3481 } 3482 3483 void __noreturn do_task_dead(void) 3484 { 3485 /* Causes final put_task_struct in finish_task_switch(): */ 3486 set_special_state(TASK_DEAD); 3487 3488 /* Tell freezer to ignore us: */ 3489 current->flags |= PF_NOFREEZE; 3490 3491 __schedule(false); 3492 BUG(); 3493 3494 /* Avoid "noreturn function does return" - but don't continue if BUG() is a NOP: */ 3495 for (;;) 3496 cpu_relax(); 3497 } 3498 3499 static inline void sched_submit_work(struct task_struct *tsk) 3500 { 3501 if (!tsk->state || tsk_is_pi_blocked(tsk)) 3502 return; 3503 /* 3504 * If we are going to sleep and we have plugged IO queued, 3505 * make sure to submit it to avoid deadlocks. 3506 */ 3507 if (blk_needs_flush_plug(tsk)) 3508 blk_schedule_flush_plug(tsk); 3509 } 3510 3511 asmlinkage __visible void __sched schedule(void) 3512 { 3513 struct task_struct *tsk = current; 3514 3515 sched_submit_work(tsk); 3516 do { 3517 preempt_disable(); 3518 __schedule(false); 3519 sched_preempt_enable_no_resched(); 3520 } while (need_resched()); 3521 } 3522 EXPORT_SYMBOL(schedule); 3523 3524 /* 3525 * synchronize_rcu_tasks() makes sure that no task is stuck in preempted 3526 * state (have scheduled out non-voluntarily) by making sure that all 3527 * tasks have either left the run queue or have gone into user space. 3528 * As idle tasks do not do either, they must not ever be preempted 3529 * (schedule out non-voluntarily). 3530 * 3531 * schedule_idle() is similar to schedule_preempt_disable() except that it 3532 * never enables preemption because it does not call sched_submit_work(). 3533 */ 3534 void __sched schedule_idle(void) 3535 { 3536 /* 3537 * As this skips calling sched_submit_work(), which the idle task does 3538 * regardless because that function is a nop when the task is in a 3539 * TASK_RUNNING state, make sure this isn't used someplace that the 3540 * current task can be in any other state. Note, idle is always in the 3541 * TASK_RUNNING state. 3542 */ 3543 WARN_ON_ONCE(current->state); 3544 do { 3545 __schedule(false); 3546 } while (need_resched()); 3547 } 3548 3549 #ifdef CONFIG_CONTEXT_TRACKING 3550 asmlinkage __visible void __sched schedule_user(void) 3551 { 3552 /* 3553 * If we come here after a random call to set_need_resched(), 3554 * or we have been woken up remotely but the IPI has not yet arrived, 3555 * we haven't yet exited the RCU idle mode. Do it here manually until 3556 * we find a better solution. 3557 * 3558 * NB: There are buggy callers of this function. Ideally we 3559 * should warn if prev_state != CONTEXT_USER, but that will trigger 3560 * too frequently to make sense yet. 3561 */ 3562 enum ctx_state prev_state = exception_enter(); 3563 schedule(); 3564 exception_exit(prev_state); 3565 } 3566 #endif 3567 3568 /** 3569 * schedule_preempt_disabled - called with preemption disabled 3570 * 3571 * Returns with preemption disabled. Note: preempt_count must be 1 3572 */ 3573 void __sched schedule_preempt_disabled(void) 3574 { 3575 sched_preempt_enable_no_resched(); 3576 schedule(); 3577 preempt_disable(); 3578 } 3579 3580 static void __sched notrace preempt_schedule_common(void) 3581 { 3582 do { 3583 /* 3584 * Because the function tracer can trace preempt_count_sub() 3585 * and it also uses preempt_enable/disable_notrace(), if 3586 * NEED_RESCHED is set, the preempt_enable_notrace() called 3587 * by the function tracer will call this function again and 3588 * cause infinite recursion. 3589 * 3590 * Preemption must be disabled here before the function 3591 * tracer can trace. Break up preempt_disable() into two 3592 * calls. One to disable preemption without fear of being 3593 * traced. The other to still record the preemption latency, 3594 * which can also be traced by the function tracer. 3595 */ 3596 preempt_disable_notrace(); 3597 preempt_latency_start(1); 3598 __schedule(true); 3599 preempt_latency_stop(1); 3600 preempt_enable_no_resched_notrace(); 3601 3602 /* 3603 * Check again in case we missed a preemption opportunity 3604 * between schedule and now. 3605 */ 3606 } while (need_resched()); 3607 } 3608 3609 #ifdef CONFIG_PREEMPT 3610 /* 3611 * this is the entry point to schedule() from in-kernel preemption 3612 * off of preempt_enable. Kernel preemptions off return from interrupt 3613 * occur there and call schedule directly. 3614 */ 3615 asmlinkage __visible void __sched notrace preempt_schedule(void) 3616 { 3617 /* 3618 * If there is a non-zero preempt_count or interrupts are disabled, 3619 * we do not want to preempt the current task. Just return.. 3620 */ 3621 if (likely(!preemptible())) 3622 return; 3623 3624 preempt_schedule_common(); 3625 } 3626 NOKPROBE_SYMBOL(preempt_schedule); 3627 EXPORT_SYMBOL(preempt_schedule); 3628 3629 /** 3630 * preempt_schedule_notrace - preempt_schedule called by tracing 3631 * 3632 * The tracing infrastructure uses preempt_enable_notrace to prevent 3633 * recursion and tracing preempt enabling caused by the tracing 3634 * infrastructure itself. But as tracing can happen in areas coming 3635 * from userspace or just about to enter userspace, a preempt enable 3636 * can occur before user_exit() is called. This will cause the scheduler 3637 * to be called when the system is still in usermode. 3638 * 3639 * To prevent this, the preempt_enable_notrace will use this function 3640 * instead of preempt_schedule() to exit user context if needed before 3641 * calling the scheduler. 3642 */ 3643 asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) 3644 { 3645 enum ctx_state prev_ctx; 3646 3647 if (likely(!preemptible())) 3648 return; 3649 3650 do { 3651 /* 3652 * Because the function tracer can trace preempt_count_sub() 3653 * and it also uses preempt_enable/disable_notrace(), if 3654 * NEED_RESCHED is set, the preempt_enable_notrace() called 3655 * by the function tracer will call this function again and 3656 * cause infinite recursion. 3657 * 3658 * Preemption must be disabled here before the function 3659 * tracer can trace. Break up preempt_disable() into two 3660 * calls. One to disable preemption without fear of being 3661 * traced. The other to still record the preemption latency, 3662 * which can also be traced by the function tracer. 3663 */ 3664 preempt_disable_notrace(); 3665 preempt_latency_start(1); 3666 /* 3667 * Needs preempt disabled in case user_exit() is traced 3668 * and the tracer calls preempt_enable_notrace() causing 3669 * an infinite recursion. 3670 */ 3671 prev_ctx = exception_enter(); 3672 __schedule(true); 3673 exception_exit(prev_ctx); 3674 3675 preempt_latency_stop(1); 3676 preempt_enable_no_resched_notrace(); 3677 } while (need_resched()); 3678 } 3679 EXPORT_SYMBOL_GPL(preempt_schedule_notrace); 3680 3681 #endif /* CONFIG_PREEMPT */ 3682 3683 /* 3684 * this is the entry point to schedule() from kernel preemption 3685 * off of irq context. 3686 * Note, that this is called and return with irqs disabled. This will 3687 * protect us against recursive calling from irq. 3688 */ 3689 asmlinkage __visible void __sched preempt_schedule_irq(void) 3690 { 3691 enum ctx_state prev_state; 3692 3693 /* Catch callers which need to be fixed */ 3694 BUG_ON(preempt_count() || !irqs_disabled()); 3695 3696 prev_state = exception_enter(); 3697 3698 do { 3699 preempt_disable(); 3700 local_irq_enable(); 3701 __schedule(true); 3702 local_irq_disable(); 3703 sched_preempt_enable_no_resched(); 3704 } while (need_resched()); 3705 3706 exception_exit(prev_state); 3707 } 3708 3709 int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags, 3710 void *key) 3711 { 3712 return try_to_wake_up(curr->private, mode, wake_flags); 3713 } 3714 EXPORT_SYMBOL(default_wake_function); 3715 3716 #ifdef CONFIG_RT_MUTEXES 3717 3718 static inline int __rt_effective_prio(struct task_struct *pi_task, int prio) 3719 { 3720 if (pi_task) 3721 prio = min(prio, pi_task->prio); 3722 3723 return prio; 3724 } 3725 3726 static inline int rt_effective_prio(struct task_struct *p, int prio) 3727 { 3728 struct task_struct *pi_task = rt_mutex_get_top_task(p); 3729 3730 return __rt_effective_prio(pi_task, prio); 3731 } 3732 3733 /* 3734 * rt_mutex_setprio - set the current priority of a task 3735 * @p: task to boost 3736 * @pi_task: donor task 3737 * 3738 * This function changes the 'effective' priority of a task. It does 3739 * not touch ->normal_prio like __setscheduler(). 3740 * 3741 * Used by the rt_mutex code to implement priority inheritance 3742 * logic. Call site only calls if the priority of the task changed. 3743 */ 3744 void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) 3745 { 3746 int prio, oldprio, queued, running, queue_flag = 3747 DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; 3748 const struct sched_class *prev_class; 3749 struct rq_flags rf; 3750 struct rq *rq; 3751 3752 /* XXX used to be waiter->prio, not waiter->task->prio */ 3753 prio = __rt_effective_prio(pi_task, p->normal_prio); 3754 3755 /* 3756 * If nothing changed; bail early. 3757 */ 3758 if (p->pi_top_task == pi_task && prio == p->prio && !dl_prio(prio)) 3759 return; 3760 3761 rq = __task_rq_lock(p, &rf); 3762 update_rq_clock(rq); 3763 /* 3764 * Set under pi_lock && rq->lock, such that the value can be used under 3765 * either lock. 3766 * 3767 * Note that there is loads of tricky to make this pointer cache work 3768 * right. rt_mutex_slowunlock()+rt_mutex_postunlock() work together to 3769 * ensure a task is de-boosted (pi_task is set to NULL) before the 3770 * task is allowed to run again (and can exit). This ensures the pointer 3771 * points to a blocked task -- which guaratees the task is present. 3772 */ 3773 p->pi_top_task = pi_task; 3774 3775 /* 3776 * For FIFO/RR we only need to set prio, if that matches we're done. 3777 */ 3778 if (prio == p->prio && !dl_prio(prio)) 3779 goto out_unlock; 3780 3781 /* 3782 * Idle task boosting is a nono in general. There is one 3783 * exception, when PREEMPT_RT and NOHZ is active: 3784 * 3785 * The idle task calls get_next_timer_interrupt() and holds 3786 * the timer wheel base->lock on the CPU and another CPU wants 3787 * to access the timer (probably to cancel it). We can safely 3788 * ignore the boosting request, as the idle CPU runs this code 3789 * with interrupts disabled and will complete the lock 3790 * protected section without being interrupted. So there is no 3791 * real need to boost. 3792 */ 3793 if (unlikely(p == rq->idle)) { 3794 WARN_ON(p != rq->curr); 3795 WARN_ON(p->pi_blocked_on); 3796 goto out_unlock; 3797 } 3798 3799 trace_sched_pi_setprio(p, pi_task); 3800 oldprio = p->prio; 3801 3802 if (oldprio == prio) 3803 queue_flag &= ~DEQUEUE_MOVE; 3804 3805 prev_class = p->sched_class; 3806 queued = task_on_rq_queued(p); 3807 running = task_current(rq, p); 3808 if (queued) 3809 dequeue_task(rq, p, queue_flag); 3810 if (running) 3811 put_prev_task(rq, p); 3812 3813 /* 3814 * Boosting condition are: 3815 * 1. -rt task is running and holds mutex A 3816 * --> -dl task blocks on mutex A 3817 * 3818 * 2. -dl task is running and holds mutex A 3819 * --> -dl task blocks on mutex A and could preempt the 3820 * running task 3821 */ 3822 if (dl_prio(prio)) { 3823 if (!dl_prio(p->normal_prio) || 3824 (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) { 3825 p->dl.dl_boosted = 1; 3826 queue_flag |= ENQUEUE_REPLENISH; 3827 } else 3828 p->dl.dl_boosted = 0; 3829 p->sched_class = &dl_sched_class; 3830 } else if (rt_prio(prio)) { 3831 if (dl_prio(oldprio)) 3832 p->dl.dl_boosted = 0; 3833 if (oldprio < prio) 3834 queue_flag |= ENQUEUE_HEAD; 3835 p->sched_class = &rt_sched_class; 3836 } else { 3837 if (dl_prio(oldprio)) 3838 p->dl.dl_boosted = 0; 3839 if (rt_prio(oldprio)) 3840 p->rt.timeout = 0; 3841 p->sched_class = &fair_sched_class; 3842 } 3843 3844 p->prio = prio; 3845 3846 if (queued) 3847 enqueue_task(rq, p, queue_flag); 3848 if (running) 3849 set_curr_task(rq, p); 3850 3851 check_class_changed(rq, p, prev_class, oldprio); 3852 out_unlock: 3853 /* Avoid rq from going away on us: */ 3854 preempt_disable(); 3855 __task_rq_unlock(rq, &rf); 3856 3857 balance_callback(rq); 3858 preempt_enable(); 3859 } 3860 #else 3861 static inline int rt_effective_prio(struct task_struct *p, int prio) 3862 { 3863 return prio; 3864 } 3865 #endif 3866 3867 void set_user_nice(struct task_struct *p, long nice) 3868 { 3869 bool queued, running; 3870 int old_prio, delta; 3871 struct rq_flags rf; 3872 struct rq *rq; 3873 3874 if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE) 3875 return; 3876 /* 3877 * We have to be careful, if called from sys_setpriority(), 3878 * the task might be in the middle of scheduling on another CPU. 3879 */ 3880 rq = task_rq_lock(p, &rf); 3881 update_rq_clock(rq); 3882 3883 /* 3884 * The RT priorities are set via sched_setscheduler(), but we still 3885 * allow the 'normal' nice value to be set - but as expected 3886 * it wont have any effect on scheduling until the task is 3887 * SCHED_DEADLINE, SCHED_FIFO or SCHED_RR: 3888 */ 3889 if (task_has_dl_policy(p) || task_has_rt_policy(p)) { 3890 p->static_prio = NICE_TO_PRIO(nice); 3891 goto out_unlock; 3892 } 3893 queued = task_on_rq_queued(p); 3894 running = task_current(rq, p); 3895 if (queued) 3896 dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK); 3897 if (running) 3898 put_prev_task(rq, p); 3899 3900 p->static_prio = NICE_TO_PRIO(nice); 3901 set_load_weight(p, true); 3902 old_prio = p->prio; 3903 p->prio = effective_prio(p); 3904 delta = p->prio - old_prio; 3905 3906 if (queued) { 3907 enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); 3908 /* 3909 * If the task increased its priority or is running and 3910 * lowered its priority, then reschedule its CPU: 3911 */ 3912 if (delta < 0 || (delta > 0 && task_running(rq, p))) 3913 resched_curr(rq); 3914 } 3915 if (running) 3916 set_curr_task(rq, p); 3917 out_unlock: 3918 task_rq_unlock(rq, p, &rf); 3919 } 3920 EXPORT_SYMBOL(set_user_nice); 3921 3922 /* 3923 * can_nice - check if a task can reduce its nice value 3924 * @p: task 3925 * @nice: nice value 3926 */ 3927 int can_nice(const struct task_struct *p, const int nice) 3928 { 3929 /* Convert nice value [19,-20] to rlimit style value [1,40]: */ 3930 int nice_rlim = nice_to_rlimit(nice); 3931 3932 return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || 3933 capable(CAP_SYS_NICE)); 3934 } 3935 3936 #ifdef __ARCH_WANT_SYS_NICE 3937 3938 /* 3939 * sys_nice - change the priority of the current process. 3940 * @increment: priority increment 3941 * 3942 * sys_setpriority is a more generic, but much slower function that 3943 * does similar things. 3944 */ 3945 SYSCALL_DEFINE1(nice, int, increment) 3946 { 3947 long nice, retval; 3948 3949 /* 3950 * Setpriority might change our priority at the same moment. 3951 * We don't have to worry. Conceptually one call occurs first 3952 * and we have a single winner. 3953 */ 3954 increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH); 3955 nice = task_nice(current) + increment; 3956 3957 nice = clamp_val(nice, MIN_NICE, MAX_NICE); 3958 if (increment < 0 && !can_nice(current, nice)) 3959 return -EPERM; 3960 3961 retval = security_task_setnice(current, nice); 3962 if (retval) 3963 return retval; 3964 3965 set_user_nice(current, nice); 3966 return 0; 3967 } 3968 3969 #endif 3970 3971 /** 3972 * task_prio - return the priority value of a given task. 3973 * @p: the task in question. 3974 * 3975 * Return: The priority value as seen by users in /proc. 3976 * RT tasks are offset by -200. Normal tasks are centered 3977 * around 0, value goes from -16 to +15. 3978 */ 3979 int task_prio(const struct task_struct *p) 3980 { 3981 return p->prio - MAX_RT_PRIO; 3982 } 3983 3984 /** 3985 * idle_cpu - is a given CPU idle currently? 3986 * @cpu: the processor in question. 3987 * 3988 * Return: 1 if the CPU is currently idle. 0 otherwise. 3989 */ 3990 int idle_cpu(int cpu) 3991 { 3992 struct rq *rq = cpu_rq(cpu); 3993 3994 if (rq->curr != rq->idle) 3995 return 0; 3996 3997 if (rq->nr_running) 3998 return 0; 3999 4000 #ifdef CONFIG_SMP 4001 if (!llist_empty(&rq->wake_list)) 4002 return 0; 4003 #endif 4004 4005 return 1; 4006 } 4007 4008 /** 4009 * available_idle_cpu - is a given CPU idle for enqueuing work. 4010 * @cpu: the CPU in question. 4011 * 4012 * Return: 1 if the CPU is currently idle. 0 otherwise. 4013 */ 4014 int available_idle_cpu(int cpu) 4015 { 4016 if (!idle_cpu(cpu)) 4017 return 0; 4018 4019 if (vcpu_is_preempted(cpu)) 4020 return 0; 4021 4022 return 1; 4023 } 4024 4025 /** 4026 * idle_task - return the idle task for a given CPU. 4027 * @cpu: the processor in question. 4028 * 4029 * Return: The idle task for the CPU @cpu. 4030 */ 4031 struct task_struct *idle_task(int cpu) 4032 { 4033 return cpu_rq(cpu)->idle; 4034 } 4035 4036 /** 4037 * find_process_by_pid - find a process with a matching PID value. 4038 * @pid: the pid in question. 4039 * 4040 * The task of @pid, if found. %NULL otherwise. 4041 */ 4042 static struct task_struct *find_process_by_pid(pid_t pid) 4043 { 4044 return pid ? find_task_by_vpid(pid) : current; 4045 } 4046 4047 /* 4048 * sched_setparam() passes in -1 for its policy, to let the functions 4049 * it calls know not to change it. 4050 */ 4051 #define SETPARAM_POLICY -1 4052 4053 static void __setscheduler_params(struct task_struct *p, 4054 const struct sched_attr *attr) 4055 { 4056 int policy = attr->sched_policy; 4057 4058 if (policy == SETPARAM_POLICY) 4059 policy = p->policy; 4060 4061 p->policy = policy; 4062 4063 if (dl_policy(policy)) 4064 __setparam_dl(p, attr); 4065 else if (fair_policy(policy)) 4066 p->static_prio = NICE_TO_PRIO(attr->sched_nice); 4067 4068 /* 4069 * __sched_setscheduler() ensures attr->sched_priority == 0 when 4070 * !rt_policy. Always setting this ensures that things like 4071 * getparam()/getattr() don't report silly values for !rt tasks. 4072 */ 4073 p->rt_priority = attr->sched_priority; 4074 p->normal_prio = normal_prio(p); 4075 set_load_weight(p, true); 4076 } 4077 4078 /* Actually do priority change: must hold pi & rq lock. */ 4079 static void __setscheduler(struct rq *rq, struct task_struct *p, 4080 const struct sched_attr *attr, bool keep_boost) 4081 { 4082 __setscheduler_params(p, attr); 4083 4084 /* 4085 * Keep a potential priority boosting if called from 4086 * sched_setscheduler(). 4087 */ 4088 p->prio = normal_prio(p); 4089 if (keep_boost) 4090 p->prio = rt_effective_prio(p, p->prio); 4091 4092 if (dl_prio(p->prio)) 4093 p->sched_class = &dl_sched_class; 4094 else if (rt_prio(p->prio)) 4095 p->sched_class = &rt_sched_class; 4096 else 4097 p->sched_class = &fair_sched_class; 4098 } 4099 4100 /* 4101 * Check the target process has a UID that matches the current process's: 4102 */ 4103 static bool check_same_owner(struct task_struct *p) 4104 { 4105 const struct cred *cred = current_cred(), *pcred; 4106 bool match; 4107 4108 rcu_read_lock(); 4109 pcred = __task_cred(p); 4110 match = (uid_eq(cred->euid, pcred->euid) || 4111 uid_eq(cred->euid, pcred->uid)); 4112 rcu_read_unlock(); 4113 return match; 4114 } 4115 4116 static int __sched_setscheduler(struct task_struct *p, 4117 const struct sched_attr *attr, 4118 bool user, bool pi) 4119 { 4120 int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 : 4121 MAX_RT_PRIO - 1 - attr->sched_priority; 4122 int retval, oldprio, oldpolicy = -1, queued, running; 4123 int new_effective_prio, policy = attr->sched_policy; 4124 const struct sched_class *prev_class; 4125 struct rq_flags rf; 4126 int reset_on_fork; 4127 int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; 4128 struct rq *rq; 4129 4130 /* The pi code expects interrupts enabled */ 4131 BUG_ON(pi && in_interrupt()); 4132 recheck: 4133 /* Double check policy once rq lock held: */ 4134 if (policy < 0) { 4135 reset_on_fork = p->sched_reset_on_fork; 4136 policy = oldpolicy = p->policy; 4137 } else { 4138 reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK); 4139 4140 if (!valid_policy(policy)) 4141 return -EINVAL; 4142 } 4143 4144 if (attr->sched_flags & ~(SCHED_FLAG_ALL | SCHED_FLAG_SUGOV)) 4145 return -EINVAL; 4146 4147 /* 4148 * Valid priorities for SCHED_FIFO and SCHED_RR are 4149 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, 4150 * SCHED_BATCH and SCHED_IDLE is 0. 4151 */ 4152 if ((p->mm && attr->sched_priority > MAX_USER_RT_PRIO-1) || 4153 (!p->mm && attr->sched_priority > MAX_RT_PRIO-1)) 4154 return -EINVAL; 4155 if ((dl_policy(policy) && !__checkparam_dl(attr)) || 4156 (rt_policy(policy) != (attr->sched_priority != 0))) 4157 return -EINVAL; 4158 4159 /* 4160 * Allow unprivileged RT tasks to decrease priority: 4161 */ 4162 if (user && !capable(CAP_SYS_NICE)) { 4163 if (fair_policy(policy)) { 4164 if (attr->sched_nice < task_nice(p) && 4165 !can_nice(p, attr->sched_nice)) 4166 return -EPERM; 4167 } 4168 4169 if (rt_policy(policy)) { 4170 unsigned long rlim_rtprio = 4171 task_rlimit(p, RLIMIT_RTPRIO); 4172 4173 /* Can't set/change the rt policy: */ 4174 if (policy != p->policy && !rlim_rtprio) 4175 return -EPERM; 4176 4177 /* Can't increase priority: */ 4178 if (attr->sched_priority > p->rt_priority && 4179 attr->sched_priority > rlim_rtprio) 4180 return -EPERM; 4181 } 4182 4183 /* 4184 * Can't set/change SCHED_DEADLINE policy at all for now 4185 * (safest behavior); in the future we would like to allow 4186 * unprivileged DL tasks to increase their relative deadline 4187 * or reduce their runtime (both ways reducing utilization) 4188 */ 4189 if (dl_policy(policy)) 4190 return -EPERM; 4191 4192 /* 4193 * Treat SCHED_IDLE as nice 20. Only allow a switch to 4194 * SCHED_NORMAL if the RLIMIT_NICE would normally permit it. 4195 */ 4196 if (idle_policy(p->policy) && !idle_policy(policy)) { 4197 if (!can_nice(p, task_nice(p))) 4198 return -EPERM; 4199 } 4200 4201 /* Can't change other user's priorities: */ 4202 if (!check_same_owner(p)) 4203 return -EPERM; 4204 4205 /* Normal users shall not reset the sched_reset_on_fork flag: */ 4206 if (p->sched_reset_on_fork && !reset_on_fork) 4207 return -EPERM; 4208 } 4209 4210 if (user) { 4211 if (attr->sched_flags & SCHED_FLAG_SUGOV) 4212 return -EINVAL; 4213 4214 retval = security_task_setscheduler(p); 4215 if (retval) 4216 return retval; 4217 } 4218 4219 /* 4220 * Make sure no PI-waiters arrive (or leave) while we are 4221 * changing the priority of the task: 4222 * 4223 * To be able to change p->policy safely, the appropriate 4224 * runqueue lock must be held. 4225 */ 4226 rq = task_rq_lock(p, &rf); 4227 update_rq_clock(rq); 4228 4229 /* 4230 * Changing the policy of the stop threads its a very bad idea: 4231 */ 4232 if (p == rq->stop) { 4233 task_rq_unlock(rq, p, &rf); 4234 return -EINVAL; 4235 } 4236 4237 /* 4238 * If not changing anything there's no need to proceed further, 4239 * but store a possible modification of reset_on_fork. 4240 */ 4241 if (unlikely(policy == p->policy)) { 4242 if (fair_policy(policy) && attr->sched_nice != task_nice(p)) 4243 goto change; 4244 if (rt_policy(policy) && attr->sched_priority != p->rt_priority) 4245 goto change; 4246 if (dl_policy(policy) && dl_param_changed(p, attr)) 4247 goto change; 4248 4249 p->sched_reset_on_fork = reset_on_fork; 4250 task_rq_unlock(rq, p, &rf); 4251 return 0; 4252 } 4253 change: 4254 4255 if (user) { 4256 #ifdef CONFIG_RT_GROUP_SCHED 4257 /* 4258 * Do not allow realtime tasks into groups that have no runtime 4259 * assigned. 4260 */ 4261 if (rt_bandwidth_enabled() && rt_policy(policy) && 4262 task_group(p)->rt_bandwidth.rt_runtime == 0 && 4263 !task_group_is_autogroup(task_group(p))) { 4264 task_rq_unlock(rq, p, &rf); 4265 return -EPERM; 4266 } 4267 #endif 4268 #ifdef CONFIG_SMP 4269 if (dl_bandwidth_enabled() && dl_policy(policy) && 4270 !(attr->sched_flags & SCHED_FLAG_SUGOV)) { 4271 cpumask_t *span = rq->rd->span; 4272 4273 /* 4274 * Don't allow tasks with an affinity mask smaller than 4275 * the entire root_domain to become SCHED_DEADLINE. We 4276 * will also fail if there's no bandwidth available. 4277 */ 4278 if (!cpumask_subset(span, &p->cpus_allowed) || 4279 rq->rd->dl_bw.bw == 0) { 4280 task_rq_unlock(rq, p, &rf); 4281 return -EPERM; 4282 } 4283 } 4284 #endif 4285 } 4286 4287 /* Re-check policy now with rq lock held: */ 4288 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { 4289 policy = oldpolicy = -1; 4290 task_rq_unlock(rq, p, &rf); 4291 goto recheck; 4292 } 4293 4294 /* 4295 * If setscheduling to SCHED_DEADLINE (or changing the parameters 4296 * of a SCHED_DEADLINE task) we need to check if enough bandwidth 4297 * is available. 4298 */ 4299 if ((dl_policy(policy) || dl_task(p)) && sched_dl_overflow(p, policy, attr)) { 4300 task_rq_unlock(rq, p, &rf); 4301 return -EBUSY; 4302 } 4303 4304 p->sched_reset_on_fork = reset_on_fork; 4305 oldprio = p->prio; 4306 4307 if (pi) { 4308 /* 4309 * Take priority boosted tasks into account. If the new 4310 * effective priority is unchanged, we just store the new 4311 * normal parameters and do not touch the scheduler class and 4312 * the runqueue. This will be done when the task deboost 4313 * itself. 4314 */ 4315 new_effective_prio = rt_effective_prio(p, newprio); 4316 if (new_effective_prio == oldprio) 4317 queue_flags &= ~DEQUEUE_MOVE; 4318 } 4319 4320 queued = task_on_rq_queued(p); 4321 running = task_current(rq, p); 4322 if (queued) 4323 dequeue_task(rq, p, queue_flags); 4324 if (running) 4325 put_prev_task(rq, p); 4326 4327 prev_class = p->sched_class; 4328 __setscheduler(rq, p, attr, pi); 4329 4330 if (queued) { 4331 /* 4332 * We enqueue to tail when the priority of a task is 4333 * increased (user space view). 4334 */ 4335 if (oldprio < p->prio) 4336 queue_flags |= ENQUEUE_HEAD; 4337 4338 enqueue_task(rq, p, queue_flags); 4339 } 4340 if (running) 4341 set_curr_task(rq, p); 4342 4343 check_class_changed(rq, p, prev_class, oldprio); 4344 4345 /* Avoid rq from going away on us: */ 4346 preempt_disable(); 4347 task_rq_unlock(rq, p, &rf); 4348 4349 if (pi) 4350 rt_mutex_adjust_pi(p); 4351 4352 /* Run balance callbacks after we've adjusted the PI chain: */ 4353 balance_callback(rq); 4354 preempt_enable(); 4355 4356 return 0; 4357 } 4358 4359 static int _sched_setscheduler(struct task_struct *p, int policy, 4360 const struct sched_param *param, bool check) 4361 { 4362 struct sched_attr attr = { 4363 .sched_policy = policy, 4364 .sched_priority = param->sched_priority, 4365 .sched_nice = PRIO_TO_NICE(p->static_prio), 4366 }; 4367 4368 /* Fixup the legacy SCHED_RESET_ON_FORK hack. */ 4369 if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) { 4370 attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; 4371 policy &= ~SCHED_RESET_ON_FORK; 4372 attr.sched_policy = policy; 4373 } 4374 4375 return __sched_setscheduler(p, &attr, check, true); 4376 } 4377 /** 4378 * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. 4379 * @p: the task in question. 4380 * @policy: new policy. 4381 * @param: structure containing the new RT priority. 4382 * 4383 * Return: 0 on success. An error code otherwise. 4384 * 4385 * NOTE that the task may be already dead. 4386 */ 4387 int sched_setscheduler(struct task_struct *p, int policy, 4388 const struct sched_param *param) 4389 { 4390 return _sched_setscheduler(p, policy, param, true); 4391 } 4392 EXPORT_SYMBOL_GPL(sched_setscheduler); 4393 4394 int sched_setattr(struct task_struct *p, const struct sched_attr *attr) 4395 { 4396 return __sched_setscheduler(p, attr, true, true); 4397 } 4398 EXPORT_SYMBOL_GPL(sched_setattr); 4399 4400 int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr) 4401 { 4402 return __sched_setscheduler(p, attr, false, true); 4403 } 4404 4405 /** 4406 * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. 4407 * @p: the task in question. 4408 * @policy: new policy. 4409 * @param: structure containing the new RT priority. 4410 * 4411 * Just like sched_setscheduler, only don't bother checking if the 4412 * current context has permission. For example, this is needed in 4413 * stop_machine(): we create temporary high priority worker threads, 4414 * but our caller might not have that capability. 4415 * 4416 * Return: 0 on success. An error code otherwise. 4417 */ 4418 int sched_setscheduler_nocheck(struct task_struct *p, int policy, 4419 const struct sched_param *param) 4420 { 4421 return _sched_setscheduler(p, policy, param, false); 4422 } 4423 EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck); 4424 4425 static int 4426 do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) 4427 { 4428 struct sched_param lparam; 4429 struct task_struct *p; 4430 int retval; 4431 4432 if (!param || pid < 0) 4433 return -EINVAL; 4434 if (copy_from_user(&lparam, param, sizeof(struct sched_param))) 4435 return -EFAULT; 4436 4437 rcu_read_lock(); 4438 retval = -ESRCH; 4439 p = find_process_by_pid(pid); 4440 if (p != NULL) 4441 retval = sched_setscheduler(p, policy, &lparam); 4442 rcu_read_unlock(); 4443 4444 return retval; 4445 } 4446 4447 /* 4448 * Mimics kernel/events/core.c perf_copy_attr(). 4449 */ 4450 static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *attr) 4451 { 4452 u32 size; 4453 int ret; 4454 4455 if (!access_ok(VERIFY_WRITE, uattr, SCHED_ATTR_SIZE_VER0)) 4456 return -EFAULT; 4457 4458 /* Zero the full structure, so that a short copy will be nice: */ 4459 memset(attr, 0, sizeof(*attr)); 4460 4461 ret = get_user(size, &uattr->size); 4462 if (ret) 4463 return ret; 4464 4465 /* Bail out on silly large: */ 4466 if (size > PAGE_SIZE) 4467 goto err_size; 4468 4469 /* ABI compatibility quirk: */ 4470 if (!size) 4471 size = SCHED_ATTR_SIZE_VER0; 4472 4473 if (size < SCHED_ATTR_SIZE_VER0) 4474 goto err_size; 4475 4476 /* 4477 * If we're handed a bigger struct than we know of, 4478 * ensure all the unknown bits are 0 - i.e. new 4479 * user-space does not rely on any kernel feature 4480 * extensions we dont know about yet. 4481 */ 4482 if (size > sizeof(*attr)) { 4483 unsigned char __user *addr; 4484 unsigned char __user *end; 4485 unsigned char val; 4486 4487 addr = (void __user *)uattr + sizeof(*attr); 4488 end = (void __user *)uattr + size; 4489 4490 for (; addr < end; addr++) { 4491 ret = get_user(val, addr); 4492 if (ret) 4493 return ret; 4494 if (val) 4495 goto err_size; 4496 } 4497 size = sizeof(*attr); 4498 } 4499 4500 ret = copy_from_user(attr, uattr, size); 4501 if (ret) 4502 return -EFAULT; 4503 4504 /* 4505 * XXX: Do we want to be lenient like existing syscalls; or do we want 4506 * to be strict and return an error on out-of-bounds values? 4507 */ 4508 attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE); 4509 4510 return 0; 4511 4512 err_size: 4513 put_user(sizeof(*attr), &uattr->size); 4514 return -E2BIG; 4515 } 4516 4517 /** 4518 * sys_sched_setscheduler - set/change the scheduler policy and RT priority 4519 * @pid: the pid in question. 4520 * @policy: new policy. 4521 * @param: structure containing the new RT priority. 4522 * 4523 * Return: 0 on success. An error code otherwise. 4524 */ 4525 SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, struct sched_param __user *, param) 4526 { 4527 if (policy < 0) 4528 return -EINVAL; 4529 4530 return do_sched_setscheduler(pid, policy, param); 4531 } 4532 4533 /** 4534 * sys_sched_setparam - set/change the RT priority of a thread 4535 * @pid: the pid in question. 4536 * @param: structure containing the new RT priority. 4537 * 4538 * Return: 0 on success. An error code otherwise. 4539 */ 4540 SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) 4541 { 4542 return do_sched_setscheduler(pid, SETPARAM_POLICY, param); 4543 } 4544 4545 /** 4546 * sys_sched_setattr - same as above, but with extended sched_attr 4547 * @pid: the pid in question. 4548 * @uattr: structure containing the extended parameters. 4549 * @flags: for future extension. 4550 */ 4551 SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, 4552 unsigned int, flags) 4553 { 4554 struct sched_attr attr; 4555 struct task_struct *p; 4556 int retval; 4557 4558 if (!uattr || pid < 0 || flags) 4559 return -EINVAL; 4560 4561 retval = sched_copy_attr(uattr, &attr); 4562 if (retval) 4563 return retval; 4564 4565 if ((int)attr.sched_policy < 0) 4566 return -EINVAL; 4567 4568 rcu_read_lock(); 4569 retval = -ESRCH; 4570 p = find_process_by_pid(pid); 4571 if (p != NULL) 4572 retval = sched_setattr(p, &attr); 4573 rcu_read_unlock(); 4574 4575 return retval; 4576 } 4577 4578 /** 4579 * sys_sched_getscheduler - get the policy (scheduling class) of a thread 4580 * @pid: the pid in question. 4581 * 4582 * Return: On success, the policy of the thread. Otherwise, a negative error 4583 * code. 4584 */ 4585 SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) 4586 { 4587 struct task_struct *p; 4588 int retval; 4589 4590 if (pid < 0) 4591 return -EINVAL; 4592 4593 retval = -ESRCH; 4594 rcu_read_lock(); 4595 p = find_process_by_pid(pid); 4596 if (p) { 4597 retval = security_task_getscheduler(p); 4598 if (!retval) 4599 retval = p->policy 4600 | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0); 4601 } 4602 rcu_read_unlock(); 4603 return retval; 4604 } 4605 4606 /** 4607 * sys_sched_getparam - get the RT priority of a thread 4608 * @pid: the pid in question. 4609 * @param: structure containing the RT priority. 4610 * 4611 * Return: On success, 0 and the RT priority is in @param. Otherwise, an error 4612 * code. 4613 */ 4614 SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) 4615 { 4616 struct sched_param lp = { .sched_priority = 0 }; 4617 struct task_struct *p; 4618 int retval; 4619 4620 if (!param || pid < 0) 4621 return -EINVAL; 4622 4623 rcu_read_lock(); 4624 p = find_process_by_pid(pid); 4625 retval = -ESRCH; 4626 if (!p) 4627 goto out_unlock; 4628 4629 retval = security_task_getscheduler(p); 4630 if (retval) 4631 goto out_unlock; 4632 4633 if (task_has_rt_policy(p)) 4634 lp.sched_priority = p->rt_priority; 4635 rcu_read_unlock(); 4636 4637 /* 4638 * This one might sleep, we cannot do it with a spinlock held ... 4639 */ 4640 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; 4641 4642 return retval; 4643 4644 out_unlock: 4645 rcu_read_unlock(); 4646 return retval; 4647 } 4648 4649 static int sched_read_attr(struct sched_attr __user *uattr, 4650 struct sched_attr *attr, 4651 unsigned int usize) 4652 { 4653 int ret; 4654 4655 if (!access_ok(VERIFY_WRITE, uattr, usize)) 4656 return -EFAULT; 4657 4658 /* 4659 * If we're handed a smaller struct than we know of, 4660 * ensure all the unknown bits are 0 - i.e. old 4661 * user-space does not get uncomplete information. 4662 */ 4663 if (usize < sizeof(*attr)) { 4664 unsigned char *addr; 4665 unsigned char *end; 4666 4667 addr = (void *)attr + usize; 4668 end = (void *)attr + sizeof(*attr); 4669 4670 for (; addr < end; addr++) { 4671 if (*addr) 4672 return -EFBIG; 4673 } 4674 4675 attr->size = usize; 4676 } 4677 4678 ret = copy_to_user(uattr, attr, attr->size); 4679 if (ret) 4680 return -EFAULT; 4681 4682 return 0; 4683 } 4684 4685 /** 4686 * sys_sched_getattr - similar to sched_getparam, but with sched_attr 4687 * @pid: the pid in question. 4688 * @uattr: structure containing the extended parameters. 4689 * @size: sizeof(attr) for fwd/bwd comp. 4690 * @flags: for future extension. 4691 */ 4692 SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, 4693 unsigned int, size, unsigned int, flags) 4694 { 4695 struct sched_attr attr = { 4696 .size = sizeof(struct sched_attr), 4697 }; 4698 struct task_struct *p; 4699 int retval; 4700 4701 if (!uattr || pid < 0 || size > PAGE_SIZE || 4702 size < SCHED_ATTR_SIZE_VER0 || flags) 4703 return -EINVAL; 4704 4705 rcu_read_lock(); 4706 p = find_process_by_pid(pid); 4707 retval = -ESRCH; 4708 if (!p) 4709 goto out_unlock; 4710 4711 retval = security_task_getscheduler(p); 4712 if (retval) 4713 goto out_unlock; 4714 4715 attr.sched_policy = p->policy; 4716 if (p->sched_reset_on_fork) 4717 attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; 4718 if (task_has_dl_policy(p)) 4719 __getparam_dl(p, &attr); 4720 else if (task_has_rt_policy(p)) 4721 attr.sched_priority = p->rt_priority; 4722 else 4723 attr.sched_nice = task_nice(p); 4724 4725 rcu_read_unlock(); 4726 4727 retval = sched_read_attr(uattr, &attr, size); 4728 return retval; 4729 4730 out_unlock: 4731 rcu_read_unlock(); 4732 return retval; 4733 } 4734 4735 long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) 4736 { 4737 cpumask_var_t cpus_allowed, new_mask; 4738 struct task_struct *p; 4739 int retval; 4740 4741 rcu_read_lock(); 4742 4743 p = find_process_by_pid(pid); 4744 if (!p) { 4745 rcu_read_unlock(); 4746 return -ESRCH; 4747 } 4748 4749 /* Prevent p going away */ 4750 get_task_struct(p); 4751 rcu_read_unlock(); 4752 4753 if (p->flags & PF_NO_SETAFFINITY) { 4754 retval = -EINVAL; 4755 goto out_put_task; 4756 } 4757 if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { 4758 retval = -ENOMEM; 4759 goto out_put_task; 4760 } 4761 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { 4762 retval = -ENOMEM; 4763 goto out_free_cpus_allowed; 4764 } 4765 retval = -EPERM; 4766 if (!check_same_owner(p)) { 4767 rcu_read_lock(); 4768 if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { 4769 rcu_read_unlock(); 4770 goto out_free_new_mask; 4771 } 4772 rcu_read_unlock(); 4773 } 4774 4775 retval = security_task_setscheduler(p); 4776 if (retval) 4777 goto out_free_new_mask; 4778 4779 4780 cpuset_cpus_allowed(p, cpus_allowed); 4781 cpumask_and(new_mask, in_mask, cpus_allowed); 4782 4783 /* 4784 * Since bandwidth control happens on root_domain basis, 4785 * if admission test is enabled, we only admit -deadline 4786 * tasks allowed to run on all the CPUs in the task's 4787 * root_domain. 4788 */ 4789 #ifdef CONFIG_SMP 4790 if (task_has_dl_policy(p) && dl_bandwidth_enabled()) { 4791 rcu_read_lock(); 4792 if (!cpumask_subset(task_rq(p)->rd->span, new_mask)) { 4793 retval = -EBUSY; 4794 rcu_read_unlock(); 4795 goto out_free_new_mask; 4796 } 4797 rcu_read_unlock(); 4798 } 4799 #endif 4800 again: 4801 retval = __set_cpus_allowed_ptr(p, new_mask, true); 4802 4803 if (!retval) { 4804 cpuset_cpus_allowed(p, cpus_allowed); 4805 if (!cpumask_subset(new_mask, cpus_allowed)) { 4806 /* 4807 * We must have raced with a concurrent cpuset 4808 * update. Just reset the cpus_allowed to the 4809 * cpuset's cpus_allowed 4810 */ 4811 cpumask_copy(new_mask, cpus_allowed); 4812 goto again; 4813 } 4814 } 4815 out_free_new_mask: 4816 free_cpumask_var(new_mask); 4817 out_free_cpus_allowed: 4818 free_cpumask_var(cpus_allowed); 4819 out_put_task: 4820 put_task_struct(p); 4821 return retval; 4822 } 4823 4824 static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, 4825 struct cpumask *new_mask) 4826 { 4827 if (len < cpumask_size()) 4828 cpumask_clear(new_mask); 4829 else if (len > cpumask_size()) 4830 len = cpumask_size(); 4831 4832 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; 4833 } 4834 4835 /** 4836 * sys_sched_setaffinity - set the CPU affinity of a process 4837 * @pid: pid of the process 4838 * @len: length in bytes of the bitmask pointed to by user_mask_ptr 4839 * @user_mask_ptr: user-space pointer to the new CPU mask 4840 * 4841 * Return: 0 on success. An error code otherwise. 4842 */ 4843 SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, 4844 unsigned long __user *, user_mask_ptr) 4845 { 4846 cpumask_var_t new_mask; 4847 int retval; 4848 4849 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) 4850 return -ENOMEM; 4851 4852 retval = get_user_cpu_mask(user_mask_ptr, len, new_mask); 4853 if (retval == 0) 4854 retval = sched_setaffinity(pid, new_mask); 4855 free_cpumask_var(new_mask); 4856 return retval; 4857 } 4858 4859 long sched_getaffinity(pid_t pid, struct cpumask *mask) 4860 { 4861 struct task_struct *p; 4862 unsigned long flags; 4863 int retval; 4864 4865 rcu_read_lock(); 4866 4867 retval = -ESRCH; 4868 p = find_process_by_pid(pid); 4869 if (!p) 4870 goto out_unlock; 4871 4872 retval = security_task_getscheduler(p); 4873 if (retval) 4874 goto out_unlock; 4875 4876 raw_spin_lock_irqsave(&p->pi_lock, flags); 4877 cpumask_and(mask, &p->cpus_allowed, cpu_active_mask); 4878 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 4879 4880 out_unlock: 4881 rcu_read_unlock(); 4882 4883 return retval; 4884 } 4885 4886 /** 4887 * sys_sched_getaffinity - get the CPU affinity of a process 4888 * @pid: pid of the process 4889 * @len: length in bytes of the bitmask pointed to by user_mask_ptr 4890 * @user_mask_ptr: user-space pointer to hold the current CPU mask 4891 * 4892 * Return: size of CPU mask copied to user_mask_ptr on success. An 4893 * error code otherwise. 4894 */ 4895 SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, 4896 unsigned long __user *, user_mask_ptr) 4897 { 4898 int ret; 4899 cpumask_var_t mask; 4900 4901 if ((len * BITS_PER_BYTE) < nr_cpu_ids) 4902 return -EINVAL; 4903 if (len & (sizeof(unsigned long)-1)) 4904 return -EINVAL; 4905 4906 if (!alloc_cpumask_var(&mask, GFP_KERNEL)) 4907 return -ENOMEM; 4908 4909 ret = sched_getaffinity(pid, mask); 4910 if (ret == 0) { 4911 unsigned int retlen = min(len, cpumask_size()); 4912 4913 if (copy_to_user(user_mask_ptr, mask, retlen)) 4914 ret = -EFAULT; 4915 else 4916 ret = retlen; 4917 } 4918 free_cpumask_var(mask); 4919 4920 return ret; 4921 } 4922 4923 /** 4924 * sys_sched_yield - yield the current processor to other threads. 4925 * 4926 * This function yields the current CPU to other tasks. If there are no 4927 * other threads running on this CPU then this function will return. 4928 * 4929 * Return: 0. 4930 */ 4931 static void do_sched_yield(void) 4932 { 4933 struct rq_flags rf; 4934 struct rq *rq; 4935 4936 local_irq_disable(); 4937 rq = this_rq(); 4938 rq_lock(rq, &rf); 4939 4940 schedstat_inc(rq->yld_count); 4941 current->sched_class->yield_task(rq); 4942 4943 /* 4944 * Since we are going to call schedule() anyway, there's 4945 * no need to preempt or enable interrupts: 4946 */ 4947 preempt_disable(); 4948 rq_unlock(rq, &rf); 4949 sched_preempt_enable_no_resched(); 4950 4951 schedule(); 4952 } 4953 4954 SYSCALL_DEFINE0(sched_yield) 4955 { 4956 do_sched_yield(); 4957 return 0; 4958 } 4959 4960 #ifndef CONFIG_PREEMPT 4961 int __sched _cond_resched(void) 4962 { 4963 if (should_resched(0)) { 4964 preempt_schedule_common(); 4965 return 1; 4966 } 4967 rcu_all_qs(); 4968 return 0; 4969 } 4970 EXPORT_SYMBOL(_cond_resched); 4971 #endif 4972 4973 /* 4974 * __cond_resched_lock() - if a reschedule is pending, drop the given lock, 4975 * call schedule, and on return reacquire the lock. 4976 * 4977 * This works OK both with and without CONFIG_PREEMPT. We do strange low-level 4978 * operations here to prevent schedule() from being called twice (once via 4979 * spin_unlock(), once by hand). 4980 */ 4981 int __cond_resched_lock(spinlock_t *lock) 4982 { 4983 int resched = should_resched(PREEMPT_LOCK_OFFSET); 4984 int ret = 0; 4985 4986 lockdep_assert_held(lock); 4987 4988 if (spin_needbreak(lock) || resched) { 4989 spin_unlock(lock); 4990 if (resched) 4991 preempt_schedule_common(); 4992 else 4993 cpu_relax(); 4994 ret = 1; 4995 spin_lock(lock); 4996 } 4997 return ret; 4998 } 4999 EXPORT_SYMBOL(__cond_resched_lock); 5000 5001 /** 5002 * yield - yield the current processor to other threads. 5003 * 5004 * Do not ever use this function, there's a 99% chance you're doing it wrong. 5005 * 5006 * The scheduler is at all times free to pick the calling task as the most 5007 * eligible task to run, if removing the yield() call from your code breaks 5008 * it, its already broken. 5009 * 5010 * Typical broken usage is: 5011 * 5012 * while (!event) 5013 * yield(); 5014 * 5015 * where one assumes that yield() will let 'the other' process run that will 5016 * make event true. If the current task is a SCHED_FIFO task that will never 5017 * happen. Never use yield() as a progress guarantee!! 5018 * 5019 * If you want to use yield() to wait for something, use wait_event(). 5020 * If you want to use yield() to be 'nice' for others, use cond_resched(). 5021 * If you still want to use yield(), do not! 5022 */ 5023 void __sched yield(void) 5024 { 5025 set_current_state(TASK_RUNNING); 5026 do_sched_yield(); 5027 } 5028 EXPORT_SYMBOL(yield); 5029 5030 /** 5031 * yield_to - yield the current processor to another thread in 5032 * your thread group, or accelerate that thread toward the 5033 * processor it's on. 5034 * @p: target task 5035 * @preempt: whether task preemption is allowed or not 5036 * 5037 * It's the caller's job to ensure that the target task struct 5038 * can't go away on us before we can do any checks. 5039 * 5040 * Return: 5041 * true (>0) if we indeed boosted the target task. 5042 * false (0) if we failed to boost the target. 5043 * -ESRCH if there's no task to yield to. 5044 */ 5045 int __sched yield_to(struct task_struct *p, bool preempt) 5046 { 5047 struct task_struct *curr = current; 5048 struct rq *rq, *p_rq; 5049 unsigned long flags; 5050 int yielded = 0; 5051 5052 local_irq_save(flags); 5053 rq = this_rq(); 5054 5055 again: 5056 p_rq = task_rq(p); 5057 /* 5058 * If we're the only runnable task on the rq and target rq also 5059 * has only one task, there's absolutely no point in yielding. 5060 */ 5061 if (rq->nr_running == 1 && p_rq->nr_running == 1) { 5062 yielded = -ESRCH; 5063 goto out_irq; 5064 } 5065 5066 double_rq_lock(rq, p_rq); 5067 if (task_rq(p) != p_rq) { 5068 double_rq_unlock(rq, p_rq); 5069 goto again; 5070 } 5071 5072 if (!curr->sched_class->yield_to_task) 5073 goto out_unlock; 5074 5075 if (curr->sched_class != p->sched_class) 5076 goto out_unlock; 5077 5078 if (task_running(p_rq, p) || p->state) 5079 goto out_unlock; 5080 5081 yielded = curr->sched_class->yield_to_task(rq, p, preempt); 5082 if (yielded) { 5083 schedstat_inc(rq->yld_count); 5084 /* 5085 * Make p's CPU reschedule; pick_next_entity takes care of 5086 * fairness. 5087 */ 5088 if (preempt && rq != p_rq) 5089 resched_curr(p_rq); 5090 } 5091 5092 out_unlock: 5093 double_rq_unlock(rq, p_rq); 5094 out_irq: 5095 local_irq_restore(flags); 5096 5097 if (yielded > 0) 5098 schedule(); 5099 5100 return yielded; 5101 } 5102 EXPORT_SYMBOL_GPL(yield_to); 5103 5104 int io_schedule_prepare(void) 5105 { 5106 int old_iowait = current->in_iowait; 5107 5108 current->in_iowait = 1; 5109 blk_schedule_flush_plug(current); 5110 5111 return old_iowait; 5112 } 5113 5114 void io_schedule_finish(int token) 5115 { 5116 current->in_iowait = token; 5117 } 5118 5119 /* 5120 * This task is about to go to sleep on IO. Increment rq->nr_iowait so 5121 * that process accounting knows that this is a task in IO wait state. 5122 */ 5123 long __sched io_schedule_timeout(long timeout) 5124 { 5125 int token; 5126 long ret; 5127 5128 token = io_schedule_prepare(); 5129 ret = schedule_timeout(timeout); 5130 io_schedule_finish(token); 5131 5132 return ret; 5133 } 5134 EXPORT_SYMBOL(io_schedule_timeout); 5135 5136 void io_schedule(void) 5137 { 5138 int token; 5139 5140 token = io_schedule_prepare(); 5141 schedule(); 5142 io_schedule_finish(token); 5143 } 5144 EXPORT_SYMBOL(io_schedule); 5145 5146 /** 5147 * sys_sched_get_priority_max - return maximum RT priority. 5148 * @policy: scheduling class. 5149 * 5150 * Return: On success, this syscall returns the maximum 5151 * rt_priority that can be used by a given scheduling class. 5152 * On failure, a negative error code is returned. 5153 */ 5154 SYSCALL_DEFINE1(sched_get_priority_max, int, policy) 5155 { 5156 int ret = -EINVAL; 5157 5158 switch (policy) { 5159 case SCHED_FIFO: 5160 case SCHED_RR: 5161 ret = MAX_USER_RT_PRIO-1; 5162 break; 5163 case SCHED_DEADLINE: 5164 case SCHED_NORMAL: 5165 case SCHED_BATCH: 5166 case SCHED_IDLE: 5167 ret = 0; 5168 break; 5169 } 5170 return ret; 5171 } 5172 5173 /** 5174 * sys_sched_get_priority_min - return minimum RT priority. 5175 * @policy: scheduling class. 5176 * 5177 * Return: On success, this syscall returns the minimum 5178 * rt_priority that can be used by a given scheduling class. 5179 * On failure, a negative error code is returned. 5180 */ 5181 SYSCALL_DEFINE1(sched_get_priority_min, int, policy) 5182 { 5183 int ret = -EINVAL; 5184 5185 switch (policy) { 5186 case SCHED_FIFO: 5187 case SCHED_RR: 5188 ret = 1; 5189 break; 5190 case SCHED_DEADLINE: 5191 case SCHED_NORMAL: 5192 case SCHED_BATCH: 5193 case SCHED_IDLE: 5194 ret = 0; 5195 } 5196 return ret; 5197 } 5198 5199 static int sched_rr_get_interval(pid_t pid, struct timespec64 *t) 5200 { 5201 struct task_struct *p; 5202 unsigned int time_slice; 5203 struct rq_flags rf; 5204 struct rq *rq; 5205 int retval; 5206 5207 if (pid < 0) 5208 return -EINVAL; 5209 5210 retval = -ESRCH; 5211 rcu_read_lock(); 5212 p = find_process_by_pid(pid); 5213 if (!p) 5214 goto out_unlock; 5215 5216 retval = security_task_getscheduler(p); 5217 if (retval) 5218 goto out_unlock; 5219 5220 rq = task_rq_lock(p, &rf); 5221 time_slice = 0; 5222 if (p->sched_class->get_rr_interval) 5223 time_slice = p->sched_class->get_rr_interval(rq, p); 5224 task_rq_unlock(rq, p, &rf); 5225 5226 rcu_read_unlock(); 5227 jiffies_to_timespec64(time_slice, t); 5228 return 0; 5229 5230 out_unlock: 5231 rcu_read_unlock(); 5232 return retval; 5233 } 5234 5235 /** 5236 * sys_sched_rr_get_interval - return the default timeslice of a process. 5237 * @pid: pid of the process. 5238 * @interval: userspace pointer to the timeslice value. 5239 * 5240 * this syscall writes the default timeslice value of a given process 5241 * into the user-space timespec buffer. A value of '0' means infinity. 5242 * 5243 * Return: On success, 0 and the timeslice is in @interval. Otherwise, 5244 * an error code. 5245 */ 5246 SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, 5247 struct __kernel_timespec __user *, interval) 5248 { 5249 struct timespec64 t; 5250 int retval = sched_rr_get_interval(pid, &t); 5251 5252 if (retval == 0) 5253 retval = put_timespec64(&t, interval); 5254 5255 return retval; 5256 } 5257 5258 #ifdef CONFIG_COMPAT_32BIT_TIME 5259 COMPAT_SYSCALL_DEFINE2(sched_rr_get_interval, 5260 compat_pid_t, pid, 5261 struct old_timespec32 __user *, interval) 5262 { 5263 struct timespec64 t; 5264 int retval = sched_rr_get_interval(pid, &t); 5265 5266 if (retval == 0) 5267 retval = put_old_timespec32(&t, interval); 5268 return retval; 5269 } 5270 #endif 5271 5272 void sched_show_task(struct task_struct *p) 5273 { 5274 unsigned long free = 0; 5275 int ppid; 5276 5277 if (!try_get_task_stack(p)) 5278 return; 5279 5280 printk(KERN_INFO "%-15.15s %c", p->comm, task_state_to_char(p)); 5281 5282 if (p->state == TASK_RUNNING) 5283 printk(KERN_CONT " running task "); 5284 #ifdef CONFIG_DEBUG_STACK_USAGE 5285 free = stack_not_used(p); 5286 #endif 5287 ppid = 0; 5288 rcu_read_lock(); 5289 if (pid_alive(p)) 5290 ppid = task_pid_nr(rcu_dereference(p->real_parent)); 5291 rcu_read_unlock(); 5292 printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, 5293 task_pid_nr(p), ppid, 5294 (unsigned long)task_thread_info(p)->flags); 5295 5296 print_worker_info(KERN_INFO, p); 5297 show_stack(p, NULL); 5298 put_task_stack(p); 5299 } 5300 EXPORT_SYMBOL_GPL(sched_show_task); 5301 5302 static inline bool 5303 state_filter_match(unsigned long state_filter, struct task_struct *p) 5304 { 5305 /* no filter, everything matches */ 5306 if (!state_filter) 5307 return true; 5308 5309 /* filter, but doesn't match */ 5310 if (!(p->state & state_filter)) 5311 return false; 5312 5313 /* 5314 * When looking for TASK_UNINTERRUPTIBLE skip TASK_IDLE (allows 5315 * TASK_KILLABLE). 5316 */ 5317 if (state_filter == TASK_UNINTERRUPTIBLE && p->state == TASK_IDLE) 5318 return false; 5319 5320 return true; 5321 } 5322 5323 5324 void show_state_filter(unsigned long state_filter) 5325 { 5326 struct task_struct *g, *p; 5327 5328 #if BITS_PER_LONG == 32 5329 printk(KERN_INFO 5330 " task PC stack pid father\n"); 5331 #else 5332 printk(KERN_INFO 5333 " task PC stack pid father\n"); 5334 #endif 5335 rcu_read_lock(); 5336 for_each_process_thread(g, p) { 5337 /* 5338 * reset the NMI-timeout, listing all files on a slow 5339 * console might take a lot of time: 5340 * Also, reset softlockup watchdogs on all CPUs, because 5341 * another CPU might be blocked waiting for us to process 5342 * an IPI. 5343 */ 5344 touch_nmi_watchdog(); 5345 touch_all_softlockup_watchdogs(); 5346 if (state_filter_match(state_filter, p)) 5347 sched_show_task(p); 5348 } 5349 5350 #ifdef CONFIG_SCHED_DEBUG 5351 if (!state_filter) 5352 sysrq_sched_debug_show(); 5353 #endif 5354 rcu_read_unlock(); 5355 /* 5356 * Only show locks if all tasks are dumped: 5357 */ 5358 if (!state_filter) 5359 debug_show_all_locks(); 5360 } 5361 5362 /** 5363 * init_idle - set up an idle thread for a given CPU 5364 * @idle: task in question 5365 * @cpu: CPU the idle task belongs to 5366 * 5367 * NOTE: this function does not set the idle thread's NEED_RESCHED 5368 * flag, to make booting more robust. 5369 */ 5370 void init_idle(struct task_struct *idle, int cpu) 5371 { 5372 struct rq *rq = cpu_rq(cpu); 5373 unsigned long flags; 5374 5375 raw_spin_lock_irqsave(&idle->pi_lock, flags); 5376 raw_spin_lock(&rq->lock); 5377 5378 __sched_fork(0, idle); 5379 idle->state = TASK_RUNNING; 5380 idle->se.exec_start = sched_clock(); 5381 idle->flags |= PF_IDLE; 5382 5383 kasan_unpoison_task_stack(idle); 5384 5385 #ifdef CONFIG_SMP 5386 /* 5387 * Its possible that init_idle() gets called multiple times on a task, 5388 * in that case do_set_cpus_allowed() will not do the right thing. 5389 * 5390 * And since this is boot we can forgo the serialization. 5391 */ 5392 set_cpus_allowed_common(idle, cpumask_of(cpu)); 5393 #endif 5394 /* 5395 * We're having a chicken and egg problem, even though we are 5396 * holding rq->lock, the CPU isn't yet set to this CPU so the 5397 * lockdep check in task_group() will fail. 5398 * 5399 * Similar case to sched_fork(). / Alternatively we could 5400 * use task_rq_lock() here and obtain the other rq->lock. 5401 * 5402 * Silence PROVE_RCU 5403 */ 5404 rcu_read_lock(); 5405 __set_task_cpu(idle, cpu); 5406 rcu_read_unlock(); 5407 5408 rq->curr = rq->idle = idle; 5409 idle->on_rq = TASK_ON_RQ_QUEUED; 5410 #ifdef CONFIG_SMP 5411 idle->on_cpu = 1; 5412 #endif 5413 raw_spin_unlock(&rq->lock); 5414 raw_spin_unlock_irqrestore(&idle->pi_lock, flags); 5415 5416 /* Set the preempt count _outside_ the spinlocks! */ 5417 init_idle_preempt_count(idle, cpu); 5418 5419 /* 5420 * The idle tasks have their own, simple scheduling class: 5421 */ 5422 idle->sched_class = &idle_sched_class; 5423 ftrace_graph_init_idle_task(idle, cpu); 5424 vtime_init_idle(idle, cpu); 5425 #ifdef CONFIG_SMP 5426 sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); 5427 #endif 5428 } 5429 5430 #ifdef CONFIG_SMP 5431 5432 int cpuset_cpumask_can_shrink(const struct cpumask *cur, 5433 const struct cpumask *trial) 5434 { 5435 int ret = 1; 5436 5437 if (!cpumask_weight(cur)) 5438 return ret; 5439 5440 ret = dl_cpuset_cpumask_can_shrink(cur, trial); 5441 5442 return ret; 5443 } 5444 5445 int task_can_attach(struct task_struct *p, 5446 const struct cpumask *cs_cpus_allowed) 5447 { 5448 int ret = 0; 5449 5450 /* 5451 * Kthreads which disallow setaffinity shouldn't be moved 5452 * to a new cpuset; we don't want to change their CPU 5453 * affinity and isolating such threads by their set of 5454 * allowed nodes is unnecessary. Thus, cpusets are not 5455 * applicable for such threads. This prevents checking for 5456 * success of set_cpus_allowed_ptr() on all attached tasks 5457 * before cpus_allowed may be changed. 5458 */ 5459 if (p->flags & PF_NO_SETAFFINITY) { 5460 ret = -EINVAL; 5461 goto out; 5462 } 5463 5464 if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span, 5465 cs_cpus_allowed)) 5466 ret = dl_task_can_attach(p, cs_cpus_allowed); 5467 5468 out: 5469 return ret; 5470 } 5471 5472 bool sched_smp_initialized __read_mostly; 5473 5474 #ifdef CONFIG_NUMA_BALANCING 5475 /* Migrate current task p to target_cpu */ 5476 int migrate_task_to(struct task_struct *p, int target_cpu) 5477 { 5478 struct migration_arg arg = { p, target_cpu }; 5479 int curr_cpu = task_cpu(p); 5480 5481 if (curr_cpu == target_cpu) 5482 return 0; 5483 5484 if (!cpumask_test_cpu(target_cpu, &p->cpus_allowed)) 5485 return -EINVAL; 5486 5487 /* TODO: This is not properly updating schedstats */ 5488 5489 trace_sched_move_numa(p, curr_cpu, target_cpu); 5490 return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg); 5491 } 5492 5493 /* 5494 * Requeue a task on a given node and accurately track the number of NUMA 5495 * tasks on the runqueues 5496 */ 5497 void sched_setnuma(struct task_struct *p, int nid) 5498 { 5499 bool queued, running; 5500 struct rq_flags rf; 5501 struct rq *rq; 5502 5503 rq = task_rq_lock(p, &rf); 5504 queued = task_on_rq_queued(p); 5505 running = task_current(rq, p); 5506 5507 if (queued) 5508 dequeue_task(rq, p, DEQUEUE_SAVE); 5509 if (running) 5510 put_prev_task(rq, p); 5511 5512 p->numa_preferred_nid = nid; 5513 5514 if (queued) 5515 enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); 5516 if (running) 5517 set_curr_task(rq, p); 5518 task_rq_unlock(rq, p, &rf); 5519 } 5520 #endif /* CONFIG_NUMA_BALANCING */ 5521 5522 #ifdef CONFIG_HOTPLUG_CPU 5523 /* 5524 * Ensure that the idle task is using init_mm right before its CPU goes 5525 * offline. 5526 */ 5527 void idle_task_exit(void) 5528 { 5529 struct mm_struct *mm = current->active_mm; 5530 5531 BUG_ON(cpu_online(smp_processor_id())); 5532 5533 if (mm != &init_mm) { 5534 switch_mm(mm, &init_mm, current); 5535 current->active_mm = &init_mm; 5536 finish_arch_post_lock_switch(); 5537 } 5538 mmdrop(mm); 5539 } 5540 5541 /* 5542 * Since this CPU is going 'away' for a while, fold any nr_active delta 5543 * we might have. Assumes we're called after migrate_tasks() so that the 5544 * nr_active count is stable. We need to take the teardown thread which 5545 * is calling this into account, so we hand in adjust = 1 to the load 5546 * calculation. 5547 * 5548 * Also see the comment "Global load-average calculations". 5549 */ 5550 static void calc_load_migrate(struct rq *rq) 5551 { 5552 long delta = calc_load_fold_active(rq, 1); 5553 if (delta) 5554 atomic_long_add(delta, &calc_load_tasks); 5555 } 5556 5557 static void put_prev_task_fake(struct rq *rq, struct task_struct *prev) 5558 { 5559 } 5560 5561 static const struct sched_class fake_sched_class = { 5562 .put_prev_task = put_prev_task_fake, 5563 }; 5564 5565 static struct task_struct fake_task = { 5566 /* 5567 * Avoid pull_{rt,dl}_task() 5568 */ 5569 .prio = MAX_PRIO + 1, 5570 .sched_class = &fake_sched_class, 5571 }; 5572 5573 /* 5574 * Migrate all tasks from the rq, sleeping tasks will be migrated by 5575 * try_to_wake_up()->select_task_rq(). 5576 * 5577 * Called with rq->lock held even though we'er in stop_machine() and 5578 * there's no concurrency possible, we hold the required locks anyway 5579 * because of lock validation efforts. 5580 */ 5581 static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf) 5582 { 5583 struct rq *rq = dead_rq; 5584 struct task_struct *next, *stop = rq->stop; 5585 struct rq_flags orf = *rf; 5586 int dest_cpu; 5587 5588 /* 5589 * Fudge the rq selection such that the below task selection loop 5590 * doesn't get stuck on the currently eligible stop task. 5591 * 5592 * We're currently inside stop_machine() and the rq is either stuck 5593 * in the stop_machine_cpu_stop() loop, or we're executing this code, 5594 * either way we should never end up calling schedule() until we're 5595 * done here. 5596 */ 5597 rq->stop = NULL; 5598 5599 /* 5600 * put_prev_task() and pick_next_task() sched 5601 * class method both need to have an up-to-date 5602 * value of rq->clock[_task] 5603 */ 5604 update_rq_clock(rq); 5605 5606 for (;;) { 5607 /* 5608 * There's this thread running, bail when that's the only 5609 * remaining thread: 5610 */ 5611 if (rq->nr_running == 1) 5612 break; 5613 5614 /* 5615 * pick_next_task() assumes pinned rq->lock: 5616 */ 5617 next = pick_next_task(rq, &fake_task, rf); 5618 BUG_ON(!next); 5619 put_prev_task(rq, next); 5620 5621 /* 5622 * Rules for changing task_struct::cpus_allowed are holding 5623 * both pi_lock and rq->lock, such that holding either 5624 * stabilizes the mask. 5625 * 5626 * Drop rq->lock is not quite as disastrous as it usually is 5627 * because !cpu_active at this point, which means load-balance 5628 * will not interfere. Also, stop-machine. 5629 */ 5630 rq_unlock(rq, rf); 5631 raw_spin_lock(&next->pi_lock); 5632 rq_relock(rq, rf); 5633 5634 /* 5635 * Since we're inside stop-machine, _nothing_ should have 5636 * changed the task, WARN if weird stuff happened, because in 5637 * that case the above rq->lock drop is a fail too. 5638 */ 5639 if (WARN_ON(task_rq(next) != rq || !task_on_rq_queued(next))) { 5640 raw_spin_unlock(&next->pi_lock); 5641 continue; 5642 } 5643 5644 /* Find suitable destination for @next, with force if needed. */ 5645 dest_cpu = select_fallback_rq(dead_rq->cpu, next); 5646 rq = __migrate_task(rq, rf, next, dest_cpu); 5647 if (rq != dead_rq) { 5648 rq_unlock(rq, rf); 5649 rq = dead_rq; 5650 *rf = orf; 5651 rq_relock(rq, rf); 5652 } 5653 raw_spin_unlock(&next->pi_lock); 5654 } 5655 5656 rq->stop = stop; 5657 } 5658 #endif /* CONFIG_HOTPLUG_CPU */ 5659 5660 void set_rq_online(struct rq *rq) 5661 { 5662 if (!rq->online) { 5663 const struct sched_class *class; 5664 5665 cpumask_set_cpu(rq->cpu, rq->rd->online); 5666 rq->online = 1; 5667 5668 for_each_class(class) { 5669 if (class->rq_online) 5670 class->rq_online(rq); 5671 } 5672 } 5673 } 5674 5675 void set_rq_offline(struct rq *rq) 5676 { 5677 if (rq->online) { 5678 const struct sched_class *class; 5679 5680 for_each_class(class) { 5681 if (class->rq_offline) 5682 class->rq_offline(rq); 5683 } 5684 5685 cpumask_clear_cpu(rq->cpu, rq->rd->online); 5686 rq->online = 0; 5687 } 5688 } 5689 5690 /* 5691 * used to mark begin/end of suspend/resume: 5692 */ 5693 static int num_cpus_frozen; 5694 5695 /* 5696 * Update cpusets according to cpu_active mask. If cpusets are 5697 * disabled, cpuset_update_active_cpus() becomes a simple wrapper 5698 * around partition_sched_domains(). 5699 * 5700 * If we come here as part of a suspend/resume, don't touch cpusets because we 5701 * want to restore it back to its original state upon resume anyway. 5702 */ 5703 static void cpuset_cpu_active(void) 5704 { 5705 if (cpuhp_tasks_frozen) { 5706 /* 5707 * num_cpus_frozen tracks how many CPUs are involved in suspend 5708 * resume sequence. As long as this is not the last online 5709 * operation in the resume sequence, just build a single sched 5710 * domain, ignoring cpusets. 5711 */ 5712 partition_sched_domains(1, NULL, NULL); 5713 if (--num_cpus_frozen) 5714 return; 5715 /* 5716 * This is the last CPU online operation. So fall through and 5717 * restore the original sched domains by considering the 5718 * cpuset configurations. 5719 */ 5720 cpuset_force_rebuild(); 5721 } 5722 cpuset_update_active_cpus(); 5723 } 5724 5725 static int cpuset_cpu_inactive(unsigned int cpu) 5726 { 5727 if (!cpuhp_tasks_frozen) { 5728 if (dl_cpu_busy(cpu)) 5729 return -EBUSY; 5730 cpuset_update_active_cpus(); 5731 } else { 5732 num_cpus_frozen++; 5733 partition_sched_domains(1, NULL, NULL); 5734 } 5735 return 0; 5736 } 5737 5738 int sched_cpu_activate(unsigned int cpu) 5739 { 5740 struct rq *rq = cpu_rq(cpu); 5741 struct rq_flags rf; 5742 5743 #ifdef CONFIG_SCHED_SMT 5744 /* 5745 * The sched_smt_present static key needs to be evaluated on every 5746 * hotplug event because at boot time SMT might be disabled when 5747 * the number of booted CPUs is limited. 5748 * 5749 * If then later a sibling gets hotplugged, then the key would stay 5750 * off and SMT scheduling would never be functional. 5751 */ 5752 if (cpumask_weight(cpu_smt_mask(cpu)) > 1) 5753 static_branch_enable_cpuslocked(&sched_smt_present); 5754 #endif 5755 set_cpu_active(cpu, true); 5756 5757 if (sched_smp_initialized) { 5758 sched_domains_numa_masks_set(cpu); 5759 cpuset_cpu_active(); 5760 } 5761 5762 /* 5763 * Put the rq online, if not already. This happens: 5764 * 5765 * 1) In the early boot process, because we build the real domains 5766 * after all CPUs have been brought up. 5767 * 5768 * 2) At runtime, if cpuset_cpu_active() fails to rebuild the 5769 * domains. 5770 */ 5771 rq_lock_irqsave(rq, &rf); 5772 if (rq->rd) { 5773 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 5774 set_rq_online(rq); 5775 } 5776 rq_unlock_irqrestore(rq, &rf); 5777 5778 update_max_interval(); 5779 5780 return 0; 5781 } 5782 5783 int sched_cpu_deactivate(unsigned int cpu) 5784 { 5785 int ret; 5786 5787 set_cpu_active(cpu, false); 5788 /* 5789 * We've cleared cpu_active_mask, wait for all preempt-disabled and RCU 5790 * users of this state to go away such that all new such users will 5791 * observe it. 5792 * 5793 * Do sync before park smpboot threads to take care the rcu boost case. 5794 */ 5795 synchronize_rcu_mult(call_rcu, call_rcu_sched); 5796 5797 if (!sched_smp_initialized) 5798 return 0; 5799 5800 ret = cpuset_cpu_inactive(cpu); 5801 if (ret) { 5802 set_cpu_active(cpu, true); 5803 return ret; 5804 } 5805 sched_domains_numa_masks_clear(cpu); 5806 return 0; 5807 } 5808 5809 static void sched_rq_cpu_starting(unsigned int cpu) 5810 { 5811 struct rq *rq = cpu_rq(cpu); 5812 5813 rq->calc_load_update = calc_load_update; 5814 update_max_interval(); 5815 } 5816 5817 int sched_cpu_starting(unsigned int cpu) 5818 { 5819 sched_rq_cpu_starting(cpu); 5820 sched_tick_start(cpu); 5821 return 0; 5822 } 5823 5824 #ifdef CONFIG_HOTPLUG_CPU 5825 int sched_cpu_dying(unsigned int cpu) 5826 { 5827 struct rq *rq = cpu_rq(cpu); 5828 struct rq_flags rf; 5829 5830 /* Handle pending wakeups and then migrate everything off */ 5831 sched_ttwu_pending(); 5832 sched_tick_stop(cpu); 5833 5834 rq_lock_irqsave(rq, &rf); 5835 if (rq->rd) { 5836 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 5837 set_rq_offline(rq); 5838 } 5839 migrate_tasks(rq, &rf); 5840 BUG_ON(rq->nr_running != 1); 5841 rq_unlock_irqrestore(rq, &rf); 5842 5843 calc_load_migrate(rq); 5844 update_max_interval(); 5845 nohz_balance_exit_idle(rq); 5846 hrtick_clear(rq); 5847 return 0; 5848 } 5849 #endif 5850 5851 void __init sched_init_smp(void) 5852 { 5853 sched_init_numa(); 5854 5855 /* 5856 * There's no userspace yet to cause hotplug operations; hence all the 5857 * CPU masks are stable and all blatant races in the below code cannot 5858 * happen. 5859 */ 5860 mutex_lock(&sched_domains_mutex); 5861 sched_init_domains(cpu_active_mask); 5862 mutex_unlock(&sched_domains_mutex); 5863 5864 /* Move init over to a non-isolated CPU */ 5865 if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0) 5866 BUG(); 5867 sched_init_granularity(); 5868 5869 init_sched_rt_class(); 5870 init_sched_dl_class(); 5871 5872 sched_smp_initialized = true; 5873 } 5874 5875 static int __init migration_init(void) 5876 { 5877 sched_rq_cpu_starting(smp_processor_id()); 5878 return 0; 5879 } 5880 early_initcall(migration_init); 5881 5882 #else 5883 void __init sched_init_smp(void) 5884 { 5885 sched_init_granularity(); 5886 } 5887 #endif /* CONFIG_SMP */ 5888 5889 int in_sched_functions(unsigned long addr) 5890 { 5891 return in_lock_functions(addr) || 5892 (addr >= (unsigned long)__sched_text_start 5893 && addr < (unsigned long)__sched_text_end); 5894 } 5895 5896 #ifdef CONFIG_CGROUP_SCHED 5897 /* 5898 * Default task group. 5899 * Every task in system belongs to this group at bootup. 5900 */ 5901 struct task_group root_task_group; 5902 LIST_HEAD(task_groups); 5903 5904 /* Cacheline aligned slab cache for task_group */ 5905 static struct kmem_cache *task_group_cache __read_mostly; 5906 #endif 5907 5908 DECLARE_PER_CPU(cpumask_var_t, load_balance_mask); 5909 DECLARE_PER_CPU(cpumask_var_t, select_idle_mask); 5910 5911 void __init sched_init(void) 5912 { 5913 int i, j; 5914 unsigned long alloc_size = 0, ptr; 5915 5916 wait_bit_init(); 5917 5918 #ifdef CONFIG_FAIR_GROUP_SCHED 5919 alloc_size += 2 * nr_cpu_ids * sizeof(void **); 5920 #endif 5921 #ifdef CONFIG_RT_GROUP_SCHED 5922 alloc_size += 2 * nr_cpu_ids * sizeof(void **); 5923 #endif 5924 if (alloc_size) { 5925 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); 5926 5927 #ifdef CONFIG_FAIR_GROUP_SCHED 5928 root_task_group.se = (struct sched_entity **)ptr; 5929 ptr += nr_cpu_ids * sizeof(void **); 5930 5931 root_task_group.cfs_rq = (struct cfs_rq **)ptr; 5932 ptr += nr_cpu_ids * sizeof(void **); 5933 5934 #endif /* CONFIG_FAIR_GROUP_SCHED */ 5935 #ifdef CONFIG_RT_GROUP_SCHED 5936 root_task_group.rt_se = (struct sched_rt_entity **)ptr; 5937 ptr += nr_cpu_ids * sizeof(void **); 5938 5939 root_task_group.rt_rq = (struct rt_rq **)ptr; 5940 ptr += nr_cpu_ids * sizeof(void **); 5941 5942 #endif /* CONFIG_RT_GROUP_SCHED */ 5943 } 5944 #ifdef CONFIG_CPUMASK_OFFSTACK 5945 for_each_possible_cpu(i) { 5946 per_cpu(load_balance_mask, i) = (cpumask_var_t)kzalloc_node( 5947 cpumask_size(), GFP_KERNEL, cpu_to_node(i)); 5948 per_cpu(select_idle_mask, i) = (cpumask_var_t)kzalloc_node( 5949 cpumask_size(), GFP_KERNEL, cpu_to_node(i)); 5950 } 5951 #endif /* CONFIG_CPUMASK_OFFSTACK */ 5952 5953 init_rt_bandwidth(&def_rt_bandwidth, global_rt_period(), global_rt_runtime()); 5954 init_dl_bandwidth(&def_dl_bandwidth, global_rt_period(), global_rt_runtime()); 5955 5956 #ifdef CONFIG_SMP 5957 init_defrootdomain(); 5958 #endif 5959 5960 #ifdef CONFIG_RT_GROUP_SCHED 5961 init_rt_bandwidth(&root_task_group.rt_bandwidth, 5962 global_rt_period(), global_rt_runtime()); 5963 #endif /* CONFIG_RT_GROUP_SCHED */ 5964 5965 #ifdef CONFIG_CGROUP_SCHED 5966 task_group_cache = KMEM_CACHE(task_group, 0); 5967 5968 list_add(&root_task_group.list, &task_groups); 5969 INIT_LIST_HEAD(&root_task_group.children); 5970 INIT_LIST_HEAD(&root_task_group.siblings); 5971 autogroup_init(&init_task); 5972 #endif /* CONFIG_CGROUP_SCHED */ 5973 5974 for_each_possible_cpu(i) { 5975 struct rq *rq; 5976 5977 rq = cpu_rq(i); 5978 raw_spin_lock_init(&rq->lock); 5979 rq->nr_running = 0; 5980 rq->calc_load_active = 0; 5981 rq->calc_load_update = jiffies + LOAD_FREQ; 5982 init_cfs_rq(&rq->cfs); 5983 init_rt_rq(&rq->rt); 5984 init_dl_rq(&rq->dl); 5985 #ifdef CONFIG_FAIR_GROUP_SCHED 5986 root_task_group.shares = ROOT_TASK_GROUP_LOAD; 5987 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); 5988 rq->tmp_alone_branch = &rq->leaf_cfs_rq_list; 5989 /* 5990 * How much CPU bandwidth does root_task_group get? 5991 * 5992 * In case of task-groups formed thr' the cgroup filesystem, it 5993 * gets 100% of the CPU resources in the system. This overall 5994 * system CPU resource is divided among the tasks of 5995 * root_task_group and its child task-groups in a fair manner, 5996 * based on each entity's (task or task-group's) weight 5997 * (se->load.weight). 5998 * 5999 * In other words, if root_task_group has 10 tasks of weight 6000 * 1024) and two child groups A0 and A1 (of weight 1024 each), 6001 * then A0's share of the CPU resource is: 6002 * 6003 * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% 6004 * 6005 * We achieve this by letting root_task_group's tasks sit 6006 * directly in rq->cfs (i.e root_task_group->se[] = NULL). 6007 */ 6008 init_cfs_bandwidth(&root_task_group.cfs_bandwidth); 6009 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL); 6010 #endif /* CONFIG_FAIR_GROUP_SCHED */ 6011 6012 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; 6013 #ifdef CONFIG_RT_GROUP_SCHED 6014 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL); 6015 #endif 6016 6017 for (j = 0; j < CPU_LOAD_IDX_MAX; j++) 6018 rq->cpu_load[j] = 0; 6019 6020 #ifdef CONFIG_SMP 6021 rq->sd = NULL; 6022 rq->rd = NULL; 6023 rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE; 6024 rq->balance_callback = NULL; 6025 rq->active_balance = 0; 6026 rq->next_balance = jiffies; 6027 rq->push_cpu = 0; 6028 rq->cpu = i; 6029 rq->online = 0; 6030 rq->idle_stamp = 0; 6031 rq->avg_idle = 2*sysctl_sched_migration_cost; 6032 rq->max_idle_balance_cost = sysctl_sched_migration_cost; 6033 6034 INIT_LIST_HEAD(&rq->cfs_tasks); 6035 6036 rq_attach_root(rq, &def_root_domain); 6037 #ifdef CONFIG_NO_HZ_COMMON 6038 rq->last_load_update_tick = jiffies; 6039 rq->last_blocked_load_update_tick = jiffies; 6040 atomic_set(&rq->nohz_flags, 0); 6041 #endif 6042 #endif /* CONFIG_SMP */ 6043 hrtick_rq_init(rq); 6044 atomic_set(&rq->nr_iowait, 0); 6045 } 6046 6047 set_load_weight(&init_task, false); 6048 6049 /* 6050 * The boot idle thread does lazy MMU switching as well: 6051 */ 6052 mmgrab(&init_mm); 6053 enter_lazy_tlb(&init_mm, current); 6054 6055 /* 6056 * Make us the idle thread. Technically, schedule() should not be 6057 * called from this thread, however somewhere below it might be, 6058 * but because we are the idle thread, we just pick up running again 6059 * when this runqueue becomes "idle". 6060 */ 6061 init_idle(current, smp_processor_id()); 6062 6063 calc_load_update = jiffies + LOAD_FREQ; 6064 6065 #ifdef CONFIG_SMP 6066 idle_thread_set_boot_cpu(); 6067 #endif 6068 init_sched_fair_class(); 6069 6070 init_schedstats(); 6071 6072 scheduler_running = 1; 6073 } 6074 6075 #ifdef CONFIG_DEBUG_ATOMIC_SLEEP 6076 static inline int preempt_count_equals(int preempt_offset) 6077 { 6078 int nested = preempt_count() + rcu_preempt_depth(); 6079 6080 return (nested == preempt_offset); 6081 } 6082 6083 void __might_sleep(const char *file, int line, int preempt_offset) 6084 { 6085 /* 6086 * Blocking primitives will set (and therefore destroy) current->state, 6087 * since we will exit with TASK_RUNNING make sure we enter with it, 6088 * otherwise we will destroy state. 6089 */ 6090 WARN_ONCE(current->state != TASK_RUNNING && current->task_state_change, 6091 "do not call blocking ops when !TASK_RUNNING; " 6092 "state=%lx set at [<%p>] %pS\n", 6093 current->state, 6094 (void *)current->task_state_change, 6095 (void *)current->task_state_change); 6096 6097 ___might_sleep(file, line, preempt_offset); 6098 } 6099 EXPORT_SYMBOL(__might_sleep); 6100 6101 void ___might_sleep(const char *file, int line, int preempt_offset) 6102 { 6103 /* Ratelimiting timestamp: */ 6104 static unsigned long prev_jiffy; 6105 6106 unsigned long preempt_disable_ip; 6107 6108 /* WARN_ON_ONCE() by default, no rate limit required: */ 6109 rcu_sleep_check(); 6110 6111 if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && 6112 !is_idle_task(current)) || 6113 system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING || 6114 oops_in_progress) 6115 return; 6116 6117 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) 6118 return; 6119 prev_jiffy = jiffies; 6120 6121 /* Save this before calling printk(), since that will clobber it: */ 6122 preempt_disable_ip = get_preempt_disable_ip(current); 6123 6124 printk(KERN_ERR 6125 "BUG: sleeping function called from invalid context at %s:%d\n", 6126 file, line); 6127 printk(KERN_ERR 6128 "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", 6129 in_atomic(), irqs_disabled(), 6130 current->pid, current->comm); 6131 6132 if (task_stack_end_corrupted(current)) 6133 printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); 6134 6135 debug_show_held_locks(current); 6136 if (irqs_disabled()) 6137 print_irqtrace_events(current); 6138 if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) 6139 && !preempt_count_equals(preempt_offset)) { 6140 pr_err("Preemption disabled at:"); 6141 print_ip_sym(preempt_disable_ip); 6142 pr_cont("\n"); 6143 } 6144 dump_stack(); 6145 add_taint(TAINT_WARN, LOCKDEP_STILL_OK); 6146 } 6147 EXPORT_SYMBOL(___might_sleep); 6148 #endif 6149 6150 #ifdef CONFIG_MAGIC_SYSRQ 6151 void normalize_rt_tasks(void) 6152 { 6153 struct task_struct *g, *p; 6154 struct sched_attr attr = { 6155 .sched_policy = SCHED_NORMAL, 6156 }; 6157 6158 read_lock(&tasklist_lock); 6159 for_each_process_thread(g, p) { 6160 /* 6161 * Only normalize user tasks: 6162 */ 6163 if (p->flags & PF_KTHREAD) 6164 continue; 6165 6166 p->se.exec_start = 0; 6167 schedstat_set(p->se.statistics.wait_start, 0); 6168 schedstat_set(p->se.statistics.sleep_start, 0); 6169 schedstat_set(p->se.statistics.block_start, 0); 6170 6171 if (!dl_task(p) && !rt_task(p)) { 6172 /* 6173 * Renice negative nice level userspace 6174 * tasks back to 0: 6175 */ 6176 if (task_nice(p) < 0) 6177 set_user_nice(p, 0); 6178 continue; 6179 } 6180 6181 __sched_setscheduler(p, &attr, false, false); 6182 } 6183 read_unlock(&tasklist_lock); 6184 } 6185 6186 #endif /* CONFIG_MAGIC_SYSRQ */ 6187 6188 #if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) 6189 /* 6190 * These functions are only useful for the IA64 MCA handling, or kdb. 6191 * 6192 * They can only be called when the whole system has been 6193 * stopped - every CPU needs to be quiescent, and no scheduling 6194 * activity can take place. Using them for anything else would 6195 * be a serious bug, and as a result, they aren't even visible 6196 * under any other configuration. 6197 */ 6198 6199 /** 6200 * curr_task - return the current task for a given CPU. 6201 * @cpu: the processor in question. 6202 * 6203 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! 6204 * 6205 * Return: The current task for @cpu. 6206 */ 6207 struct task_struct *curr_task(int cpu) 6208 { 6209 return cpu_curr(cpu); 6210 } 6211 6212 #endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */ 6213 6214 #ifdef CONFIG_IA64 6215 /** 6216 * set_curr_task - set the current task for a given CPU. 6217 * @cpu: the processor in question. 6218 * @p: the task pointer to set. 6219 * 6220 * Description: This function must only be used when non-maskable interrupts 6221 * are serviced on a separate stack. It allows the architecture to switch the 6222 * notion of the current task on a CPU in a non-blocking manner. This function 6223 * must be called with all CPU's synchronized, and interrupts disabled, the 6224 * and caller must save the original value of the current task (see 6225 * curr_task() above) and restore that value before reenabling interrupts and 6226 * re-starting the system. 6227 * 6228 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! 6229 */ 6230 void ia64_set_curr_task(int cpu, struct task_struct *p) 6231 { 6232 cpu_curr(cpu) = p; 6233 } 6234 6235 #endif 6236 6237 #ifdef CONFIG_CGROUP_SCHED 6238 /* task_group_lock serializes the addition/removal of task groups */ 6239 static DEFINE_SPINLOCK(task_group_lock); 6240 6241 static void sched_free_group(struct task_group *tg) 6242 { 6243 free_fair_sched_group(tg); 6244 free_rt_sched_group(tg); 6245 autogroup_free(tg); 6246 kmem_cache_free(task_group_cache, tg); 6247 } 6248 6249 /* allocate runqueue etc for a new task group */ 6250 struct task_group *sched_create_group(struct task_group *parent) 6251 { 6252 struct task_group *tg; 6253 6254 tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | __GFP_ZERO); 6255 if (!tg) 6256 return ERR_PTR(-ENOMEM); 6257 6258 if (!alloc_fair_sched_group(tg, parent)) 6259 goto err; 6260 6261 if (!alloc_rt_sched_group(tg, parent)) 6262 goto err; 6263 6264 return tg; 6265 6266 err: 6267 sched_free_group(tg); 6268 return ERR_PTR(-ENOMEM); 6269 } 6270 6271 void sched_online_group(struct task_group *tg, struct task_group *parent) 6272 { 6273 unsigned long flags; 6274 6275 spin_lock_irqsave(&task_group_lock, flags); 6276 list_add_rcu(&tg->list, &task_groups); 6277 6278 /* Root should already exist: */ 6279 WARN_ON(!parent); 6280 6281 tg->parent = parent; 6282 INIT_LIST_HEAD(&tg->children); 6283 list_add_rcu(&tg->siblings, &parent->children); 6284 spin_unlock_irqrestore(&task_group_lock, flags); 6285 6286 online_fair_sched_group(tg); 6287 } 6288 6289 /* rcu callback to free various structures associated with a task group */ 6290 static void sched_free_group_rcu(struct rcu_head *rhp) 6291 { 6292 /* Now it should be safe to free those cfs_rqs: */ 6293 sched_free_group(container_of(rhp, struct task_group, rcu)); 6294 } 6295 6296 void sched_destroy_group(struct task_group *tg) 6297 { 6298 /* Wait for possible concurrent references to cfs_rqs complete: */ 6299 call_rcu(&tg->rcu, sched_free_group_rcu); 6300 } 6301 6302 void sched_offline_group(struct task_group *tg) 6303 { 6304 unsigned long flags; 6305 6306 /* End participation in shares distribution: */ 6307 unregister_fair_sched_group(tg); 6308 6309 spin_lock_irqsave(&task_group_lock, flags); 6310 list_del_rcu(&tg->list); 6311 list_del_rcu(&tg->siblings); 6312 spin_unlock_irqrestore(&task_group_lock, flags); 6313 } 6314 6315 static void sched_change_group(struct task_struct *tsk, int type) 6316 { 6317 struct task_group *tg; 6318 6319 /* 6320 * All callers are synchronized by task_rq_lock(); we do not use RCU 6321 * which is pointless here. Thus, we pass "true" to task_css_check() 6322 * to prevent lockdep warnings. 6323 */ 6324 tg = container_of(task_css_check(tsk, cpu_cgrp_id, true), 6325 struct task_group, css); 6326 tg = autogroup_task_group(tsk, tg); 6327 tsk->sched_task_group = tg; 6328 6329 #ifdef CONFIG_FAIR_GROUP_SCHED 6330 if (tsk->sched_class->task_change_group) 6331 tsk->sched_class->task_change_group(tsk, type); 6332 else 6333 #endif 6334 set_task_rq(tsk, task_cpu(tsk)); 6335 } 6336 6337 /* 6338 * Change task's runqueue when it moves between groups. 6339 * 6340 * The caller of this function should have put the task in its new group by 6341 * now. This function just updates tsk->se.cfs_rq and tsk->se.parent to reflect 6342 * its new group. 6343 */ 6344 void sched_move_task(struct task_struct *tsk) 6345 { 6346 int queued, running, queue_flags = 6347 DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; 6348 struct rq_flags rf; 6349 struct rq *rq; 6350 6351 rq = task_rq_lock(tsk, &rf); 6352 update_rq_clock(rq); 6353 6354 running = task_current(rq, tsk); 6355 queued = task_on_rq_queued(tsk); 6356 6357 if (queued) 6358 dequeue_task(rq, tsk, queue_flags); 6359 if (running) 6360 put_prev_task(rq, tsk); 6361 6362 sched_change_group(tsk, TASK_MOVE_GROUP); 6363 6364 if (queued) 6365 enqueue_task(rq, tsk, queue_flags); 6366 if (running) 6367 set_curr_task(rq, tsk); 6368 6369 task_rq_unlock(rq, tsk, &rf); 6370 } 6371 6372 static inline struct task_group *css_tg(struct cgroup_subsys_state *css) 6373 { 6374 return css ? container_of(css, struct task_group, css) : NULL; 6375 } 6376 6377 static struct cgroup_subsys_state * 6378 cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) 6379 { 6380 struct task_group *parent = css_tg(parent_css); 6381 struct task_group *tg; 6382 6383 if (!parent) { 6384 /* This is early initialization for the top cgroup */ 6385 return &root_task_group.css; 6386 } 6387 6388 tg = sched_create_group(parent); 6389 if (IS_ERR(tg)) 6390 return ERR_PTR(-ENOMEM); 6391 6392 return &tg->css; 6393 } 6394 6395 /* Expose task group only after completing cgroup initialization */ 6396 static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) 6397 { 6398 struct task_group *tg = css_tg(css); 6399 struct task_group *parent = css_tg(css->parent); 6400 6401 if (parent) 6402 sched_online_group(tg, parent); 6403 return 0; 6404 } 6405 6406 static void cpu_cgroup_css_released(struct cgroup_subsys_state *css) 6407 { 6408 struct task_group *tg = css_tg(css); 6409 6410 sched_offline_group(tg); 6411 } 6412 6413 static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) 6414 { 6415 struct task_group *tg = css_tg(css); 6416 6417 /* 6418 * Relies on the RCU grace period between css_released() and this. 6419 */ 6420 sched_free_group(tg); 6421 } 6422 6423 /* 6424 * This is called before wake_up_new_task(), therefore we really only 6425 * have to set its group bits, all the other stuff does not apply. 6426 */ 6427 static void cpu_cgroup_fork(struct task_struct *task) 6428 { 6429 struct rq_flags rf; 6430 struct rq *rq; 6431 6432 rq = task_rq_lock(task, &rf); 6433 6434 update_rq_clock(rq); 6435 sched_change_group(task, TASK_SET_GROUP); 6436 6437 task_rq_unlock(rq, task, &rf); 6438 } 6439 6440 static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) 6441 { 6442 struct task_struct *task; 6443 struct cgroup_subsys_state *css; 6444 int ret = 0; 6445 6446 cgroup_taskset_for_each(task, css, tset) { 6447 #ifdef CONFIG_RT_GROUP_SCHED 6448 if (!sched_rt_can_attach(css_tg(css), task)) 6449 return -EINVAL; 6450 #else 6451 /* We don't support RT-tasks being in separate groups */ 6452 if (task->sched_class != &fair_sched_class) 6453 return -EINVAL; 6454 #endif 6455 /* 6456 * Serialize against wake_up_new_task() such that if its 6457 * running, we're sure to observe its full state. 6458 */ 6459 raw_spin_lock_irq(&task->pi_lock); 6460 /* 6461 * Avoid calling sched_move_task() before wake_up_new_task() 6462 * has happened. This would lead to problems with PELT, due to 6463 * move wanting to detach+attach while we're not attached yet. 6464 */ 6465 if (task->state == TASK_NEW) 6466 ret = -EINVAL; 6467 raw_spin_unlock_irq(&task->pi_lock); 6468 6469 if (ret) 6470 break; 6471 } 6472 return ret; 6473 } 6474 6475 static void cpu_cgroup_attach(struct cgroup_taskset *tset) 6476 { 6477 struct task_struct *task; 6478 struct cgroup_subsys_state *css; 6479 6480 cgroup_taskset_for_each(task, css, tset) 6481 sched_move_task(task); 6482 } 6483 6484 #ifdef CONFIG_FAIR_GROUP_SCHED 6485 static int cpu_shares_write_u64(struct cgroup_subsys_state *css, 6486 struct cftype *cftype, u64 shareval) 6487 { 6488 return sched_group_set_shares(css_tg(css), scale_load(shareval)); 6489 } 6490 6491 static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css, 6492 struct cftype *cft) 6493 { 6494 struct task_group *tg = css_tg(css); 6495 6496 return (u64) scale_load_down(tg->shares); 6497 } 6498 6499 #ifdef CONFIG_CFS_BANDWIDTH 6500 static DEFINE_MUTEX(cfs_constraints_mutex); 6501 6502 const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */ 6503 const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */ 6504 6505 static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime); 6506 6507 static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) 6508 { 6509 int i, ret = 0, runtime_enabled, runtime_was_enabled; 6510 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; 6511 6512 if (tg == &root_task_group) 6513 return -EINVAL; 6514 6515 /* 6516 * Ensure we have at some amount of bandwidth every period. This is 6517 * to prevent reaching a state of large arrears when throttled via 6518 * entity_tick() resulting in prolonged exit starvation. 6519 */ 6520 if (quota < min_cfs_quota_period || period < min_cfs_quota_period) 6521 return -EINVAL; 6522 6523 /* 6524 * Likewise, bound things on the otherside by preventing insane quota 6525 * periods. This also allows us to normalize in computing quota 6526 * feasibility. 6527 */ 6528 if (period > max_cfs_quota_period) 6529 return -EINVAL; 6530 6531 /* 6532 * Prevent race between setting of cfs_rq->runtime_enabled and 6533 * unthrottle_offline_cfs_rqs(). 6534 */ 6535 get_online_cpus(); 6536 mutex_lock(&cfs_constraints_mutex); 6537 ret = __cfs_schedulable(tg, period, quota); 6538 if (ret) 6539 goto out_unlock; 6540 6541 runtime_enabled = quota != RUNTIME_INF; 6542 runtime_was_enabled = cfs_b->quota != RUNTIME_INF; 6543 /* 6544 * If we need to toggle cfs_bandwidth_used, off->on must occur 6545 * before making related changes, and on->off must occur afterwards 6546 */ 6547 if (runtime_enabled && !runtime_was_enabled) 6548 cfs_bandwidth_usage_inc(); 6549 raw_spin_lock_irq(&cfs_b->lock); 6550 cfs_b->period = ns_to_ktime(period); 6551 cfs_b->quota = quota; 6552 6553 __refill_cfs_bandwidth_runtime(cfs_b); 6554 6555 /* Restart the period timer (if active) to handle new period expiry: */ 6556 if (runtime_enabled) 6557 start_cfs_bandwidth(cfs_b); 6558 6559 raw_spin_unlock_irq(&cfs_b->lock); 6560 6561 for_each_online_cpu(i) { 6562 struct cfs_rq *cfs_rq = tg->cfs_rq[i]; 6563 struct rq *rq = cfs_rq->rq; 6564 struct rq_flags rf; 6565 6566 rq_lock_irq(rq, &rf); 6567 cfs_rq->runtime_enabled = runtime_enabled; 6568 cfs_rq->runtime_remaining = 0; 6569 6570 if (cfs_rq->throttled) 6571 unthrottle_cfs_rq(cfs_rq); 6572 rq_unlock_irq(rq, &rf); 6573 } 6574 if (runtime_was_enabled && !runtime_enabled) 6575 cfs_bandwidth_usage_dec(); 6576 out_unlock: 6577 mutex_unlock(&cfs_constraints_mutex); 6578 put_online_cpus(); 6579 6580 return ret; 6581 } 6582 6583 int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us) 6584 { 6585 u64 quota, period; 6586 6587 period = ktime_to_ns(tg->cfs_bandwidth.period); 6588 if (cfs_quota_us < 0) 6589 quota = RUNTIME_INF; 6590 else 6591 quota = (u64)cfs_quota_us * NSEC_PER_USEC; 6592 6593 return tg_set_cfs_bandwidth(tg, period, quota); 6594 } 6595 6596 long tg_get_cfs_quota(struct task_group *tg) 6597 { 6598 u64 quota_us; 6599 6600 if (tg->cfs_bandwidth.quota == RUNTIME_INF) 6601 return -1; 6602 6603 quota_us = tg->cfs_bandwidth.quota; 6604 do_div(quota_us, NSEC_PER_USEC); 6605 6606 return quota_us; 6607 } 6608 6609 int tg_set_cfs_period(struct task_group *tg, long cfs_period_us) 6610 { 6611 u64 quota, period; 6612 6613 period = (u64)cfs_period_us * NSEC_PER_USEC; 6614 quota = tg->cfs_bandwidth.quota; 6615 6616 return tg_set_cfs_bandwidth(tg, period, quota); 6617 } 6618 6619 long tg_get_cfs_period(struct task_group *tg) 6620 { 6621 u64 cfs_period_us; 6622 6623 cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period); 6624 do_div(cfs_period_us, NSEC_PER_USEC); 6625 6626 return cfs_period_us; 6627 } 6628 6629 static s64 cpu_cfs_quota_read_s64(struct cgroup_subsys_state *css, 6630 struct cftype *cft) 6631 { 6632 return tg_get_cfs_quota(css_tg(css)); 6633 } 6634 6635 static int cpu_cfs_quota_write_s64(struct cgroup_subsys_state *css, 6636 struct cftype *cftype, s64 cfs_quota_us) 6637 { 6638 return tg_set_cfs_quota(css_tg(css), cfs_quota_us); 6639 } 6640 6641 static u64 cpu_cfs_period_read_u64(struct cgroup_subsys_state *css, 6642 struct cftype *cft) 6643 { 6644 return tg_get_cfs_period(css_tg(css)); 6645 } 6646 6647 static int cpu_cfs_period_write_u64(struct cgroup_subsys_state *css, 6648 struct cftype *cftype, u64 cfs_period_us) 6649 { 6650 return tg_set_cfs_period(css_tg(css), cfs_period_us); 6651 } 6652 6653 struct cfs_schedulable_data { 6654 struct task_group *tg; 6655 u64 period, quota; 6656 }; 6657 6658 /* 6659 * normalize group quota/period to be quota/max_period 6660 * note: units are usecs 6661 */ 6662 static u64 normalize_cfs_quota(struct task_group *tg, 6663 struct cfs_schedulable_data *d) 6664 { 6665 u64 quota, period; 6666 6667 if (tg == d->tg) { 6668 period = d->period; 6669 quota = d->quota; 6670 } else { 6671 period = tg_get_cfs_period(tg); 6672 quota = tg_get_cfs_quota(tg); 6673 } 6674 6675 /* note: these should typically be equivalent */ 6676 if (quota == RUNTIME_INF || quota == -1) 6677 return RUNTIME_INF; 6678 6679 return to_ratio(period, quota); 6680 } 6681 6682 static int tg_cfs_schedulable_down(struct task_group *tg, void *data) 6683 { 6684 struct cfs_schedulable_data *d = data; 6685 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; 6686 s64 quota = 0, parent_quota = -1; 6687 6688 if (!tg->parent) { 6689 quota = RUNTIME_INF; 6690 } else { 6691 struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth; 6692 6693 quota = normalize_cfs_quota(tg, d); 6694 parent_quota = parent_b->hierarchical_quota; 6695 6696 /* 6697 * Ensure max(child_quota) <= parent_quota. On cgroup2, 6698 * always take the min. On cgroup1, only inherit when no 6699 * limit is set: 6700 */ 6701 if (cgroup_subsys_on_dfl(cpu_cgrp_subsys)) { 6702 quota = min(quota, parent_quota); 6703 } else { 6704 if (quota == RUNTIME_INF) 6705 quota = parent_quota; 6706 else if (parent_quota != RUNTIME_INF && quota > parent_quota) 6707 return -EINVAL; 6708 } 6709 } 6710 cfs_b->hierarchical_quota = quota; 6711 6712 return 0; 6713 } 6714 6715 static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota) 6716 { 6717 int ret; 6718 struct cfs_schedulable_data data = { 6719 .tg = tg, 6720 .period = period, 6721 .quota = quota, 6722 }; 6723 6724 if (quota != RUNTIME_INF) { 6725 do_div(data.period, NSEC_PER_USEC); 6726 do_div(data.quota, NSEC_PER_USEC); 6727 } 6728 6729 rcu_read_lock(); 6730 ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data); 6731 rcu_read_unlock(); 6732 6733 return ret; 6734 } 6735 6736 static int cpu_cfs_stat_show(struct seq_file *sf, void *v) 6737 { 6738 struct task_group *tg = css_tg(seq_css(sf)); 6739 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; 6740 6741 seq_printf(sf, "nr_periods %d\n", cfs_b->nr_periods); 6742 seq_printf(sf, "nr_throttled %d\n", cfs_b->nr_throttled); 6743 seq_printf(sf, "throttled_time %llu\n", cfs_b->throttled_time); 6744 6745 if (schedstat_enabled() && tg != &root_task_group) { 6746 u64 ws = 0; 6747 int i; 6748 6749 for_each_possible_cpu(i) 6750 ws += schedstat_val(tg->se[i]->statistics.wait_sum); 6751 6752 seq_printf(sf, "wait_sum %llu\n", ws); 6753 } 6754 6755 return 0; 6756 } 6757 #endif /* CONFIG_CFS_BANDWIDTH */ 6758 #endif /* CONFIG_FAIR_GROUP_SCHED */ 6759 6760 #ifdef CONFIG_RT_GROUP_SCHED 6761 static int cpu_rt_runtime_write(struct cgroup_subsys_state *css, 6762 struct cftype *cft, s64 val) 6763 { 6764 return sched_group_set_rt_runtime(css_tg(css), val); 6765 } 6766 6767 static s64 cpu_rt_runtime_read(struct cgroup_subsys_state *css, 6768 struct cftype *cft) 6769 { 6770 return sched_group_rt_runtime(css_tg(css)); 6771 } 6772 6773 static int cpu_rt_period_write_uint(struct cgroup_subsys_state *css, 6774 struct cftype *cftype, u64 rt_period_us) 6775 { 6776 return sched_group_set_rt_period(css_tg(css), rt_period_us); 6777 } 6778 6779 static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css, 6780 struct cftype *cft) 6781 { 6782 return sched_group_rt_period(css_tg(css)); 6783 } 6784 #endif /* CONFIG_RT_GROUP_SCHED */ 6785 6786 static struct cftype cpu_legacy_files[] = { 6787 #ifdef CONFIG_FAIR_GROUP_SCHED 6788 { 6789 .name = "shares", 6790 .read_u64 = cpu_shares_read_u64, 6791 .write_u64 = cpu_shares_write_u64, 6792 }, 6793 #endif 6794 #ifdef CONFIG_CFS_BANDWIDTH 6795 { 6796 .name = "cfs_quota_us", 6797 .read_s64 = cpu_cfs_quota_read_s64, 6798 .write_s64 = cpu_cfs_quota_write_s64, 6799 }, 6800 { 6801 .name = "cfs_period_us", 6802 .read_u64 = cpu_cfs_period_read_u64, 6803 .write_u64 = cpu_cfs_period_write_u64, 6804 }, 6805 { 6806 .name = "stat", 6807 .seq_show = cpu_cfs_stat_show, 6808 }, 6809 #endif 6810 #ifdef CONFIG_RT_GROUP_SCHED 6811 { 6812 .name = "rt_runtime_us", 6813 .read_s64 = cpu_rt_runtime_read, 6814 .write_s64 = cpu_rt_runtime_write, 6815 }, 6816 { 6817 .name = "rt_period_us", 6818 .read_u64 = cpu_rt_period_read_uint, 6819 .write_u64 = cpu_rt_period_write_uint, 6820 }, 6821 #endif 6822 { } /* Terminate */ 6823 }; 6824 6825 static int cpu_extra_stat_show(struct seq_file *sf, 6826 struct cgroup_subsys_state *css) 6827 { 6828 #ifdef CONFIG_CFS_BANDWIDTH 6829 { 6830 struct task_group *tg = css_tg(css); 6831 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; 6832 u64 throttled_usec; 6833 6834 throttled_usec = cfs_b->throttled_time; 6835 do_div(throttled_usec, NSEC_PER_USEC); 6836 6837 seq_printf(sf, "nr_periods %d\n" 6838 "nr_throttled %d\n" 6839 "throttled_usec %llu\n", 6840 cfs_b->nr_periods, cfs_b->nr_throttled, 6841 throttled_usec); 6842 } 6843 #endif 6844 return 0; 6845 } 6846 6847 #ifdef CONFIG_FAIR_GROUP_SCHED 6848 static u64 cpu_weight_read_u64(struct cgroup_subsys_state *css, 6849 struct cftype *cft) 6850 { 6851 struct task_group *tg = css_tg(css); 6852 u64 weight = scale_load_down(tg->shares); 6853 6854 return DIV_ROUND_CLOSEST_ULL(weight * CGROUP_WEIGHT_DFL, 1024); 6855 } 6856 6857 static int cpu_weight_write_u64(struct cgroup_subsys_state *css, 6858 struct cftype *cft, u64 weight) 6859 { 6860 /* 6861 * cgroup weight knobs should use the common MIN, DFL and MAX 6862 * values which are 1, 100 and 10000 respectively. While it loses 6863 * a bit of range on both ends, it maps pretty well onto the shares 6864 * value used by scheduler and the round-trip conversions preserve 6865 * the original value over the entire range. 6866 */ 6867 if (weight < CGROUP_WEIGHT_MIN || weight > CGROUP_WEIGHT_MAX) 6868 return -ERANGE; 6869 6870 weight = DIV_ROUND_CLOSEST_ULL(weight * 1024, CGROUP_WEIGHT_DFL); 6871 6872 return sched_group_set_shares(css_tg(css), scale_load(weight)); 6873 } 6874 6875 static s64 cpu_weight_nice_read_s64(struct cgroup_subsys_state *css, 6876 struct cftype *cft) 6877 { 6878 unsigned long weight = scale_load_down(css_tg(css)->shares); 6879 int last_delta = INT_MAX; 6880 int prio, delta; 6881 6882 /* find the closest nice value to the current weight */ 6883 for (prio = 0; prio < ARRAY_SIZE(sched_prio_to_weight); prio++) { 6884 delta = abs(sched_prio_to_weight[prio] - weight); 6885 if (delta >= last_delta) 6886 break; 6887 last_delta = delta; 6888 } 6889 6890 return PRIO_TO_NICE(prio - 1 + MAX_RT_PRIO); 6891 } 6892 6893 static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css, 6894 struct cftype *cft, s64 nice) 6895 { 6896 unsigned long weight; 6897 int idx; 6898 6899 if (nice < MIN_NICE || nice > MAX_NICE) 6900 return -ERANGE; 6901 6902 idx = NICE_TO_PRIO(nice) - MAX_RT_PRIO; 6903 idx = array_index_nospec(idx, 40); 6904 weight = sched_prio_to_weight[idx]; 6905 6906 return sched_group_set_shares(css_tg(css), scale_load(weight)); 6907 } 6908 #endif 6909 6910 static void __maybe_unused cpu_period_quota_print(struct seq_file *sf, 6911 long period, long quota) 6912 { 6913 if (quota < 0) 6914 seq_puts(sf, "max"); 6915 else 6916 seq_printf(sf, "%ld", quota); 6917 6918 seq_printf(sf, " %ld\n", period); 6919 } 6920 6921 /* caller should put the current value in *@periodp before calling */ 6922 static int __maybe_unused cpu_period_quota_parse(char *buf, 6923 u64 *periodp, u64 *quotap) 6924 { 6925 char tok[21]; /* U64_MAX */ 6926 6927 if (!sscanf(buf, "%s %llu", tok, periodp)) 6928 return -EINVAL; 6929 6930 *periodp *= NSEC_PER_USEC; 6931 6932 if (sscanf(tok, "%llu", quotap)) 6933 *quotap *= NSEC_PER_USEC; 6934 else if (!strcmp(tok, "max")) 6935 *quotap = RUNTIME_INF; 6936 else 6937 return -EINVAL; 6938 6939 return 0; 6940 } 6941 6942 #ifdef CONFIG_CFS_BANDWIDTH 6943 static int cpu_max_show(struct seq_file *sf, void *v) 6944 { 6945 struct task_group *tg = css_tg(seq_css(sf)); 6946 6947 cpu_period_quota_print(sf, tg_get_cfs_period(tg), tg_get_cfs_quota(tg)); 6948 return 0; 6949 } 6950 6951 static ssize_t cpu_max_write(struct kernfs_open_file *of, 6952 char *buf, size_t nbytes, loff_t off) 6953 { 6954 struct task_group *tg = css_tg(of_css(of)); 6955 u64 period = tg_get_cfs_period(tg); 6956 u64 quota; 6957 int ret; 6958 6959 ret = cpu_period_quota_parse(buf, &period, "a); 6960 if (!ret) 6961 ret = tg_set_cfs_bandwidth(tg, period, quota); 6962 return ret ?: nbytes; 6963 } 6964 #endif 6965 6966 static struct cftype cpu_files[] = { 6967 #ifdef CONFIG_FAIR_GROUP_SCHED 6968 { 6969 .name = "weight", 6970 .flags = CFTYPE_NOT_ON_ROOT, 6971 .read_u64 = cpu_weight_read_u64, 6972 .write_u64 = cpu_weight_write_u64, 6973 }, 6974 { 6975 .name = "weight.nice", 6976 .flags = CFTYPE_NOT_ON_ROOT, 6977 .read_s64 = cpu_weight_nice_read_s64, 6978 .write_s64 = cpu_weight_nice_write_s64, 6979 }, 6980 #endif 6981 #ifdef CONFIG_CFS_BANDWIDTH 6982 { 6983 .name = "max", 6984 .flags = CFTYPE_NOT_ON_ROOT, 6985 .seq_show = cpu_max_show, 6986 .write = cpu_max_write, 6987 }, 6988 #endif 6989 { } /* terminate */ 6990 }; 6991 6992 struct cgroup_subsys cpu_cgrp_subsys = { 6993 .css_alloc = cpu_cgroup_css_alloc, 6994 .css_online = cpu_cgroup_css_online, 6995 .css_released = cpu_cgroup_css_released, 6996 .css_free = cpu_cgroup_css_free, 6997 .css_extra_stat_show = cpu_extra_stat_show, 6998 .fork = cpu_cgroup_fork, 6999 .can_attach = cpu_cgroup_can_attach, 7000 .attach = cpu_cgroup_attach, 7001 .legacy_cftypes = cpu_legacy_files, 7002 .dfl_cftypes = cpu_files, 7003 .early_init = true, 7004 .threaded = true, 7005 }; 7006 7007 #endif /* CONFIG_CGROUP_SCHED */ 7008 7009 void dump_cpu_task(int cpu) 7010 { 7011 pr_info("Task dump for CPU %d:\n", cpu); 7012 sched_show_task(cpu_curr(cpu)); 7013 } 7014 7015 /* 7016 * Nice levels are multiplicative, with a gentle 10% change for every 7017 * nice level changed. I.e. when a CPU-bound task goes from nice 0 to 7018 * nice 1, it will get ~10% less CPU time than another CPU-bound task 7019 * that remained on nice 0. 7020 * 7021 * The "10% effect" is relative and cumulative: from _any_ nice level, 7022 * if you go up 1 level, it's -10% CPU usage, if you go down 1 level 7023 * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25. 7024 * If a task goes up by ~10% and another task goes down by ~10% then 7025 * the relative distance between them is ~25%.) 7026 */ 7027 const int sched_prio_to_weight[40] = { 7028 /* -20 */ 88761, 71755, 56483, 46273, 36291, 7029 /* -15 */ 29154, 23254, 18705, 14949, 11916, 7030 /* -10 */ 9548, 7620, 6100, 4904, 3906, 7031 /* -5 */ 3121, 2501, 1991, 1586, 1277, 7032 /* 0 */ 1024, 820, 655, 526, 423, 7033 /* 5 */ 335, 272, 215, 172, 137, 7034 /* 10 */ 110, 87, 70, 56, 45, 7035 /* 15 */ 36, 29, 23, 18, 15, 7036 }; 7037 7038 /* 7039 * Inverse (2^32/x) values of the sched_prio_to_weight[] array, precalculated. 7040 * 7041 * In cases where the weight does not change often, we can use the 7042 * precalculated inverse to speed up arithmetics by turning divisions 7043 * into multiplications: 7044 */ 7045 const u32 sched_prio_to_wmult[40] = { 7046 /* -20 */ 48388, 59856, 76040, 92818, 118348, 7047 /* -15 */ 147320, 184698, 229616, 287308, 360437, 7048 /* -10 */ 449829, 563644, 704093, 875809, 1099582, 7049 /* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326, 7050 /* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587, 7051 /* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126, 7052 /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717, 7053 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, 7054 }; 7055 7056 #undef CREATE_TRACE_POINTS 7057