1 /* 2 * kernel/sched/core.c 3 * 4 * Kernel scheduler and related syscalls 5 * 6 * Copyright (C) 1991-2002 Linus Torvalds 7 * 8 * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and 9 * make semaphores SMP safe 10 * 1998-11-19 Implemented schedule_timeout() and related stuff 11 * by Andrea Arcangeli 12 * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar: 13 * hybrid priority-list and round-robin design with 14 * an array-switch method of distributing timeslices 15 * and per-CPU runqueues. Cleanups and useful suggestions 16 * by Davide Libenzi, preemptible kernel bits by Robert Love. 17 * 2003-09-03 Interactivity tuning by Con Kolivas. 18 * 2004-04-02 Scheduler domains code by Nick Piggin 19 * 2007-04-15 Work begun on replacing all interactivity tuning with a 20 * fair scheduling design by Con Kolivas. 21 * 2007-05-05 Load balancing (smp-nice) and other improvements 22 * by Peter Williams 23 * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith 24 * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri 25 * 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins, 26 * Thomas Gleixner, Mike Kravetz 27 */ 28 29 #include <linux/mm.h> 30 #include <linux/module.h> 31 #include <linux/nmi.h> 32 #include <linux/init.h> 33 #include <linux/uaccess.h> 34 #include <linux/highmem.h> 35 #include <asm/mmu_context.h> 36 #include <linux/interrupt.h> 37 #include <linux/capability.h> 38 #include <linux/completion.h> 39 #include <linux/kernel_stat.h> 40 #include <linux/debug_locks.h> 41 #include <linux/perf_event.h> 42 #include <linux/security.h> 43 #include <linux/notifier.h> 44 #include <linux/profile.h> 45 #include <linux/freezer.h> 46 #include <linux/vmalloc.h> 47 #include <linux/blkdev.h> 48 #include <linux/delay.h> 49 #include <linux/pid_namespace.h> 50 #include <linux/smp.h> 51 #include <linux/threads.h> 52 #include <linux/timer.h> 53 #include <linux/rcupdate.h> 54 #include <linux/cpu.h> 55 #include <linux/cpuset.h> 56 #include <linux/percpu.h> 57 #include <linux/proc_fs.h> 58 #include <linux/seq_file.h> 59 #include <linux/sysctl.h> 60 #include <linux/syscalls.h> 61 #include <linux/times.h> 62 #include <linux/tsacct_kern.h> 63 #include <linux/kprobes.h> 64 #include <linux/delayacct.h> 65 #include <linux/unistd.h> 66 #include <linux/pagemap.h> 67 #include <linux/hrtimer.h> 68 #include <linux/tick.h> 69 #include <linux/debugfs.h> 70 #include <linux/ctype.h> 71 #include <linux/ftrace.h> 72 #include <linux/slab.h> 73 #include <linux/init_task.h> 74 #include <linux/binfmts.h> 75 #include <linux/context_tracking.h> 76 #include <linux/compiler.h> 77 78 #include <asm/switch_to.h> 79 #include <asm/tlb.h> 80 #include <asm/irq_regs.h> 81 #include <asm/mutex.h> 82 #ifdef CONFIG_PARAVIRT 83 #include <asm/paravirt.h> 84 #endif 85 86 #include "sched.h" 87 #include "../workqueue_internal.h" 88 #include "../smpboot.h" 89 90 #define CREATE_TRACE_POINTS 91 #include <trace/events/sched.h> 92 93 DEFINE_MUTEX(sched_domains_mutex); 94 DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); 95 96 static void update_rq_clock_task(struct rq *rq, s64 delta); 97 98 void update_rq_clock(struct rq *rq) 99 { 100 s64 delta; 101 102 lockdep_assert_held(&rq->lock); 103 104 if (rq->clock_skip_update & RQCF_ACT_SKIP) 105 return; 106 107 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; 108 if (delta < 0) 109 return; 110 rq->clock += delta; 111 update_rq_clock_task(rq, delta); 112 } 113 114 /* 115 * Debugging: various feature bits 116 */ 117 118 #define SCHED_FEAT(name, enabled) \ 119 (1UL << __SCHED_FEAT_##name) * enabled | 120 121 const_debug unsigned int sysctl_sched_features = 122 #include "features.h" 123 0; 124 125 #undef SCHED_FEAT 126 127 #ifdef CONFIG_SCHED_DEBUG 128 #define SCHED_FEAT(name, enabled) \ 129 #name , 130 131 static const char * const sched_feat_names[] = { 132 #include "features.h" 133 }; 134 135 #undef SCHED_FEAT 136 137 static int sched_feat_show(struct seq_file *m, void *v) 138 { 139 int i; 140 141 for (i = 0; i < __SCHED_FEAT_NR; i++) { 142 if (!(sysctl_sched_features & (1UL << i))) 143 seq_puts(m, "NO_"); 144 seq_printf(m, "%s ", sched_feat_names[i]); 145 } 146 seq_puts(m, "\n"); 147 148 return 0; 149 } 150 151 #ifdef HAVE_JUMP_LABEL 152 153 #define jump_label_key__true STATIC_KEY_INIT_TRUE 154 #define jump_label_key__false STATIC_KEY_INIT_FALSE 155 156 #define SCHED_FEAT(name, enabled) \ 157 jump_label_key__##enabled , 158 159 struct static_key sched_feat_keys[__SCHED_FEAT_NR] = { 160 #include "features.h" 161 }; 162 163 #undef SCHED_FEAT 164 165 static void sched_feat_disable(int i) 166 { 167 if (static_key_enabled(&sched_feat_keys[i])) 168 static_key_slow_dec(&sched_feat_keys[i]); 169 } 170 171 static void sched_feat_enable(int i) 172 { 173 if (!static_key_enabled(&sched_feat_keys[i])) 174 static_key_slow_inc(&sched_feat_keys[i]); 175 } 176 #else 177 static void sched_feat_disable(int i) { }; 178 static void sched_feat_enable(int i) { }; 179 #endif /* HAVE_JUMP_LABEL */ 180 181 static int sched_feat_set(char *cmp) 182 { 183 int i; 184 int neg = 0; 185 186 if (strncmp(cmp, "NO_", 3) == 0) { 187 neg = 1; 188 cmp += 3; 189 } 190 191 for (i = 0; i < __SCHED_FEAT_NR; i++) { 192 if (strcmp(cmp, sched_feat_names[i]) == 0) { 193 if (neg) { 194 sysctl_sched_features &= ~(1UL << i); 195 sched_feat_disable(i); 196 } else { 197 sysctl_sched_features |= (1UL << i); 198 sched_feat_enable(i); 199 } 200 break; 201 } 202 } 203 204 return i; 205 } 206 207 static ssize_t 208 sched_feat_write(struct file *filp, const char __user *ubuf, 209 size_t cnt, loff_t *ppos) 210 { 211 char buf[64]; 212 char *cmp; 213 int i; 214 struct inode *inode; 215 216 if (cnt > 63) 217 cnt = 63; 218 219 if (copy_from_user(&buf, ubuf, cnt)) 220 return -EFAULT; 221 222 buf[cnt] = 0; 223 cmp = strstrip(buf); 224 225 /* Ensure the static_key remains in a consistent state */ 226 inode = file_inode(filp); 227 mutex_lock(&inode->i_mutex); 228 i = sched_feat_set(cmp); 229 mutex_unlock(&inode->i_mutex); 230 if (i == __SCHED_FEAT_NR) 231 return -EINVAL; 232 233 *ppos += cnt; 234 235 return cnt; 236 } 237 238 static int sched_feat_open(struct inode *inode, struct file *filp) 239 { 240 return single_open(filp, sched_feat_show, NULL); 241 } 242 243 static const struct file_operations sched_feat_fops = { 244 .open = sched_feat_open, 245 .write = sched_feat_write, 246 .read = seq_read, 247 .llseek = seq_lseek, 248 .release = single_release, 249 }; 250 251 static __init int sched_init_debug(void) 252 { 253 debugfs_create_file("sched_features", 0644, NULL, NULL, 254 &sched_feat_fops); 255 256 return 0; 257 } 258 late_initcall(sched_init_debug); 259 #endif /* CONFIG_SCHED_DEBUG */ 260 261 /* 262 * Number of tasks to iterate in a single balance run. 263 * Limited because this is done with IRQs disabled. 264 */ 265 const_debug unsigned int sysctl_sched_nr_migrate = 32; 266 267 /* 268 * period over which we average the RT time consumption, measured 269 * in ms. 270 * 271 * default: 1s 272 */ 273 const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC; 274 275 /* 276 * period over which we measure -rt task cpu usage in us. 277 * default: 1s 278 */ 279 unsigned int sysctl_sched_rt_period = 1000000; 280 281 __read_mostly int scheduler_running; 282 283 /* 284 * part of the period that we allow rt tasks to run in us. 285 * default: 0.95s 286 */ 287 int sysctl_sched_rt_runtime = 950000; 288 289 /* cpus with isolated domains */ 290 cpumask_var_t cpu_isolated_map; 291 292 /* 293 * this_rq_lock - lock this runqueue and disable interrupts. 294 */ 295 static struct rq *this_rq_lock(void) 296 __acquires(rq->lock) 297 { 298 struct rq *rq; 299 300 local_irq_disable(); 301 rq = this_rq(); 302 raw_spin_lock(&rq->lock); 303 304 return rq; 305 } 306 307 #ifdef CONFIG_SCHED_HRTICK 308 /* 309 * Use HR-timers to deliver accurate preemption points. 310 */ 311 312 static void hrtick_clear(struct rq *rq) 313 { 314 if (hrtimer_active(&rq->hrtick_timer)) 315 hrtimer_cancel(&rq->hrtick_timer); 316 } 317 318 /* 319 * High-resolution timer tick. 320 * Runs from hardirq context with interrupts disabled. 321 */ 322 static enum hrtimer_restart hrtick(struct hrtimer *timer) 323 { 324 struct rq *rq = container_of(timer, struct rq, hrtick_timer); 325 326 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); 327 328 raw_spin_lock(&rq->lock); 329 update_rq_clock(rq); 330 rq->curr->sched_class->task_tick(rq, rq->curr, 1); 331 raw_spin_unlock(&rq->lock); 332 333 return HRTIMER_NORESTART; 334 } 335 336 #ifdef CONFIG_SMP 337 338 static void __hrtick_restart(struct rq *rq) 339 { 340 struct hrtimer *timer = &rq->hrtick_timer; 341 342 hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED); 343 } 344 345 /* 346 * called from hardirq (IPI) context 347 */ 348 static void __hrtick_start(void *arg) 349 { 350 struct rq *rq = arg; 351 352 raw_spin_lock(&rq->lock); 353 __hrtick_restart(rq); 354 rq->hrtick_csd_pending = 0; 355 raw_spin_unlock(&rq->lock); 356 } 357 358 /* 359 * Called to set the hrtick timer state. 360 * 361 * called with rq->lock held and irqs disabled 362 */ 363 void hrtick_start(struct rq *rq, u64 delay) 364 { 365 struct hrtimer *timer = &rq->hrtick_timer; 366 ktime_t time; 367 s64 delta; 368 369 /* 370 * Don't schedule slices shorter than 10000ns, that just 371 * doesn't make sense and can cause timer DoS. 372 */ 373 delta = max_t(s64, delay, 10000LL); 374 time = ktime_add_ns(timer->base->get_time(), delta); 375 376 hrtimer_set_expires(timer, time); 377 378 if (rq == this_rq()) { 379 __hrtick_restart(rq); 380 } else if (!rq->hrtick_csd_pending) { 381 smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd); 382 rq->hrtick_csd_pending = 1; 383 } 384 } 385 386 static int 387 hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu) 388 { 389 int cpu = (int)(long)hcpu; 390 391 switch (action) { 392 case CPU_UP_CANCELED: 393 case CPU_UP_CANCELED_FROZEN: 394 case CPU_DOWN_PREPARE: 395 case CPU_DOWN_PREPARE_FROZEN: 396 case CPU_DEAD: 397 case CPU_DEAD_FROZEN: 398 hrtick_clear(cpu_rq(cpu)); 399 return NOTIFY_OK; 400 } 401 402 return NOTIFY_DONE; 403 } 404 405 static __init void init_hrtick(void) 406 { 407 hotcpu_notifier(hotplug_hrtick, 0); 408 } 409 #else 410 /* 411 * Called to set the hrtick timer state. 412 * 413 * called with rq->lock held and irqs disabled 414 */ 415 void hrtick_start(struct rq *rq, u64 delay) 416 { 417 /* 418 * Don't schedule slices shorter than 10000ns, that just 419 * doesn't make sense. Rely on vruntime for fairness. 420 */ 421 delay = max_t(u64, delay, 10000LL); 422 hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), 423 HRTIMER_MODE_REL_PINNED); 424 } 425 426 static inline void init_hrtick(void) 427 { 428 } 429 #endif /* CONFIG_SMP */ 430 431 static void init_rq_hrtick(struct rq *rq) 432 { 433 #ifdef CONFIG_SMP 434 rq->hrtick_csd_pending = 0; 435 436 rq->hrtick_csd.flags = 0; 437 rq->hrtick_csd.func = __hrtick_start; 438 rq->hrtick_csd.info = rq; 439 #endif 440 441 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 442 rq->hrtick_timer.function = hrtick; 443 } 444 #else /* CONFIG_SCHED_HRTICK */ 445 static inline void hrtick_clear(struct rq *rq) 446 { 447 } 448 449 static inline void init_rq_hrtick(struct rq *rq) 450 { 451 } 452 453 static inline void init_hrtick(void) 454 { 455 } 456 #endif /* CONFIG_SCHED_HRTICK */ 457 458 /* 459 * cmpxchg based fetch_or, macro so it works for different integer types 460 */ 461 #define fetch_or(ptr, val) \ 462 ({ typeof(*(ptr)) __old, __val = *(ptr); \ 463 for (;;) { \ 464 __old = cmpxchg((ptr), __val, __val | (val)); \ 465 if (__old == __val) \ 466 break; \ 467 __val = __old; \ 468 } \ 469 __old; \ 470 }) 471 472 #if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG) 473 /* 474 * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG, 475 * this avoids any races wrt polling state changes and thereby avoids 476 * spurious IPIs. 477 */ 478 static bool set_nr_and_not_polling(struct task_struct *p) 479 { 480 struct thread_info *ti = task_thread_info(p); 481 return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG); 482 } 483 484 /* 485 * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set. 486 * 487 * If this returns true, then the idle task promises to call 488 * sched_ttwu_pending() and reschedule soon. 489 */ 490 static bool set_nr_if_polling(struct task_struct *p) 491 { 492 struct thread_info *ti = task_thread_info(p); 493 typeof(ti->flags) old, val = READ_ONCE(ti->flags); 494 495 for (;;) { 496 if (!(val & _TIF_POLLING_NRFLAG)) 497 return false; 498 if (val & _TIF_NEED_RESCHED) 499 return true; 500 old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED); 501 if (old == val) 502 break; 503 val = old; 504 } 505 return true; 506 } 507 508 #else 509 static bool set_nr_and_not_polling(struct task_struct *p) 510 { 511 set_tsk_need_resched(p); 512 return true; 513 } 514 515 #ifdef CONFIG_SMP 516 static bool set_nr_if_polling(struct task_struct *p) 517 { 518 return false; 519 } 520 #endif 521 #endif 522 523 void wake_q_add(struct wake_q_head *head, struct task_struct *task) 524 { 525 struct wake_q_node *node = &task->wake_q; 526 527 /* 528 * Atomically grab the task, if ->wake_q is !nil already it means 529 * its already queued (either by us or someone else) and will get the 530 * wakeup due to that. 531 * 532 * This cmpxchg() implies a full barrier, which pairs with the write 533 * barrier implied by the wakeup in wake_up_list(). 534 */ 535 if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL)) 536 return; 537 538 get_task_struct(task); 539 540 /* 541 * The head is context local, there can be no concurrency. 542 */ 543 *head->lastp = node; 544 head->lastp = &node->next; 545 } 546 547 void wake_up_q(struct wake_q_head *head) 548 { 549 struct wake_q_node *node = head->first; 550 551 while (node != WAKE_Q_TAIL) { 552 struct task_struct *task; 553 554 task = container_of(node, struct task_struct, wake_q); 555 BUG_ON(!task); 556 /* task can safely be re-inserted now */ 557 node = node->next; 558 task->wake_q.next = NULL; 559 560 /* 561 * wake_up_process() implies a wmb() to pair with the queueing 562 * in wake_q_add() so as not to miss wakeups. 563 */ 564 wake_up_process(task); 565 put_task_struct(task); 566 } 567 } 568 569 /* 570 * resched_curr - mark rq's current task 'to be rescheduled now'. 571 * 572 * On UP this means the setting of the need_resched flag, on SMP it 573 * might also involve a cross-CPU call to trigger the scheduler on 574 * the target CPU. 575 */ 576 void resched_curr(struct rq *rq) 577 { 578 struct task_struct *curr = rq->curr; 579 int cpu; 580 581 lockdep_assert_held(&rq->lock); 582 583 if (test_tsk_need_resched(curr)) 584 return; 585 586 cpu = cpu_of(rq); 587 588 if (cpu == smp_processor_id()) { 589 set_tsk_need_resched(curr); 590 set_preempt_need_resched(); 591 return; 592 } 593 594 if (set_nr_and_not_polling(curr)) 595 smp_send_reschedule(cpu); 596 else 597 trace_sched_wake_idle_without_ipi(cpu); 598 } 599 600 void resched_cpu(int cpu) 601 { 602 struct rq *rq = cpu_rq(cpu); 603 unsigned long flags; 604 605 if (!raw_spin_trylock_irqsave(&rq->lock, flags)) 606 return; 607 resched_curr(rq); 608 raw_spin_unlock_irqrestore(&rq->lock, flags); 609 } 610 611 #ifdef CONFIG_SMP 612 #ifdef CONFIG_NO_HZ_COMMON 613 /* 614 * In the semi idle case, use the nearest busy cpu for migrating timers 615 * from an idle cpu. This is good for power-savings. 616 * 617 * We don't do similar optimization for completely idle system, as 618 * selecting an idle cpu will add more delays to the timers than intended 619 * (as that cpu's timer base may not be uptodate wrt jiffies etc). 620 */ 621 int get_nohz_timer_target(void) 622 { 623 int i, cpu = smp_processor_id(); 624 struct sched_domain *sd; 625 626 if (!idle_cpu(cpu)) 627 return cpu; 628 629 rcu_read_lock(); 630 for_each_domain(cpu, sd) { 631 for_each_cpu(i, sched_domain_span(sd)) { 632 if (!idle_cpu(i)) { 633 cpu = i; 634 goto unlock; 635 } 636 } 637 } 638 unlock: 639 rcu_read_unlock(); 640 return cpu; 641 } 642 /* 643 * When add_timer_on() enqueues a timer into the timer wheel of an 644 * idle CPU then this timer might expire before the next timer event 645 * which is scheduled to wake up that CPU. In case of a completely 646 * idle system the next event might even be infinite time into the 647 * future. wake_up_idle_cpu() ensures that the CPU is woken up and 648 * leaves the inner idle loop so the newly added timer is taken into 649 * account when the CPU goes back to idle and evaluates the timer 650 * wheel for the next timer event. 651 */ 652 static void wake_up_idle_cpu(int cpu) 653 { 654 struct rq *rq = cpu_rq(cpu); 655 656 if (cpu == smp_processor_id()) 657 return; 658 659 if (set_nr_and_not_polling(rq->idle)) 660 smp_send_reschedule(cpu); 661 else 662 trace_sched_wake_idle_without_ipi(cpu); 663 } 664 665 static bool wake_up_full_nohz_cpu(int cpu) 666 { 667 /* 668 * We just need the target to call irq_exit() and re-evaluate 669 * the next tick. The nohz full kick at least implies that. 670 * If needed we can still optimize that later with an 671 * empty IRQ. 672 */ 673 if (tick_nohz_full_cpu(cpu)) { 674 if (cpu != smp_processor_id() || 675 tick_nohz_tick_stopped()) 676 tick_nohz_full_kick_cpu(cpu); 677 return true; 678 } 679 680 return false; 681 } 682 683 void wake_up_nohz_cpu(int cpu) 684 { 685 if (!wake_up_full_nohz_cpu(cpu)) 686 wake_up_idle_cpu(cpu); 687 } 688 689 static inline bool got_nohz_idle_kick(void) 690 { 691 int cpu = smp_processor_id(); 692 693 if (!test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu))) 694 return false; 695 696 if (idle_cpu(cpu) && !need_resched()) 697 return true; 698 699 /* 700 * We can't run Idle Load Balance on this CPU for this time so we 701 * cancel it and clear NOHZ_BALANCE_KICK 702 */ 703 clear_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)); 704 return false; 705 } 706 707 #else /* CONFIG_NO_HZ_COMMON */ 708 709 static inline bool got_nohz_idle_kick(void) 710 { 711 return false; 712 } 713 714 #endif /* CONFIG_NO_HZ_COMMON */ 715 716 #ifdef CONFIG_NO_HZ_FULL 717 bool sched_can_stop_tick(void) 718 { 719 /* 720 * FIFO realtime policy runs the highest priority task. Other runnable 721 * tasks are of a lower priority. The scheduler tick does nothing. 722 */ 723 if (current->policy == SCHED_FIFO) 724 return true; 725 726 /* 727 * Round-robin realtime tasks time slice with other tasks at the same 728 * realtime priority. Is this task the only one at this priority? 729 */ 730 if (current->policy == SCHED_RR) { 731 struct sched_rt_entity *rt_se = ¤t->rt; 732 733 return rt_se->run_list.prev == rt_se->run_list.next; 734 } 735 736 /* 737 * More than one running task need preemption. 738 * nr_running update is assumed to be visible 739 * after IPI is sent from wakers. 740 */ 741 if (this_rq()->nr_running > 1) 742 return false; 743 744 return true; 745 } 746 #endif /* CONFIG_NO_HZ_FULL */ 747 748 void sched_avg_update(struct rq *rq) 749 { 750 s64 period = sched_avg_period(); 751 752 while ((s64)(rq_clock(rq) - rq->age_stamp) > period) { 753 /* 754 * Inline assembly required to prevent the compiler 755 * optimising this loop into a divmod call. 756 * See __iter_div_u64_rem() for another example of this. 757 */ 758 asm("" : "+rm" (rq->age_stamp)); 759 rq->age_stamp += period; 760 rq->rt_avg /= 2; 761 } 762 } 763 764 #endif /* CONFIG_SMP */ 765 766 #if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \ 767 (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH))) 768 /* 769 * Iterate task_group tree rooted at *from, calling @down when first entering a 770 * node and @up when leaving it for the final time. 771 * 772 * Caller must hold rcu_lock or sufficient equivalent. 773 */ 774 int walk_tg_tree_from(struct task_group *from, 775 tg_visitor down, tg_visitor up, void *data) 776 { 777 struct task_group *parent, *child; 778 int ret; 779 780 parent = from; 781 782 down: 783 ret = (*down)(parent, data); 784 if (ret) 785 goto out; 786 list_for_each_entry_rcu(child, &parent->children, siblings) { 787 parent = child; 788 goto down; 789 790 up: 791 continue; 792 } 793 ret = (*up)(parent, data); 794 if (ret || parent == from) 795 goto out; 796 797 child = parent; 798 parent = parent->parent; 799 if (parent) 800 goto up; 801 out: 802 return ret; 803 } 804 805 int tg_nop(struct task_group *tg, void *data) 806 { 807 return 0; 808 } 809 #endif 810 811 static void set_load_weight(struct task_struct *p) 812 { 813 int prio = p->static_prio - MAX_RT_PRIO; 814 struct load_weight *load = &p->se.load; 815 816 /* 817 * SCHED_IDLE tasks get minimal weight: 818 */ 819 if (p->policy == SCHED_IDLE) { 820 load->weight = scale_load(WEIGHT_IDLEPRIO); 821 load->inv_weight = WMULT_IDLEPRIO; 822 return; 823 } 824 825 load->weight = scale_load(prio_to_weight[prio]); 826 load->inv_weight = prio_to_wmult[prio]; 827 } 828 829 static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) 830 { 831 update_rq_clock(rq); 832 sched_info_queued(rq, p); 833 p->sched_class->enqueue_task(rq, p, flags); 834 } 835 836 static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) 837 { 838 update_rq_clock(rq); 839 sched_info_dequeued(rq, p); 840 p->sched_class->dequeue_task(rq, p, flags); 841 } 842 843 void activate_task(struct rq *rq, struct task_struct *p, int flags) 844 { 845 if (task_contributes_to_load(p)) 846 rq->nr_uninterruptible--; 847 848 enqueue_task(rq, p, flags); 849 } 850 851 void deactivate_task(struct rq *rq, struct task_struct *p, int flags) 852 { 853 if (task_contributes_to_load(p)) 854 rq->nr_uninterruptible++; 855 856 dequeue_task(rq, p, flags); 857 } 858 859 static void update_rq_clock_task(struct rq *rq, s64 delta) 860 { 861 /* 862 * In theory, the compile should just see 0 here, and optimize out the call 863 * to sched_rt_avg_update. But I don't trust it... 864 */ 865 #if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) 866 s64 steal = 0, irq_delta = 0; 867 #endif 868 #ifdef CONFIG_IRQ_TIME_ACCOUNTING 869 irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; 870 871 /* 872 * Since irq_time is only updated on {soft,}irq_exit, we might run into 873 * this case when a previous update_rq_clock() happened inside a 874 * {soft,}irq region. 875 * 876 * When this happens, we stop ->clock_task and only update the 877 * prev_irq_time stamp to account for the part that fit, so that a next 878 * update will consume the rest. This ensures ->clock_task is 879 * monotonic. 880 * 881 * It does however cause some slight miss-attribution of {soft,}irq 882 * time, a more accurate solution would be to update the irq_time using 883 * the current rq->clock timestamp, except that would require using 884 * atomic ops. 885 */ 886 if (irq_delta > delta) 887 irq_delta = delta; 888 889 rq->prev_irq_time += irq_delta; 890 delta -= irq_delta; 891 #endif 892 #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING 893 if (static_key_false((¶virt_steal_rq_enabled))) { 894 steal = paravirt_steal_clock(cpu_of(rq)); 895 steal -= rq->prev_steal_time_rq; 896 897 if (unlikely(steal > delta)) 898 steal = delta; 899 900 rq->prev_steal_time_rq += steal; 901 delta -= steal; 902 } 903 #endif 904 905 rq->clock_task += delta; 906 907 #if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) 908 if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY)) 909 sched_rt_avg_update(rq, irq_delta + steal); 910 #endif 911 } 912 913 void sched_set_stop_task(int cpu, struct task_struct *stop) 914 { 915 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; 916 struct task_struct *old_stop = cpu_rq(cpu)->stop; 917 918 if (stop) { 919 /* 920 * Make it appear like a SCHED_FIFO task, its something 921 * userspace knows about and won't get confused about. 922 * 923 * Also, it will make PI more or less work without too 924 * much confusion -- but then, stop work should not 925 * rely on PI working anyway. 926 */ 927 sched_setscheduler_nocheck(stop, SCHED_FIFO, ¶m); 928 929 stop->sched_class = &stop_sched_class; 930 } 931 932 cpu_rq(cpu)->stop = stop; 933 934 if (old_stop) { 935 /* 936 * Reset it back to a normal scheduling class so that 937 * it can die in pieces. 938 */ 939 old_stop->sched_class = &rt_sched_class; 940 } 941 } 942 943 /* 944 * __normal_prio - return the priority that is based on the static prio 945 */ 946 static inline int __normal_prio(struct task_struct *p) 947 { 948 return p->static_prio; 949 } 950 951 /* 952 * Calculate the expected normal priority: i.e. priority 953 * without taking RT-inheritance into account. Might be 954 * boosted by interactivity modifiers. Changes upon fork, 955 * setprio syscalls, and whenever the interactivity 956 * estimator recalculates. 957 */ 958 static inline int normal_prio(struct task_struct *p) 959 { 960 int prio; 961 962 if (task_has_dl_policy(p)) 963 prio = MAX_DL_PRIO-1; 964 else if (task_has_rt_policy(p)) 965 prio = MAX_RT_PRIO-1 - p->rt_priority; 966 else 967 prio = __normal_prio(p); 968 return prio; 969 } 970 971 /* 972 * Calculate the current priority, i.e. the priority 973 * taken into account by the scheduler. This value might 974 * be boosted by RT tasks, or might be boosted by 975 * interactivity modifiers. Will be RT if the task got 976 * RT-boosted. If not then it returns p->normal_prio. 977 */ 978 static int effective_prio(struct task_struct *p) 979 { 980 p->normal_prio = normal_prio(p); 981 /* 982 * If we are RT tasks or we were boosted to RT priority, 983 * keep the priority unchanged. Otherwise, update priority 984 * to the normal priority: 985 */ 986 if (!rt_prio(p->prio)) 987 return p->normal_prio; 988 return p->prio; 989 } 990 991 /** 992 * task_curr - is this task currently executing on a CPU? 993 * @p: the task in question. 994 * 995 * Return: 1 if the task is currently executing. 0 otherwise. 996 */ 997 inline int task_curr(const struct task_struct *p) 998 { 999 return cpu_curr(task_cpu(p)) == p; 1000 } 1001 1002 /* 1003 * switched_from, switched_to and prio_changed must _NOT_ drop rq->lock, 1004 * use the balance_callback list if you want balancing. 1005 * 1006 * this means any call to check_class_changed() must be followed by a call to 1007 * balance_callback(). 1008 */ 1009 static inline void check_class_changed(struct rq *rq, struct task_struct *p, 1010 const struct sched_class *prev_class, 1011 int oldprio) 1012 { 1013 if (prev_class != p->sched_class) { 1014 if (prev_class->switched_from) 1015 prev_class->switched_from(rq, p); 1016 1017 p->sched_class->switched_to(rq, p); 1018 } else if (oldprio != p->prio || dl_task(p)) 1019 p->sched_class->prio_changed(rq, p, oldprio); 1020 } 1021 1022 void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) 1023 { 1024 const struct sched_class *class; 1025 1026 if (p->sched_class == rq->curr->sched_class) { 1027 rq->curr->sched_class->check_preempt_curr(rq, p, flags); 1028 } else { 1029 for_each_class(class) { 1030 if (class == rq->curr->sched_class) 1031 break; 1032 if (class == p->sched_class) { 1033 resched_curr(rq); 1034 break; 1035 } 1036 } 1037 } 1038 1039 /* 1040 * A queue event has occurred, and we're going to schedule. In 1041 * this case, we can save a useless back to back clock update. 1042 */ 1043 if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr)) 1044 rq_clock_skip_update(rq, true); 1045 } 1046 1047 #ifdef CONFIG_SMP 1048 /* 1049 * This is how migration works: 1050 * 1051 * 1) we invoke migration_cpu_stop() on the target CPU using 1052 * stop_one_cpu(). 1053 * 2) stopper starts to run (implicitly forcing the migrated thread 1054 * off the CPU) 1055 * 3) it checks whether the migrated task is still in the wrong runqueue. 1056 * 4) if it's in the wrong runqueue then the migration thread removes 1057 * it and puts it into the right queue. 1058 * 5) stopper completes and stop_one_cpu() returns and the migration 1059 * is done. 1060 */ 1061 1062 /* 1063 * move_queued_task - move a queued task to new rq. 1064 * 1065 * Returns (locked) new rq. Old rq's lock is released. 1066 */ 1067 static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new_cpu) 1068 { 1069 lockdep_assert_held(&rq->lock); 1070 1071 dequeue_task(rq, p, 0); 1072 p->on_rq = TASK_ON_RQ_MIGRATING; 1073 set_task_cpu(p, new_cpu); 1074 raw_spin_unlock(&rq->lock); 1075 1076 rq = cpu_rq(new_cpu); 1077 1078 raw_spin_lock(&rq->lock); 1079 BUG_ON(task_cpu(p) != new_cpu); 1080 p->on_rq = TASK_ON_RQ_QUEUED; 1081 enqueue_task(rq, p, 0); 1082 check_preempt_curr(rq, p, 0); 1083 1084 return rq; 1085 } 1086 1087 struct migration_arg { 1088 struct task_struct *task; 1089 int dest_cpu; 1090 }; 1091 1092 /* 1093 * Move (not current) task off this cpu, onto dest cpu. We're doing 1094 * this because either it can't run here any more (set_cpus_allowed() 1095 * away from this CPU, or CPU going down), or because we're 1096 * attempting to rebalance this task on exec (sched_exec). 1097 * 1098 * So we race with normal scheduler movements, but that's OK, as long 1099 * as the task is no longer on this CPU. 1100 */ 1101 static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int dest_cpu) 1102 { 1103 if (unlikely(!cpu_active(dest_cpu))) 1104 return rq; 1105 1106 /* Affinity changed (again). */ 1107 if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) 1108 return rq; 1109 1110 rq = move_queued_task(rq, p, dest_cpu); 1111 1112 return rq; 1113 } 1114 1115 /* 1116 * migration_cpu_stop - this will be executed by a highprio stopper thread 1117 * and performs thread migration by bumping thread off CPU then 1118 * 'pushing' onto another runqueue. 1119 */ 1120 static int migration_cpu_stop(void *data) 1121 { 1122 struct migration_arg *arg = data; 1123 struct task_struct *p = arg->task; 1124 struct rq *rq = this_rq(); 1125 1126 /* 1127 * The original target cpu might have gone down and we might 1128 * be on another cpu but it doesn't matter. 1129 */ 1130 local_irq_disable(); 1131 /* 1132 * We need to explicitly wake pending tasks before running 1133 * __migrate_task() such that we will not miss enforcing cpus_allowed 1134 * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test. 1135 */ 1136 sched_ttwu_pending(); 1137 1138 raw_spin_lock(&p->pi_lock); 1139 raw_spin_lock(&rq->lock); 1140 /* 1141 * If task_rq(p) != rq, it cannot be migrated here, because we're 1142 * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because 1143 * we're holding p->pi_lock. 1144 */ 1145 if (task_rq(p) == rq && task_on_rq_queued(p)) 1146 rq = __migrate_task(rq, p, arg->dest_cpu); 1147 raw_spin_unlock(&rq->lock); 1148 raw_spin_unlock(&p->pi_lock); 1149 1150 local_irq_enable(); 1151 return 0; 1152 } 1153 1154 void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) 1155 { 1156 if (p->sched_class->set_cpus_allowed) 1157 p->sched_class->set_cpus_allowed(p, new_mask); 1158 1159 cpumask_copy(&p->cpus_allowed, new_mask); 1160 p->nr_cpus_allowed = cpumask_weight(new_mask); 1161 } 1162 1163 /* 1164 * Change a given task's CPU affinity. Migrate the thread to a 1165 * proper CPU and schedule it away if the CPU it's executing on 1166 * is removed from the allowed bitmask. 1167 * 1168 * NOTE: the caller must have a valid reference to the task, the 1169 * task must not exit() & deallocate itself prematurely. The 1170 * call is not atomic; no spinlocks may be held. 1171 */ 1172 int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) 1173 { 1174 unsigned long flags; 1175 struct rq *rq; 1176 unsigned int dest_cpu; 1177 int ret = 0; 1178 1179 rq = task_rq_lock(p, &flags); 1180 1181 if (cpumask_equal(&p->cpus_allowed, new_mask)) 1182 goto out; 1183 1184 if (!cpumask_intersects(new_mask, cpu_active_mask)) { 1185 ret = -EINVAL; 1186 goto out; 1187 } 1188 1189 do_set_cpus_allowed(p, new_mask); 1190 1191 /* Can the task run on the task's current CPU? If so, we're done */ 1192 if (cpumask_test_cpu(task_cpu(p), new_mask)) 1193 goto out; 1194 1195 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); 1196 if (task_running(rq, p) || p->state == TASK_WAKING) { 1197 struct migration_arg arg = { p, dest_cpu }; 1198 /* Need help from migration thread: drop lock and wait. */ 1199 task_rq_unlock(rq, p, &flags); 1200 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); 1201 tlb_migrate_finish(p->mm); 1202 return 0; 1203 } else if (task_on_rq_queued(p)) { 1204 /* 1205 * OK, since we're going to drop the lock immediately 1206 * afterwards anyway. 1207 */ 1208 lockdep_unpin_lock(&rq->lock); 1209 rq = move_queued_task(rq, p, dest_cpu); 1210 lockdep_pin_lock(&rq->lock); 1211 } 1212 out: 1213 task_rq_unlock(rq, p, &flags); 1214 1215 return ret; 1216 } 1217 EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); 1218 1219 void set_task_cpu(struct task_struct *p, unsigned int new_cpu) 1220 { 1221 #ifdef CONFIG_SCHED_DEBUG 1222 /* 1223 * We should never call set_task_cpu() on a blocked task, 1224 * ttwu() will sort out the placement. 1225 */ 1226 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && 1227 !p->on_rq); 1228 1229 #ifdef CONFIG_LOCKDEP 1230 /* 1231 * The caller should hold either p->pi_lock or rq->lock, when changing 1232 * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks. 1233 * 1234 * sched_move_task() holds both and thus holding either pins the cgroup, 1235 * see task_group(). 1236 * 1237 * Furthermore, all task_rq users should acquire both locks, see 1238 * task_rq_lock(). 1239 */ 1240 WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || 1241 lockdep_is_held(&task_rq(p)->lock))); 1242 #endif 1243 #endif 1244 1245 trace_sched_migrate_task(p, new_cpu); 1246 1247 if (task_cpu(p) != new_cpu) { 1248 if (p->sched_class->migrate_task_rq) 1249 p->sched_class->migrate_task_rq(p, new_cpu); 1250 p->se.nr_migrations++; 1251 perf_event_task_migrate(p); 1252 } 1253 1254 __set_task_cpu(p, new_cpu); 1255 } 1256 1257 static void __migrate_swap_task(struct task_struct *p, int cpu) 1258 { 1259 if (task_on_rq_queued(p)) { 1260 struct rq *src_rq, *dst_rq; 1261 1262 src_rq = task_rq(p); 1263 dst_rq = cpu_rq(cpu); 1264 1265 deactivate_task(src_rq, p, 0); 1266 set_task_cpu(p, cpu); 1267 activate_task(dst_rq, p, 0); 1268 check_preempt_curr(dst_rq, p, 0); 1269 } else { 1270 /* 1271 * Task isn't running anymore; make it appear like we migrated 1272 * it before it went to sleep. This means on wakeup we make the 1273 * previous cpu our targer instead of where it really is. 1274 */ 1275 p->wake_cpu = cpu; 1276 } 1277 } 1278 1279 struct migration_swap_arg { 1280 struct task_struct *src_task, *dst_task; 1281 int src_cpu, dst_cpu; 1282 }; 1283 1284 static int migrate_swap_stop(void *data) 1285 { 1286 struct migration_swap_arg *arg = data; 1287 struct rq *src_rq, *dst_rq; 1288 int ret = -EAGAIN; 1289 1290 src_rq = cpu_rq(arg->src_cpu); 1291 dst_rq = cpu_rq(arg->dst_cpu); 1292 1293 double_raw_lock(&arg->src_task->pi_lock, 1294 &arg->dst_task->pi_lock); 1295 double_rq_lock(src_rq, dst_rq); 1296 if (task_cpu(arg->dst_task) != arg->dst_cpu) 1297 goto unlock; 1298 1299 if (task_cpu(arg->src_task) != arg->src_cpu) 1300 goto unlock; 1301 1302 if (!cpumask_test_cpu(arg->dst_cpu, tsk_cpus_allowed(arg->src_task))) 1303 goto unlock; 1304 1305 if (!cpumask_test_cpu(arg->src_cpu, tsk_cpus_allowed(arg->dst_task))) 1306 goto unlock; 1307 1308 __migrate_swap_task(arg->src_task, arg->dst_cpu); 1309 __migrate_swap_task(arg->dst_task, arg->src_cpu); 1310 1311 ret = 0; 1312 1313 unlock: 1314 double_rq_unlock(src_rq, dst_rq); 1315 raw_spin_unlock(&arg->dst_task->pi_lock); 1316 raw_spin_unlock(&arg->src_task->pi_lock); 1317 1318 return ret; 1319 } 1320 1321 /* 1322 * Cross migrate two tasks 1323 */ 1324 int migrate_swap(struct task_struct *cur, struct task_struct *p) 1325 { 1326 struct migration_swap_arg arg; 1327 int ret = -EINVAL; 1328 1329 arg = (struct migration_swap_arg){ 1330 .src_task = cur, 1331 .src_cpu = task_cpu(cur), 1332 .dst_task = p, 1333 .dst_cpu = task_cpu(p), 1334 }; 1335 1336 if (arg.src_cpu == arg.dst_cpu) 1337 goto out; 1338 1339 /* 1340 * These three tests are all lockless; this is OK since all of them 1341 * will be re-checked with proper locks held further down the line. 1342 */ 1343 if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu)) 1344 goto out; 1345 1346 if (!cpumask_test_cpu(arg.dst_cpu, tsk_cpus_allowed(arg.src_task))) 1347 goto out; 1348 1349 if (!cpumask_test_cpu(arg.src_cpu, tsk_cpus_allowed(arg.dst_task))) 1350 goto out; 1351 1352 trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu); 1353 ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg); 1354 1355 out: 1356 return ret; 1357 } 1358 1359 /* 1360 * wait_task_inactive - wait for a thread to unschedule. 1361 * 1362 * If @match_state is nonzero, it's the @p->state value just checked and 1363 * not expected to change. If it changes, i.e. @p might have woken up, 1364 * then return zero. When we succeed in waiting for @p to be off its CPU, 1365 * we return a positive number (its total switch count). If a second call 1366 * a short while later returns the same number, the caller can be sure that 1367 * @p has remained unscheduled the whole time. 1368 * 1369 * The caller must ensure that the task *will* unschedule sometime soon, 1370 * else this function might spin for a *long* time. This function can't 1371 * be called with interrupts off, or it may introduce deadlock with 1372 * smp_call_function() if an IPI is sent by the same process we are 1373 * waiting to become inactive. 1374 */ 1375 unsigned long wait_task_inactive(struct task_struct *p, long match_state) 1376 { 1377 unsigned long flags; 1378 int running, queued; 1379 unsigned long ncsw; 1380 struct rq *rq; 1381 1382 for (;;) { 1383 /* 1384 * We do the initial early heuristics without holding 1385 * any task-queue locks at all. We'll only try to get 1386 * the runqueue lock when things look like they will 1387 * work out! 1388 */ 1389 rq = task_rq(p); 1390 1391 /* 1392 * If the task is actively running on another CPU 1393 * still, just relax and busy-wait without holding 1394 * any locks. 1395 * 1396 * NOTE! Since we don't hold any locks, it's not 1397 * even sure that "rq" stays as the right runqueue! 1398 * But we don't care, since "task_running()" will 1399 * return false if the runqueue has changed and p 1400 * is actually now running somewhere else! 1401 */ 1402 while (task_running(rq, p)) { 1403 if (match_state && unlikely(p->state != match_state)) 1404 return 0; 1405 cpu_relax(); 1406 } 1407 1408 /* 1409 * Ok, time to look more closely! We need the rq 1410 * lock now, to be *sure*. If we're wrong, we'll 1411 * just go back and repeat. 1412 */ 1413 rq = task_rq_lock(p, &flags); 1414 trace_sched_wait_task(p); 1415 running = task_running(rq, p); 1416 queued = task_on_rq_queued(p); 1417 ncsw = 0; 1418 if (!match_state || p->state == match_state) 1419 ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ 1420 task_rq_unlock(rq, p, &flags); 1421 1422 /* 1423 * If it changed from the expected state, bail out now. 1424 */ 1425 if (unlikely(!ncsw)) 1426 break; 1427 1428 /* 1429 * Was it really running after all now that we 1430 * checked with the proper locks actually held? 1431 * 1432 * Oops. Go back and try again.. 1433 */ 1434 if (unlikely(running)) { 1435 cpu_relax(); 1436 continue; 1437 } 1438 1439 /* 1440 * It's not enough that it's not actively running, 1441 * it must be off the runqueue _entirely_, and not 1442 * preempted! 1443 * 1444 * So if it was still runnable (but just not actively 1445 * running right now), it's preempted, and we should 1446 * yield - it could be a while. 1447 */ 1448 if (unlikely(queued)) { 1449 ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ); 1450 1451 set_current_state(TASK_UNINTERRUPTIBLE); 1452 schedule_hrtimeout(&to, HRTIMER_MODE_REL); 1453 continue; 1454 } 1455 1456 /* 1457 * Ahh, all good. It wasn't running, and it wasn't 1458 * runnable, which means that it will never become 1459 * running in the future either. We're all done! 1460 */ 1461 break; 1462 } 1463 1464 return ncsw; 1465 } 1466 1467 /*** 1468 * kick_process - kick a running thread to enter/exit the kernel 1469 * @p: the to-be-kicked thread 1470 * 1471 * Cause a process which is running on another CPU to enter 1472 * kernel-mode, without any delay. (to get signals handled.) 1473 * 1474 * NOTE: this function doesn't have to take the runqueue lock, 1475 * because all it wants to ensure is that the remote task enters 1476 * the kernel. If the IPI races and the task has been migrated 1477 * to another CPU then no harm is done and the purpose has been 1478 * achieved as well. 1479 */ 1480 void kick_process(struct task_struct *p) 1481 { 1482 int cpu; 1483 1484 preempt_disable(); 1485 cpu = task_cpu(p); 1486 if ((cpu != smp_processor_id()) && task_curr(p)) 1487 smp_send_reschedule(cpu); 1488 preempt_enable(); 1489 } 1490 EXPORT_SYMBOL_GPL(kick_process); 1491 1492 /* 1493 * ->cpus_allowed is protected by both rq->lock and p->pi_lock 1494 */ 1495 static int select_fallback_rq(int cpu, struct task_struct *p) 1496 { 1497 int nid = cpu_to_node(cpu); 1498 const struct cpumask *nodemask = NULL; 1499 enum { cpuset, possible, fail } state = cpuset; 1500 int dest_cpu; 1501 1502 /* 1503 * If the node that the cpu is on has been offlined, cpu_to_node() 1504 * will return -1. There is no cpu on the node, and we should 1505 * select the cpu on the other node. 1506 */ 1507 if (nid != -1) { 1508 nodemask = cpumask_of_node(nid); 1509 1510 /* Look for allowed, online CPU in same node. */ 1511 for_each_cpu(dest_cpu, nodemask) { 1512 if (!cpu_online(dest_cpu)) 1513 continue; 1514 if (!cpu_active(dest_cpu)) 1515 continue; 1516 if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) 1517 return dest_cpu; 1518 } 1519 } 1520 1521 for (;;) { 1522 /* Any allowed, online CPU? */ 1523 for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) { 1524 if (!cpu_online(dest_cpu)) 1525 continue; 1526 if (!cpu_active(dest_cpu)) 1527 continue; 1528 goto out; 1529 } 1530 1531 switch (state) { 1532 case cpuset: 1533 /* No more Mr. Nice Guy. */ 1534 cpuset_cpus_allowed_fallback(p); 1535 state = possible; 1536 break; 1537 1538 case possible: 1539 do_set_cpus_allowed(p, cpu_possible_mask); 1540 state = fail; 1541 break; 1542 1543 case fail: 1544 BUG(); 1545 break; 1546 } 1547 } 1548 1549 out: 1550 if (state != cpuset) { 1551 /* 1552 * Don't tell them about moving exiting tasks or 1553 * kernel threads (both mm NULL), since they never 1554 * leave kernel. 1555 */ 1556 if (p->mm && printk_ratelimit()) { 1557 printk_deferred("process %d (%s) no longer affine to cpu%d\n", 1558 task_pid_nr(p), p->comm, cpu); 1559 } 1560 } 1561 1562 return dest_cpu; 1563 } 1564 1565 /* 1566 * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable. 1567 */ 1568 static inline 1569 int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) 1570 { 1571 lockdep_assert_held(&p->pi_lock); 1572 1573 if (p->nr_cpus_allowed > 1) 1574 cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags); 1575 1576 /* 1577 * In order not to call set_task_cpu() on a blocking task we need 1578 * to rely on ttwu() to place the task on a valid ->cpus_allowed 1579 * cpu. 1580 * 1581 * Since this is common to all placement strategies, this lives here. 1582 * 1583 * [ this allows ->select_task() to simply return task_cpu(p) and 1584 * not worry about this generic constraint ] 1585 */ 1586 if (unlikely(!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) || 1587 !cpu_online(cpu))) 1588 cpu = select_fallback_rq(task_cpu(p), p); 1589 1590 return cpu; 1591 } 1592 1593 static void update_avg(u64 *avg, u64 sample) 1594 { 1595 s64 diff = sample - *avg; 1596 *avg += diff >> 3; 1597 } 1598 #endif /* CONFIG_SMP */ 1599 1600 static void 1601 ttwu_stat(struct task_struct *p, int cpu, int wake_flags) 1602 { 1603 #ifdef CONFIG_SCHEDSTATS 1604 struct rq *rq = this_rq(); 1605 1606 #ifdef CONFIG_SMP 1607 int this_cpu = smp_processor_id(); 1608 1609 if (cpu == this_cpu) { 1610 schedstat_inc(rq, ttwu_local); 1611 schedstat_inc(p, se.statistics.nr_wakeups_local); 1612 } else { 1613 struct sched_domain *sd; 1614 1615 schedstat_inc(p, se.statistics.nr_wakeups_remote); 1616 rcu_read_lock(); 1617 for_each_domain(this_cpu, sd) { 1618 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { 1619 schedstat_inc(sd, ttwu_wake_remote); 1620 break; 1621 } 1622 } 1623 rcu_read_unlock(); 1624 } 1625 1626 if (wake_flags & WF_MIGRATED) 1627 schedstat_inc(p, se.statistics.nr_wakeups_migrate); 1628 1629 #endif /* CONFIG_SMP */ 1630 1631 schedstat_inc(rq, ttwu_count); 1632 schedstat_inc(p, se.statistics.nr_wakeups); 1633 1634 if (wake_flags & WF_SYNC) 1635 schedstat_inc(p, se.statistics.nr_wakeups_sync); 1636 1637 #endif /* CONFIG_SCHEDSTATS */ 1638 } 1639 1640 static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) 1641 { 1642 activate_task(rq, p, en_flags); 1643 p->on_rq = TASK_ON_RQ_QUEUED; 1644 1645 /* if a worker is waking up, notify workqueue */ 1646 if (p->flags & PF_WQ_WORKER) 1647 wq_worker_waking_up(p, cpu_of(rq)); 1648 } 1649 1650 /* 1651 * Mark the task runnable and perform wakeup-preemption. 1652 */ 1653 static void 1654 ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) 1655 { 1656 check_preempt_curr(rq, p, wake_flags); 1657 trace_sched_wakeup(p, true); 1658 1659 p->state = TASK_RUNNING; 1660 #ifdef CONFIG_SMP 1661 if (p->sched_class->task_woken) { 1662 /* 1663 * Our task @p is fully woken up and running; so its safe to 1664 * drop the rq->lock, hereafter rq is only used for statistics. 1665 */ 1666 lockdep_unpin_lock(&rq->lock); 1667 p->sched_class->task_woken(rq, p); 1668 lockdep_pin_lock(&rq->lock); 1669 } 1670 1671 if (rq->idle_stamp) { 1672 u64 delta = rq_clock(rq) - rq->idle_stamp; 1673 u64 max = 2*rq->max_idle_balance_cost; 1674 1675 update_avg(&rq->avg_idle, delta); 1676 1677 if (rq->avg_idle > max) 1678 rq->avg_idle = max; 1679 1680 rq->idle_stamp = 0; 1681 } 1682 #endif 1683 } 1684 1685 static void 1686 ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags) 1687 { 1688 lockdep_assert_held(&rq->lock); 1689 1690 #ifdef CONFIG_SMP 1691 if (p->sched_contributes_to_load) 1692 rq->nr_uninterruptible--; 1693 #endif 1694 1695 ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING); 1696 ttwu_do_wakeup(rq, p, wake_flags); 1697 } 1698 1699 /* 1700 * Called in case the task @p isn't fully descheduled from its runqueue, 1701 * in this case we must do a remote wakeup. Its a 'light' wakeup though, 1702 * since all we need to do is flip p->state to TASK_RUNNING, since 1703 * the task is still ->on_rq. 1704 */ 1705 static int ttwu_remote(struct task_struct *p, int wake_flags) 1706 { 1707 struct rq *rq; 1708 int ret = 0; 1709 1710 rq = __task_rq_lock(p); 1711 if (task_on_rq_queued(p)) { 1712 /* check_preempt_curr() may use rq clock */ 1713 update_rq_clock(rq); 1714 ttwu_do_wakeup(rq, p, wake_flags); 1715 ret = 1; 1716 } 1717 __task_rq_unlock(rq); 1718 1719 return ret; 1720 } 1721 1722 #ifdef CONFIG_SMP 1723 void sched_ttwu_pending(void) 1724 { 1725 struct rq *rq = this_rq(); 1726 struct llist_node *llist = llist_del_all(&rq->wake_list); 1727 struct task_struct *p; 1728 unsigned long flags; 1729 1730 if (!llist) 1731 return; 1732 1733 raw_spin_lock_irqsave(&rq->lock, flags); 1734 lockdep_pin_lock(&rq->lock); 1735 1736 while (llist) { 1737 p = llist_entry(llist, struct task_struct, wake_entry); 1738 llist = llist_next(llist); 1739 ttwu_do_activate(rq, p, 0); 1740 } 1741 1742 lockdep_unpin_lock(&rq->lock); 1743 raw_spin_unlock_irqrestore(&rq->lock, flags); 1744 } 1745 1746 void scheduler_ipi(void) 1747 { 1748 /* 1749 * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting 1750 * TIF_NEED_RESCHED remotely (for the first time) will also send 1751 * this IPI. 1752 */ 1753 preempt_fold_need_resched(); 1754 1755 if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick()) 1756 return; 1757 1758 /* 1759 * Not all reschedule IPI handlers call irq_enter/irq_exit, since 1760 * traditionally all their work was done from the interrupt return 1761 * path. Now that we actually do some work, we need to make sure 1762 * we do call them. 1763 * 1764 * Some archs already do call them, luckily irq_enter/exit nest 1765 * properly. 1766 * 1767 * Arguably we should visit all archs and update all handlers, 1768 * however a fair share of IPIs are still resched only so this would 1769 * somewhat pessimize the simple resched case. 1770 */ 1771 irq_enter(); 1772 sched_ttwu_pending(); 1773 1774 /* 1775 * Check if someone kicked us for doing the nohz idle load balance. 1776 */ 1777 if (unlikely(got_nohz_idle_kick())) { 1778 this_rq()->idle_balance = 1; 1779 raise_softirq_irqoff(SCHED_SOFTIRQ); 1780 } 1781 irq_exit(); 1782 } 1783 1784 static void ttwu_queue_remote(struct task_struct *p, int cpu) 1785 { 1786 struct rq *rq = cpu_rq(cpu); 1787 1788 if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) { 1789 if (!set_nr_if_polling(rq->idle)) 1790 smp_send_reschedule(cpu); 1791 else 1792 trace_sched_wake_idle_without_ipi(cpu); 1793 } 1794 } 1795 1796 void wake_up_if_idle(int cpu) 1797 { 1798 struct rq *rq = cpu_rq(cpu); 1799 unsigned long flags; 1800 1801 rcu_read_lock(); 1802 1803 if (!is_idle_task(rcu_dereference(rq->curr))) 1804 goto out; 1805 1806 if (set_nr_if_polling(rq->idle)) { 1807 trace_sched_wake_idle_without_ipi(cpu); 1808 } else { 1809 raw_spin_lock_irqsave(&rq->lock, flags); 1810 if (is_idle_task(rq->curr)) 1811 smp_send_reschedule(cpu); 1812 /* Else cpu is not in idle, do nothing here */ 1813 raw_spin_unlock_irqrestore(&rq->lock, flags); 1814 } 1815 1816 out: 1817 rcu_read_unlock(); 1818 } 1819 1820 bool cpus_share_cache(int this_cpu, int that_cpu) 1821 { 1822 return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); 1823 } 1824 #endif /* CONFIG_SMP */ 1825 1826 static void ttwu_queue(struct task_struct *p, int cpu) 1827 { 1828 struct rq *rq = cpu_rq(cpu); 1829 1830 #if defined(CONFIG_SMP) 1831 if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) { 1832 sched_clock_cpu(cpu); /* sync clocks x-cpu */ 1833 ttwu_queue_remote(p, cpu); 1834 return; 1835 } 1836 #endif 1837 1838 raw_spin_lock(&rq->lock); 1839 lockdep_pin_lock(&rq->lock); 1840 ttwu_do_activate(rq, p, 0); 1841 lockdep_unpin_lock(&rq->lock); 1842 raw_spin_unlock(&rq->lock); 1843 } 1844 1845 /** 1846 * try_to_wake_up - wake up a thread 1847 * @p: the thread to be awakened 1848 * @state: the mask of task states that can be woken 1849 * @wake_flags: wake modifier flags (WF_*) 1850 * 1851 * Put it on the run-queue if it's not already there. The "current" 1852 * thread is always on the run-queue (except when the actual 1853 * re-schedule is in progress), and as such you're allowed to do 1854 * the simpler "current->state = TASK_RUNNING" to mark yourself 1855 * runnable without the overhead of this. 1856 * 1857 * Return: %true if @p was woken up, %false if it was already running. 1858 * or @state didn't match @p's state. 1859 */ 1860 static int 1861 try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) 1862 { 1863 unsigned long flags; 1864 int cpu, success = 0; 1865 1866 /* 1867 * If we are going to wake up a thread waiting for CONDITION we 1868 * need to ensure that CONDITION=1 done by the caller can not be 1869 * reordered with p->state check below. This pairs with mb() in 1870 * set_current_state() the waiting thread does. 1871 */ 1872 smp_mb__before_spinlock(); 1873 raw_spin_lock_irqsave(&p->pi_lock, flags); 1874 if (!(p->state & state)) 1875 goto out; 1876 1877 success = 1; /* we're going to change ->state */ 1878 cpu = task_cpu(p); 1879 1880 if (p->on_rq && ttwu_remote(p, wake_flags)) 1881 goto stat; 1882 1883 #ifdef CONFIG_SMP 1884 /* 1885 * If the owning (remote) cpu is still in the middle of schedule() with 1886 * this task as prev, wait until its done referencing the task. 1887 */ 1888 while (p->on_cpu) 1889 cpu_relax(); 1890 /* 1891 * Pairs with the smp_wmb() in finish_lock_switch(). 1892 */ 1893 smp_rmb(); 1894 1895 p->sched_contributes_to_load = !!task_contributes_to_load(p); 1896 p->state = TASK_WAKING; 1897 1898 if (p->sched_class->task_waking) 1899 p->sched_class->task_waking(p); 1900 1901 cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags); 1902 if (task_cpu(p) != cpu) { 1903 wake_flags |= WF_MIGRATED; 1904 set_task_cpu(p, cpu); 1905 } 1906 #endif /* CONFIG_SMP */ 1907 1908 ttwu_queue(p, cpu); 1909 stat: 1910 ttwu_stat(p, cpu, wake_flags); 1911 out: 1912 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 1913 1914 return success; 1915 } 1916 1917 /** 1918 * try_to_wake_up_local - try to wake up a local task with rq lock held 1919 * @p: the thread to be awakened 1920 * 1921 * Put @p on the run-queue if it's not already there. The caller must 1922 * ensure that this_rq() is locked, @p is bound to this_rq() and not 1923 * the current task. 1924 */ 1925 static void try_to_wake_up_local(struct task_struct *p) 1926 { 1927 struct rq *rq = task_rq(p); 1928 1929 if (WARN_ON_ONCE(rq != this_rq()) || 1930 WARN_ON_ONCE(p == current)) 1931 return; 1932 1933 lockdep_assert_held(&rq->lock); 1934 1935 if (!raw_spin_trylock(&p->pi_lock)) { 1936 /* 1937 * This is OK, because current is on_cpu, which avoids it being 1938 * picked for load-balance and preemption/IRQs are still 1939 * disabled avoiding further scheduler activity on it and we've 1940 * not yet picked a replacement task. 1941 */ 1942 lockdep_unpin_lock(&rq->lock); 1943 raw_spin_unlock(&rq->lock); 1944 raw_spin_lock(&p->pi_lock); 1945 raw_spin_lock(&rq->lock); 1946 lockdep_pin_lock(&rq->lock); 1947 } 1948 1949 if (!(p->state & TASK_NORMAL)) 1950 goto out; 1951 1952 if (!task_on_rq_queued(p)) 1953 ttwu_activate(rq, p, ENQUEUE_WAKEUP); 1954 1955 ttwu_do_wakeup(rq, p, 0); 1956 ttwu_stat(p, smp_processor_id(), 0); 1957 out: 1958 raw_spin_unlock(&p->pi_lock); 1959 } 1960 1961 /** 1962 * wake_up_process - Wake up a specific process 1963 * @p: The process to be woken up. 1964 * 1965 * Attempt to wake up the nominated process and move it to the set of runnable 1966 * processes. 1967 * 1968 * Return: 1 if the process was woken up, 0 if it was already running. 1969 * 1970 * It may be assumed that this function implies a write memory barrier before 1971 * changing the task state if and only if any tasks are woken up. 1972 */ 1973 int wake_up_process(struct task_struct *p) 1974 { 1975 WARN_ON(task_is_stopped_or_traced(p)); 1976 return try_to_wake_up(p, TASK_NORMAL, 0); 1977 } 1978 EXPORT_SYMBOL(wake_up_process); 1979 1980 int wake_up_state(struct task_struct *p, unsigned int state) 1981 { 1982 return try_to_wake_up(p, state, 0); 1983 } 1984 1985 /* 1986 * This function clears the sched_dl_entity static params. 1987 */ 1988 void __dl_clear_params(struct task_struct *p) 1989 { 1990 struct sched_dl_entity *dl_se = &p->dl; 1991 1992 dl_se->dl_runtime = 0; 1993 dl_se->dl_deadline = 0; 1994 dl_se->dl_period = 0; 1995 dl_se->flags = 0; 1996 dl_se->dl_bw = 0; 1997 1998 dl_se->dl_throttled = 0; 1999 dl_se->dl_new = 1; 2000 dl_se->dl_yielded = 0; 2001 } 2002 2003 /* 2004 * Perform scheduler related setup for a newly forked process p. 2005 * p is forked by current. 2006 * 2007 * __sched_fork() is basic setup used by init_idle() too: 2008 */ 2009 static void __sched_fork(unsigned long clone_flags, struct task_struct *p) 2010 { 2011 p->on_rq = 0; 2012 2013 p->se.on_rq = 0; 2014 p->se.exec_start = 0; 2015 p->se.sum_exec_runtime = 0; 2016 p->se.prev_sum_exec_runtime = 0; 2017 p->se.nr_migrations = 0; 2018 p->se.vruntime = 0; 2019 #ifdef CONFIG_SMP 2020 p->se.avg.decay_count = 0; 2021 #endif 2022 INIT_LIST_HEAD(&p->se.group_node); 2023 2024 #ifdef CONFIG_SCHEDSTATS 2025 memset(&p->se.statistics, 0, sizeof(p->se.statistics)); 2026 #endif 2027 2028 RB_CLEAR_NODE(&p->dl.rb_node); 2029 init_dl_task_timer(&p->dl); 2030 __dl_clear_params(p); 2031 2032 INIT_LIST_HEAD(&p->rt.run_list); 2033 2034 #ifdef CONFIG_PREEMPT_NOTIFIERS 2035 INIT_HLIST_HEAD(&p->preempt_notifiers); 2036 #endif 2037 2038 #ifdef CONFIG_NUMA_BALANCING 2039 if (p->mm && atomic_read(&p->mm->mm_users) == 1) { 2040 p->mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay); 2041 p->mm->numa_scan_seq = 0; 2042 } 2043 2044 if (clone_flags & CLONE_VM) 2045 p->numa_preferred_nid = current->numa_preferred_nid; 2046 else 2047 p->numa_preferred_nid = -1; 2048 2049 p->node_stamp = 0ULL; 2050 p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; 2051 p->numa_scan_period = sysctl_numa_balancing_scan_delay; 2052 p->numa_work.next = &p->numa_work; 2053 p->numa_faults = NULL; 2054 p->last_task_numa_placement = 0; 2055 p->last_sum_exec_runtime = 0; 2056 2057 p->numa_group = NULL; 2058 #endif /* CONFIG_NUMA_BALANCING */ 2059 } 2060 2061 #ifdef CONFIG_NUMA_BALANCING 2062 #ifdef CONFIG_SCHED_DEBUG 2063 void set_numabalancing_state(bool enabled) 2064 { 2065 if (enabled) 2066 sched_feat_set("NUMA"); 2067 else 2068 sched_feat_set("NO_NUMA"); 2069 } 2070 #else 2071 __read_mostly bool numabalancing_enabled; 2072 2073 void set_numabalancing_state(bool enabled) 2074 { 2075 numabalancing_enabled = enabled; 2076 } 2077 #endif /* CONFIG_SCHED_DEBUG */ 2078 2079 #ifdef CONFIG_PROC_SYSCTL 2080 int sysctl_numa_balancing(struct ctl_table *table, int write, 2081 void __user *buffer, size_t *lenp, loff_t *ppos) 2082 { 2083 struct ctl_table t; 2084 int err; 2085 int state = numabalancing_enabled; 2086 2087 if (write && !capable(CAP_SYS_ADMIN)) 2088 return -EPERM; 2089 2090 t = *table; 2091 t.data = &state; 2092 err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); 2093 if (err < 0) 2094 return err; 2095 if (write) 2096 set_numabalancing_state(state); 2097 return err; 2098 } 2099 #endif 2100 #endif 2101 2102 /* 2103 * fork()/clone()-time setup: 2104 */ 2105 int sched_fork(unsigned long clone_flags, struct task_struct *p) 2106 { 2107 unsigned long flags; 2108 int cpu = get_cpu(); 2109 2110 __sched_fork(clone_flags, p); 2111 /* 2112 * We mark the process as running here. This guarantees that 2113 * nobody will actually run it, and a signal or other external 2114 * event cannot wake it up and insert it on the runqueue either. 2115 */ 2116 p->state = TASK_RUNNING; 2117 2118 /* 2119 * Make sure we do not leak PI boosting priority to the child. 2120 */ 2121 p->prio = current->normal_prio; 2122 2123 /* 2124 * Revert to default priority/policy on fork if requested. 2125 */ 2126 if (unlikely(p->sched_reset_on_fork)) { 2127 if (task_has_dl_policy(p) || task_has_rt_policy(p)) { 2128 p->policy = SCHED_NORMAL; 2129 p->static_prio = NICE_TO_PRIO(0); 2130 p->rt_priority = 0; 2131 } else if (PRIO_TO_NICE(p->static_prio) < 0) 2132 p->static_prio = NICE_TO_PRIO(0); 2133 2134 p->prio = p->normal_prio = __normal_prio(p); 2135 set_load_weight(p); 2136 2137 /* 2138 * We don't need the reset flag anymore after the fork. It has 2139 * fulfilled its duty: 2140 */ 2141 p->sched_reset_on_fork = 0; 2142 } 2143 2144 if (dl_prio(p->prio)) { 2145 put_cpu(); 2146 return -EAGAIN; 2147 } else if (rt_prio(p->prio)) { 2148 p->sched_class = &rt_sched_class; 2149 } else { 2150 p->sched_class = &fair_sched_class; 2151 } 2152 2153 if (p->sched_class->task_fork) 2154 p->sched_class->task_fork(p); 2155 2156 /* 2157 * The child is not yet in the pid-hash so no cgroup attach races, 2158 * and the cgroup is pinned to this child due to cgroup_fork() 2159 * is ran before sched_fork(). 2160 * 2161 * Silence PROVE_RCU. 2162 */ 2163 raw_spin_lock_irqsave(&p->pi_lock, flags); 2164 set_task_cpu(p, cpu); 2165 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 2166 2167 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) 2168 if (likely(sched_info_on())) 2169 memset(&p->sched_info, 0, sizeof(p->sched_info)); 2170 #endif 2171 #if defined(CONFIG_SMP) 2172 p->on_cpu = 0; 2173 #endif 2174 init_task_preempt_count(p); 2175 #ifdef CONFIG_SMP 2176 plist_node_init(&p->pushable_tasks, MAX_PRIO); 2177 RB_CLEAR_NODE(&p->pushable_dl_tasks); 2178 #endif 2179 2180 put_cpu(); 2181 return 0; 2182 } 2183 2184 unsigned long to_ratio(u64 period, u64 runtime) 2185 { 2186 if (runtime == RUNTIME_INF) 2187 return 1ULL << 20; 2188 2189 /* 2190 * Doing this here saves a lot of checks in all 2191 * the calling paths, and returning zero seems 2192 * safe for them anyway. 2193 */ 2194 if (period == 0) 2195 return 0; 2196 2197 return div64_u64(runtime << 20, period); 2198 } 2199 2200 #ifdef CONFIG_SMP 2201 inline struct dl_bw *dl_bw_of(int i) 2202 { 2203 rcu_lockdep_assert(rcu_read_lock_sched_held(), 2204 "sched RCU must be held"); 2205 return &cpu_rq(i)->rd->dl_bw; 2206 } 2207 2208 static inline int dl_bw_cpus(int i) 2209 { 2210 struct root_domain *rd = cpu_rq(i)->rd; 2211 int cpus = 0; 2212 2213 rcu_lockdep_assert(rcu_read_lock_sched_held(), 2214 "sched RCU must be held"); 2215 for_each_cpu_and(i, rd->span, cpu_active_mask) 2216 cpus++; 2217 2218 return cpus; 2219 } 2220 #else 2221 inline struct dl_bw *dl_bw_of(int i) 2222 { 2223 return &cpu_rq(i)->dl.dl_bw; 2224 } 2225 2226 static inline int dl_bw_cpus(int i) 2227 { 2228 return 1; 2229 } 2230 #endif 2231 2232 /* 2233 * We must be sure that accepting a new task (or allowing changing the 2234 * parameters of an existing one) is consistent with the bandwidth 2235 * constraints. If yes, this function also accordingly updates the currently 2236 * allocated bandwidth to reflect the new situation. 2237 * 2238 * This function is called while holding p's rq->lock. 2239 * 2240 * XXX we should delay bw change until the task's 0-lag point, see 2241 * __setparam_dl(). 2242 */ 2243 static int dl_overflow(struct task_struct *p, int policy, 2244 const struct sched_attr *attr) 2245 { 2246 2247 struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); 2248 u64 period = attr->sched_period ?: attr->sched_deadline; 2249 u64 runtime = attr->sched_runtime; 2250 u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0; 2251 int cpus, err = -1; 2252 2253 if (new_bw == p->dl.dl_bw) 2254 return 0; 2255 2256 /* 2257 * Either if a task, enters, leave, or stays -deadline but changes 2258 * its parameters, we may need to update accordingly the total 2259 * allocated bandwidth of the container. 2260 */ 2261 raw_spin_lock(&dl_b->lock); 2262 cpus = dl_bw_cpus(task_cpu(p)); 2263 if (dl_policy(policy) && !task_has_dl_policy(p) && 2264 !__dl_overflow(dl_b, cpus, 0, new_bw)) { 2265 __dl_add(dl_b, new_bw); 2266 err = 0; 2267 } else if (dl_policy(policy) && task_has_dl_policy(p) && 2268 !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) { 2269 __dl_clear(dl_b, p->dl.dl_bw); 2270 __dl_add(dl_b, new_bw); 2271 err = 0; 2272 } else if (!dl_policy(policy) && task_has_dl_policy(p)) { 2273 __dl_clear(dl_b, p->dl.dl_bw); 2274 err = 0; 2275 } 2276 raw_spin_unlock(&dl_b->lock); 2277 2278 return err; 2279 } 2280 2281 extern void init_dl_bw(struct dl_bw *dl_b); 2282 2283 /* 2284 * wake_up_new_task - wake up a newly created task for the first time. 2285 * 2286 * This function will do some initial scheduler statistics housekeeping 2287 * that must be done for every newly created context, then puts the task 2288 * on the runqueue and wakes it. 2289 */ 2290 void wake_up_new_task(struct task_struct *p) 2291 { 2292 unsigned long flags; 2293 struct rq *rq; 2294 2295 raw_spin_lock_irqsave(&p->pi_lock, flags); 2296 #ifdef CONFIG_SMP 2297 /* 2298 * Fork balancing, do it here and not earlier because: 2299 * - cpus_allowed can change in the fork path 2300 * - any previously selected cpu might disappear through hotplug 2301 */ 2302 set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); 2303 #endif 2304 2305 /* Initialize new task's runnable average */ 2306 init_task_runnable_average(p); 2307 rq = __task_rq_lock(p); 2308 activate_task(rq, p, 0); 2309 p->on_rq = TASK_ON_RQ_QUEUED; 2310 trace_sched_wakeup_new(p, true); 2311 check_preempt_curr(rq, p, WF_FORK); 2312 #ifdef CONFIG_SMP 2313 if (p->sched_class->task_woken) 2314 p->sched_class->task_woken(rq, p); 2315 #endif 2316 task_rq_unlock(rq, p, &flags); 2317 } 2318 2319 #ifdef CONFIG_PREEMPT_NOTIFIERS 2320 2321 static struct static_key preempt_notifier_key = STATIC_KEY_INIT_FALSE; 2322 2323 /** 2324 * preempt_notifier_register - tell me when current is being preempted & rescheduled 2325 * @notifier: notifier struct to register 2326 */ 2327 void preempt_notifier_register(struct preempt_notifier *notifier) 2328 { 2329 static_key_slow_inc(&preempt_notifier_key); 2330 hlist_add_head(¬ifier->link, ¤t->preempt_notifiers); 2331 } 2332 EXPORT_SYMBOL_GPL(preempt_notifier_register); 2333 2334 /** 2335 * preempt_notifier_unregister - no longer interested in preemption notifications 2336 * @notifier: notifier struct to unregister 2337 * 2338 * This is *not* safe to call from within a preemption notifier. 2339 */ 2340 void preempt_notifier_unregister(struct preempt_notifier *notifier) 2341 { 2342 hlist_del(¬ifier->link); 2343 static_key_slow_dec(&preempt_notifier_key); 2344 } 2345 EXPORT_SYMBOL_GPL(preempt_notifier_unregister); 2346 2347 static void __fire_sched_in_preempt_notifiers(struct task_struct *curr) 2348 { 2349 struct preempt_notifier *notifier; 2350 2351 hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) 2352 notifier->ops->sched_in(notifier, raw_smp_processor_id()); 2353 } 2354 2355 static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) 2356 { 2357 if (static_key_false(&preempt_notifier_key)) 2358 __fire_sched_in_preempt_notifiers(curr); 2359 } 2360 2361 static void 2362 __fire_sched_out_preempt_notifiers(struct task_struct *curr, 2363 struct task_struct *next) 2364 { 2365 struct preempt_notifier *notifier; 2366 2367 hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) 2368 notifier->ops->sched_out(notifier, next); 2369 } 2370 2371 static __always_inline void 2372 fire_sched_out_preempt_notifiers(struct task_struct *curr, 2373 struct task_struct *next) 2374 { 2375 if (static_key_false(&preempt_notifier_key)) 2376 __fire_sched_out_preempt_notifiers(curr, next); 2377 } 2378 2379 #else /* !CONFIG_PREEMPT_NOTIFIERS */ 2380 2381 static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) 2382 { 2383 } 2384 2385 static inline void 2386 fire_sched_out_preempt_notifiers(struct task_struct *curr, 2387 struct task_struct *next) 2388 { 2389 } 2390 2391 #endif /* CONFIG_PREEMPT_NOTIFIERS */ 2392 2393 /** 2394 * prepare_task_switch - prepare to switch tasks 2395 * @rq: the runqueue preparing to switch 2396 * @prev: the current task that is being switched out 2397 * @next: the task we are going to switch to. 2398 * 2399 * This is called with the rq lock held and interrupts off. It must 2400 * be paired with a subsequent finish_task_switch after the context 2401 * switch. 2402 * 2403 * prepare_task_switch sets up locking and calls architecture specific 2404 * hooks. 2405 */ 2406 static inline void 2407 prepare_task_switch(struct rq *rq, struct task_struct *prev, 2408 struct task_struct *next) 2409 { 2410 trace_sched_switch(prev, next); 2411 sched_info_switch(rq, prev, next); 2412 perf_event_task_sched_out(prev, next); 2413 fire_sched_out_preempt_notifiers(prev, next); 2414 prepare_lock_switch(rq, next); 2415 prepare_arch_switch(next); 2416 } 2417 2418 /** 2419 * finish_task_switch - clean up after a task-switch 2420 * @prev: the thread we just switched away from. 2421 * 2422 * finish_task_switch must be called after the context switch, paired 2423 * with a prepare_task_switch call before the context switch. 2424 * finish_task_switch will reconcile locking set up by prepare_task_switch, 2425 * and do any other architecture-specific cleanup actions. 2426 * 2427 * Note that we may have delayed dropping an mm in context_switch(). If 2428 * so, we finish that here outside of the runqueue lock. (Doing it 2429 * with the lock held can cause deadlocks; see schedule() for 2430 * details.) 2431 * 2432 * The context switch have flipped the stack from under us and restored the 2433 * local variables which were saved when this task called schedule() in the 2434 * past. prev == current is still correct but we need to recalculate this_rq 2435 * because prev may have moved to another CPU. 2436 */ 2437 static struct rq *finish_task_switch(struct task_struct *prev) 2438 __releases(rq->lock) 2439 { 2440 struct rq *rq = this_rq(); 2441 struct mm_struct *mm = rq->prev_mm; 2442 long prev_state; 2443 2444 rq->prev_mm = NULL; 2445 2446 /* 2447 * A task struct has one reference for the use as "current". 2448 * If a task dies, then it sets TASK_DEAD in tsk->state and calls 2449 * schedule one last time. The schedule call will never return, and 2450 * the scheduled task must drop that reference. 2451 * The test for TASK_DEAD must occur while the runqueue locks are 2452 * still held, otherwise prev could be scheduled on another cpu, die 2453 * there before we look at prev->state, and then the reference would 2454 * be dropped twice. 2455 * Manfred Spraul <manfred@colorfullife.com> 2456 */ 2457 prev_state = prev->state; 2458 vtime_task_switch(prev); 2459 finish_arch_switch(prev); 2460 perf_event_task_sched_in(prev, current); 2461 finish_lock_switch(rq, prev); 2462 finish_arch_post_lock_switch(); 2463 2464 fire_sched_in_preempt_notifiers(current); 2465 if (mm) 2466 mmdrop(mm); 2467 if (unlikely(prev_state == TASK_DEAD)) { 2468 if (prev->sched_class->task_dead) 2469 prev->sched_class->task_dead(prev); 2470 2471 /* 2472 * Remove function-return probe instances associated with this 2473 * task and put them back on the free list. 2474 */ 2475 kprobe_flush_task(prev); 2476 put_task_struct(prev); 2477 } 2478 2479 tick_nohz_task_switch(current); 2480 return rq; 2481 } 2482 2483 #ifdef CONFIG_SMP 2484 2485 /* rq->lock is NOT held, but preemption is disabled */ 2486 static void __balance_callback(struct rq *rq) 2487 { 2488 struct callback_head *head, *next; 2489 void (*func)(struct rq *rq); 2490 unsigned long flags; 2491 2492 raw_spin_lock_irqsave(&rq->lock, flags); 2493 head = rq->balance_callback; 2494 rq->balance_callback = NULL; 2495 while (head) { 2496 func = (void (*)(struct rq *))head->func; 2497 next = head->next; 2498 head->next = NULL; 2499 head = next; 2500 2501 func(rq); 2502 } 2503 raw_spin_unlock_irqrestore(&rq->lock, flags); 2504 } 2505 2506 static inline void balance_callback(struct rq *rq) 2507 { 2508 if (unlikely(rq->balance_callback)) 2509 __balance_callback(rq); 2510 } 2511 2512 #else 2513 2514 static inline void balance_callback(struct rq *rq) 2515 { 2516 } 2517 2518 #endif 2519 2520 /** 2521 * schedule_tail - first thing a freshly forked thread must call. 2522 * @prev: the thread we just switched away from. 2523 */ 2524 asmlinkage __visible void schedule_tail(struct task_struct *prev) 2525 __releases(rq->lock) 2526 { 2527 struct rq *rq; 2528 2529 /* finish_task_switch() drops rq->lock and enables preemtion */ 2530 preempt_disable(); 2531 rq = finish_task_switch(prev); 2532 balance_callback(rq); 2533 preempt_enable(); 2534 2535 if (current->set_child_tid) 2536 put_user(task_pid_vnr(current), current->set_child_tid); 2537 } 2538 2539 /* 2540 * context_switch - switch to the new MM and the new thread's register state. 2541 */ 2542 static inline struct rq * 2543 context_switch(struct rq *rq, struct task_struct *prev, 2544 struct task_struct *next) 2545 { 2546 struct mm_struct *mm, *oldmm; 2547 2548 prepare_task_switch(rq, prev, next); 2549 2550 mm = next->mm; 2551 oldmm = prev->active_mm; 2552 /* 2553 * For paravirt, this is coupled with an exit in switch_to to 2554 * combine the page table reload and the switch backend into 2555 * one hypercall. 2556 */ 2557 arch_start_context_switch(prev); 2558 2559 if (!mm) { 2560 next->active_mm = oldmm; 2561 atomic_inc(&oldmm->mm_count); 2562 enter_lazy_tlb(oldmm, next); 2563 } else 2564 switch_mm(oldmm, mm, next); 2565 2566 if (!prev->mm) { 2567 prev->active_mm = NULL; 2568 rq->prev_mm = oldmm; 2569 } 2570 /* 2571 * Since the runqueue lock will be released by the next 2572 * task (which is an invalid locking op but in the case 2573 * of the scheduler it's an obvious special-case), so we 2574 * do an early lockdep release here: 2575 */ 2576 lockdep_unpin_lock(&rq->lock); 2577 spin_release(&rq->lock.dep_map, 1, _THIS_IP_); 2578 2579 /* Here we just switch the register state and the stack. */ 2580 switch_to(prev, next, prev); 2581 barrier(); 2582 2583 return finish_task_switch(prev); 2584 } 2585 2586 /* 2587 * nr_running and nr_context_switches: 2588 * 2589 * externally visible scheduler statistics: current number of runnable 2590 * threads, total number of context switches performed since bootup. 2591 */ 2592 unsigned long nr_running(void) 2593 { 2594 unsigned long i, sum = 0; 2595 2596 for_each_online_cpu(i) 2597 sum += cpu_rq(i)->nr_running; 2598 2599 return sum; 2600 } 2601 2602 /* 2603 * Check if only the current task is running on the cpu. 2604 */ 2605 bool single_task_running(void) 2606 { 2607 if (cpu_rq(smp_processor_id())->nr_running == 1) 2608 return true; 2609 else 2610 return false; 2611 } 2612 EXPORT_SYMBOL(single_task_running); 2613 2614 unsigned long long nr_context_switches(void) 2615 { 2616 int i; 2617 unsigned long long sum = 0; 2618 2619 for_each_possible_cpu(i) 2620 sum += cpu_rq(i)->nr_switches; 2621 2622 return sum; 2623 } 2624 2625 unsigned long nr_iowait(void) 2626 { 2627 unsigned long i, sum = 0; 2628 2629 for_each_possible_cpu(i) 2630 sum += atomic_read(&cpu_rq(i)->nr_iowait); 2631 2632 return sum; 2633 } 2634 2635 unsigned long nr_iowait_cpu(int cpu) 2636 { 2637 struct rq *this = cpu_rq(cpu); 2638 return atomic_read(&this->nr_iowait); 2639 } 2640 2641 void get_iowait_load(unsigned long *nr_waiters, unsigned long *load) 2642 { 2643 struct rq *rq = this_rq(); 2644 *nr_waiters = atomic_read(&rq->nr_iowait); 2645 *load = rq->load.weight; 2646 } 2647 2648 #ifdef CONFIG_SMP 2649 2650 /* 2651 * sched_exec - execve() is a valuable balancing opportunity, because at 2652 * this point the task has the smallest effective memory and cache footprint. 2653 */ 2654 void sched_exec(void) 2655 { 2656 struct task_struct *p = current; 2657 unsigned long flags; 2658 int dest_cpu; 2659 2660 raw_spin_lock_irqsave(&p->pi_lock, flags); 2661 dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0); 2662 if (dest_cpu == smp_processor_id()) 2663 goto unlock; 2664 2665 if (likely(cpu_active(dest_cpu))) { 2666 struct migration_arg arg = { p, dest_cpu }; 2667 2668 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 2669 stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg); 2670 return; 2671 } 2672 unlock: 2673 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 2674 } 2675 2676 #endif 2677 2678 DEFINE_PER_CPU(struct kernel_stat, kstat); 2679 DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat); 2680 2681 EXPORT_PER_CPU_SYMBOL(kstat); 2682 EXPORT_PER_CPU_SYMBOL(kernel_cpustat); 2683 2684 /* 2685 * Return accounted runtime for the task. 2686 * In case the task is currently running, return the runtime plus current's 2687 * pending runtime that have not been accounted yet. 2688 */ 2689 unsigned long long task_sched_runtime(struct task_struct *p) 2690 { 2691 unsigned long flags; 2692 struct rq *rq; 2693 u64 ns; 2694 2695 #if defined(CONFIG_64BIT) && defined(CONFIG_SMP) 2696 /* 2697 * 64-bit doesn't need locks to atomically read a 64bit value. 2698 * So we have a optimization chance when the task's delta_exec is 0. 2699 * Reading ->on_cpu is racy, but this is ok. 2700 * 2701 * If we race with it leaving cpu, we'll take a lock. So we're correct. 2702 * If we race with it entering cpu, unaccounted time is 0. This is 2703 * indistinguishable from the read occurring a few cycles earlier. 2704 * If we see ->on_cpu without ->on_rq, the task is leaving, and has 2705 * been accounted, so we're correct here as well. 2706 */ 2707 if (!p->on_cpu || !task_on_rq_queued(p)) 2708 return p->se.sum_exec_runtime; 2709 #endif 2710 2711 rq = task_rq_lock(p, &flags); 2712 /* 2713 * Must be ->curr _and_ ->on_rq. If dequeued, we would 2714 * project cycles that may never be accounted to this 2715 * thread, breaking clock_gettime(). 2716 */ 2717 if (task_current(rq, p) && task_on_rq_queued(p)) { 2718 update_rq_clock(rq); 2719 p->sched_class->update_curr(rq); 2720 } 2721 ns = p->se.sum_exec_runtime; 2722 task_rq_unlock(rq, p, &flags); 2723 2724 return ns; 2725 } 2726 2727 /* 2728 * This function gets called by the timer code, with HZ frequency. 2729 * We call it with interrupts disabled. 2730 */ 2731 void scheduler_tick(void) 2732 { 2733 int cpu = smp_processor_id(); 2734 struct rq *rq = cpu_rq(cpu); 2735 struct task_struct *curr = rq->curr; 2736 2737 sched_clock_tick(); 2738 2739 raw_spin_lock(&rq->lock); 2740 update_rq_clock(rq); 2741 curr->sched_class->task_tick(rq, curr, 0); 2742 update_cpu_load_active(rq); 2743 calc_global_load_tick(rq); 2744 raw_spin_unlock(&rq->lock); 2745 2746 perf_event_task_tick(); 2747 2748 #ifdef CONFIG_SMP 2749 rq->idle_balance = idle_cpu(cpu); 2750 trigger_load_balance(rq); 2751 #endif 2752 rq_last_tick_reset(rq); 2753 } 2754 2755 #ifdef CONFIG_NO_HZ_FULL 2756 /** 2757 * scheduler_tick_max_deferment 2758 * 2759 * Keep at least one tick per second when a single 2760 * active task is running because the scheduler doesn't 2761 * yet completely support full dynticks environment. 2762 * 2763 * This makes sure that uptime, CFS vruntime, load 2764 * balancing, etc... continue to move forward, even 2765 * with a very low granularity. 2766 * 2767 * Return: Maximum deferment in nanoseconds. 2768 */ 2769 u64 scheduler_tick_max_deferment(void) 2770 { 2771 struct rq *rq = this_rq(); 2772 unsigned long next, now = READ_ONCE(jiffies); 2773 2774 next = rq->last_sched_tick + HZ; 2775 2776 if (time_before_eq(next, now)) 2777 return 0; 2778 2779 return jiffies_to_nsecs(next - now); 2780 } 2781 #endif 2782 2783 notrace unsigned long get_parent_ip(unsigned long addr) 2784 { 2785 if (in_lock_functions(addr)) { 2786 addr = CALLER_ADDR2; 2787 if (in_lock_functions(addr)) 2788 addr = CALLER_ADDR3; 2789 } 2790 return addr; 2791 } 2792 2793 #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ 2794 defined(CONFIG_PREEMPT_TRACER)) 2795 2796 void preempt_count_add(int val) 2797 { 2798 #ifdef CONFIG_DEBUG_PREEMPT 2799 /* 2800 * Underflow? 2801 */ 2802 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) 2803 return; 2804 #endif 2805 __preempt_count_add(val); 2806 #ifdef CONFIG_DEBUG_PREEMPT 2807 /* 2808 * Spinlock count overflowing soon? 2809 */ 2810 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= 2811 PREEMPT_MASK - 10); 2812 #endif 2813 if (preempt_count() == val) { 2814 unsigned long ip = get_parent_ip(CALLER_ADDR1); 2815 #ifdef CONFIG_DEBUG_PREEMPT 2816 current->preempt_disable_ip = ip; 2817 #endif 2818 trace_preempt_off(CALLER_ADDR0, ip); 2819 } 2820 } 2821 EXPORT_SYMBOL(preempt_count_add); 2822 NOKPROBE_SYMBOL(preempt_count_add); 2823 2824 void preempt_count_sub(int val) 2825 { 2826 #ifdef CONFIG_DEBUG_PREEMPT 2827 /* 2828 * Underflow? 2829 */ 2830 if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) 2831 return; 2832 /* 2833 * Is the spinlock portion underflowing? 2834 */ 2835 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && 2836 !(preempt_count() & PREEMPT_MASK))) 2837 return; 2838 #endif 2839 2840 if (preempt_count() == val) 2841 trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); 2842 __preempt_count_sub(val); 2843 } 2844 EXPORT_SYMBOL(preempt_count_sub); 2845 NOKPROBE_SYMBOL(preempt_count_sub); 2846 2847 #endif 2848 2849 /* 2850 * Print scheduling while atomic bug: 2851 */ 2852 static noinline void __schedule_bug(struct task_struct *prev) 2853 { 2854 if (oops_in_progress) 2855 return; 2856 2857 printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", 2858 prev->comm, prev->pid, preempt_count()); 2859 2860 debug_show_held_locks(prev); 2861 print_modules(); 2862 if (irqs_disabled()) 2863 print_irqtrace_events(prev); 2864 #ifdef CONFIG_DEBUG_PREEMPT 2865 if (in_atomic_preempt_off()) { 2866 pr_err("Preemption disabled at:"); 2867 print_ip_sym(current->preempt_disable_ip); 2868 pr_cont("\n"); 2869 } 2870 #endif 2871 dump_stack(); 2872 add_taint(TAINT_WARN, LOCKDEP_STILL_OK); 2873 } 2874 2875 /* 2876 * Various schedule()-time debugging checks and statistics: 2877 */ 2878 static inline void schedule_debug(struct task_struct *prev) 2879 { 2880 #ifdef CONFIG_SCHED_STACK_END_CHECK 2881 BUG_ON(unlikely(task_stack_end_corrupted(prev))); 2882 #endif 2883 /* 2884 * Test if we are atomic. Since do_exit() needs to call into 2885 * schedule() atomically, we ignore that path. Otherwise whine 2886 * if we are scheduling when we should not. 2887 */ 2888 if (unlikely(in_atomic_preempt_off() && prev->state != TASK_DEAD)) 2889 __schedule_bug(prev); 2890 rcu_sleep_check(); 2891 2892 profile_hit(SCHED_PROFILING, __builtin_return_address(0)); 2893 2894 schedstat_inc(this_rq(), sched_count); 2895 } 2896 2897 /* 2898 * Pick up the highest-prio task: 2899 */ 2900 static inline struct task_struct * 2901 pick_next_task(struct rq *rq, struct task_struct *prev) 2902 { 2903 const struct sched_class *class = &fair_sched_class; 2904 struct task_struct *p; 2905 2906 /* 2907 * Optimization: we know that if all tasks are in 2908 * the fair class we can call that function directly: 2909 */ 2910 if (likely(prev->sched_class == class && 2911 rq->nr_running == rq->cfs.h_nr_running)) { 2912 p = fair_sched_class.pick_next_task(rq, prev); 2913 if (unlikely(p == RETRY_TASK)) 2914 goto again; 2915 2916 /* assumes fair_sched_class->next == idle_sched_class */ 2917 if (unlikely(!p)) 2918 p = idle_sched_class.pick_next_task(rq, prev); 2919 2920 return p; 2921 } 2922 2923 again: 2924 for_each_class(class) { 2925 p = class->pick_next_task(rq, prev); 2926 if (p) { 2927 if (unlikely(p == RETRY_TASK)) 2928 goto again; 2929 return p; 2930 } 2931 } 2932 2933 BUG(); /* the idle class will always have a runnable task */ 2934 } 2935 2936 /* 2937 * __schedule() is the main scheduler function. 2938 * 2939 * The main means of driving the scheduler and thus entering this function are: 2940 * 2941 * 1. Explicit blocking: mutex, semaphore, waitqueue, etc. 2942 * 2943 * 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return 2944 * paths. For example, see arch/x86/entry_64.S. 2945 * 2946 * To drive preemption between tasks, the scheduler sets the flag in timer 2947 * interrupt handler scheduler_tick(). 2948 * 2949 * 3. Wakeups don't really cause entry into schedule(). They add a 2950 * task to the run-queue and that's it. 2951 * 2952 * Now, if the new task added to the run-queue preempts the current 2953 * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets 2954 * called on the nearest possible occasion: 2955 * 2956 * - If the kernel is preemptible (CONFIG_PREEMPT=y): 2957 * 2958 * - in syscall or exception context, at the next outmost 2959 * preempt_enable(). (this might be as soon as the wake_up()'s 2960 * spin_unlock()!) 2961 * 2962 * - in IRQ context, return from interrupt-handler to 2963 * preemptible context 2964 * 2965 * - If the kernel is not preemptible (CONFIG_PREEMPT is not set) 2966 * then at the next: 2967 * 2968 * - cond_resched() call 2969 * - explicit schedule() call 2970 * - return from syscall or exception to user-space 2971 * - return from interrupt-handler to user-space 2972 * 2973 * WARNING: must be called with preemption disabled! 2974 */ 2975 static void __sched __schedule(void) 2976 { 2977 struct task_struct *prev, *next; 2978 unsigned long *switch_count; 2979 struct rq *rq; 2980 int cpu; 2981 2982 cpu = smp_processor_id(); 2983 rq = cpu_rq(cpu); 2984 rcu_note_context_switch(); 2985 prev = rq->curr; 2986 2987 schedule_debug(prev); 2988 2989 if (sched_feat(HRTICK)) 2990 hrtick_clear(rq); 2991 2992 /* 2993 * Make sure that signal_pending_state()->signal_pending() below 2994 * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) 2995 * done by the caller to avoid the race with signal_wake_up(). 2996 */ 2997 smp_mb__before_spinlock(); 2998 raw_spin_lock_irq(&rq->lock); 2999 lockdep_pin_lock(&rq->lock); 3000 3001 rq->clock_skip_update <<= 1; /* promote REQ to ACT */ 3002 3003 switch_count = &prev->nivcsw; 3004 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { 3005 if (unlikely(signal_pending_state(prev->state, prev))) { 3006 prev->state = TASK_RUNNING; 3007 } else { 3008 deactivate_task(rq, prev, DEQUEUE_SLEEP); 3009 prev->on_rq = 0; 3010 3011 /* 3012 * If a worker went to sleep, notify and ask workqueue 3013 * whether it wants to wake up a task to maintain 3014 * concurrency. 3015 */ 3016 if (prev->flags & PF_WQ_WORKER) { 3017 struct task_struct *to_wakeup; 3018 3019 to_wakeup = wq_worker_sleeping(prev, cpu); 3020 if (to_wakeup) 3021 try_to_wake_up_local(to_wakeup); 3022 } 3023 } 3024 switch_count = &prev->nvcsw; 3025 } 3026 3027 if (task_on_rq_queued(prev)) 3028 update_rq_clock(rq); 3029 3030 next = pick_next_task(rq, prev); 3031 clear_tsk_need_resched(prev); 3032 clear_preempt_need_resched(); 3033 rq->clock_skip_update = 0; 3034 3035 if (likely(prev != next)) { 3036 rq->nr_switches++; 3037 rq->curr = next; 3038 ++*switch_count; 3039 3040 rq = context_switch(rq, prev, next); /* unlocks the rq */ 3041 cpu = cpu_of(rq); 3042 } else { 3043 lockdep_unpin_lock(&rq->lock); 3044 raw_spin_unlock_irq(&rq->lock); 3045 } 3046 3047 balance_callback(rq); 3048 } 3049 3050 static inline void sched_submit_work(struct task_struct *tsk) 3051 { 3052 if (!tsk->state || tsk_is_pi_blocked(tsk)) 3053 return; 3054 /* 3055 * If we are going to sleep and we have plugged IO queued, 3056 * make sure to submit it to avoid deadlocks. 3057 */ 3058 if (blk_needs_flush_plug(tsk)) 3059 blk_schedule_flush_plug(tsk); 3060 } 3061 3062 asmlinkage __visible void __sched schedule(void) 3063 { 3064 struct task_struct *tsk = current; 3065 3066 sched_submit_work(tsk); 3067 do { 3068 preempt_disable(); 3069 __schedule(); 3070 sched_preempt_enable_no_resched(); 3071 } while (need_resched()); 3072 } 3073 EXPORT_SYMBOL(schedule); 3074 3075 #ifdef CONFIG_CONTEXT_TRACKING 3076 asmlinkage __visible void __sched schedule_user(void) 3077 { 3078 /* 3079 * If we come here after a random call to set_need_resched(), 3080 * or we have been woken up remotely but the IPI has not yet arrived, 3081 * we haven't yet exited the RCU idle mode. Do it here manually until 3082 * we find a better solution. 3083 * 3084 * NB: There are buggy callers of this function. Ideally we 3085 * should warn if prev_state != CONTEXT_USER, but that will trigger 3086 * too frequently to make sense yet. 3087 */ 3088 enum ctx_state prev_state = exception_enter(); 3089 schedule(); 3090 exception_exit(prev_state); 3091 } 3092 #endif 3093 3094 /** 3095 * schedule_preempt_disabled - called with preemption disabled 3096 * 3097 * Returns with preemption disabled. Note: preempt_count must be 1 3098 */ 3099 void __sched schedule_preempt_disabled(void) 3100 { 3101 sched_preempt_enable_no_resched(); 3102 schedule(); 3103 preempt_disable(); 3104 } 3105 3106 static void __sched notrace preempt_schedule_common(void) 3107 { 3108 do { 3109 preempt_active_enter(); 3110 __schedule(); 3111 preempt_active_exit(); 3112 3113 /* 3114 * Check again in case we missed a preemption opportunity 3115 * between schedule and now. 3116 */ 3117 } while (need_resched()); 3118 } 3119 3120 #ifdef CONFIG_PREEMPT 3121 /* 3122 * this is the entry point to schedule() from in-kernel preemption 3123 * off of preempt_enable. Kernel preemptions off return from interrupt 3124 * occur there and call schedule directly. 3125 */ 3126 asmlinkage __visible void __sched notrace preempt_schedule(void) 3127 { 3128 /* 3129 * If there is a non-zero preempt_count or interrupts are disabled, 3130 * we do not want to preempt the current task. Just return.. 3131 */ 3132 if (likely(!preemptible())) 3133 return; 3134 3135 preempt_schedule_common(); 3136 } 3137 NOKPROBE_SYMBOL(preempt_schedule); 3138 EXPORT_SYMBOL(preempt_schedule); 3139 3140 /** 3141 * preempt_schedule_notrace - preempt_schedule called by tracing 3142 * 3143 * The tracing infrastructure uses preempt_enable_notrace to prevent 3144 * recursion and tracing preempt enabling caused by the tracing 3145 * infrastructure itself. But as tracing can happen in areas coming 3146 * from userspace or just about to enter userspace, a preempt enable 3147 * can occur before user_exit() is called. This will cause the scheduler 3148 * to be called when the system is still in usermode. 3149 * 3150 * To prevent this, the preempt_enable_notrace will use this function 3151 * instead of preempt_schedule() to exit user context if needed before 3152 * calling the scheduler. 3153 */ 3154 asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) 3155 { 3156 enum ctx_state prev_ctx; 3157 3158 if (likely(!preemptible())) 3159 return; 3160 3161 do { 3162 /* 3163 * Use raw __prempt_count() ops that don't call function. 3164 * We can't call functions before disabling preemption which 3165 * disarm preemption tracing recursions. 3166 */ 3167 __preempt_count_add(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET); 3168 barrier(); 3169 /* 3170 * Needs preempt disabled in case user_exit() is traced 3171 * and the tracer calls preempt_enable_notrace() causing 3172 * an infinite recursion. 3173 */ 3174 prev_ctx = exception_enter(); 3175 __schedule(); 3176 exception_exit(prev_ctx); 3177 3178 barrier(); 3179 __preempt_count_sub(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET); 3180 } while (need_resched()); 3181 } 3182 EXPORT_SYMBOL_GPL(preempt_schedule_notrace); 3183 3184 #endif /* CONFIG_PREEMPT */ 3185 3186 /* 3187 * this is the entry point to schedule() from kernel preemption 3188 * off of irq context. 3189 * Note, that this is called and return with irqs disabled. This will 3190 * protect us against recursive calling from irq. 3191 */ 3192 asmlinkage __visible void __sched preempt_schedule_irq(void) 3193 { 3194 enum ctx_state prev_state; 3195 3196 /* Catch callers which need to be fixed */ 3197 BUG_ON(preempt_count() || !irqs_disabled()); 3198 3199 prev_state = exception_enter(); 3200 3201 do { 3202 preempt_active_enter(); 3203 local_irq_enable(); 3204 __schedule(); 3205 local_irq_disable(); 3206 preempt_active_exit(); 3207 } while (need_resched()); 3208 3209 exception_exit(prev_state); 3210 } 3211 3212 int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags, 3213 void *key) 3214 { 3215 return try_to_wake_up(curr->private, mode, wake_flags); 3216 } 3217 EXPORT_SYMBOL(default_wake_function); 3218 3219 #ifdef CONFIG_RT_MUTEXES 3220 3221 /* 3222 * rt_mutex_setprio - set the current priority of a task 3223 * @p: task 3224 * @prio: prio value (kernel-internal form) 3225 * 3226 * This function changes the 'effective' priority of a task. It does 3227 * not touch ->normal_prio like __setscheduler(). 3228 * 3229 * Used by the rt_mutex code to implement priority inheritance 3230 * logic. Call site only calls if the priority of the task changed. 3231 */ 3232 void rt_mutex_setprio(struct task_struct *p, int prio) 3233 { 3234 int oldprio, queued, running, enqueue_flag = 0; 3235 struct rq *rq; 3236 const struct sched_class *prev_class; 3237 3238 BUG_ON(prio > MAX_PRIO); 3239 3240 rq = __task_rq_lock(p); 3241 3242 /* 3243 * Idle task boosting is a nono in general. There is one 3244 * exception, when PREEMPT_RT and NOHZ is active: 3245 * 3246 * The idle task calls get_next_timer_interrupt() and holds 3247 * the timer wheel base->lock on the CPU and another CPU wants 3248 * to access the timer (probably to cancel it). We can safely 3249 * ignore the boosting request, as the idle CPU runs this code 3250 * with interrupts disabled and will complete the lock 3251 * protected section without being interrupted. So there is no 3252 * real need to boost. 3253 */ 3254 if (unlikely(p == rq->idle)) { 3255 WARN_ON(p != rq->curr); 3256 WARN_ON(p->pi_blocked_on); 3257 goto out_unlock; 3258 } 3259 3260 trace_sched_pi_setprio(p, prio); 3261 oldprio = p->prio; 3262 prev_class = p->sched_class; 3263 queued = task_on_rq_queued(p); 3264 running = task_current(rq, p); 3265 if (queued) 3266 dequeue_task(rq, p, 0); 3267 if (running) 3268 put_prev_task(rq, p); 3269 3270 /* 3271 * Boosting condition are: 3272 * 1. -rt task is running and holds mutex A 3273 * --> -dl task blocks on mutex A 3274 * 3275 * 2. -dl task is running and holds mutex A 3276 * --> -dl task blocks on mutex A and could preempt the 3277 * running task 3278 */ 3279 if (dl_prio(prio)) { 3280 struct task_struct *pi_task = rt_mutex_get_top_task(p); 3281 if (!dl_prio(p->normal_prio) || 3282 (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) { 3283 p->dl.dl_boosted = 1; 3284 enqueue_flag = ENQUEUE_REPLENISH; 3285 } else 3286 p->dl.dl_boosted = 0; 3287 p->sched_class = &dl_sched_class; 3288 } else if (rt_prio(prio)) { 3289 if (dl_prio(oldprio)) 3290 p->dl.dl_boosted = 0; 3291 if (oldprio < prio) 3292 enqueue_flag = ENQUEUE_HEAD; 3293 p->sched_class = &rt_sched_class; 3294 } else { 3295 if (dl_prio(oldprio)) 3296 p->dl.dl_boosted = 0; 3297 if (rt_prio(oldprio)) 3298 p->rt.timeout = 0; 3299 p->sched_class = &fair_sched_class; 3300 } 3301 3302 p->prio = prio; 3303 3304 if (running) 3305 p->sched_class->set_curr_task(rq); 3306 if (queued) 3307 enqueue_task(rq, p, enqueue_flag); 3308 3309 check_class_changed(rq, p, prev_class, oldprio); 3310 out_unlock: 3311 preempt_disable(); /* avoid rq from going away on us */ 3312 __task_rq_unlock(rq); 3313 3314 balance_callback(rq); 3315 preempt_enable(); 3316 } 3317 #endif 3318 3319 void set_user_nice(struct task_struct *p, long nice) 3320 { 3321 int old_prio, delta, queued; 3322 unsigned long flags; 3323 struct rq *rq; 3324 3325 if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE) 3326 return; 3327 /* 3328 * We have to be careful, if called from sys_setpriority(), 3329 * the task might be in the middle of scheduling on another CPU. 3330 */ 3331 rq = task_rq_lock(p, &flags); 3332 /* 3333 * The RT priorities are set via sched_setscheduler(), but we still 3334 * allow the 'normal' nice value to be set - but as expected 3335 * it wont have any effect on scheduling until the task is 3336 * SCHED_DEADLINE, SCHED_FIFO or SCHED_RR: 3337 */ 3338 if (task_has_dl_policy(p) || task_has_rt_policy(p)) { 3339 p->static_prio = NICE_TO_PRIO(nice); 3340 goto out_unlock; 3341 } 3342 queued = task_on_rq_queued(p); 3343 if (queued) 3344 dequeue_task(rq, p, 0); 3345 3346 p->static_prio = NICE_TO_PRIO(nice); 3347 set_load_weight(p); 3348 old_prio = p->prio; 3349 p->prio = effective_prio(p); 3350 delta = p->prio - old_prio; 3351 3352 if (queued) { 3353 enqueue_task(rq, p, 0); 3354 /* 3355 * If the task increased its priority or is running and 3356 * lowered its priority, then reschedule its CPU: 3357 */ 3358 if (delta < 0 || (delta > 0 && task_running(rq, p))) 3359 resched_curr(rq); 3360 } 3361 out_unlock: 3362 task_rq_unlock(rq, p, &flags); 3363 } 3364 EXPORT_SYMBOL(set_user_nice); 3365 3366 /* 3367 * can_nice - check if a task can reduce its nice value 3368 * @p: task 3369 * @nice: nice value 3370 */ 3371 int can_nice(const struct task_struct *p, const int nice) 3372 { 3373 /* convert nice value [19,-20] to rlimit style value [1,40] */ 3374 int nice_rlim = nice_to_rlimit(nice); 3375 3376 return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || 3377 capable(CAP_SYS_NICE)); 3378 } 3379 3380 #ifdef __ARCH_WANT_SYS_NICE 3381 3382 /* 3383 * sys_nice - change the priority of the current process. 3384 * @increment: priority increment 3385 * 3386 * sys_setpriority is a more generic, but much slower function that 3387 * does similar things. 3388 */ 3389 SYSCALL_DEFINE1(nice, int, increment) 3390 { 3391 long nice, retval; 3392 3393 /* 3394 * Setpriority might change our priority at the same moment. 3395 * We don't have to worry. Conceptually one call occurs first 3396 * and we have a single winner. 3397 */ 3398 increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH); 3399 nice = task_nice(current) + increment; 3400 3401 nice = clamp_val(nice, MIN_NICE, MAX_NICE); 3402 if (increment < 0 && !can_nice(current, nice)) 3403 return -EPERM; 3404 3405 retval = security_task_setnice(current, nice); 3406 if (retval) 3407 return retval; 3408 3409 set_user_nice(current, nice); 3410 return 0; 3411 } 3412 3413 #endif 3414 3415 /** 3416 * task_prio - return the priority value of a given task. 3417 * @p: the task in question. 3418 * 3419 * Return: The priority value as seen by users in /proc. 3420 * RT tasks are offset by -200. Normal tasks are centered 3421 * around 0, value goes from -16 to +15. 3422 */ 3423 int task_prio(const struct task_struct *p) 3424 { 3425 return p->prio - MAX_RT_PRIO; 3426 } 3427 3428 /** 3429 * idle_cpu - is a given cpu idle currently? 3430 * @cpu: the processor in question. 3431 * 3432 * Return: 1 if the CPU is currently idle. 0 otherwise. 3433 */ 3434 int idle_cpu(int cpu) 3435 { 3436 struct rq *rq = cpu_rq(cpu); 3437 3438 if (rq->curr != rq->idle) 3439 return 0; 3440 3441 if (rq->nr_running) 3442 return 0; 3443 3444 #ifdef CONFIG_SMP 3445 if (!llist_empty(&rq->wake_list)) 3446 return 0; 3447 #endif 3448 3449 return 1; 3450 } 3451 3452 /** 3453 * idle_task - return the idle task for a given cpu. 3454 * @cpu: the processor in question. 3455 * 3456 * Return: The idle task for the cpu @cpu. 3457 */ 3458 struct task_struct *idle_task(int cpu) 3459 { 3460 return cpu_rq(cpu)->idle; 3461 } 3462 3463 /** 3464 * find_process_by_pid - find a process with a matching PID value. 3465 * @pid: the pid in question. 3466 * 3467 * The task of @pid, if found. %NULL otherwise. 3468 */ 3469 static struct task_struct *find_process_by_pid(pid_t pid) 3470 { 3471 return pid ? find_task_by_vpid(pid) : current; 3472 } 3473 3474 /* 3475 * This function initializes the sched_dl_entity of a newly becoming 3476 * SCHED_DEADLINE task. 3477 * 3478 * Only the static values are considered here, the actual runtime and the 3479 * absolute deadline will be properly calculated when the task is enqueued 3480 * for the first time with its new policy. 3481 */ 3482 static void 3483 __setparam_dl(struct task_struct *p, const struct sched_attr *attr) 3484 { 3485 struct sched_dl_entity *dl_se = &p->dl; 3486 3487 dl_se->dl_runtime = attr->sched_runtime; 3488 dl_se->dl_deadline = attr->sched_deadline; 3489 dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline; 3490 dl_se->flags = attr->sched_flags; 3491 dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime); 3492 3493 /* 3494 * Changing the parameters of a task is 'tricky' and we're not doing 3495 * the correct thing -- also see task_dead_dl() and switched_from_dl(). 3496 * 3497 * What we SHOULD do is delay the bandwidth release until the 0-lag 3498 * point. This would include retaining the task_struct until that time 3499 * and change dl_overflow() to not immediately decrement the current 3500 * amount. 3501 * 3502 * Instead we retain the current runtime/deadline and let the new 3503 * parameters take effect after the current reservation period lapses. 3504 * This is safe (albeit pessimistic) because the 0-lag point is always 3505 * before the current scheduling deadline. 3506 * 3507 * We can still have temporary overloads because we do not delay the 3508 * change in bandwidth until that time; so admission control is 3509 * not on the safe side. It does however guarantee tasks will never 3510 * consume more than promised. 3511 */ 3512 } 3513 3514 /* 3515 * sched_setparam() passes in -1 for its policy, to let the functions 3516 * it calls know not to change it. 3517 */ 3518 #define SETPARAM_POLICY -1 3519 3520 static void __setscheduler_params(struct task_struct *p, 3521 const struct sched_attr *attr) 3522 { 3523 int policy = attr->sched_policy; 3524 3525 if (policy == SETPARAM_POLICY) 3526 policy = p->policy; 3527 3528 p->policy = policy; 3529 3530 if (dl_policy(policy)) 3531 __setparam_dl(p, attr); 3532 else if (fair_policy(policy)) 3533 p->static_prio = NICE_TO_PRIO(attr->sched_nice); 3534 3535 /* 3536 * __sched_setscheduler() ensures attr->sched_priority == 0 when 3537 * !rt_policy. Always setting this ensures that things like 3538 * getparam()/getattr() don't report silly values for !rt tasks. 3539 */ 3540 p->rt_priority = attr->sched_priority; 3541 p->normal_prio = normal_prio(p); 3542 set_load_weight(p); 3543 } 3544 3545 /* Actually do priority change: must hold pi & rq lock. */ 3546 static void __setscheduler(struct rq *rq, struct task_struct *p, 3547 const struct sched_attr *attr, bool keep_boost) 3548 { 3549 __setscheduler_params(p, attr); 3550 3551 /* 3552 * Keep a potential priority boosting if called from 3553 * sched_setscheduler(). 3554 */ 3555 if (keep_boost) 3556 p->prio = rt_mutex_get_effective_prio(p, normal_prio(p)); 3557 else 3558 p->prio = normal_prio(p); 3559 3560 if (dl_prio(p->prio)) 3561 p->sched_class = &dl_sched_class; 3562 else if (rt_prio(p->prio)) 3563 p->sched_class = &rt_sched_class; 3564 else 3565 p->sched_class = &fair_sched_class; 3566 } 3567 3568 static void 3569 __getparam_dl(struct task_struct *p, struct sched_attr *attr) 3570 { 3571 struct sched_dl_entity *dl_se = &p->dl; 3572 3573 attr->sched_priority = p->rt_priority; 3574 attr->sched_runtime = dl_se->dl_runtime; 3575 attr->sched_deadline = dl_se->dl_deadline; 3576 attr->sched_period = dl_se->dl_period; 3577 attr->sched_flags = dl_se->flags; 3578 } 3579 3580 /* 3581 * This function validates the new parameters of a -deadline task. 3582 * We ask for the deadline not being zero, and greater or equal 3583 * than the runtime, as well as the period of being zero or 3584 * greater than deadline. Furthermore, we have to be sure that 3585 * user parameters are above the internal resolution of 1us (we 3586 * check sched_runtime only since it is always the smaller one) and 3587 * below 2^63 ns (we have to check both sched_deadline and 3588 * sched_period, as the latter can be zero). 3589 */ 3590 static bool 3591 __checkparam_dl(const struct sched_attr *attr) 3592 { 3593 /* deadline != 0 */ 3594 if (attr->sched_deadline == 0) 3595 return false; 3596 3597 /* 3598 * Since we truncate DL_SCALE bits, make sure we're at least 3599 * that big. 3600 */ 3601 if (attr->sched_runtime < (1ULL << DL_SCALE)) 3602 return false; 3603 3604 /* 3605 * Since we use the MSB for wrap-around and sign issues, make 3606 * sure it's not set (mind that period can be equal to zero). 3607 */ 3608 if (attr->sched_deadline & (1ULL << 63) || 3609 attr->sched_period & (1ULL << 63)) 3610 return false; 3611 3612 /* runtime <= deadline <= period (if period != 0) */ 3613 if ((attr->sched_period != 0 && 3614 attr->sched_period < attr->sched_deadline) || 3615 attr->sched_deadline < attr->sched_runtime) 3616 return false; 3617 3618 return true; 3619 } 3620 3621 /* 3622 * check the target process has a UID that matches the current process's 3623 */ 3624 static bool check_same_owner(struct task_struct *p) 3625 { 3626 const struct cred *cred = current_cred(), *pcred; 3627 bool match; 3628 3629 rcu_read_lock(); 3630 pcred = __task_cred(p); 3631 match = (uid_eq(cred->euid, pcred->euid) || 3632 uid_eq(cred->euid, pcred->uid)); 3633 rcu_read_unlock(); 3634 return match; 3635 } 3636 3637 static bool dl_param_changed(struct task_struct *p, 3638 const struct sched_attr *attr) 3639 { 3640 struct sched_dl_entity *dl_se = &p->dl; 3641 3642 if (dl_se->dl_runtime != attr->sched_runtime || 3643 dl_se->dl_deadline != attr->sched_deadline || 3644 dl_se->dl_period != attr->sched_period || 3645 dl_se->flags != attr->sched_flags) 3646 return true; 3647 3648 return false; 3649 } 3650 3651 static int __sched_setscheduler(struct task_struct *p, 3652 const struct sched_attr *attr, 3653 bool user, bool pi) 3654 { 3655 int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 : 3656 MAX_RT_PRIO - 1 - attr->sched_priority; 3657 int retval, oldprio, oldpolicy = -1, queued, running; 3658 int new_effective_prio, policy = attr->sched_policy; 3659 unsigned long flags; 3660 const struct sched_class *prev_class; 3661 struct rq *rq; 3662 int reset_on_fork; 3663 3664 /* may grab non-irq protected spin_locks */ 3665 BUG_ON(in_interrupt()); 3666 recheck: 3667 /* double check policy once rq lock held */ 3668 if (policy < 0) { 3669 reset_on_fork = p->sched_reset_on_fork; 3670 policy = oldpolicy = p->policy; 3671 } else { 3672 reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK); 3673 3674 if (policy != SCHED_DEADLINE && 3675 policy != SCHED_FIFO && policy != SCHED_RR && 3676 policy != SCHED_NORMAL && policy != SCHED_BATCH && 3677 policy != SCHED_IDLE) 3678 return -EINVAL; 3679 } 3680 3681 if (attr->sched_flags & ~(SCHED_FLAG_RESET_ON_FORK)) 3682 return -EINVAL; 3683 3684 /* 3685 * Valid priorities for SCHED_FIFO and SCHED_RR are 3686 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, 3687 * SCHED_BATCH and SCHED_IDLE is 0. 3688 */ 3689 if ((p->mm && attr->sched_priority > MAX_USER_RT_PRIO-1) || 3690 (!p->mm && attr->sched_priority > MAX_RT_PRIO-1)) 3691 return -EINVAL; 3692 if ((dl_policy(policy) && !__checkparam_dl(attr)) || 3693 (rt_policy(policy) != (attr->sched_priority != 0))) 3694 return -EINVAL; 3695 3696 /* 3697 * Allow unprivileged RT tasks to decrease priority: 3698 */ 3699 if (user && !capable(CAP_SYS_NICE)) { 3700 if (fair_policy(policy)) { 3701 if (attr->sched_nice < task_nice(p) && 3702 !can_nice(p, attr->sched_nice)) 3703 return -EPERM; 3704 } 3705 3706 if (rt_policy(policy)) { 3707 unsigned long rlim_rtprio = 3708 task_rlimit(p, RLIMIT_RTPRIO); 3709 3710 /* can't set/change the rt policy */ 3711 if (policy != p->policy && !rlim_rtprio) 3712 return -EPERM; 3713 3714 /* can't increase priority */ 3715 if (attr->sched_priority > p->rt_priority && 3716 attr->sched_priority > rlim_rtprio) 3717 return -EPERM; 3718 } 3719 3720 /* 3721 * Can't set/change SCHED_DEADLINE policy at all for now 3722 * (safest behavior); in the future we would like to allow 3723 * unprivileged DL tasks to increase their relative deadline 3724 * or reduce their runtime (both ways reducing utilization) 3725 */ 3726 if (dl_policy(policy)) 3727 return -EPERM; 3728 3729 /* 3730 * Treat SCHED_IDLE as nice 20. Only allow a switch to 3731 * SCHED_NORMAL if the RLIMIT_NICE would normally permit it. 3732 */ 3733 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) { 3734 if (!can_nice(p, task_nice(p))) 3735 return -EPERM; 3736 } 3737 3738 /* can't change other user's priorities */ 3739 if (!check_same_owner(p)) 3740 return -EPERM; 3741 3742 /* Normal users shall not reset the sched_reset_on_fork flag */ 3743 if (p->sched_reset_on_fork && !reset_on_fork) 3744 return -EPERM; 3745 } 3746 3747 if (user) { 3748 retval = security_task_setscheduler(p); 3749 if (retval) 3750 return retval; 3751 } 3752 3753 /* 3754 * make sure no PI-waiters arrive (or leave) while we are 3755 * changing the priority of the task: 3756 * 3757 * To be able to change p->policy safely, the appropriate 3758 * runqueue lock must be held. 3759 */ 3760 rq = task_rq_lock(p, &flags); 3761 3762 /* 3763 * Changing the policy of the stop threads its a very bad idea 3764 */ 3765 if (p == rq->stop) { 3766 task_rq_unlock(rq, p, &flags); 3767 return -EINVAL; 3768 } 3769 3770 /* 3771 * If not changing anything there's no need to proceed further, 3772 * but store a possible modification of reset_on_fork. 3773 */ 3774 if (unlikely(policy == p->policy)) { 3775 if (fair_policy(policy) && attr->sched_nice != task_nice(p)) 3776 goto change; 3777 if (rt_policy(policy) && attr->sched_priority != p->rt_priority) 3778 goto change; 3779 if (dl_policy(policy) && dl_param_changed(p, attr)) 3780 goto change; 3781 3782 p->sched_reset_on_fork = reset_on_fork; 3783 task_rq_unlock(rq, p, &flags); 3784 return 0; 3785 } 3786 change: 3787 3788 if (user) { 3789 #ifdef CONFIG_RT_GROUP_SCHED 3790 /* 3791 * Do not allow realtime tasks into groups that have no runtime 3792 * assigned. 3793 */ 3794 if (rt_bandwidth_enabled() && rt_policy(policy) && 3795 task_group(p)->rt_bandwidth.rt_runtime == 0 && 3796 !task_group_is_autogroup(task_group(p))) { 3797 task_rq_unlock(rq, p, &flags); 3798 return -EPERM; 3799 } 3800 #endif 3801 #ifdef CONFIG_SMP 3802 if (dl_bandwidth_enabled() && dl_policy(policy)) { 3803 cpumask_t *span = rq->rd->span; 3804 3805 /* 3806 * Don't allow tasks with an affinity mask smaller than 3807 * the entire root_domain to become SCHED_DEADLINE. We 3808 * will also fail if there's no bandwidth available. 3809 */ 3810 if (!cpumask_subset(span, &p->cpus_allowed) || 3811 rq->rd->dl_bw.bw == 0) { 3812 task_rq_unlock(rq, p, &flags); 3813 return -EPERM; 3814 } 3815 } 3816 #endif 3817 } 3818 3819 /* recheck policy now with rq lock held */ 3820 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { 3821 policy = oldpolicy = -1; 3822 task_rq_unlock(rq, p, &flags); 3823 goto recheck; 3824 } 3825 3826 /* 3827 * If setscheduling to SCHED_DEADLINE (or changing the parameters 3828 * of a SCHED_DEADLINE task) we need to check if enough bandwidth 3829 * is available. 3830 */ 3831 if ((dl_policy(policy) || dl_task(p)) && dl_overflow(p, policy, attr)) { 3832 task_rq_unlock(rq, p, &flags); 3833 return -EBUSY; 3834 } 3835 3836 p->sched_reset_on_fork = reset_on_fork; 3837 oldprio = p->prio; 3838 3839 if (pi) { 3840 /* 3841 * Take priority boosted tasks into account. If the new 3842 * effective priority is unchanged, we just store the new 3843 * normal parameters and do not touch the scheduler class and 3844 * the runqueue. This will be done when the task deboost 3845 * itself. 3846 */ 3847 new_effective_prio = rt_mutex_get_effective_prio(p, newprio); 3848 if (new_effective_prio == oldprio) { 3849 __setscheduler_params(p, attr); 3850 task_rq_unlock(rq, p, &flags); 3851 return 0; 3852 } 3853 } 3854 3855 queued = task_on_rq_queued(p); 3856 running = task_current(rq, p); 3857 if (queued) 3858 dequeue_task(rq, p, 0); 3859 if (running) 3860 put_prev_task(rq, p); 3861 3862 prev_class = p->sched_class; 3863 __setscheduler(rq, p, attr, pi); 3864 3865 if (running) 3866 p->sched_class->set_curr_task(rq); 3867 if (queued) { 3868 /* 3869 * We enqueue to tail when the priority of a task is 3870 * increased (user space view). 3871 */ 3872 enqueue_task(rq, p, oldprio <= p->prio ? ENQUEUE_HEAD : 0); 3873 } 3874 3875 check_class_changed(rq, p, prev_class, oldprio); 3876 preempt_disable(); /* avoid rq from going away on us */ 3877 task_rq_unlock(rq, p, &flags); 3878 3879 if (pi) 3880 rt_mutex_adjust_pi(p); 3881 3882 /* 3883 * Run balance callbacks after we've adjusted the PI chain. 3884 */ 3885 balance_callback(rq); 3886 preempt_enable(); 3887 3888 return 0; 3889 } 3890 3891 static int _sched_setscheduler(struct task_struct *p, int policy, 3892 const struct sched_param *param, bool check) 3893 { 3894 struct sched_attr attr = { 3895 .sched_policy = policy, 3896 .sched_priority = param->sched_priority, 3897 .sched_nice = PRIO_TO_NICE(p->static_prio), 3898 }; 3899 3900 /* Fixup the legacy SCHED_RESET_ON_FORK hack. */ 3901 if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) { 3902 attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; 3903 policy &= ~SCHED_RESET_ON_FORK; 3904 attr.sched_policy = policy; 3905 } 3906 3907 return __sched_setscheduler(p, &attr, check, true); 3908 } 3909 /** 3910 * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. 3911 * @p: the task in question. 3912 * @policy: new policy. 3913 * @param: structure containing the new RT priority. 3914 * 3915 * Return: 0 on success. An error code otherwise. 3916 * 3917 * NOTE that the task may be already dead. 3918 */ 3919 int sched_setscheduler(struct task_struct *p, int policy, 3920 const struct sched_param *param) 3921 { 3922 return _sched_setscheduler(p, policy, param, true); 3923 } 3924 EXPORT_SYMBOL_GPL(sched_setscheduler); 3925 3926 int sched_setattr(struct task_struct *p, const struct sched_attr *attr) 3927 { 3928 return __sched_setscheduler(p, attr, true, true); 3929 } 3930 EXPORT_SYMBOL_GPL(sched_setattr); 3931 3932 /** 3933 * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. 3934 * @p: the task in question. 3935 * @policy: new policy. 3936 * @param: structure containing the new RT priority. 3937 * 3938 * Just like sched_setscheduler, only don't bother checking if the 3939 * current context has permission. For example, this is needed in 3940 * stop_machine(): we create temporary high priority worker threads, 3941 * but our caller might not have that capability. 3942 * 3943 * Return: 0 on success. An error code otherwise. 3944 */ 3945 int sched_setscheduler_nocheck(struct task_struct *p, int policy, 3946 const struct sched_param *param) 3947 { 3948 return _sched_setscheduler(p, policy, param, false); 3949 } 3950 3951 static int 3952 do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) 3953 { 3954 struct sched_param lparam; 3955 struct task_struct *p; 3956 int retval; 3957 3958 if (!param || pid < 0) 3959 return -EINVAL; 3960 if (copy_from_user(&lparam, param, sizeof(struct sched_param))) 3961 return -EFAULT; 3962 3963 rcu_read_lock(); 3964 retval = -ESRCH; 3965 p = find_process_by_pid(pid); 3966 if (p != NULL) 3967 retval = sched_setscheduler(p, policy, &lparam); 3968 rcu_read_unlock(); 3969 3970 return retval; 3971 } 3972 3973 /* 3974 * Mimics kernel/events/core.c perf_copy_attr(). 3975 */ 3976 static int sched_copy_attr(struct sched_attr __user *uattr, 3977 struct sched_attr *attr) 3978 { 3979 u32 size; 3980 int ret; 3981 3982 if (!access_ok(VERIFY_WRITE, uattr, SCHED_ATTR_SIZE_VER0)) 3983 return -EFAULT; 3984 3985 /* 3986 * zero the full structure, so that a short copy will be nice. 3987 */ 3988 memset(attr, 0, sizeof(*attr)); 3989 3990 ret = get_user(size, &uattr->size); 3991 if (ret) 3992 return ret; 3993 3994 if (size > PAGE_SIZE) /* silly large */ 3995 goto err_size; 3996 3997 if (!size) /* abi compat */ 3998 size = SCHED_ATTR_SIZE_VER0; 3999 4000 if (size < SCHED_ATTR_SIZE_VER0) 4001 goto err_size; 4002 4003 /* 4004 * If we're handed a bigger struct than we know of, 4005 * ensure all the unknown bits are 0 - i.e. new 4006 * user-space does not rely on any kernel feature 4007 * extensions we dont know about yet. 4008 */ 4009 if (size > sizeof(*attr)) { 4010 unsigned char __user *addr; 4011 unsigned char __user *end; 4012 unsigned char val; 4013 4014 addr = (void __user *)uattr + sizeof(*attr); 4015 end = (void __user *)uattr + size; 4016 4017 for (; addr < end; addr++) { 4018 ret = get_user(val, addr); 4019 if (ret) 4020 return ret; 4021 if (val) 4022 goto err_size; 4023 } 4024 size = sizeof(*attr); 4025 } 4026 4027 ret = copy_from_user(attr, uattr, size); 4028 if (ret) 4029 return -EFAULT; 4030 4031 /* 4032 * XXX: do we want to be lenient like existing syscalls; or do we want 4033 * to be strict and return an error on out-of-bounds values? 4034 */ 4035 attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE); 4036 4037 return 0; 4038 4039 err_size: 4040 put_user(sizeof(*attr), &uattr->size); 4041 return -E2BIG; 4042 } 4043 4044 /** 4045 * sys_sched_setscheduler - set/change the scheduler policy and RT priority 4046 * @pid: the pid in question. 4047 * @policy: new policy. 4048 * @param: structure containing the new RT priority. 4049 * 4050 * Return: 0 on success. An error code otherwise. 4051 */ 4052 SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, 4053 struct sched_param __user *, param) 4054 { 4055 /* negative values for policy are not valid */ 4056 if (policy < 0) 4057 return -EINVAL; 4058 4059 return do_sched_setscheduler(pid, policy, param); 4060 } 4061 4062 /** 4063 * sys_sched_setparam - set/change the RT priority of a thread 4064 * @pid: the pid in question. 4065 * @param: structure containing the new RT priority. 4066 * 4067 * Return: 0 on success. An error code otherwise. 4068 */ 4069 SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) 4070 { 4071 return do_sched_setscheduler(pid, SETPARAM_POLICY, param); 4072 } 4073 4074 /** 4075 * sys_sched_setattr - same as above, but with extended sched_attr 4076 * @pid: the pid in question. 4077 * @uattr: structure containing the extended parameters. 4078 * @flags: for future extension. 4079 */ 4080 SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, 4081 unsigned int, flags) 4082 { 4083 struct sched_attr attr; 4084 struct task_struct *p; 4085 int retval; 4086 4087 if (!uattr || pid < 0 || flags) 4088 return -EINVAL; 4089 4090 retval = sched_copy_attr(uattr, &attr); 4091 if (retval) 4092 return retval; 4093 4094 if ((int)attr.sched_policy < 0) 4095 return -EINVAL; 4096 4097 rcu_read_lock(); 4098 retval = -ESRCH; 4099 p = find_process_by_pid(pid); 4100 if (p != NULL) 4101 retval = sched_setattr(p, &attr); 4102 rcu_read_unlock(); 4103 4104 return retval; 4105 } 4106 4107 /** 4108 * sys_sched_getscheduler - get the policy (scheduling class) of a thread 4109 * @pid: the pid in question. 4110 * 4111 * Return: On success, the policy of the thread. Otherwise, a negative error 4112 * code. 4113 */ 4114 SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) 4115 { 4116 struct task_struct *p; 4117 int retval; 4118 4119 if (pid < 0) 4120 return -EINVAL; 4121 4122 retval = -ESRCH; 4123 rcu_read_lock(); 4124 p = find_process_by_pid(pid); 4125 if (p) { 4126 retval = security_task_getscheduler(p); 4127 if (!retval) 4128 retval = p->policy 4129 | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0); 4130 } 4131 rcu_read_unlock(); 4132 return retval; 4133 } 4134 4135 /** 4136 * sys_sched_getparam - get the RT priority of a thread 4137 * @pid: the pid in question. 4138 * @param: structure containing the RT priority. 4139 * 4140 * Return: On success, 0 and the RT priority is in @param. Otherwise, an error 4141 * code. 4142 */ 4143 SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) 4144 { 4145 struct sched_param lp = { .sched_priority = 0 }; 4146 struct task_struct *p; 4147 int retval; 4148 4149 if (!param || pid < 0) 4150 return -EINVAL; 4151 4152 rcu_read_lock(); 4153 p = find_process_by_pid(pid); 4154 retval = -ESRCH; 4155 if (!p) 4156 goto out_unlock; 4157 4158 retval = security_task_getscheduler(p); 4159 if (retval) 4160 goto out_unlock; 4161 4162 if (task_has_rt_policy(p)) 4163 lp.sched_priority = p->rt_priority; 4164 rcu_read_unlock(); 4165 4166 /* 4167 * This one might sleep, we cannot do it with a spinlock held ... 4168 */ 4169 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; 4170 4171 return retval; 4172 4173 out_unlock: 4174 rcu_read_unlock(); 4175 return retval; 4176 } 4177 4178 static int sched_read_attr(struct sched_attr __user *uattr, 4179 struct sched_attr *attr, 4180 unsigned int usize) 4181 { 4182 int ret; 4183 4184 if (!access_ok(VERIFY_WRITE, uattr, usize)) 4185 return -EFAULT; 4186 4187 /* 4188 * If we're handed a smaller struct than we know of, 4189 * ensure all the unknown bits are 0 - i.e. old 4190 * user-space does not get uncomplete information. 4191 */ 4192 if (usize < sizeof(*attr)) { 4193 unsigned char *addr; 4194 unsigned char *end; 4195 4196 addr = (void *)attr + usize; 4197 end = (void *)attr + sizeof(*attr); 4198 4199 for (; addr < end; addr++) { 4200 if (*addr) 4201 return -EFBIG; 4202 } 4203 4204 attr->size = usize; 4205 } 4206 4207 ret = copy_to_user(uattr, attr, attr->size); 4208 if (ret) 4209 return -EFAULT; 4210 4211 return 0; 4212 } 4213 4214 /** 4215 * sys_sched_getattr - similar to sched_getparam, but with sched_attr 4216 * @pid: the pid in question. 4217 * @uattr: structure containing the extended parameters. 4218 * @size: sizeof(attr) for fwd/bwd comp. 4219 * @flags: for future extension. 4220 */ 4221 SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, 4222 unsigned int, size, unsigned int, flags) 4223 { 4224 struct sched_attr attr = { 4225 .size = sizeof(struct sched_attr), 4226 }; 4227 struct task_struct *p; 4228 int retval; 4229 4230 if (!uattr || pid < 0 || size > PAGE_SIZE || 4231 size < SCHED_ATTR_SIZE_VER0 || flags) 4232 return -EINVAL; 4233 4234 rcu_read_lock(); 4235 p = find_process_by_pid(pid); 4236 retval = -ESRCH; 4237 if (!p) 4238 goto out_unlock; 4239 4240 retval = security_task_getscheduler(p); 4241 if (retval) 4242 goto out_unlock; 4243 4244 attr.sched_policy = p->policy; 4245 if (p->sched_reset_on_fork) 4246 attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; 4247 if (task_has_dl_policy(p)) 4248 __getparam_dl(p, &attr); 4249 else if (task_has_rt_policy(p)) 4250 attr.sched_priority = p->rt_priority; 4251 else 4252 attr.sched_nice = task_nice(p); 4253 4254 rcu_read_unlock(); 4255 4256 retval = sched_read_attr(uattr, &attr, size); 4257 return retval; 4258 4259 out_unlock: 4260 rcu_read_unlock(); 4261 return retval; 4262 } 4263 4264 long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) 4265 { 4266 cpumask_var_t cpus_allowed, new_mask; 4267 struct task_struct *p; 4268 int retval; 4269 4270 rcu_read_lock(); 4271 4272 p = find_process_by_pid(pid); 4273 if (!p) { 4274 rcu_read_unlock(); 4275 return -ESRCH; 4276 } 4277 4278 /* Prevent p going away */ 4279 get_task_struct(p); 4280 rcu_read_unlock(); 4281 4282 if (p->flags & PF_NO_SETAFFINITY) { 4283 retval = -EINVAL; 4284 goto out_put_task; 4285 } 4286 if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { 4287 retval = -ENOMEM; 4288 goto out_put_task; 4289 } 4290 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { 4291 retval = -ENOMEM; 4292 goto out_free_cpus_allowed; 4293 } 4294 retval = -EPERM; 4295 if (!check_same_owner(p)) { 4296 rcu_read_lock(); 4297 if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { 4298 rcu_read_unlock(); 4299 goto out_free_new_mask; 4300 } 4301 rcu_read_unlock(); 4302 } 4303 4304 retval = security_task_setscheduler(p); 4305 if (retval) 4306 goto out_free_new_mask; 4307 4308 4309 cpuset_cpus_allowed(p, cpus_allowed); 4310 cpumask_and(new_mask, in_mask, cpus_allowed); 4311 4312 /* 4313 * Since bandwidth control happens on root_domain basis, 4314 * if admission test is enabled, we only admit -deadline 4315 * tasks allowed to run on all the CPUs in the task's 4316 * root_domain. 4317 */ 4318 #ifdef CONFIG_SMP 4319 if (task_has_dl_policy(p) && dl_bandwidth_enabled()) { 4320 rcu_read_lock(); 4321 if (!cpumask_subset(task_rq(p)->rd->span, new_mask)) { 4322 retval = -EBUSY; 4323 rcu_read_unlock(); 4324 goto out_free_new_mask; 4325 } 4326 rcu_read_unlock(); 4327 } 4328 #endif 4329 again: 4330 retval = set_cpus_allowed_ptr(p, new_mask); 4331 4332 if (!retval) { 4333 cpuset_cpus_allowed(p, cpus_allowed); 4334 if (!cpumask_subset(new_mask, cpus_allowed)) { 4335 /* 4336 * We must have raced with a concurrent cpuset 4337 * update. Just reset the cpus_allowed to the 4338 * cpuset's cpus_allowed 4339 */ 4340 cpumask_copy(new_mask, cpus_allowed); 4341 goto again; 4342 } 4343 } 4344 out_free_new_mask: 4345 free_cpumask_var(new_mask); 4346 out_free_cpus_allowed: 4347 free_cpumask_var(cpus_allowed); 4348 out_put_task: 4349 put_task_struct(p); 4350 return retval; 4351 } 4352 4353 static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, 4354 struct cpumask *new_mask) 4355 { 4356 if (len < cpumask_size()) 4357 cpumask_clear(new_mask); 4358 else if (len > cpumask_size()) 4359 len = cpumask_size(); 4360 4361 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; 4362 } 4363 4364 /** 4365 * sys_sched_setaffinity - set the cpu affinity of a process 4366 * @pid: pid of the process 4367 * @len: length in bytes of the bitmask pointed to by user_mask_ptr 4368 * @user_mask_ptr: user-space pointer to the new cpu mask 4369 * 4370 * Return: 0 on success. An error code otherwise. 4371 */ 4372 SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, 4373 unsigned long __user *, user_mask_ptr) 4374 { 4375 cpumask_var_t new_mask; 4376 int retval; 4377 4378 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) 4379 return -ENOMEM; 4380 4381 retval = get_user_cpu_mask(user_mask_ptr, len, new_mask); 4382 if (retval == 0) 4383 retval = sched_setaffinity(pid, new_mask); 4384 free_cpumask_var(new_mask); 4385 return retval; 4386 } 4387 4388 long sched_getaffinity(pid_t pid, struct cpumask *mask) 4389 { 4390 struct task_struct *p; 4391 unsigned long flags; 4392 int retval; 4393 4394 rcu_read_lock(); 4395 4396 retval = -ESRCH; 4397 p = find_process_by_pid(pid); 4398 if (!p) 4399 goto out_unlock; 4400 4401 retval = security_task_getscheduler(p); 4402 if (retval) 4403 goto out_unlock; 4404 4405 raw_spin_lock_irqsave(&p->pi_lock, flags); 4406 cpumask_and(mask, &p->cpus_allowed, cpu_active_mask); 4407 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 4408 4409 out_unlock: 4410 rcu_read_unlock(); 4411 4412 return retval; 4413 } 4414 4415 /** 4416 * sys_sched_getaffinity - get the cpu affinity of a process 4417 * @pid: pid of the process 4418 * @len: length in bytes of the bitmask pointed to by user_mask_ptr 4419 * @user_mask_ptr: user-space pointer to hold the current cpu mask 4420 * 4421 * Return: 0 on success. An error code otherwise. 4422 */ 4423 SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, 4424 unsigned long __user *, user_mask_ptr) 4425 { 4426 int ret; 4427 cpumask_var_t mask; 4428 4429 if ((len * BITS_PER_BYTE) < nr_cpu_ids) 4430 return -EINVAL; 4431 if (len & (sizeof(unsigned long)-1)) 4432 return -EINVAL; 4433 4434 if (!alloc_cpumask_var(&mask, GFP_KERNEL)) 4435 return -ENOMEM; 4436 4437 ret = sched_getaffinity(pid, mask); 4438 if (ret == 0) { 4439 size_t retlen = min_t(size_t, len, cpumask_size()); 4440 4441 if (copy_to_user(user_mask_ptr, mask, retlen)) 4442 ret = -EFAULT; 4443 else 4444 ret = retlen; 4445 } 4446 free_cpumask_var(mask); 4447 4448 return ret; 4449 } 4450 4451 /** 4452 * sys_sched_yield - yield the current processor to other threads. 4453 * 4454 * This function yields the current CPU to other tasks. If there are no 4455 * other threads running on this CPU then this function will return. 4456 * 4457 * Return: 0. 4458 */ 4459 SYSCALL_DEFINE0(sched_yield) 4460 { 4461 struct rq *rq = this_rq_lock(); 4462 4463 schedstat_inc(rq, yld_count); 4464 current->sched_class->yield_task(rq); 4465 4466 /* 4467 * Since we are going to call schedule() anyway, there's 4468 * no need to preempt or enable interrupts: 4469 */ 4470 __release(rq->lock); 4471 spin_release(&rq->lock.dep_map, 1, _THIS_IP_); 4472 do_raw_spin_unlock(&rq->lock); 4473 sched_preempt_enable_no_resched(); 4474 4475 schedule(); 4476 4477 return 0; 4478 } 4479 4480 int __sched _cond_resched(void) 4481 { 4482 if (should_resched()) { 4483 preempt_schedule_common(); 4484 return 1; 4485 } 4486 return 0; 4487 } 4488 EXPORT_SYMBOL(_cond_resched); 4489 4490 /* 4491 * __cond_resched_lock() - if a reschedule is pending, drop the given lock, 4492 * call schedule, and on return reacquire the lock. 4493 * 4494 * This works OK both with and without CONFIG_PREEMPT. We do strange low-level 4495 * operations here to prevent schedule() from being called twice (once via 4496 * spin_unlock(), once by hand). 4497 */ 4498 int __cond_resched_lock(spinlock_t *lock) 4499 { 4500 int resched = should_resched(); 4501 int ret = 0; 4502 4503 lockdep_assert_held(lock); 4504 4505 if (spin_needbreak(lock) || resched) { 4506 spin_unlock(lock); 4507 if (resched) 4508 preempt_schedule_common(); 4509 else 4510 cpu_relax(); 4511 ret = 1; 4512 spin_lock(lock); 4513 } 4514 return ret; 4515 } 4516 EXPORT_SYMBOL(__cond_resched_lock); 4517 4518 int __sched __cond_resched_softirq(void) 4519 { 4520 BUG_ON(!in_softirq()); 4521 4522 if (should_resched()) { 4523 local_bh_enable(); 4524 preempt_schedule_common(); 4525 local_bh_disable(); 4526 return 1; 4527 } 4528 return 0; 4529 } 4530 EXPORT_SYMBOL(__cond_resched_softirq); 4531 4532 /** 4533 * yield - yield the current processor to other threads. 4534 * 4535 * Do not ever use this function, there's a 99% chance you're doing it wrong. 4536 * 4537 * The scheduler is at all times free to pick the calling task as the most 4538 * eligible task to run, if removing the yield() call from your code breaks 4539 * it, its already broken. 4540 * 4541 * Typical broken usage is: 4542 * 4543 * while (!event) 4544 * yield(); 4545 * 4546 * where one assumes that yield() will let 'the other' process run that will 4547 * make event true. If the current task is a SCHED_FIFO task that will never 4548 * happen. Never use yield() as a progress guarantee!! 4549 * 4550 * If you want to use yield() to wait for something, use wait_event(). 4551 * If you want to use yield() to be 'nice' for others, use cond_resched(). 4552 * If you still want to use yield(), do not! 4553 */ 4554 void __sched yield(void) 4555 { 4556 set_current_state(TASK_RUNNING); 4557 sys_sched_yield(); 4558 } 4559 EXPORT_SYMBOL(yield); 4560 4561 /** 4562 * yield_to - yield the current processor to another thread in 4563 * your thread group, or accelerate that thread toward the 4564 * processor it's on. 4565 * @p: target task 4566 * @preempt: whether task preemption is allowed or not 4567 * 4568 * It's the caller's job to ensure that the target task struct 4569 * can't go away on us before we can do any checks. 4570 * 4571 * Return: 4572 * true (>0) if we indeed boosted the target task. 4573 * false (0) if we failed to boost the target. 4574 * -ESRCH if there's no task to yield to. 4575 */ 4576 int __sched yield_to(struct task_struct *p, bool preempt) 4577 { 4578 struct task_struct *curr = current; 4579 struct rq *rq, *p_rq; 4580 unsigned long flags; 4581 int yielded = 0; 4582 4583 local_irq_save(flags); 4584 rq = this_rq(); 4585 4586 again: 4587 p_rq = task_rq(p); 4588 /* 4589 * If we're the only runnable task on the rq and target rq also 4590 * has only one task, there's absolutely no point in yielding. 4591 */ 4592 if (rq->nr_running == 1 && p_rq->nr_running == 1) { 4593 yielded = -ESRCH; 4594 goto out_irq; 4595 } 4596 4597 double_rq_lock(rq, p_rq); 4598 if (task_rq(p) != p_rq) { 4599 double_rq_unlock(rq, p_rq); 4600 goto again; 4601 } 4602 4603 if (!curr->sched_class->yield_to_task) 4604 goto out_unlock; 4605 4606 if (curr->sched_class != p->sched_class) 4607 goto out_unlock; 4608 4609 if (task_running(p_rq, p) || p->state) 4610 goto out_unlock; 4611 4612 yielded = curr->sched_class->yield_to_task(rq, p, preempt); 4613 if (yielded) { 4614 schedstat_inc(rq, yld_count); 4615 /* 4616 * Make p's CPU reschedule; pick_next_entity takes care of 4617 * fairness. 4618 */ 4619 if (preempt && rq != p_rq) 4620 resched_curr(p_rq); 4621 } 4622 4623 out_unlock: 4624 double_rq_unlock(rq, p_rq); 4625 out_irq: 4626 local_irq_restore(flags); 4627 4628 if (yielded > 0) 4629 schedule(); 4630 4631 return yielded; 4632 } 4633 EXPORT_SYMBOL_GPL(yield_to); 4634 4635 /* 4636 * This task is about to go to sleep on IO. Increment rq->nr_iowait so 4637 * that process accounting knows that this is a task in IO wait state. 4638 */ 4639 long __sched io_schedule_timeout(long timeout) 4640 { 4641 int old_iowait = current->in_iowait; 4642 struct rq *rq; 4643 long ret; 4644 4645 current->in_iowait = 1; 4646 blk_schedule_flush_plug(current); 4647 4648 delayacct_blkio_start(); 4649 rq = raw_rq(); 4650 atomic_inc(&rq->nr_iowait); 4651 ret = schedule_timeout(timeout); 4652 current->in_iowait = old_iowait; 4653 atomic_dec(&rq->nr_iowait); 4654 delayacct_blkio_end(); 4655 4656 return ret; 4657 } 4658 EXPORT_SYMBOL(io_schedule_timeout); 4659 4660 /** 4661 * sys_sched_get_priority_max - return maximum RT priority. 4662 * @policy: scheduling class. 4663 * 4664 * Return: On success, this syscall returns the maximum 4665 * rt_priority that can be used by a given scheduling class. 4666 * On failure, a negative error code is returned. 4667 */ 4668 SYSCALL_DEFINE1(sched_get_priority_max, int, policy) 4669 { 4670 int ret = -EINVAL; 4671 4672 switch (policy) { 4673 case SCHED_FIFO: 4674 case SCHED_RR: 4675 ret = MAX_USER_RT_PRIO-1; 4676 break; 4677 case SCHED_DEADLINE: 4678 case SCHED_NORMAL: 4679 case SCHED_BATCH: 4680 case SCHED_IDLE: 4681 ret = 0; 4682 break; 4683 } 4684 return ret; 4685 } 4686 4687 /** 4688 * sys_sched_get_priority_min - return minimum RT priority. 4689 * @policy: scheduling class. 4690 * 4691 * Return: On success, this syscall returns the minimum 4692 * rt_priority that can be used by a given scheduling class. 4693 * On failure, a negative error code is returned. 4694 */ 4695 SYSCALL_DEFINE1(sched_get_priority_min, int, policy) 4696 { 4697 int ret = -EINVAL; 4698 4699 switch (policy) { 4700 case SCHED_FIFO: 4701 case SCHED_RR: 4702 ret = 1; 4703 break; 4704 case SCHED_DEADLINE: 4705 case SCHED_NORMAL: 4706 case SCHED_BATCH: 4707 case SCHED_IDLE: 4708 ret = 0; 4709 } 4710 return ret; 4711 } 4712 4713 /** 4714 * sys_sched_rr_get_interval - return the default timeslice of a process. 4715 * @pid: pid of the process. 4716 * @interval: userspace pointer to the timeslice value. 4717 * 4718 * this syscall writes the default timeslice value of a given process 4719 * into the user-space timespec buffer. A value of '0' means infinity. 4720 * 4721 * Return: On success, 0 and the timeslice is in @interval. Otherwise, 4722 * an error code. 4723 */ 4724 SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, 4725 struct timespec __user *, interval) 4726 { 4727 struct task_struct *p; 4728 unsigned int time_slice; 4729 unsigned long flags; 4730 struct rq *rq; 4731 int retval; 4732 struct timespec t; 4733 4734 if (pid < 0) 4735 return -EINVAL; 4736 4737 retval = -ESRCH; 4738 rcu_read_lock(); 4739 p = find_process_by_pid(pid); 4740 if (!p) 4741 goto out_unlock; 4742 4743 retval = security_task_getscheduler(p); 4744 if (retval) 4745 goto out_unlock; 4746 4747 rq = task_rq_lock(p, &flags); 4748 time_slice = 0; 4749 if (p->sched_class->get_rr_interval) 4750 time_slice = p->sched_class->get_rr_interval(rq, p); 4751 task_rq_unlock(rq, p, &flags); 4752 4753 rcu_read_unlock(); 4754 jiffies_to_timespec(time_slice, &t); 4755 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; 4756 return retval; 4757 4758 out_unlock: 4759 rcu_read_unlock(); 4760 return retval; 4761 } 4762 4763 static const char stat_nam[] = TASK_STATE_TO_CHAR_STR; 4764 4765 void sched_show_task(struct task_struct *p) 4766 { 4767 unsigned long free = 0; 4768 int ppid; 4769 unsigned long state = p->state; 4770 4771 if (state) 4772 state = __ffs(state) + 1; 4773 printk(KERN_INFO "%-15.15s %c", p->comm, 4774 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); 4775 #if BITS_PER_LONG == 32 4776 if (state == TASK_RUNNING) 4777 printk(KERN_CONT " running "); 4778 else 4779 printk(KERN_CONT " %08lx ", thread_saved_pc(p)); 4780 #else 4781 if (state == TASK_RUNNING) 4782 printk(KERN_CONT " running task "); 4783 else 4784 printk(KERN_CONT " %016lx ", thread_saved_pc(p)); 4785 #endif 4786 #ifdef CONFIG_DEBUG_STACK_USAGE 4787 free = stack_not_used(p); 4788 #endif 4789 ppid = 0; 4790 rcu_read_lock(); 4791 if (pid_alive(p)) 4792 ppid = task_pid_nr(rcu_dereference(p->real_parent)); 4793 rcu_read_unlock(); 4794 printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, 4795 task_pid_nr(p), ppid, 4796 (unsigned long)task_thread_info(p)->flags); 4797 4798 print_worker_info(KERN_INFO, p); 4799 show_stack(p, NULL); 4800 } 4801 4802 void show_state_filter(unsigned long state_filter) 4803 { 4804 struct task_struct *g, *p; 4805 4806 #if BITS_PER_LONG == 32 4807 printk(KERN_INFO 4808 " task PC stack pid father\n"); 4809 #else 4810 printk(KERN_INFO 4811 " task PC stack pid father\n"); 4812 #endif 4813 rcu_read_lock(); 4814 for_each_process_thread(g, p) { 4815 /* 4816 * reset the NMI-timeout, listing all files on a slow 4817 * console might take a lot of time: 4818 */ 4819 touch_nmi_watchdog(); 4820 if (!state_filter || (p->state & state_filter)) 4821 sched_show_task(p); 4822 } 4823 4824 touch_all_softlockup_watchdogs(); 4825 4826 #ifdef CONFIG_SCHED_DEBUG 4827 sysrq_sched_debug_show(); 4828 #endif 4829 rcu_read_unlock(); 4830 /* 4831 * Only show locks if all tasks are dumped: 4832 */ 4833 if (!state_filter) 4834 debug_show_all_locks(); 4835 } 4836 4837 void init_idle_bootup_task(struct task_struct *idle) 4838 { 4839 idle->sched_class = &idle_sched_class; 4840 } 4841 4842 /** 4843 * init_idle - set up an idle thread for a given CPU 4844 * @idle: task in question 4845 * @cpu: cpu the idle task belongs to 4846 * 4847 * NOTE: this function does not set the idle thread's NEED_RESCHED 4848 * flag, to make booting more robust. 4849 */ 4850 void init_idle(struct task_struct *idle, int cpu) 4851 { 4852 struct rq *rq = cpu_rq(cpu); 4853 unsigned long flags; 4854 4855 raw_spin_lock_irqsave(&rq->lock, flags); 4856 4857 __sched_fork(0, idle); 4858 idle->state = TASK_RUNNING; 4859 idle->se.exec_start = sched_clock(); 4860 4861 do_set_cpus_allowed(idle, cpumask_of(cpu)); 4862 /* 4863 * We're having a chicken and egg problem, even though we are 4864 * holding rq->lock, the cpu isn't yet set to this cpu so the 4865 * lockdep check in task_group() will fail. 4866 * 4867 * Similar case to sched_fork(). / Alternatively we could 4868 * use task_rq_lock() here and obtain the other rq->lock. 4869 * 4870 * Silence PROVE_RCU 4871 */ 4872 rcu_read_lock(); 4873 __set_task_cpu(idle, cpu); 4874 rcu_read_unlock(); 4875 4876 rq->curr = rq->idle = idle; 4877 idle->on_rq = TASK_ON_RQ_QUEUED; 4878 #if defined(CONFIG_SMP) 4879 idle->on_cpu = 1; 4880 #endif 4881 raw_spin_unlock_irqrestore(&rq->lock, flags); 4882 4883 /* Set the preempt count _outside_ the spinlocks! */ 4884 init_idle_preempt_count(idle, cpu); 4885 4886 /* 4887 * The idle tasks have their own, simple scheduling class: 4888 */ 4889 idle->sched_class = &idle_sched_class; 4890 ftrace_graph_init_idle_task(idle, cpu); 4891 vtime_init_idle(idle, cpu); 4892 #if defined(CONFIG_SMP) 4893 sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); 4894 #endif 4895 } 4896 4897 int cpuset_cpumask_can_shrink(const struct cpumask *cur, 4898 const struct cpumask *trial) 4899 { 4900 int ret = 1, trial_cpus; 4901 struct dl_bw *cur_dl_b; 4902 unsigned long flags; 4903 4904 if (!cpumask_weight(cur)) 4905 return ret; 4906 4907 rcu_read_lock_sched(); 4908 cur_dl_b = dl_bw_of(cpumask_any(cur)); 4909 trial_cpus = cpumask_weight(trial); 4910 4911 raw_spin_lock_irqsave(&cur_dl_b->lock, flags); 4912 if (cur_dl_b->bw != -1 && 4913 cur_dl_b->bw * trial_cpus < cur_dl_b->total_bw) 4914 ret = 0; 4915 raw_spin_unlock_irqrestore(&cur_dl_b->lock, flags); 4916 rcu_read_unlock_sched(); 4917 4918 return ret; 4919 } 4920 4921 int task_can_attach(struct task_struct *p, 4922 const struct cpumask *cs_cpus_allowed) 4923 { 4924 int ret = 0; 4925 4926 /* 4927 * Kthreads which disallow setaffinity shouldn't be moved 4928 * to a new cpuset; we don't want to change their cpu 4929 * affinity and isolating such threads by their set of 4930 * allowed nodes is unnecessary. Thus, cpusets are not 4931 * applicable for such threads. This prevents checking for 4932 * success of set_cpus_allowed_ptr() on all attached tasks 4933 * before cpus_allowed may be changed. 4934 */ 4935 if (p->flags & PF_NO_SETAFFINITY) { 4936 ret = -EINVAL; 4937 goto out; 4938 } 4939 4940 #ifdef CONFIG_SMP 4941 if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span, 4942 cs_cpus_allowed)) { 4943 unsigned int dest_cpu = cpumask_any_and(cpu_active_mask, 4944 cs_cpus_allowed); 4945 struct dl_bw *dl_b; 4946 bool overflow; 4947 int cpus; 4948 unsigned long flags; 4949 4950 rcu_read_lock_sched(); 4951 dl_b = dl_bw_of(dest_cpu); 4952 raw_spin_lock_irqsave(&dl_b->lock, flags); 4953 cpus = dl_bw_cpus(dest_cpu); 4954 overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw); 4955 if (overflow) 4956 ret = -EBUSY; 4957 else { 4958 /* 4959 * We reserve space for this task in the destination 4960 * root_domain, as we can't fail after this point. 4961 * We will free resources in the source root_domain 4962 * later on (see set_cpus_allowed_dl()). 4963 */ 4964 __dl_add(dl_b, p->dl.dl_bw); 4965 } 4966 raw_spin_unlock_irqrestore(&dl_b->lock, flags); 4967 rcu_read_unlock_sched(); 4968 4969 } 4970 #endif 4971 out: 4972 return ret; 4973 } 4974 4975 #ifdef CONFIG_SMP 4976 4977 #ifdef CONFIG_NUMA_BALANCING 4978 /* Migrate current task p to target_cpu */ 4979 int migrate_task_to(struct task_struct *p, int target_cpu) 4980 { 4981 struct migration_arg arg = { p, target_cpu }; 4982 int curr_cpu = task_cpu(p); 4983 4984 if (curr_cpu == target_cpu) 4985 return 0; 4986 4987 if (!cpumask_test_cpu(target_cpu, tsk_cpus_allowed(p))) 4988 return -EINVAL; 4989 4990 /* TODO: This is not properly updating schedstats */ 4991 4992 trace_sched_move_numa(p, curr_cpu, target_cpu); 4993 return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg); 4994 } 4995 4996 /* 4997 * Requeue a task on a given node and accurately track the number of NUMA 4998 * tasks on the runqueues 4999 */ 5000 void sched_setnuma(struct task_struct *p, int nid) 5001 { 5002 struct rq *rq; 5003 unsigned long flags; 5004 bool queued, running; 5005 5006 rq = task_rq_lock(p, &flags); 5007 queued = task_on_rq_queued(p); 5008 running = task_current(rq, p); 5009 5010 if (queued) 5011 dequeue_task(rq, p, 0); 5012 if (running) 5013 put_prev_task(rq, p); 5014 5015 p->numa_preferred_nid = nid; 5016 5017 if (running) 5018 p->sched_class->set_curr_task(rq); 5019 if (queued) 5020 enqueue_task(rq, p, 0); 5021 task_rq_unlock(rq, p, &flags); 5022 } 5023 #endif /* CONFIG_NUMA_BALANCING */ 5024 5025 #ifdef CONFIG_HOTPLUG_CPU 5026 /* 5027 * Ensures that the idle task is using init_mm right before its cpu goes 5028 * offline. 5029 */ 5030 void idle_task_exit(void) 5031 { 5032 struct mm_struct *mm = current->active_mm; 5033 5034 BUG_ON(cpu_online(smp_processor_id())); 5035 5036 if (mm != &init_mm) { 5037 switch_mm(mm, &init_mm, current); 5038 finish_arch_post_lock_switch(); 5039 } 5040 mmdrop(mm); 5041 } 5042 5043 /* 5044 * Since this CPU is going 'away' for a while, fold any nr_active delta 5045 * we might have. Assumes we're called after migrate_tasks() so that the 5046 * nr_active count is stable. 5047 * 5048 * Also see the comment "Global load-average calculations". 5049 */ 5050 static void calc_load_migrate(struct rq *rq) 5051 { 5052 long delta = calc_load_fold_active(rq); 5053 if (delta) 5054 atomic_long_add(delta, &calc_load_tasks); 5055 } 5056 5057 static void put_prev_task_fake(struct rq *rq, struct task_struct *prev) 5058 { 5059 } 5060 5061 static const struct sched_class fake_sched_class = { 5062 .put_prev_task = put_prev_task_fake, 5063 }; 5064 5065 static struct task_struct fake_task = { 5066 /* 5067 * Avoid pull_{rt,dl}_task() 5068 */ 5069 .prio = MAX_PRIO + 1, 5070 .sched_class = &fake_sched_class, 5071 }; 5072 5073 /* 5074 * Migrate all tasks from the rq, sleeping tasks will be migrated by 5075 * try_to_wake_up()->select_task_rq(). 5076 * 5077 * Called with rq->lock held even though we'er in stop_machine() and 5078 * there's no concurrency possible, we hold the required locks anyway 5079 * because of lock validation efforts. 5080 */ 5081 static void migrate_tasks(struct rq *dead_rq) 5082 { 5083 struct rq *rq = dead_rq; 5084 struct task_struct *next, *stop = rq->stop; 5085 int dest_cpu; 5086 5087 /* 5088 * Fudge the rq selection such that the below task selection loop 5089 * doesn't get stuck on the currently eligible stop task. 5090 * 5091 * We're currently inside stop_machine() and the rq is either stuck 5092 * in the stop_machine_cpu_stop() loop, or we're executing this code, 5093 * either way we should never end up calling schedule() until we're 5094 * done here. 5095 */ 5096 rq->stop = NULL; 5097 5098 /* 5099 * put_prev_task() and pick_next_task() sched 5100 * class method both need to have an up-to-date 5101 * value of rq->clock[_task] 5102 */ 5103 update_rq_clock(rq); 5104 5105 for (;;) { 5106 /* 5107 * There's this thread running, bail when that's the only 5108 * remaining thread. 5109 */ 5110 if (rq->nr_running == 1) 5111 break; 5112 5113 /* 5114 * Ensure rq->lock covers the entire task selection 5115 * until the migration. 5116 */ 5117 lockdep_pin_lock(&rq->lock); 5118 next = pick_next_task(rq, &fake_task); 5119 BUG_ON(!next); 5120 next->sched_class->put_prev_task(rq, next); 5121 5122 /* Find suitable destination for @next, with force if needed. */ 5123 dest_cpu = select_fallback_rq(dead_rq->cpu, next); 5124 5125 lockdep_unpin_lock(&rq->lock); 5126 rq = __migrate_task(rq, next, dest_cpu); 5127 if (rq != dead_rq) { 5128 raw_spin_unlock(&rq->lock); 5129 rq = dead_rq; 5130 raw_spin_lock(&rq->lock); 5131 } 5132 } 5133 5134 rq->stop = stop; 5135 } 5136 #endif /* CONFIG_HOTPLUG_CPU */ 5137 5138 #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) 5139 5140 static struct ctl_table sd_ctl_dir[] = { 5141 { 5142 .procname = "sched_domain", 5143 .mode = 0555, 5144 }, 5145 {} 5146 }; 5147 5148 static struct ctl_table sd_ctl_root[] = { 5149 { 5150 .procname = "kernel", 5151 .mode = 0555, 5152 .child = sd_ctl_dir, 5153 }, 5154 {} 5155 }; 5156 5157 static struct ctl_table *sd_alloc_ctl_entry(int n) 5158 { 5159 struct ctl_table *entry = 5160 kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL); 5161 5162 return entry; 5163 } 5164 5165 static void sd_free_ctl_entry(struct ctl_table **tablep) 5166 { 5167 struct ctl_table *entry; 5168 5169 /* 5170 * In the intermediate directories, both the child directory and 5171 * procname are dynamically allocated and could fail but the mode 5172 * will always be set. In the lowest directory the names are 5173 * static strings and all have proc handlers. 5174 */ 5175 for (entry = *tablep; entry->mode; entry++) { 5176 if (entry->child) 5177 sd_free_ctl_entry(&entry->child); 5178 if (entry->proc_handler == NULL) 5179 kfree(entry->procname); 5180 } 5181 5182 kfree(*tablep); 5183 *tablep = NULL; 5184 } 5185 5186 static int min_load_idx = 0; 5187 static int max_load_idx = CPU_LOAD_IDX_MAX-1; 5188 5189 static void 5190 set_table_entry(struct ctl_table *entry, 5191 const char *procname, void *data, int maxlen, 5192 umode_t mode, proc_handler *proc_handler, 5193 bool load_idx) 5194 { 5195 entry->procname = procname; 5196 entry->data = data; 5197 entry->maxlen = maxlen; 5198 entry->mode = mode; 5199 entry->proc_handler = proc_handler; 5200 5201 if (load_idx) { 5202 entry->extra1 = &min_load_idx; 5203 entry->extra2 = &max_load_idx; 5204 } 5205 } 5206 5207 static struct ctl_table * 5208 sd_alloc_ctl_domain_table(struct sched_domain *sd) 5209 { 5210 struct ctl_table *table = sd_alloc_ctl_entry(14); 5211 5212 if (table == NULL) 5213 return NULL; 5214 5215 set_table_entry(&table[0], "min_interval", &sd->min_interval, 5216 sizeof(long), 0644, proc_doulongvec_minmax, false); 5217 set_table_entry(&table[1], "max_interval", &sd->max_interval, 5218 sizeof(long), 0644, proc_doulongvec_minmax, false); 5219 set_table_entry(&table[2], "busy_idx", &sd->busy_idx, 5220 sizeof(int), 0644, proc_dointvec_minmax, true); 5221 set_table_entry(&table[3], "idle_idx", &sd->idle_idx, 5222 sizeof(int), 0644, proc_dointvec_minmax, true); 5223 set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx, 5224 sizeof(int), 0644, proc_dointvec_minmax, true); 5225 set_table_entry(&table[5], "wake_idx", &sd->wake_idx, 5226 sizeof(int), 0644, proc_dointvec_minmax, true); 5227 set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx, 5228 sizeof(int), 0644, proc_dointvec_minmax, true); 5229 set_table_entry(&table[7], "busy_factor", &sd->busy_factor, 5230 sizeof(int), 0644, proc_dointvec_minmax, false); 5231 set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, 5232 sizeof(int), 0644, proc_dointvec_minmax, false); 5233 set_table_entry(&table[9], "cache_nice_tries", 5234 &sd->cache_nice_tries, 5235 sizeof(int), 0644, proc_dointvec_minmax, false); 5236 set_table_entry(&table[10], "flags", &sd->flags, 5237 sizeof(int), 0644, proc_dointvec_minmax, false); 5238 set_table_entry(&table[11], "max_newidle_lb_cost", 5239 &sd->max_newidle_lb_cost, 5240 sizeof(long), 0644, proc_doulongvec_minmax, false); 5241 set_table_entry(&table[12], "name", sd->name, 5242 CORENAME_MAX_SIZE, 0444, proc_dostring, false); 5243 /* &table[13] is terminator */ 5244 5245 return table; 5246 } 5247 5248 static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu) 5249 { 5250 struct ctl_table *entry, *table; 5251 struct sched_domain *sd; 5252 int domain_num = 0, i; 5253 char buf[32]; 5254 5255 for_each_domain(cpu, sd) 5256 domain_num++; 5257 entry = table = sd_alloc_ctl_entry(domain_num + 1); 5258 if (table == NULL) 5259 return NULL; 5260 5261 i = 0; 5262 for_each_domain(cpu, sd) { 5263 snprintf(buf, 32, "domain%d", i); 5264 entry->procname = kstrdup(buf, GFP_KERNEL); 5265 entry->mode = 0555; 5266 entry->child = sd_alloc_ctl_domain_table(sd); 5267 entry++; 5268 i++; 5269 } 5270 return table; 5271 } 5272 5273 static struct ctl_table_header *sd_sysctl_header; 5274 static void register_sched_domain_sysctl(void) 5275 { 5276 int i, cpu_num = num_possible_cpus(); 5277 struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); 5278 char buf[32]; 5279 5280 WARN_ON(sd_ctl_dir[0].child); 5281 sd_ctl_dir[0].child = entry; 5282 5283 if (entry == NULL) 5284 return; 5285 5286 for_each_possible_cpu(i) { 5287 snprintf(buf, 32, "cpu%d", i); 5288 entry->procname = kstrdup(buf, GFP_KERNEL); 5289 entry->mode = 0555; 5290 entry->child = sd_alloc_ctl_cpu_table(i); 5291 entry++; 5292 } 5293 5294 WARN_ON(sd_sysctl_header); 5295 sd_sysctl_header = register_sysctl_table(sd_ctl_root); 5296 } 5297 5298 /* may be called multiple times per register */ 5299 static void unregister_sched_domain_sysctl(void) 5300 { 5301 if (sd_sysctl_header) 5302 unregister_sysctl_table(sd_sysctl_header); 5303 sd_sysctl_header = NULL; 5304 if (sd_ctl_dir[0].child) 5305 sd_free_ctl_entry(&sd_ctl_dir[0].child); 5306 } 5307 #else 5308 static void register_sched_domain_sysctl(void) 5309 { 5310 } 5311 static void unregister_sched_domain_sysctl(void) 5312 { 5313 } 5314 #endif /* CONFIG_SCHED_DEBUG && CONFIG_SYSCTL */ 5315 5316 static void set_rq_online(struct rq *rq) 5317 { 5318 if (!rq->online) { 5319 const struct sched_class *class; 5320 5321 cpumask_set_cpu(rq->cpu, rq->rd->online); 5322 rq->online = 1; 5323 5324 for_each_class(class) { 5325 if (class->rq_online) 5326 class->rq_online(rq); 5327 } 5328 } 5329 } 5330 5331 static void set_rq_offline(struct rq *rq) 5332 { 5333 if (rq->online) { 5334 const struct sched_class *class; 5335 5336 for_each_class(class) { 5337 if (class->rq_offline) 5338 class->rq_offline(rq); 5339 } 5340 5341 cpumask_clear_cpu(rq->cpu, rq->rd->online); 5342 rq->online = 0; 5343 } 5344 } 5345 5346 /* 5347 * migration_call - callback that gets triggered when a CPU is added. 5348 * Here we can start up the necessary migration thread for the new CPU. 5349 */ 5350 static int 5351 migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) 5352 { 5353 int cpu = (long)hcpu; 5354 unsigned long flags; 5355 struct rq *rq = cpu_rq(cpu); 5356 5357 switch (action & ~CPU_TASKS_FROZEN) { 5358 5359 case CPU_UP_PREPARE: 5360 rq->calc_load_update = calc_load_update; 5361 break; 5362 5363 case CPU_ONLINE: 5364 /* Update our root-domain */ 5365 raw_spin_lock_irqsave(&rq->lock, flags); 5366 if (rq->rd) { 5367 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 5368 5369 set_rq_online(rq); 5370 } 5371 raw_spin_unlock_irqrestore(&rq->lock, flags); 5372 break; 5373 5374 #ifdef CONFIG_HOTPLUG_CPU 5375 case CPU_DYING: 5376 sched_ttwu_pending(); 5377 /* Update our root-domain */ 5378 raw_spin_lock_irqsave(&rq->lock, flags); 5379 if (rq->rd) { 5380 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 5381 set_rq_offline(rq); 5382 } 5383 migrate_tasks(rq); 5384 BUG_ON(rq->nr_running != 1); /* the migration thread */ 5385 raw_spin_unlock_irqrestore(&rq->lock, flags); 5386 break; 5387 5388 case CPU_DEAD: 5389 calc_load_migrate(rq); 5390 break; 5391 #endif 5392 } 5393 5394 update_max_interval(); 5395 5396 return NOTIFY_OK; 5397 } 5398 5399 /* 5400 * Register at high priority so that task migration (migrate_all_tasks) 5401 * happens before everything else. This has to be lower priority than 5402 * the notifier in the perf_event subsystem, though. 5403 */ 5404 static struct notifier_block migration_notifier = { 5405 .notifier_call = migration_call, 5406 .priority = CPU_PRI_MIGRATION, 5407 }; 5408 5409 static void set_cpu_rq_start_time(void) 5410 { 5411 int cpu = smp_processor_id(); 5412 struct rq *rq = cpu_rq(cpu); 5413 rq->age_stamp = sched_clock_cpu(cpu); 5414 } 5415 5416 static int sched_cpu_active(struct notifier_block *nfb, 5417 unsigned long action, void *hcpu) 5418 { 5419 switch (action & ~CPU_TASKS_FROZEN) { 5420 case CPU_STARTING: 5421 set_cpu_rq_start_time(); 5422 return NOTIFY_OK; 5423 case CPU_DOWN_FAILED: 5424 set_cpu_active((long)hcpu, true); 5425 return NOTIFY_OK; 5426 default: 5427 return NOTIFY_DONE; 5428 } 5429 } 5430 5431 static int sched_cpu_inactive(struct notifier_block *nfb, 5432 unsigned long action, void *hcpu) 5433 { 5434 switch (action & ~CPU_TASKS_FROZEN) { 5435 case CPU_DOWN_PREPARE: 5436 set_cpu_active((long)hcpu, false); 5437 return NOTIFY_OK; 5438 default: 5439 return NOTIFY_DONE; 5440 } 5441 } 5442 5443 static int __init migration_init(void) 5444 { 5445 void *cpu = (void *)(long)smp_processor_id(); 5446 int err; 5447 5448 /* Initialize migration for the boot CPU */ 5449 err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); 5450 BUG_ON(err == NOTIFY_BAD); 5451 migration_call(&migration_notifier, CPU_ONLINE, cpu); 5452 register_cpu_notifier(&migration_notifier); 5453 5454 /* Register cpu active notifiers */ 5455 cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE); 5456 cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE); 5457 5458 return 0; 5459 } 5460 early_initcall(migration_init); 5461 5462 static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */ 5463 5464 #ifdef CONFIG_SCHED_DEBUG 5465 5466 static __read_mostly int sched_debug_enabled; 5467 5468 static int __init sched_debug_setup(char *str) 5469 { 5470 sched_debug_enabled = 1; 5471 5472 return 0; 5473 } 5474 early_param("sched_debug", sched_debug_setup); 5475 5476 static inline bool sched_debug(void) 5477 { 5478 return sched_debug_enabled; 5479 } 5480 5481 static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, 5482 struct cpumask *groupmask) 5483 { 5484 struct sched_group *group = sd->groups; 5485 5486 cpumask_clear(groupmask); 5487 5488 printk(KERN_DEBUG "%*s domain %d: ", level, "", level); 5489 5490 if (!(sd->flags & SD_LOAD_BALANCE)) { 5491 printk("does not load-balance\n"); 5492 if (sd->parent) 5493 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain" 5494 " has parent"); 5495 return -1; 5496 } 5497 5498 printk(KERN_CONT "span %*pbl level %s\n", 5499 cpumask_pr_args(sched_domain_span(sd)), sd->name); 5500 5501 if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) { 5502 printk(KERN_ERR "ERROR: domain->span does not contain " 5503 "CPU%d\n", cpu); 5504 } 5505 if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) { 5506 printk(KERN_ERR "ERROR: domain->groups does not contain" 5507 " CPU%d\n", cpu); 5508 } 5509 5510 printk(KERN_DEBUG "%*s groups:", level + 1, ""); 5511 do { 5512 if (!group) { 5513 printk("\n"); 5514 printk(KERN_ERR "ERROR: group is NULL\n"); 5515 break; 5516 } 5517 5518 if (!cpumask_weight(sched_group_cpus(group))) { 5519 printk(KERN_CONT "\n"); 5520 printk(KERN_ERR "ERROR: empty group\n"); 5521 break; 5522 } 5523 5524 if (!(sd->flags & SD_OVERLAP) && 5525 cpumask_intersects(groupmask, sched_group_cpus(group))) { 5526 printk(KERN_CONT "\n"); 5527 printk(KERN_ERR "ERROR: repeated CPUs\n"); 5528 break; 5529 } 5530 5531 cpumask_or(groupmask, groupmask, sched_group_cpus(group)); 5532 5533 printk(KERN_CONT " %*pbl", 5534 cpumask_pr_args(sched_group_cpus(group))); 5535 if (group->sgc->capacity != SCHED_CAPACITY_SCALE) { 5536 printk(KERN_CONT " (cpu_capacity = %d)", 5537 group->sgc->capacity); 5538 } 5539 5540 group = group->next; 5541 } while (group != sd->groups); 5542 printk(KERN_CONT "\n"); 5543 5544 if (!cpumask_equal(sched_domain_span(sd), groupmask)) 5545 printk(KERN_ERR "ERROR: groups don't span domain->span\n"); 5546 5547 if (sd->parent && 5548 !cpumask_subset(groupmask, sched_domain_span(sd->parent))) 5549 printk(KERN_ERR "ERROR: parent span is not a superset " 5550 "of domain->span\n"); 5551 return 0; 5552 } 5553 5554 static void sched_domain_debug(struct sched_domain *sd, int cpu) 5555 { 5556 int level = 0; 5557 5558 if (!sched_debug_enabled) 5559 return; 5560 5561 if (!sd) { 5562 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu); 5563 return; 5564 } 5565 5566 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); 5567 5568 for (;;) { 5569 if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask)) 5570 break; 5571 level++; 5572 sd = sd->parent; 5573 if (!sd) 5574 break; 5575 } 5576 } 5577 #else /* !CONFIG_SCHED_DEBUG */ 5578 # define sched_domain_debug(sd, cpu) do { } while (0) 5579 static inline bool sched_debug(void) 5580 { 5581 return false; 5582 } 5583 #endif /* CONFIG_SCHED_DEBUG */ 5584 5585 static int sd_degenerate(struct sched_domain *sd) 5586 { 5587 if (cpumask_weight(sched_domain_span(sd)) == 1) 5588 return 1; 5589 5590 /* Following flags need at least 2 groups */ 5591 if (sd->flags & (SD_LOAD_BALANCE | 5592 SD_BALANCE_NEWIDLE | 5593 SD_BALANCE_FORK | 5594 SD_BALANCE_EXEC | 5595 SD_SHARE_CPUCAPACITY | 5596 SD_SHARE_PKG_RESOURCES | 5597 SD_SHARE_POWERDOMAIN)) { 5598 if (sd->groups != sd->groups->next) 5599 return 0; 5600 } 5601 5602 /* Following flags don't use groups */ 5603 if (sd->flags & (SD_WAKE_AFFINE)) 5604 return 0; 5605 5606 return 1; 5607 } 5608 5609 static int 5610 sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) 5611 { 5612 unsigned long cflags = sd->flags, pflags = parent->flags; 5613 5614 if (sd_degenerate(parent)) 5615 return 1; 5616 5617 if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent))) 5618 return 0; 5619 5620 /* Flags needing groups don't count if only 1 group in parent */ 5621 if (parent->groups == parent->groups->next) { 5622 pflags &= ~(SD_LOAD_BALANCE | 5623 SD_BALANCE_NEWIDLE | 5624 SD_BALANCE_FORK | 5625 SD_BALANCE_EXEC | 5626 SD_SHARE_CPUCAPACITY | 5627 SD_SHARE_PKG_RESOURCES | 5628 SD_PREFER_SIBLING | 5629 SD_SHARE_POWERDOMAIN); 5630 if (nr_node_ids == 1) 5631 pflags &= ~SD_SERIALIZE; 5632 } 5633 if (~cflags & pflags) 5634 return 0; 5635 5636 return 1; 5637 } 5638 5639 static void free_rootdomain(struct rcu_head *rcu) 5640 { 5641 struct root_domain *rd = container_of(rcu, struct root_domain, rcu); 5642 5643 cpupri_cleanup(&rd->cpupri); 5644 cpudl_cleanup(&rd->cpudl); 5645 free_cpumask_var(rd->dlo_mask); 5646 free_cpumask_var(rd->rto_mask); 5647 free_cpumask_var(rd->online); 5648 free_cpumask_var(rd->span); 5649 kfree(rd); 5650 } 5651 5652 static void rq_attach_root(struct rq *rq, struct root_domain *rd) 5653 { 5654 struct root_domain *old_rd = NULL; 5655 unsigned long flags; 5656 5657 raw_spin_lock_irqsave(&rq->lock, flags); 5658 5659 if (rq->rd) { 5660 old_rd = rq->rd; 5661 5662 if (cpumask_test_cpu(rq->cpu, old_rd->online)) 5663 set_rq_offline(rq); 5664 5665 cpumask_clear_cpu(rq->cpu, old_rd->span); 5666 5667 /* 5668 * If we dont want to free the old_rd yet then 5669 * set old_rd to NULL to skip the freeing later 5670 * in this function: 5671 */ 5672 if (!atomic_dec_and_test(&old_rd->refcount)) 5673 old_rd = NULL; 5674 } 5675 5676 atomic_inc(&rd->refcount); 5677 rq->rd = rd; 5678 5679 cpumask_set_cpu(rq->cpu, rd->span); 5680 if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) 5681 set_rq_online(rq); 5682 5683 raw_spin_unlock_irqrestore(&rq->lock, flags); 5684 5685 if (old_rd) 5686 call_rcu_sched(&old_rd->rcu, free_rootdomain); 5687 } 5688 5689 static int init_rootdomain(struct root_domain *rd) 5690 { 5691 memset(rd, 0, sizeof(*rd)); 5692 5693 if (!alloc_cpumask_var(&rd->span, GFP_KERNEL)) 5694 goto out; 5695 if (!alloc_cpumask_var(&rd->online, GFP_KERNEL)) 5696 goto free_span; 5697 if (!alloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL)) 5698 goto free_online; 5699 if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) 5700 goto free_dlo_mask; 5701 5702 init_dl_bw(&rd->dl_bw); 5703 if (cpudl_init(&rd->cpudl) != 0) 5704 goto free_dlo_mask; 5705 5706 if (cpupri_init(&rd->cpupri) != 0) 5707 goto free_rto_mask; 5708 return 0; 5709 5710 free_rto_mask: 5711 free_cpumask_var(rd->rto_mask); 5712 free_dlo_mask: 5713 free_cpumask_var(rd->dlo_mask); 5714 free_online: 5715 free_cpumask_var(rd->online); 5716 free_span: 5717 free_cpumask_var(rd->span); 5718 out: 5719 return -ENOMEM; 5720 } 5721 5722 /* 5723 * By default the system creates a single root-domain with all cpus as 5724 * members (mimicking the global state we have today). 5725 */ 5726 struct root_domain def_root_domain; 5727 5728 static void init_defrootdomain(void) 5729 { 5730 init_rootdomain(&def_root_domain); 5731 5732 atomic_set(&def_root_domain.refcount, 1); 5733 } 5734 5735 static struct root_domain *alloc_rootdomain(void) 5736 { 5737 struct root_domain *rd; 5738 5739 rd = kmalloc(sizeof(*rd), GFP_KERNEL); 5740 if (!rd) 5741 return NULL; 5742 5743 if (init_rootdomain(rd) != 0) { 5744 kfree(rd); 5745 return NULL; 5746 } 5747 5748 return rd; 5749 } 5750 5751 static void free_sched_groups(struct sched_group *sg, int free_sgc) 5752 { 5753 struct sched_group *tmp, *first; 5754 5755 if (!sg) 5756 return; 5757 5758 first = sg; 5759 do { 5760 tmp = sg->next; 5761 5762 if (free_sgc && atomic_dec_and_test(&sg->sgc->ref)) 5763 kfree(sg->sgc); 5764 5765 kfree(sg); 5766 sg = tmp; 5767 } while (sg != first); 5768 } 5769 5770 static void free_sched_domain(struct rcu_head *rcu) 5771 { 5772 struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu); 5773 5774 /* 5775 * If its an overlapping domain it has private groups, iterate and 5776 * nuke them all. 5777 */ 5778 if (sd->flags & SD_OVERLAP) { 5779 free_sched_groups(sd->groups, 1); 5780 } else if (atomic_dec_and_test(&sd->groups->ref)) { 5781 kfree(sd->groups->sgc); 5782 kfree(sd->groups); 5783 } 5784 kfree(sd); 5785 } 5786 5787 static void destroy_sched_domain(struct sched_domain *sd, int cpu) 5788 { 5789 call_rcu(&sd->rcu, free_sched_domain); 5790 } 5791 5792 static void destroy_sched_domains(struct sched_domain *sd, int cpu) 5793 { 5794 for (; sd; sd = sd->parent) 5795 destroy_sched_domain(sd, cpu); 5796 } 5797 5798 /* 5799 * Keep a special pointer to the highest sched_domain that has 5800 * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this 5801 * allows us to avoid some pointer chasing select_idle_sibling(). 5802 * 5803 * Also keep a unique ID per domain (we use the first cpu number in 5804 * the cpumask of the domain), this allows us to quickly tell if 5805 * two cpus are in the same cache domain, see cpus_share_cache(). 5806 */ 5807 DEFINE_PER_CPU(struct sched_domain *, sd_llc); 5808 DEFINE_PER_CPU(int, sd_llc_size); 5809 DEFINE_PER_CPU(int, sd_llc_id); 5810 DEFINE_PER_CPU(struct sched_domain *, sd_numa); 5811 DEFINE_PER_CPU(struct sched_domain *, sd_busy); 5812 DEFINE_PER_CPU(struct sched_domain *, sd_asym); 5813 5814 static void update_top_cache_domain(int cpu) 5815 { 5816 struct sched_domain *sd; 5817 struct sched_domain *busy_sd = NULL; 5818 int id = cpu; 5819 int size = 1; 5820 5821 sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES); 5822 if (sd) { 5823 id = cpumask_first(sched_domain_span(sd)); 5824 size = cpumask_weight(sched_domain_span(sd)); 5825 busy_sd = sd->parent; /* sd_busy */ 5826 } 5827 rcu_assign_pointer(per_cpu(sd_busy, cpu), busy_sd); 5828 5829 rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); 5830 per_cpu(sd_llc_size, cpu) = size; 5831 per_cpu(sd_llc_id, cpu) = id; 5832 5833 sd = lowest_flag_domain(cpu, SD_NUMA); 5834 rcu_assign_pointer(per_cpu(sd_numa, cpu), sd); 5835 5836 sd = highest_flag_domain(cpu, SD_ASYM_PACKING); 5837 rcu_assign_pointer(per_cpu(sd_asym, cpu), sd); 5838 } 5839 5840 /* 5841 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must 5842 * hold the hotplug lock. 5843 */ 5844 static void 5845 cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) 5846 { 5847 struct rq *rq = cpu_rq(cpu); 5848 struct sched_domain *tmp; 5849 5850 /* Remove the sched domains which do not contribute to scheduling. */ 5851 for (tmp = sd; tmp; ) { 5852 struct sched_domain *parent = tmp->parent; 5853 if (!parent) 5854 break; 5855 5856 if (sd_parent_degenerate(tmp, parent)) { 5857 tmp->parent = parent->parent; 5858 if (parent->parent) 5859 parent->parent->child = tmp; 5860 /* 5861 * Transfer SD_PREFER_SIBLING down in case of a 5862 * degenerate parent; the spans match for this 5863 * so the property transfers. 5864 */ 5865 if (parent->flags & SD_PREFER_SIBLING) 5866 tmp->flags |= SD_PREFER_SIBLING; 5867 destroy_sched_domain(parent, cpu); 5868 } else 5869 tmp = tmp->parent; 5870 } 5871 5872 if (sd && sd_degenerate(sd)) { 5873 tmp = sd; 5874 sd = sd->parent; 5875 destroy_sched_domain(tmp, cpu); 5876 if (sd) 5877 sd->child = NULL; 5878 } 5879 5880 sched_domain_debug(sd, cpu); 5881 5882 rq_attach_root(rq, rd); 5883 tmp = rq->sd; 5884 rcu_assign_pointer(rq->sd, sd); 5885 destroy_sched_domains(tmp, cpu); 5886 5887 update_top_cache_domain(cpu); 5888 } 5889 5890 /* Setup the mask of cpus configured for isolated domains */ 5891 static int __init isolated_cpu_setup(char *str) 5892 { 5893 alloc_bootmem_cpumask_var(&cpu_isolated_map); 5894 cpulist_parse(str, cpu_isolated_map); 5895 return 1; 5896 } 5897 5898 __setup("isolcpus=", isolated_cpu_setup); 5899 5900 struct s_data { 5901 struct sched_domain ** __percpu sd; 5902 struct root_domain *rd; 5903 }; 5904 5905 enum s_alloc { 5906 sa_rootdomain, 5907 sa_sd, 5908 sa_sd_storage, 5909 sa_none, 5910 }; 5911 5912 /* 5913 * Build an iteration mask that can exclude certain CPUs from the upwards 5914 * domain traversal. 5915 * 5916 * Asymmetric node setups can result in situations where the domain tree is of 5917 * unequal depth, make sure to skip domains that already cover the entire 5918 * range. 5919 * 5920 * In that case build_sched_domains() will have terminated the iteration early 5921 * and our sibling sd spans will be empty. Domains should always include the 5922 * cpu they're built on, so check that. 5923 * 5924 */ 5925 static void build_group_mask(struct sched_domain *sd, struct sched_group *sg) 5926 { 5927 const struct cpumask *span = sched_domain_span(sd); 5928 struct sd_data *sdd = sd->private; 5929 struct sched_domain *sibling; 5930 int i; 5931 5932 for_each_cpu(i, span) { 5933 sibling = *per_cpu_ptr(sdd->sd, i); 5934 if (!cpumask_test_cpu(i, sched_domain_span(sibling))) 5935 continue; 5936 5937 cpumask_set_cpu(i, sched_group_mask(sg)); 5938 } 5939 } 5940 5941 /* 5942 * Return the canonical balance cpu for this group, this is the first cpu 5943 * of this group that's also in the iteration mask. 5944 */ 5945 int group_balance_cpu(struct sched_group *sg) 5946 { 5947 return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg)); 5948 } 5949 5950 static int 5951 build_overlap_sched_groups(struct sched_domain *sd, int cpu) 5952 { 5953 struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg; 5954 const struct cpumask *span = sched_domain_span(sd); 5955 struct cpumask *covered = sched_domains_tmpmask; 5956 struct sd_data *sdd = sd->private; 5957 struct sched_domain *sibling; 5958 int i; 5959 5960 cpumask_clear(covered); 5961 5962 for_each_cpu(i, span) { 5963 struct cpumask *sg_span; 5964 5965 if (cpumask_test_cpu(i, covered)) 5966 continue; 5967 5968 sibling = *per_cpu_ptr(sdd->sd, i); 5969 5970 /* See the comment near build_group_mask(). */ 5971 if (!cpumask_test_cpu(i, sched_domain_span(sibling))) 5972 continue; 5973 5974 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), 5975 GFP_KERNEL, cpu_to_node(cpu)); 5976 5977 if (!sg) 5978 goto fail; 5979 5980 sg_span = sched_group_cpus(sg); 5981 if (sibling->child) 5982 cpumask_copy(sg_span, sched_domain_span(sibling->child)); 5983 else 5984 cpumask_set_cpu(i, sg_span); 5985 5986 cpumask_or(covered, covered, sg_span); 5987 5988 sg->sgc = *per_cpu_ptr(sdd->sgc, i); 5989 if (atomic_inc_return(&sg->sgc->ref) == 1) 5990 build_group_mask(sd, sg); 5991 5992 /* 5993 * Initialize sgc->capacity such that even if we mess up the 5994 * domains and no possible iteration will get us here, we won't 5995 * die on a /0 trap. 5996 */ 5997 sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span); 5998 5999 /* 6000 * Make sure the first group of this domain contains the 6001 * canonical balance cpu. Otherwise the sched_domain iteration 6002 * breaks. See update_sg_lb_stats(). 6003 */ 6004 if ((!groups && cpumask_test_cpu(cpu, sg_span)) || 6005 group_balance_cpu(sg) == cpu) 6006 groups = sg; 6007 6008 if (!first) 6009 first = sg; 6010 if (last) 6011 last->next = sg; 6012 last = sg; 6013 last->next = first; 6014 } 6015 sd->groups = groups; 6016 6017 return 0; 6018 6019 fail: 6020 free_sched_groups(first, 0); 6021 6022 return -ENOMEM; 6023 } 6024 6025 static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg) 6026 { 6027 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); 6028 struct sched_domain *child = sd->child; 6029 6030 if (child) 6031 cpu = cpumask_first(sched_domain_span(child)); 6032 6033 if (sg) { 6034 *sg = *per_cpu_ptr(sdd->sg, cpu); 6035 (*sg)->sgc = *per_cpu_ptr(sdd->sgc, cpu); 6036 atomic_set(&(*sg)->sgc->ref, 1); /* for claim_allocations */ 6037 } 6038 6039 return cpu; 6040 } 6041 6042 /* 6043 * build_sched_groups will build a circular linked list of the groups 6044 * covered by the given span, and will set each group's ->cpumask correctly, 6045 * and ->cpu_capacity to 0. 6046 * 6047 * Assumes the sched_domain tree is fully constructed 6048 */ 6049 static int 6050 build_sched_groups(struct sched_domain *sd, int cpu) 6051 { 6052 struct sched_group *first = NULL, *last = NULL; 6053 struct sd_data *sdd = sd->private; 6054 const struct cpumask *span = sched_domain_span(sd); 6055 struct cpumask *covered; 6056 int i; 6057 6058 get_group(cpu, sdd, &sd->groups); 6059 atomic_inc(&sd->groups->ref); 6060 6061 if (cpu != cpumask_first(span)) 6062 return 0; 6063 6064 lockdep_assert_held(&sched_domains_mutex); 6065 covered = sched_domains_tmpmask; 6066 6067 cpumask_clear(covered); 6068 6069 for_each_cpu(i, span) { 6070 struct sched_group *sg; 6071 int group, j; 6072 6073 if (cpumask_test_cpu(i, covered)) 6074 continue; 6075 6076 group = get_group(i, sdd, &sg); 6077 cpumask_setall(sched_group_mask(sg)); 6078 6079 for_each_cpu(j, span) { 6080 if (get_group(j, sdd, NULL) != group) 6081 continue; 6082 6083 cpumask_set_cpu(j, covered); 6084 cpumask_set_cpu(j, sched_group_cpus(sg)); 6085 } 6086 6087 if (!first) 6088 first = sg; 6089 if (last) 6090 last->next = sg; 6091 last = sg; 6092 } 6093 last->next = first; 6094 6095 return 0; 6096 } 6097 6098 /* 6099 * Initialize sched groups cpu_capacity. 6100 * 6101 * cpu_capacity indicates the capacity of sched group, which is used while 6102 * distributing the load between different sched groups in a sched domain. 6103 * Typically cpu_capacity for all the groups in a sched domain will be same 6104 * unless there are asymmetries in the topology. If there are asymmetries, 6105 * group having more cpu_capacity will pickup more load compared to the 6106 * group having less cpu_capacity. 6107 */ 6108 static void init_sched_groups_capacity(int cpu, struct sched_domain *sd) 6109 { 6110 struct sched_group *sg = sd->groups; 6111 6112 WARN_ON(!sg); 6113 6114 do { 6115 sg->group_weight = cpumask_weight(sched_group_cpus(sg)); 6116 sg = sg->next; 6117 } while (sg != sd->groups); 6118 6119 if (cpu != group_balance_cpu(sg)) 6120 return; 6121 6122 update_group_capacity(sd, cpu); 6123 atomic_set(&sg->sgc->nr_busy_cpus, sg->group_weight); 6124 } 6125 6126 /* 6127 * Initializers for schedule domains 6128 * Non-inlined to reduce accumulated stack pressure in build_sched_domains() 6129 */ 6130 6131 static int default_relax_domain_level = -1; 6132 int sched_domain_level_max; 6133 6134 static int __init setup_relax_domain_level(char *str) 6135 { 6136 if (kstrtoint(str, 0, &default_relax_domain_level)) 6137 pr_warn("Unable to set relax_domain_level\n"); 6138 6139 return 1; 6140 } 6141 __setup("relax_domain_level=", setup_relax_domain_level); 6142 6143 static void set_domain_attribute(struct sched_domain *sd, 6144 struct sched_domain_attr *attr) 6145 { 6146 int request; 6147 6148 if (!attr || attr->relax_domain_level < 0) { 6149 if (default_relax_domain_level < 0) 6150 return; 6151 else 6152 request = default_relax_domain_level; 6153 } else 6154 request = attr->relax_domain_level; 6155 if (request < sd->level) { 6156 /* turn off idle balance on this domain */ 6157 sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); 6158 } else { 6159 /* turn on idle balance on this domain */ 6160 sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); 6161 } 6162 } 6163 6164 static void __sdt_free(const struct cpumask *cpu_map); 6165 static int __sdt_alloc(const struct cpumask *cpu_map); 6166 6167 static void __free_domain_allocs(struct s_data *d, enum s_alloc what, 6168 const struct cpumask *cpu_map) 6169 { 6170 switch (what) { 6171 case sa_rootdomain: 6172 if (!atomic_read(&d->rd->refcount)) 6173 free_rootdomain(&d->rd->rcu); /* fall through */ 6174 case sa_sd: 6175 free_percpu(d->sd); /* fall through */ 6176 case sa_sd_storage: 6177 __sdt_free(cpu_map); /* fall through */ 6178 case sa_none: 6179 break; 6180 } 6181 } 6182 6183 static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, 6184 const struct cpumask *cpu_map) 6185 { 6186 memset(d, 0, sizeof(*d)); 6187 6188 if (__sdt_alloc(cpu_map)) 6189 return sa_sd_storage; 6190 d->sd = alloc_percpu(struct sched_domain *); 6191 if (!d->sd) 6192 return sa_sd_storage; 6193 d->rd = alloc_rootdomain(); 6194 if (!d->rd) 6195 return sa_sd; 6196 return sa_rootdomain; 6197 } 6198 6199 /* 6200 * NULL the sd_data elements we've used to build the sched_domain and 6201 * sched_group structure so that the subsequent __free_domain_allocs() 6202 * will not free the data we're using. 6203 */ 6204 static void claim_allocations(int cpu, struct sched_domain *sd) 6205 { 6206 struct sd_data *sdd = sd->private; 6207 6208 WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd); 6209 *per_cpu_ptr(sdd->sd, cpu) = NULL; 6210 6211 if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref)) 6212 *per_cpu_ptr(sdd->sg, cpu) = NULL; 6213 6214 if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref)) 6215 *per_cpu_ptr(sdd->sgc, cpu) = NULL; 6216 } 6217 6218 #ifdef CONFIG_NUMA 6219 static int sched_domains_numa_levels; 6220 enum numa_topology_type sched_numa_topology_type; 6221 static int *sched_domains_numa_distance; 6222 int sched_max_numa_distance; 6223 static struct cpumask ***sched_domains_numa_masks; 6224 static int sched_domains_curr_level; 6225 #endif 6226 6227 /* 6228 * SD_flags allowed in topology descriptions. 6229 * 6230 * SD_SHARE_CPUCAPACITY - describes SMT topologies 6231 * SD_SHARE_PKG_RESOURCES - describes shared caches 6232 * SD_NUMA - describes NUMA topologies 6233 * SD_SHARE_POWERDOMAIN - describes shared power domain 6234 * 6235 * Odd one out: 6236 * SD_ASYM_PACKING - describes SMT quirks 6237 */ 6238 #define TOPOLOGY_SD_FLAGS \ 6239 (SD_SHARE_CPUCAPACITY | \ 6240 SD_SHARE_PKG_RESOURCES | \ 6241 SD_NUMA | \ 6242 SD_ASYM_PACKING | \ 6243 SD_SHARE_POWERDOMAIN) 6244 6245 static struct sched_domain * 6246 sd_init(struct sched_domain_topology_level *tl, int cpu) 6247 { 6248 struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); 6249 int sd_weight, sd_flags = 0; 6250 6251 #ifdef CONFIG_NUMA 6252 /* 6253 * Ugly hack to pass state to sd_numa_mask()... 6254 */ 6255 sched_domains_curr_level = tl->numa_level; 6256 #endif 6257 6258 sd_weight = cpumask_weight(tl->mask(cpu)); 6259 6260 if (tl->sd_flags) 6261 sd_flags = (*tl->sd_flags)(); 6262 if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS, 6263 "wrong sd_flags in topology description\n")) 6264 sd_flags &= ~TOPOLOGY_SD_FLAGS; 6265 6266 *sd = (struct sched_domain){ 6267 .min_interval = sd_weight, 6268 .max_interval = 2*sd_weight, 6269 .busy_factor = 32, 6270 .imbalance_pct = 125, 6271 6272 .cache_nice_tries = 0, 6273 .busy_idx = 0, 6274 .idle_idx = 0, 6275 .newidle_idx = 0, 6276 .wake_idx = 0, 6277 .forkexec_idx = 0, 6278 6279 .flags = 1*SD_LOAD_BALANCE 6280 | 1*SD_BALANCE_NEWIDLE 6281 | 1*SD_BALANCE_EXEC 6282 | 1*SD_BALANCE_FORK 6283 | 0*SD_BALANCE_WAKE 6284 | 1*SD_WAKE_AFFINE 6285 | 0*SD_SHARE_CPUCAPACITY 6286 | 0*SD_SHARE_PKG_RESOURCES 6287 | 0*SD_SERIALIZE 6288 | 0*SD_PREFER_SIBLING 6289 | 0*SD_NUMA 6290 | sd_flags 6291 , 6292 6293 .last_balance = jiffies, 6294 .balance_interval = sd_weight, 6295 .smt_gain = 0, 6296 .max_newidle_lb_cost = 0, 6297 .next_decay_max_lb_cost = jiffies, 6298 #ifdef CONFIG_SCHED_DEBUG 6299 .name = tl->name, 6300 #endif 6301 }; 6302 6303 /* 6304 * Convert topological properties into behaviour. 6305 */ 6306 6307 if (sd->flags & SD_SHARE_CPUCAPACITY) { 6308 sd->flags |= SD_PREFER_SIBLING; 6309 sd->imbalance_pct = 110; 6310 sd->smt_gain = 1178; /* ~15% */ 6311 6312 } else if (sd->flags & SD_SHARE_PKG_RESOURCES) { 6313 sd->imbalance_pct = 117; 6314 sd->cache_nice_tries = 1; 6315 sd->busy_idx = 2; 6316 6317 #ifdef CONFIG_NUMA 6318 } else if (sd->flags & SD_NUMA) { 6319 sd->cache_nice_tries = 2; 6320 sd->busy_idx = 3; 6321 sd->idle_idx = 2; 6322 6323 sd->flags |= SD_SERIALIZE; 6324 if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) { 6325 sd->flags &= ~(SD_BALANCE_EXEC | 6326 SD_BALANCE_FORK | 6327 SD_WAKE_AFFINE); 6328 } 6329 6330 #endif 6331 } else { 6332 sd->flags |= SD_PREFER_SIBLING; 6333 sd->cache_nice_tries = 1; 6334 sd->busy_idx = 2; 6335 sd->idle_idx = 1; 6336 } 6337 6338 sd->private = &tl->data; 6339 6340 return sd; 6341 } 6342 6343 /* 6344 * Topology list, bottom-up. 6345 */ 6346 static struct sched_domain_topology_level default_topology[] = { 6347 #ifdef CONFIG_SCHED_SMT 6348 { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }, 6349 #endif 6350 #ifdef CONFIG_SCHED_MC 6351 { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, 6352 #endif 6353 { cpu_cpu_mask, SD_INIT_NAME(DIE) }, 6354 { NULL, }, 6355 }; 6356 6357 struct sched_domain_topology_level *sched_domain_topology = default_topology; 6358 6359 #define for_each_sd_topology(tl) \ 6360 for (tl = sched_domain_topology; tl->mask; tl++) 6361 6362 void set_sched_topology(struct sched_domain_topology_level *tl) 6363 { 6364 sched_domain_topology = tl; 6365 } 6366 6367 #ifdef CONFIG_NUMA 6368 6369 static const struct cpumask *sd_numa_mask(int cpu) 6370 { 6371 return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)]; 6372 } 6373 6374 static void sched_numa_warn(const char *str) 6375 { 6376 static int done = false; 6377 int i,j; 6378 6379 if (done) 6380 return; 6381 6382 done = true; 6383 6384 printk(KERN_WARNING "ERROR: %s\n\n", str); 6385 6386 for (i = 0; i < nr_node_ids; i++) { 6387 printk(KERN_WARNING " "); 6388 for (j = 0; j < nr_node_ids; j++) 6389 printk(KERN_CONT "%02d ", node_distance(i,j)); 6390 printk(KERN_CONT "\n"); 6391 } 6392 printk(KERN_WARNING "\n"); 6393 } 6394 6395 bool find_numa_distance(int distance) 6396 { 6397 int i; 6398 6399 if (distance == node_distance(0, 0)) 6400 return true; 6401 6402 for (i = 0; i < sched_domains_numa_levels; i++) { 6403 if (sched_domains_numa_distance[i] == distance) 6404 return true; 6405 } 6406 6407 return false; 6408 } 6409 6410 /* 6411 * A system can have three types of NUMA topology: 6412 * NUMA_DIRECT: all nodes are directly connected, or not a NUMA system 6413 * NUMA_GLUELESS_MESH: some nodes reachable through intermediary nodes 6414 * NUMA_BACKPLANE: nodes can reach other nodes through a backplane 6415 * 6416 * The difference between a glueless mesh topology and a backplane 6417 * topology lies in whether communication between not directly 6418 * connected nodes goes through intermediary nodes (where programs 6419 * could run), or through backplane controllers. This affects 6420 * placement of programs. 6421 * 6422 * The type of topology can be discerned with the following tests: 6423 * - If the maximum distance between any nodes is 1 hop, the system 6424 * is directly connected. 6425 * - If for two nodes A and B, located N > 1 hops away from each other, 6426 * there is an intermediary node C, which is < N hops away from both 6427 * nodes A and B, the system is a glueless mesh. 6428 */ 6429 static void init_numa_topology_type(void) 6430 { 6431 int a, b, c, n; 6432 6433 n = sched_max_numa_distance; 6434 6435 if (n <= 1) 6436 sched_numa_topology_type = NUMA_DIRECT; 6437 6438 for_each_online_node(a) { 6439 for_each_online_node(b) { 6440 /* Find two nodes furthest removed from each other. */ 6441 if (node_distance(a, b) < n) 6442 continue; 6443 6444 /* Is there an intermediary node between a and b? */ 6445 for_each_online_node(c) { 6446 if (node_distance(a, c) < n && 6447 node_distance(b, c) < n) { 6448 sched_numa_topology_type = 6449 NUMA_GLUELESS_MESH; 6450 return; 6451 } 6452 } 6453 6454 sched_numa_topology_type = NUMA_BACKPLANE; 6455 return; 6456 } 6457 } 6458 } 6459 6460 static void sched_init_numa(void) 6461 { 6462 int next_distance, curr_distance = node_distance(0, 0); 6463 struct sched_domain_topology_level *tl; 6464 int level = 0; 6465 int i, j, k; 6466 6467 sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL); 6468 if (!sched_domains_numa_distance) 6469 return; 6470 6471 /* 6472 * O(nr_nodes^2) deduplicating selection sort -- in order to find the 6473 * unique distances in the node_distance() table. 6474 * 6475 * Assumes node_distance(0,j) includes all distances in 6476 * node_distance(i,j) in order to avoid cubic time. 6477 */ 6478 next_distance = curr_distance; 6479 for (i = 0; i < nr_node_ids; i++) { 6480 for (j = 0; j < nr_node_ids; j++) { 6481 for (k = 0; k < nr_node_ids; k++) { 6482 int distance = node_distance(i, k); 6483 6484 if (distance > curr_distance && 6485 (distance < next_distance || 6486 next_distance == curr_distance)) 6487 next_distance = distance; 6488 6489 /* 6490 * While not a strong assumption it would be nice to know 6491 * about cases where if node A is connected to B, B is not 6492 * equally connected to A. 6493 */ 6494 if (sched_debug() && node_distance(k, i) != distance) 6495 sched_numa_warn("Node-distance not symmetric"); 6496 6497 if (sched_debug() && i && !find_numa_distance(distance)) 6498 sched_numa_warn("Node-0 not representative"); 6499 } 6500 if (next_distance != curr_distance) { 6501 sched_domains_numa_distance[level++] = next_distance; 6502 sched_domains_numa_levels = level; 6503 curr_distance = next_distance; 6504 } else break; 6505 } 6506 6507 /* 6508 * In case of sched_debug() we verify the above assumption. 6509 */ 6510 if (!sched_debug()) 6511 break; 6512 } 6513 6514 if (!level) 6515 return; 6516 6517 /* 6518 * 'level' contains the number of unique distances, excluding the 6519 * identity distance node_distance(i,i). 6520 * 6521 * The sched_domains_numa_distance[] array includes the actual distance 6522 * numbers. 6523 */ 6524 6525 /* 6526 * Here, we should temporarily reset sched_domains_numa_levels to 0. 6527 * If it fails to allocate memory for array sched_domains_numa_masks[][], 6528 * the array will contain less then 'level' members. This could be 6529 * dangerous when we use it to iterate array sched_domains_numa_masks[][] 6530 * in other functions. 6531 * 6532 * We reset it to 'level' at the end of this function. 6533 */ 6534 sched_domains_numa_levels = 0; 6535 6536 sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL); 6537 if (!sched_domains_numa_masks) 6538 return; 6539 6540 /* 6541 * Now for each level, construct a mask per node which contains all 6542 * cpus of nodes that are that many hops away from us. 6543 */ 6544 for (i = 0; i < level; i++) { 6545 sched_domains_numa_masks[i] = 6546 kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL); 6547 if (!sched_domains_numa_masks[i]) 6548 return; 6549 6550 for (j = 0; j < nr_node_ids; j++) { 6551 struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL); 6552 if (!mask) 6553 return; 6554 6555 sched_domains_numa_masks[i][j] = mask; 6556 6557 for (k = 0; k < nr_node_ids; k++) { 6558 if (node_distance(j, k) > sched_domains_numa_distance[i]) 6559 continue; 6560 6561 cpumask_or(mask, mask, cpumask_of_node(k)); 6562 } 6563 } 6564 } 6565 6566 /* Compute default topology size */ 6567 for (i = 0; sched_domain_topology[i].mask; i++); 6568 6569 tl = kzalloc((i + level + 1) * 6570 sizeof(struct sched_domain_topology_level), GFP_KERNEL); 6571 if (!tl) 6572 return; 6573 6574 /* 6575 * Copy the default topology bits.. 6576 */ 6577 for (i = 0; sched_domain_topology[i].mask; i++) 6578 tl[i] = sched_domain_topology[i]; 6579 6580 /* 6581 * .. and append 'j' levels of NUMA goodness. 6582 */ 6583 for (j = 0; j < level; i++, j++) { 6584 tl[i] = (struct sched_domain_topology_level){ 6585 .mask = sd_numa_mask, 6586 .sd_flags = cpu_numa_flags, 6587 .flags = SDTL_OVERLAP, 6588 .numa_level = j, 6589 SD_INIT_NAME(NUMA) 6590 }; 6591 } 6592 6593 sched_domain_topology = tl; 6594 6595 sched_domains_numa_levels = level; 6596 sched_max_numa_distance = sched_domains_numa_distance[level - 1]; 6597 6598 init_numa_topology_type(); 6599 } 6600 6601 static void sched_domains_numa_masks_set(int cpu) 6602 { 6603 int i, j; 6604 int node = cpu_to_node(cpu); 6605 6606 for (i = 0; i < sched_domains_numa_levels; i++) { 6607 for (j = 0; j < nr_node_ids; j++) { 6608 if (node_distance(j, node) <= sched_domains_numa_distance[i]) 6609 cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]); 6610 } 6611 } 6612 } 6613 6614 static void sched_domains_numa_masks_clear(int cpu) 6615 { 6616 int i, j; 6617 for (i = 0; i < sched_domains_numa_levels; i++) { 6618 for (j = 0; j < nr_node_ids; j++) 6619 cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]); 6620 } 6621 } 6622 6623 /* 6624 * Update sched_domains_numa_masks[level][node] array when new cpus 6625 * are onlined. 6626 */ 6627 static int sched_domains_numa_masks_update(struct notifier_block *nfb, 6628 unsigned long action, 6629 void *hcpu) 6630 { 6631 int cpu = (long)hcpu; 6632 6633 switch (action & ~CPU_TASKS_FROZEN) { 6634 case CPU_ONLINE: 6635 sched_domains_numa_masks_set(cpu); 6636 break; 6637 6638 case CPU_DEAD: 6639 sched_domains_numa_masks_clear(cpu); 6640 break; 6641 6642 default: 6643 return NOTIFY_DONE; 6644 } 6645 6646 return NOTIFY_OK; 6647 } 6648 #else 6649 static inline void sched_init_numa(void) 6650 { 6651 } 6652 6653 static int sched_domains_numa_masks_update(struct notifier_block *nfb, 6654 unsigned long action, 6655 void *hcpu) 6656 { 6657 return 0; 6658 } 6659 #endif /* CONFIG_NUMA */ 6660 6661 static int __sdt_alloc(const struct cpumask *cpu_map) 6662 { 6663 struct sched_domain_topology_level *tl; 6664 int j; 6665 6666 for_each_sd_topology(tl) { 6667 struct sd_data *sdd = &tl->data; 6668 6669 sdd->sd = alloc_percpu(struct sched_domain *); 6670 if (!sdd->sd) 6671 return -ENOMEM; 6672 6673 sdd->sg = alloc_percpu(struct sched_group *); 6674 if (!sdd->sg) 6675 return -ENOMEM; 6676 6677 sdd->sgc = alloc_percpu(struct sched_group_capacity *); 6678 if (!sdd->sgc) 6679 return -ENOMEM; 6680 6681 for_each_cpu(j, cpu_map) { 6682 struct sched_domain *sd; 6683 struct sched_group *sg; 6684 struct sched_group_capacity *sgc; 6685 6686 sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(), 6687 GFP_KERNEL, cpu_to_node(j)); 6688 if (!sd) 6689 return -ENOMEM; 6690 6691 *per_cpu_ptr(sdd->sd, j) = sd; 6692 6693 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), 6694 GFP_KERNEL, cpu_to_node(j)); 6695 if (!sg) 6696 return -ENOMEM; 6697 6698 sg->next = sg; 6699 6700 *per_cpu_ptr(sdd->sg, j) = sg; 6701 6702 sgc = kzalloc_node(sizeof(struct sched_group_capacity) + cpumask_size(), 6703 GFP_KERNEL, cpu_to_node(j)); 6704 if (!sgc) 6705 return -ENOMEM; 6706 6707 *per_cpu_ptr(sdd->sgc, j) = sgc; 6708 } 6709 } 6710 6711 return 0; 6712 } 6713 6714 static void __sdt_free(const struct cpumask *cpu_map) 6715 { 6716 struct sched_domain_topology_level *tl; 6717 int j; 6718 6719 for_each_sd_topology(tl) { 6720 struct sd_data *sdd = &tl->data; 6721 6722 for_each_cpu(j, cpu_map) { 6723 struct sched_domain *sd; 6724 6725 if (sdd->sd) { 6726 sd = *per_cpu_ptr(sdd->sd, j); 6727 if (sd && (sd->flags & SD_OVERLAP)) 6728 free_sched_groups(sd->groups, 0); 6729 kfree(*per_cpu_ptr(sdd->sd, j)); 6730 } 6731 6732 if (sdd->sg) 6733 kfree(*per_cpu_ptr(sdd->sg, j)); 6734 if (sdd->sgc) 6735 kfree(*per_cpu_ptr(sdd->sgc, j)); 6736 } 6737 free_percpu(sdd->sd); 6738 sdd->sd = NULL; 6739 free_percpu(sdd->sg); 6740 sdd->sg = NULL; 6741 free_percpu(sdd->sgc); 6742 sdd->sgc = NULL; 6743 } 6744 } 6745 6746 struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, 6747 const struct cpumask *cpu_map, struct sched_domain_attr *attr, 6748 struct sched_domain *child, int cpu) 6749 { 6750 struct sched_domain *sd = sd_init(tl, cpu); 6751 if (!sd) 6752 return child; 6753 6754 cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); 6755 if (child) { 6756 sd->level = child->level + 1; 6757 sched_domain_level_max = max(sched_domain_level_max, sd->level); 6758 child->parent = sd; 6759 sd->child = child; 6760 6761 if (!cpumask_subset(sched_domain_span(child), 6762 sched_domain_span(sd))) { 6763 pr_err("BUG: arch topology borken\n"); 6764 #ifdef CONFIG_SCHED_DEBUG 6765 pr_err(" the %s domain not a subset of the %s domain\n", 6766 child->name, sd->name); 6767 #endif 6768 /* Fixup, ensure @sd has at least @child cpus. */ 6769 cpumask_or(sched_domain_span(sd), 6770 sched_domain_span(sd), 6771 sched_domain_span(child)); 6772 } 6773 6774 } 6775 set_domain_attribute(sd, attr); 6776 6777 return sd; 6778 } 6779 6780 /* 6781 * Build sched domains for a given set of cpus and attach the sched domains 6782 * to the individual cpus 6783 */ 6784 static int build_sched_domains(const struct cpumask *cpu_map, 6785 struct sched_domain_attr *attr) 6786 { 6787 enum s_alloc alloc_state; 6788 struct sched_domain *sd; 6789 struct s_data d; 6790 int i, ret = -ENOMEM; 6791 6792 alloc_state = __visit_domain_allocation_hell(&d, cpu_map); 6793 if (alloc_state != sa_rootdomain) 6794 goto error; 6795 6796 /* Set up domains for cpus specified by the cpu_map. */ 6797 for_each_cpu(i, cpu_map) { 6798 struct sched_domain_topology_level *tl; 6799 6800 sd = NULL; 6801 for_each_sd_topology(tl) { 6802 sd = build_sched_domain(tl, cpu_map, attr, sd, i); 6803 if (tl == sched_domain_topology) 6804 *per_cpu_ptr(d.sd, i) = sd; 6805 if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP)) 6806 sd->flags |= SD_OVERLAP; 6807 if (cpumask_equal(cpu_map, sched_domain_span(sd))) 6808 break; 6809 } 6810 } 6811 6812 /* Build the groups for the domains */ 6813 for_each_cpu(i, cpu_map) { 6814 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { 6815 sd->span_weight = cpumask_weight(sched_domain_span(sd)); 6816 if (sd->flags & SD_OVERLAP) { 6817 if (build_overlap_sched_groups(sd, i)) 6818 goto error; 6819 } else { 6820 if (build_sched_groups(sd, i)) 6821 goto error; 6822 } 6823 } 6824 } 6825 6826 /* Calculate CPU capacity for physical packages and nodes */ 6827 for (i = nr_cpumask_bits-1; i >= 0; i--) { 6828 if (!cpumask_test_cpu(i, cpu_map)) 6829 continue; 6830 6831 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { 6832 claim_allocations(i, sd); 6833 init_sched_groups_capacity(i, sd); 6834 } 6835 } 6836 6837 /* Attach the domains */ 6838 rcu_read_lock(); 6839 for_each_cpu(i, cpu_map) { 6840 sd = *per_cpu_ptr(d.sd, i); 6841 cpu_attach_domain(sd, d.rd, i); 6842 } 6843 rcu_read_unlock(); 6844 6845 ret = 0; 6846 error: 6847 __free_domain_allocs(&d, alloc_state, cpu_map); 6848 return ret; 6849 } 6850 6851 static cpumask_var_t *doms_cur; /* current sched domains */ 6852 static int ndoms_cur; /* number of sched domains in 'doms_cur' */ 6853 static struct sched_domain_attr *dattr_cur; 6854 /* attribues of custom domains in 'doms_cur' */ 6855 6856 /* 6857 * Special case: If a kmalloc of a doms_cur partition (array of 6858 * cpumask) fails, then fallback to a single sched domain, 6859 * as determined by the single cpumask fallback_doms. 6860 */ 6861 static cpumask_var_t fallback_doms; 6862 6863 /* 6864 * arch_update_cpu_topology lets virtualized architectures update the 6865 * cpu core maps. It is supposed to return 1 if the topology changed 6866 * or 0 if it stayed the same. 6867 */ 6868 int __weak arch_update_cpu_topology(void) 6869 { 6870 return 0; 6871 } 6872 6873 cpumask_var_t *alloc_sched_domains(unsigned int ndoms) 6874 { 6875 int i; 6876 cpumask_var_t *doms; 6877 6878 doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL); 6879 if (!doms) 6880 return NULL; 6881 for (i = 0; i < ndoms; i++) { 6882 if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) { 6883 free_sched_domains(doms, i); 6884 return NULL; 6885 } 6886 } 6887 return doms; 6888 } 6889 6890 void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms) 6891 { 6892 unsigned int i; 6893 for (i = 0; i < ndoms; i++) 6894 free_cpumask_var(doms[i]); 6895 kfree(doms); 6896 } 6897 6898 /* 6899 * Set up scheduler domains and groups. Callers must hold the hotplug lock. 6900 * For now this just excludes isolated cpus, but could be used to 6901 * exclude other special cases in the future. 6902 */ 6903 static int init_sched_domains(const struct cpumask *cpu_map) 6904 { 6905 int err; 6906 6907 arch_update_cpu_topology(); 6908 ndoms_cur = 1; 6909 doms_cur = alloc_sched_domains(ndoms_cur); 6910 if (!doms_cur) 6911 doms_cur = &fallback_doms; 6912 cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); 6913 err = build_sched_domains(doms_cur[0], NULL); 6914 register_sched_domain_sysctl(); 6915 6916 return err; 6917 } 6918 6919 /* 6920 * Detach sched domains from a group of cpus specified in cpu_map 6921 * These cpus will now be attached to the NULL domain 6922 */ 6923 static void detach_destroy_domains(const struct cpumask *cpu_map) 6924 { 6925 int i; 6926 6927 rcu_read_lock(); 6928 for_each_cpu(i, cpu_map) 6929 cpu_attach_domain(NULL, &def_root_domain, i); 6930 rcu_read_unlock(); 6931 } 6932 6933 /* handle null as "default" */ 6934 static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, 6935 struct sched_domain_attr *new, int idx_new) 6936 { 6937 struct sched_domain_attr tmp; 6938 6939 /* fast path */ 6940 if (!new && !cur) 6941 return 1; 6942 6943 tmp = SD_ATTR_INIT; 6944 return !memcmp(cur ? (cur + idx_cur) : &tmp, 6945 new ? (new + idx_new) : &tmp, 6946 sizeof(struct sched_domain_attr)); 6947 } 6948 6949 /* 6950 * Partition sched domains as specified by the 'ndoms_new' 6951 * cpumasks in the array doms_new[] of cpumasks. This compares 6952 * doms_new[] to the current sched domain partitioning, doms_cur[]. 6953 * It destroys each deleted domain and builds each new domain. 6954 * 6955 * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'. 6956 * The masks don't intersect (don't overlap.) We should setup one 6957 * sched domain for each mask. CPUs not in any of the cpumasks will 6958 * not be load balanced. If the same cpumask appears both in the 6959 * current 'doms_cur' domains and in the new 'doms_new', we can leave 6960 * it as it is. 6961 * 6962 * The passed in 'doms_new' should be allocated using 6963 * alloc_sched_domains. This routine takes ownership of it and will 6964 * free_sched_domains it when done with it. If the caller failed the 6965 * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1, 6966 * and partition_sched_domains() will fallback to the single partition 6967 * 'fallback_doms', it also forces the domains to be rebuilt. 6968 * 6969 * If doms_new == NULL it will be replaced with cpu_online_mask. 6970 * ndoms_new == 0 is a special case for destroying existing domains, 6971 * and it will not create the default domain. 6972 * 6973 * Call with hotplug lock held 6974 */ 6975 void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], 6976 struct sched_domain_attr *dattr_new) 6977 { 6978 int i, j, n; 6979 int new_topology; 6980 6981 mutex_lock(&sched_domains_mutex); 6982 6983 /* always unregister in case we don't destroy any domains */ 6984 unregister_sched_domain_sysctl(); 6985 6986 /* Let architecture update cpu core mappings. */ 6987 new_topology = arch_update_cpu_topology(); 6988 6989 n = doms_new ? ndoms_new : 0; 6990 6991 /* Destroy deleted domains */ 6992 for (i = 0; i < ndoms_cur; i++) { 6993 for (j = 0; j < n && !new_topology; j++) { 6994 if (cpumask_equal(doms_cur[i], doms_new[j]) 6995 && dattrs_equal(dattr_cur, i, dattr_new, j)) 6996 goto match1; 6997 } 6998 /* no match - a current sched domain not in new doms_new[] */ 6999 detach_destroy_domains(doms_cur[i]); 7000 match1: 7001 ; 7002 } 7003 7004 n = ndoms_cur; 7005 if (doms_new == NULL) { 7006 n = 0; 7007 doms_new = &fallback_doms; 7008 cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map); 7009 WARN_ON_ONCE(dattr_new); 7010 } 7011 7012 /* Build new domains */ 7013 for (i = 0; i < ndoms_new; i++) { 7014 for (j = 0; j < n && !new_topology; j++) { 7015 if (cpumask_equal(doms_new[i], doms_cur[j]) 7016 && dattrs_equal(dattr_new, i, dattr_cur, j)) 7017 goto match2; 7018 } 7019 /* no match - add a new doms_new */ 7020 build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL); 7021 match2: 7022 ; 7023 } 7024 7025 /* Remember the new sched domains */ 7026 if (doms_cur != &fallback_doms) 7027 free_sched_domains(doms_cur, ndoms_cur); 7028 kfree(dattr_cur); /* kfree(NULL) is safe */ 7029 doms_cur = doms_new; 7030 dattr_cur = dattr_new; 7031 ndoms_cur = ndoms_new; 7032 7033 register_sched_domain_sysctl(); 7034 7035 mutex_unlock(&sched_domains_mutex); 7036 } 7037 7038 static int num_cpus_frozen; /* used to mark begin/end of suspend/resume */ 7039 7040 /* 7041 * Update cpusets according to cpu_active mask. If cpusets are 7042 * disabled, cpuset_update_active_cpus() becomes a simple wrapper 7043 * around partition_sched_domains(). 7044 * 7045 * If we come here as part of a suspend/resume, don't touch cpusets because we 7046 * want to restore it back to its original state upon resume anyway. 7047 */ 7048 static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action, 7049 void *hcpu) 7050 { 7051 switch (action) { 7052 case CPU_ONLINE_FROZEN: 7053 case CPU_DOWN_FAILED_FROZEN: 7054 7055 /* 7056 * num_cpus_frozen tracks how many CPUs are involved in suspend 7057 * resume sequence. As long as this is not the last online 7058 * operation in the resume sequence, just build a single sched 7059 * domain, ignoring cpusets. 7060 */ 7061 num_cpus_frozen--; 7062 if (likely(num_cpus_frozen)) { 7063 partition_sched_domains(1, NULL, NULL); 7064 break; 7065 } 7066 7067 /* 7068 * This is the last CPU online operation. So fall through and 7069 * restore the original sched domains by considering the 7070 * cpuset configurations. 7071 */ 7072 7073 case CPU_ONLINE: 7074 cpuset_update_active_cpus(true); 7075 break; 7076 default: 7077 return NOTIFY_DONE; 7078 } 7079 return NOTIFY_OK; 7080 } 7081 7082 static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action, 7083 void *hcpu) 7084 { 7085 unsigned long flags; 7086 long cpu = (long)hcpu; 7087 struct dl_bw *dl_b; 7088 bool overflow; 7089 int cpus; 7090 7091 switch (action) { 7092 case CPU_DOWN_PREPARE: 7093 rcu_read_lock_sched(); 7094 dl_b = dl_bw_of(cpu); 7095 7096 raw_spin_lock_irqsave(&dl_b->lock, flags); 7097 cpus = dl_bw_cpus(cpu); 7098 overflow = __dl_overflow(dl_b, cpus, 0, 0); 7099 raw_spin_unlock_irqrestore(&dl_b->lock, flags); 7100 7101 rcu_read_unlock_sched(); 7102 7103 if (overflow) 7104 return notifier_from_errno(-EBUSY); 7105 cpuset_update_active_cpus(false); 7106 break; 7107 case CPU_DOWN_PREPARE_FROZEN: 7108 num_cpus_frozen++; 7109 partition_sched_domains(1, NULL, NULL); 7110 break; 7111 default: 7112 return NOTIFY_DONE; 7113 } 7114 return NOTIFY_OK; 7115 } 7116 7117 void __init sched_init_smp(void) 7118 { 7119 cpumask_var_t non_isolated_cpus; 7120 7121 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); 7122 alloc_cpumask_var(&fallback_doms, GFP_KERNEL); 7123 7124 /* nohz_full won't take effect without isolating the cpus. */ 7125 tick_nohz_full_add_cpus_to(cpu_isolated_map); 7126 7127 sched_init_numa(); 7128 7129 /* 7130 * There's no userspace yet to cause hotplug operations; hence all the 7131 * cpu masks are stable and all blatant races in the below code cannot 7132 * happen. 7133 */ 7134 mutex_lock(&sched_domains_mutex); 7135 init_sched_domains(cpu_active_mask); 7136 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); 7137 if (cpumask_empty(non_isolated_cpus)) 7138 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); 7139 mutex_unlock(&sched_domains_mutex); 7140 7141 hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE); 7142 hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE); 7143 hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE); 7144 7145 init_hrtick(); 7146 7147 /* Move init over to a non-isolated CPU */ 7148 if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0) 7149 BUG(); 7150 sched_init_granularity(); 7151 free_cpumask_var(non_isolated_cpus); 7152 7153 init_sched_rt_class(); 7154 init_sched_dl_class(); 7155 } 7156 #else 7157 void __init sched_init_smp(void) 7158 { 7159 sched_init_granularity(); 7160 } 7161 #endif /* CONFIG_SMP */ 7162 7163 int in_sched_functions(unsigned long addr) 7164 { 7165 return in_lock_functions(addr) || 7166 (addr >= (unsigned long)__sched_text_start 7167 && addr < (unsigned long)__sched_text_end); 7168 } 7169 7170 #ifdef CONFIG_CGROUP_SCHED 7171 /* 7172 * Default task group. 7173 * Every task in system belongs to this group at bootup. 7174 */ 7175 struct task_group root_task_group; 7176 LIST_HEAD(task_groups); 7177 #endif 7178 7179 DECLARE_PER_CPU(cpumask_var_t, load_balance_mask); 7180 7181 void __init sched_init(void) 7182 { 7183 int i, j; 7184 unsigned long alloc_size = 0, ptr; 7185 7186 #ifdef CONFIG_FAIR_GROUP_SCHED 7187 alloc_size += 2 * nr_cpu_ids * sizeof(void **); 7188 #endif 7189 #ifdef CONFIG_RT_GROUP_SCHED 7190 alloc_size += 2 * nr_cpu_ids * sizeof(void **); 7191 #endif 7192 if (alloc_size) { 7193 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); 7194 7195 #ifdef CONFIG_FAIR_GROUP_SCHED 7196 root_task_group.se = (struct sched_entity **)ptr; 7197 ptr += nr_cpu_ids * sizeof(void **); 7198 7199 root_task_group.cfs_rq = (struct cfs_rq **)ptr; 7200 ptr += nr_cpu_ids * sizeof(void **); 7201 7202 #endif /* CONFIG_FAIR_GROUP_SCHED */ 7203 #ifdef CONFIG_RT_GROUP_SCHED 7204 root_task_group.rt_se = (struct sched_rt_entity **)ptr; 7205 ptr += nr_cpu_ids * sizeof(void **); 7206 7207 root_task_group.rt_rq = (struct rt_rq **)ptr; 7208 ptr += nr_cpu_ids * sizeof(void **); 7209 7210 #endif /* CONFIG_RT_GROUP_SCHED */ 7211 } 7212 #ifdef CONFIG_CPUMASK_OFFSTACK 7213 for_each_possible_cpu(i) { 7214 per_cpu(load_balance_mask, i) = (cpumask_var_t)kzalloc_node( 7215 cpumask_size(), GFP_KERNEL, cpu_to_node(i)); 7216 } 7217 #endif /* CONFIG_CPUMASK_OFFSTACK */ 7218 7219 init_rt_bandwidth(&def_rt_bandwidth, 7220 global_rt_period(), global_rt_runtime()); 7221 init_dl_bandwidth(&def_dl_bandwidth, 7222 global_rt_period(), global_rt_runtime()); 7223 7224 #ifdef CONFIG_SMP 7225 init_defrootdomain(); 7226 #endif 7227 7228 #ifdef CONFIG_RT_GROUP_SCHED 7229 init_rt_bandwidth(&root_task_group.rt_bandwidth, 7230 global_rt_period(), global_rt_runtime()); 7231 #endif /* CONFIG_RT_GROUP_SCHED */ 7232 7233 #ifdef CONFIG_CGROUP_SCHED 7234 list_add(&root_task_group.list, &task_groups); 7235 INIT_LIST_HEAD(&root_task_group.children); 7236 INIT_LIST_HEAD(&root_task_group.siblings); 7237 autogroup_init(&init_task); 7238 7239 #endif /* CONFIG_CGROUP_SCHED */ 7240 7241 for_each_possible_cpu(i) { 7242 struct rq *rq; 7243 7244 rq = cpu_rq(i); 7245 raw_spin_lock_init(&rq->lock); 7246 rq->nr_running = 0; 7247 rq->calc_load_active = 0; 7248 rq->calc_load_update = jiffies + LOAD_FREQ; 7249 init_cfs_rq(&rq->cfs); 7250 init_rt_rq(&rq->rt); 7251 init_dl_rq(&rq->dl); 7252 #ifdef CONFIG_FAIR_GROUP_SCHED 7253 root_task_group.shares = ROOT_TASK_GROUP_LOAD; 7254 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); 7255 /* 7256 * How much cpu bandwidth does root_task_group get? 7257 * 7258 * In case of task-groups formed thr' the cgroup filesystem, it 7259 * gets 100% of the cpu resources in the system. This overall 7260 * system cpu resource is divided among the tasks of 7261 * root_task_group and its child task-groups in a fair manner, 7262 * based on each entity's (task or task-group's) weight 7263 * (se->load.weight). 7264 * 7265 * In other words, if root_task_group has 10 tasks of weight 7266 * 1024) and two child groups A0 and A1 (of weight 1024 each), 7267 * then A0's share of the cpu resource is: 7268 * 7269 * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% 7270 * 7271 * We achieve this by letting root_task_group's tasks sit 7272 * directly in rq->cfs (i.e root_task_group->se[] = NULL). 7273 */ 7274 init_cfs_bandwidth(&root_task_group.cfs_bandwidth); 7275 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL); 7276 #endif /* CONFIG_FAIR_GROUP_SCHED */ 7277 7278 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; 7279 #ifdef CONFIG_RT_GROUP_SCHED 7280 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL); 7281 #endif 7282 7283 for (j = 0; j < CPU_LOAD_IDX_MAX; j++) 7284 rq->cpu_load[j] = 0; 7285 7286 rq->last_load_update_tick = jiffies; 7287 7288 #ifdef CONFIG_SMP 7289 rq->sd = NULL; 7290 rq->rd = NULL; 7291 rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE; 7292 rq->balance_callback = NULL; 7293 rq->active_balance = 0; 7294 rq->next_balance = jiffies; 7295 rq->push_cpu = 0; 7296 rq->cpu = i; 7297 rq->online = 0; 7298 rq->idle_stamp = 0; 7299 rq->avg_idle = 2*sysctl_sched_migration_cost; 7300 rq->max_idle_balance_cost = sysctl_sched_migration_cost; 7301 7302 INIT_LIST_HEAD(&rq->cfs_tasks); 7303 7304 rq_attach_root(rq, &def_root_domain); 7305 #ifdef CONFIG_NO_HZ_COMMON 7306 rq->nohz_flags = 0; 7307 #endif 7308 #ifdef CONFIG_NO_HZ_FULL 7309 rq->last_sched_tick = 0; 7310 #endif 7311 #endif 7312 init_rq_hrtick(rq); 7313 atomic_set(&rq->nr_iowait, 0); 7314 } 7315 7316 set_load_weight(&init_task); 7317 7318 #ifdef CONFIG_PREEMPT_NOTIFIERS 7319 INIT_HLIST_HEAD(&init_task.preempt_notifiers); 7320 #endif 7321 7322 /* 7323 * The boot idle thread does lazy MMU switching as well: 7324 */ 7325 atomic_inc(&init_mm.mm_count); 7326 enter_lazy_tlb(&init_mm, current); 7327 7328 /* 7329 * During early bootup we pretend to be a normal task: 7330 */ 7331 current->sched_class = &fair_sched_class; 7332 7333 /* 7334 * Make us the idle thread. Technically, schedule() should not be 7335 * called from this thread, however somewhere below it might be, 7336 * but because we are the idle thread, we just pick up running again 7337 * when this runqueue becomes "idle". 7338 */ 7339 init_idle(current, smp_processor_id()); 7340 7341 calc_load_update = jiffies + LOAD_FREQ; 7342 7343 #ifdef CONFIG_SMP 7344 zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT); 7345 /* May be allocated at isolcpus cmdline parse time */ 7346 if (cpu_isolated_map == NULL) 7347 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); 7348 idle_thread_set_boot_cpu(); 7349 set_cpu_rq_start_time(); 7350 #endif 7351 init_sched_fair_class(); 7352 7353 scheduler_running = 1; 7354 } 7355 7356 #ifdef CONFIG_DEBUG_ATOMIC_SLEEP 7357 static inline int preempt_count_equals(int preempt_offset) 7358 { 7359 int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth(); 7360 7361 return (nested == preempt_offset); 7362 } 7363 7364 void __might_sleep(const char *file, int line, int preempt_offset) 7365 { 7366 /* 7367 * Blocking primitives will set (and therefore destroy) current->state, 7368 * since we will exit with TASK_RUNNING make sure we enter with it, 7369 * otherwise we will destroy state. 7370 */ 7371 WARN_ONCE(current->state != TASK_RUNNING && current->task_state_change, 7372 "do not call blocking ops when !TASK_RUNNING; " 7373 "state=%lx set at [<%p>] %pS\n", 7374 current->state, 7375 (void *)current->task_state_change, 7376 (void *)current->task_state_change); 7377 7378 ___might_sleep(file, line, preempt_offset); 7379 } 7380 EXPORT_SYMBOL(__might_sleep); 7381 7382 void ___might_sleep(const char *file, int line, int preempt_offset) 7383 { 7384 static unsigned long prev_jiffy; /* ratelimiting */ 7385 7386 rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */ 7387 if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && 7388 !is_idle_task(current)) || 7389 system_state != SYSTEM_RUNNING || oops_in_progress) 7390 return; 7391 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) 7392 return; 7393 prev_jiffy = jiffies; 7394 7395 printk(KERN_ERR 7396 "BUG: sleeping function called from invalid context at %s:%d\n", 7397 file, line); 7398 printk(KERN_ERR 7399 "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", 7400 in_atomic(), irqs_disabled(), 7401 current->pid, current->comm); 7402 7403 if (task_stack_end_corrupted(current)) 7404 printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); 7405 7406 debug_show_held_locks(current); 7407 if (irqs_disabled()) 7408 print_irqtrace_events(current); 7409 #ifdef CONFIG_DEBUG_PREEMPT 7410 if (!preempt_count_equals(preempt_offset)) { 7411 pr_err("Preemption disabled at:"); 7412 print_ip_sym(current->preempt_disable_ip); 7413 pr_cont("\n"); 7414 } 7415 #endif 7416 dump_stack(); 7417 } 7418 EXPORT_SYMBOL(___might_sleep); 7419 #endif 7420 7421 #ifdef CONFIG_MAGIC_SYSRQ 7422 void normalize_rt_tasks(void) 7423 { 7424 struct task_struct *g, *p; 7425 struct sched_attr attr = { 7426 .sched_policy = SCHED_NORMAL, 7427 }; 7428 7429 read_lock(&tasklist_lock); 7430 for_each_process_thread(g, p) { 7431 /* 7432 * Only normalize user tasks: 7433 */ 7434 if (p->flags & PF_KTHREAD) 7435 continue; 7436 7437 p->se.exec_start = 0; 7438 #ifdef CONFIG_SCHEDSTATS 7439 p->se.statistics.wait_start = 0; 7440 p->se.statistics.sleep_start = 0; 7441 p->se.statistics.block_start = 0; 7442 #endif 7443 7444 if (!dl_task(p) && !rt_task(p)) { 7445 /* 7446 * Renice negative nice level userspace 7447 * tasks back to 0: 7448 */ 7449 if (task_nice(p) < 0) 7450 set_user_nice(p, 0); 7451 continue; 7452 } 7453 7454 __sched_setscheduler(p, &attr, false, false); 7455 } 7456 read_unlock(&tasklist_lock); 7457 } 7458 7459 #endif /* CONFIG_MAGIC_SYSRQ */ 7460 7461 #if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) 7462 /* 7463 * These functions are only useful for the IA64 MCA handling, or kdb. 7464 * 7465 * They can only be called when the whole system has been 7466 * stopped - every CPU needs to be quiescent, and no scheduling 7467 * activity can take place. Using them for anything else would 7468 * be a serious bug, and as a result, they aren't even visible 7469 * under any other configuration. 7470 */ 7471 7472 /** 7473 * curr_task - return the current task for a given cpu. 7474 * @cpu: the processor in question. 7475 * 7476 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! 7477 * 7478 * Return: The current task for @cpu. 7479 */ 7480 struct task_struct *curr_task(int cpu) 7481 { 7482 return cpu_curr(cpu); 7483 } 7484 7485 #endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */ 7486 7487 #ifdef CONFIG_IA64 7488 /** 7489 * set_curr_task - set the current task for a given cpu. 7490 * @cpu: the processor in question. 7491 * @p: the task pointer to set. 7492 * 7493 * Description: This function must only be used when non-maskable interrupts 7494 * are serviced on a separate stack. It allows the architecture to switch the 7495 * notion of the current task on a cpu in a non-blocking manner. This function 7496 * must be called with all CPU's synchronized, and interrupts disabled, the 7497 * and caller must save the original value of the current task (see 7498 * curr_task() above) and restore that value before reenabling interrupts and 7499 * re-starting the system. 7500 * 7501 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! 7502 */ 7503 void set_curr_task(int cpu, struct task_struct *p) 7504 { 7505 cpu_curr(cpu) = p; 7506 } 7507 7508 #endif 7509 7510 #ifdef CONFIG_CGROUP_SCHED 7511 /* task_group_lock serializes the addition/removal of task groups */ 7512 static DEFINE_SPINLOCK(task_group_lock); 7513 7514 static void free_sched_group(struct task_group *tg) 7515 { 7516 free_fair_sched_group(tg); 7517 free_rt_sched_group(tg); 7518 autogroup_free(tg); 7519 kfree(tg); 7520 } 7521 7522 /* allocate runqueue etc for a new task group */ 7523 struct task_group *sched_create_group(struct task_group *parent) 7524 { 7525 struct task_group *tg; 7526 7527 tg = kzalloc(sizeof(*tg), GFP_KERNEL); 7528 if (!tg) 7529 return ERR_PTR(-ENOMEM); 7530 7531 if (!alloc_fair_sched_group(tg, parent)) 7532 goto err; 7533 7534 if (!alloc_rt_sched_group(tg, parent)) 7535 goto err; 7536 7537 return tg; 7538 7539 err: 7540 free_sched_group(tg); 7541 return ERR_PTR(-ENOMEM); 7542 } 7543 7544 void sched_online_group(struct task_group *tg, struct task_group *parent) 7545 { 7546 unsigned long flags; 7547 7548 spin_lock_irqsave(&task_group_lock, flags); 7549 list_add_rcu(&tg->list, &task_groups); 7550 7551 WARN_ON(!parent); /* root should already exist */ 7552 7553 tg->parent = parent; 7554 INIT_LIST_HEAD(&tg->children); 7555 list_add_rcu(&tg->siblings, &parent->children); 7556 spin_unlock_irqrestore(&task_group_lock, flags); 7557 } 7558 7559 /* rcu callback to free various structures associated with a task group */ 7560 static void free_sched_group_rcu(struct rcu_head *rhp) 7561 { 7562 /* now it should be safe to free those cfs_rqs */ 7563 free_sched_group(container_of(rhp, struct task_group, rcu)); 7564 } 7565 7566 /* Destroy runqueue etc associated with a task group */ 7567 void sched_destroy_group(struct task_group *tg) 7568 { 7569 /* wait for possible concurrent references to cfs_rqs complete */ 7570 call_rcu(&tg->rcu, free_sched_group_rcu); 7571 } 7572 7573 void sched_offline_group(struct task_group *tg) 7574 { 7575 unsigned long flags; 7576 int i; 7577 7578 /* end participation in shares distribution */ 7579 for_each_possible_cpu(i) 7580 unregister_fair_sched_group(tg, i); 7581 7582 spin_lock_irqsave(&task_group_lock, flags); 7583 list_del_rcu(&tg->list); 7584 list_del_rcu(&tg->siblings); 7585 spin_unlock_irqrestore(&task_group_lock, flags); 7586 } 7587 7588 /* change task's runqueue when it moves between groups. 7589 * The caller of this function should have put the task in its new group 7590 * by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to 7591 * reflect its new group. 7592 */ 7593 void sched_move_task(struct task_struct *tsk) 7594 { 7595 struct task_group *tg; 7596 int queued, running; 7597 unsigned long flags; 7598 struct rq *rq; 7599 7600 rq = task_rq_lock(tsk, &flags); 7601 7602 running = task_current(rq, tsk); 7603 queued = task_on_rq_queued(tsk); 7604 7605 if (queued) 7606 dequeue_task(rq, tsk, 0); 7607 if (unlikely(running)) 7608 put_prev_task(rq, tsk); 7609 7610 /* 7611 * All callers are synchronized by task_rq_lock(); we do not use RCU 7612 * which is pointless here. Thus, we pass "true" to task_css_check() 7613 * to prevent lockdep warnings. 7614 */ 7615 tg = container_of(task_css_check(tsk, cpu_cgrp_id, true), 7616 struct task_group, css); 7617 tg = autogroup_task_group(tsk, tg); 7618 tsk->sched_task_group = tg; 7619 7620 #ifdef CONFIG_FAIR_GROUP_SCHED 7621 if (tsk->sched_class->task_move_group) 7622 tsk->sched_class->task_move_group(tsk, queued); 7623 else 7624 #endif 7625 set_task_rq(tsk, task_cpu(tsk)); 7626 7627 if (unlikely(running)) 7628 tsk->sched_class->set_curr_task(rq); 7629 if (queued) 7630 enqueue_task(rq, tsk, 0); 7631 7632 task_rq_unlock(rq, tsk, &flags); 7633 } 7634 #endif /* CONFIG_CGROUP_SCHED */ 7635 7636 #ifdef CONFIG_RT_GROUP_SCHED 7637 /* 7638 * Ensure that the real time constraints are schedulable. 7639 */ 7640 static DEFINE_MUTEX(rt_constraints_mutex); 7641 7642 /* Must be called with tasklist_lock held */ 7643 static inline int tg_has_rt_tasks(struct task_group *tg) 7644 { 7645 struct task_struct *g, *p; 7646 7647 /* 7648 * Autogroups do not have RT tasks; see autogroup_create(). 7649 */ 7650 if (task_group_is_autogroup(tg)) 7651 return 0; 7652 7653 for_each_process_thread(g, p) { 7654 if (rt_task(p) && task_group(p) == tg) 7655 return 1; 7656 } 7657 7658 return 0; 7659 } 7660 7661 struct rt_schedulable_data { 7662 struct task_group *tg; 7663 u64 rt_period; 7664 u64 rt_runtime; 7665 }; 7666 7667 static int tg_rt_schedulable(struct task_group *tg, void *data) 7668 { 7669 struct rt_schedulable_data *d = data; 7670 struct task_group *child; 7671 unsigned long total, sum = 0; 7672 u64 period, runtime; 7673 7674 period = ktime_to_ns(tg->rt_bandwidth.rt_period); 7675 runtime = tg->rt_bandwidth.rt_runtime; 7676 7677 if (tg == d->tg) { 7678 period = d->rt_period; 7679 runtime = d->rt_runtime; 7680 } 7681 7682 /* 7683 * Cannot have more runtime than the period. 7684 */ 7685 if (runtime > period && runtime != RUNTIME_INF) 7686 return -EINVAL; 7687 7688 /* 7689 * Ensure we don't starve existing RT tasks. 7690 */ 7691 if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg)) 7692 return -EBUSY; 7693 7694 total = to_ratio(period, runtime); 7695 7696 /* 7697 * Nobody can have more than the global setting allows. 7698 */ 7699 if (total > to_ratio(global_rt_period(), global_rt_runtime())) 7700 return -EINVAL; 7701 7702 /* 7703 * The sum of our children's runtime should not exceed our own. 7704 */ 7705 list_for_each_entry_rcu(child, &tg->children, siblings) { 7706 period = ktime_to_ns(child->rt_bandwidth.rt_period); 7707 runtime = child->rt_bandwidth.rt_runtime; 7708 7709 if (child == d->tg) { 7710 period = d->rt_period; 7711 runtime = d->rt_runtime; 7712 } 7713 7714 sum += to_ratio(period, runtime); 7715 } 7716 7717 if (sum > total) 7718 return -EINVAL; 7719 7720 return 0; 7721 } 7722 7723 static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) 7724 { 7725 int ret; 7726 7727 struct rt_schedulable_data data = { 7728 .tg = tg, 7729 .rt_period = period, 7730 .rt_runtime = runtime, 7731 }; 7732 7733 rcu_read_lock(); 7734 ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data); 7735 rcu_read_unlock(); 7736 7737 return ret; 7738 } 7739 7740 static int tg_set_rt_bandwidth(struct task_group *tg, 7741 u64 rt_period, u64 rt_runtime) 7742 { 7743 int i, err = 0; 7744 7745 /* 7746 * Disallowing the root group RT runtime is BAD, it would disallow the 7747 * kernel creating (and or operating) RT threads. 7748 */ 7749 if (tg == &root_task_group && rt_runtime == 0) 7750 return -EINVAL; 7751 7752 /* No period doesn't make any sense. */ 7753 if (rt_period == 0) 7754 return -EINVAL; 7755 7756 mutex_lock(&rt_constraints_mutex); 7757 read_lock(&tasklist_lock); 7758 err = __rt_schedulable(tg, rt_period, rt_runtime); 7759 if (err) 7760 goto unlock; 7761 7762 raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); 7763 tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period); 7764 tg->rt_bandwidth.rt_runtime = rt_runtime; 7765 7766 for_each_possible_cpu(i) { 7767 struct rt_rq *rt_rq = tg->rt_rq[i]; 7768 7769 raw_spin_lock(&rt_rq->rt_runtime_lock); 7770 rt_rq->rt_runtime = rt_runtime; 7771 raw_spin_unlock(&rt_rq->rt_runtime_lock); 7772 } 7773 raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); 7774 unlock: 7775 read_unlock(&tasklist_lock); 7776 mutex_unlock(&rt_constraints_mutex); 7777 7778 return err; 7779 } 7780 7781 static int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) 7782 { 7783 u64 rt_runtime, rt_period; 7784 7785 rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period); 7786 rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC; 7787 if (rt_runtime_us < 0) 7788 rt_runtime = RUNTIME_INF; 7789 7790 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); 7791 } 7792 7793 static long sched_group_rt_runtime(struct task_group *tg) 7794 { 7795 u64 rt_runtime_us; 7796 7797 if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF) 7798 return -1; 7799 7800 rt_runtime_us = tg->rt_bandwidth.rt_runtime; 7801 do_div(rt_runtime_us, NSEC_PER_USEC); 7802 return rt_runtime_us; 7803 } 7804 7805 static int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us) 7806 { 7807 u64 rt_runtime, rt_period; 7808 7809 rt_period = rt_period_us * NSEC_PER_USEC; 7810 rt_runtime = tg->rt_bandwidth.rt_runtime; 7811 7812 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); 7813 } 7814 7815 static long sched_group_rt_period(struct task_group *tg) 7816 { 7817 u64 rt_period_us; 7818 7819 rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period); 7820 do_div(rt_period_us, NSEC_PER_USEC); 7821 return rt_period_us; 7822 } 7823 #endif /* CONFIG_RT_GROUP_SCHED */ 7824 7825 #ifdef CONFIG_RT_GROUP_SCHED 7826 static int sched_rt_global_constraints(void) 7827 { 7828 int ret = 0; 7829 7830 mutex_lock(&rt_constraints_mutex); 7831 read_lock(&tasklist_lock); 7832 ret = __rt_schedulable(NULL, 0, 0); 7833 read_unlock(&tasklist_lock); 7834 mutex_unlock(&rt_constraints_mutex); 7835 7836 return ret; 7837 } 7838 7839 static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) 7840 { 7841 /* Don't accept realtime tasks when there is no way for them to run */ 7842 if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0) 7843 return 0; 7844 7845 return 1; 7846 } 7847 7848 #else /* !CONFIG_RT_GROUP_SCHED */ 7849 static int sched_rt_global_constraints(void) 7850 { 7851 unsigned long flags; 7852 int i, ret = 0; 7853 7854 raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); 7855 for_each_possible_cpu(i) { 7856 struct rt_rq *rt_rq = &cpu_rq(i)->rt; 7857 7858 raw_spin_lock(&rt_rq->rt_runtime_lock); 7859 rt_rq->rt_runtime = global_rt_runtime(); 7860 raw_spin_unlock(&rt_rq->rt_runtime_lock); 7861 } 7862 raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); 7863 7864 return ret; 7865 } 7866 #endif /* CONFIG_RT_GROUP_SCHED */ 7867 7868 static int sched_dl_global_validate(void) 7869 { 7870 u64 runtime = global_rt_runtime(); 7871 u64 period = global_rt_period(); 7872 u64 new_bw = to_ratio(period, runtime); 7873 struct dl_bw *dl_b; 7874 int cpu, ret = 0; 7875 unsigned long flags; 7876 7877 /* 7878 * Here we want to check the bandwidth not being set to some 7879 * value smaller than the currently allocated bandwidth in 7880 * any of the root_domains. 7881 * 7882 * FIXME: Cycling on all the CPUs is overdoing, but simpler than 7883 * cycling on root_domains... Discussion on different/better 7884 * solutions is welcome! 7885 */ 7886 for_each_possible_cpu(cpu) { 7887 rcu_read_lock_sched(); 7888 dl_b = dl_bw_of(cpu); 7889 7890 raw_spin_lock_irqsave(&dl_b->lock, flags); 7891 if (new_bw < dl_b->total_bw) 7892 ret = -EBUSY; 7893 raw_spin_unlock_irqrestore(&dl_b->lock, flags); 7894 7895 rcu_read_unlock_sched(); 7896 7897 if (ret) 7898 break; 7899 } 7900 7901 return ret; 7902 } 7903 7904 static void sched_dl_do_global(void) 7905 { 7906 u64 new_bw = -1; 7907 struct dl_bw *dl_b; 7908 int cpu; 7909 unsigned long flags; 7910 7911 def_dl_bandwidth.dl_period = global_rt_period(); 7912 def_dl_bandwidth.dl_runtime = global_rt_runtime(); 7913 7914 if (global_rt_runtime() != RUNTIME_INF) 7915 new_bw = to_ratio(global_rt_period(), global_rt_runtime()); 7916 7917 /* 7918 * FIXME: As above... 7919 */ 7920 for_each_possible_cpu(cpu) { 7921 rcu_read_lock_sched(); 7922 dl_b = dl_bw_of(cpu); 7923 7924 raw_spin_lock_irqsave(&dl_b->lock, flags); 7925 dl_b->bw = new_bw; 7926 raw_spin_unlock_irqrestore(&dl_b->lock, flags); 7927 7928 rcu_read_unlock_sched(); 7929 } 7930 } 7931 7932 static int sched_rt_global_validate(void) 7933 { 7934 if (sysctl_sched_rt_period <= 0) 7935 return -EINVAL; 7936 7937 if ((sysctl_sched_rt_runtime != RUNTIME_INF) && 7938 (sysctl_sched_rt_runtime > sysctl_sched_rt_period)) 7939 return -EINVAL; 7940 7941 return 0; 7942 } 7943 7944 static void sched_rt_do_global(void) 7945 { 7946 def_rt_bandwidth.rt_runtime = global_rt_runtime(); 7947 def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period()); 7948 } 7949 7950 int sched_rt_handler(struct ctl_table *table, int write, 7951 void __user *buffer, size_t *lenp, 7952 loff_t *ppos) 7953 { 7954 int old_period, old_runtime; 7955 static DEFINE_MUTEX(mutex); 7956 int ret; 7957 7958 mutex_lock(&mutex); 7959 old_period = sysctl_sched_rt_period; 7960 old_runtime = sysctl_sched_rt_runtime; 7961 7962 ret = proc_dointvec(table, write, buffer, lenp, ppos); 7963 7964 if (!ret && write) { 7965 ret = sched_rt_global_validate(); 7966 if (ret) 7967 goto undo; 7968 7969 ret = sched_dl_global_validate(); 7970 if (ret) 7971 goto undo; 7972 7973 ret = sched_rt_global_constraints(); 7974 if (ret) 7975 goto undo; 7976 7977 sched_rt_do_global(); 7978 sched_dl_do_global(); 7979 } 7980 if (0) { 7981 undo: 7982 sysctl_sched_rt_period = old_period; 7983 sysctl_sched_rt_runtime = old_runtime; 7984 } 7985 mutex_unlock(&mutex); 7986 7987 return ret; 7988 } 7989 7990 int sched_rr_handler(struct ctl_table *table, int write, 7991 void __user *buffer, size_t *lenp, 7992 loff_t *ppos) 7993 { 7994 int ret; 7995 static DEFINE_MUTEX(mutex); 7996 7997 mutex_lock(&mutex); 7998 ret = proc_dointvec(table, write, buffer, lenp, ppos); 7999 /* make sure that internally we keep jiffies */ 8000 /* also, writing zero resets timeslice to default */ 8001 if (!ret && write) { 8002 sched_rr_timeslice = sched_rr_timeslice <= 0 ? 8003 RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice); 8004 } 8005 mutex_unlock(&mutex); 8006 return ret; 8007 } 8008 8009 #ifdef CONFIG_CGROUP_SCHED 8010 8011 static inline struct task_group *css_tg(struct cgroup_subsys_state *css) 8012 { 8013 return css ? container_of(css, struct task_group, css) : NULL; 8014 } 8015 8016 static struct cgroup_subsys_state * 8017 cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) 8018 { 8019 struct task_group *parent = css_tg(parent_css); 8020 struct task_group *tg; 8021 8022 if (!parent) { 8023 /* This is early initialization for the top cgroup */ 8024 return &root_task_group.css; 8025 } 8026 8027 tg = sched_create_group(parent); 8028 if (IS_ERR(tg)) 8029 return ERR_PTR(-ENOMEM); 8030 8031 return &tg->css; 8032 } 8033 8034 static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) 8035 { 8036 struct task_group *tg = css_tg(css); 8037 struct task_group *parent = css_tg(css->parent); 8038 8039 if (parent) 8040 sched_online_group(tg, parent); 8041 return 0; 8042 } 8043 8044 static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) 8045 { 8046 struct task_group *tg = css_tg(css); 8047 8048 sched_destroy_group(tg); 8049 } 8050 8051 static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css) 8052 { 8053 struct task_group *tg = css_tg(css); 8054 8055 sched_offline_group(tg); 8056 } 8057 8058 static void cpu_cgroup_fork(struct task_struct *task) 8059 { 8060 sched_move_task(task); 8061 } 8062 8063 static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css, 8064 struct cgroup_taskset *tset) 8065 { 8066 struct task_struct *task; 8067 8068 cgroup_taskset_for_each(task, tset) { 8069 #ifdef CONFIG_RT_GROUP_SCHED 8070 if (!sched_rt_can_attach(css_tg(css), task)) 8071 return -EINVAL; 8072 #else 8073 /* We don't support RT-tasks being in separate groups */ 8074 if (task->sched_class != &fair_sched_class) 8075 return -EINVAL; 8076 #endif 8077 } 8078 return 0; 8079 } 8080 8081 static void cpu_cgroup_attach(struct cgroup_subsys_state *css, 8082 struct cgroup_taskset *tset) 8083 { 8084 struct task_struct *task; 8085 8086 cgroup_taskset_for_each(task, tset) 8087 sched_move_task(task); 8088 } 8089 8090 static void cpu_cgroup_exit(struct cgroup_subsys_state *css, 8091 struct cgroup_subsys_state *old_css, 8092 struct task_struct *task) 8093 { 8094 /* 8095 * cgroup_exit() is called in the copy_process() failure path. 8096 * Ignore this case since the task hasn't ran yet, this avoids 8097 * trying to poke a half freed task state from generic code. 8098 */ 8099 if (!(task->flags & PF_EXITING)) 8100 return; 8101 8102 sched_move_task(task); 8103 } 8104 8105 #ifdef CONFIG_FAIR_GROUP_SCHED 8106 static int cpu_shares_write_u64(struct cgroup_subsys_state *css, 8107 struct cftype *cftype, u64 shareval) 8108 { 8109 return sched_group_set_shares(css_tg(css), scale_load(shareval)); 8110 } 8111 8112 static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css, 8113 struct cftype *cft) 8114 { 8115 struct task_group *tg = css_tg(css); 8116 8117 return (u64) scale_load_down(tg->shares); 8118 } 8119 8120 #ifdef CONFIG_CFS_BANDWIDTH 8121 static DEFINE_MUTEX(cfs_constraints_mutex); 8122 8123 const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */ 8124 const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */ 8125 8126 static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime); 8127 8128 static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) 8129 { 8130 int i, ret = 0, runtime_enabled, runtime_was_enabled; 8131 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; 8132 8133 if (tg == &root_task_group) 8134 return -EINVAL; 8135 8136 /* 8137 * Ensure we have at some amount of bandwidth every period. This is 8138 * to prevent reaching a state of large arrears when throttled via 8139 * entity_tick() resulting in prolonged exit starvation. 8140 */ 8141 if (quota < min_cfs_quota_period || period < min_cfs_quota_period) 8142 return -EINVAL; 8143 8144 /* 8145 * Likewise, bound things on the otherside by preventing insane quota 8146 * periods. This also allows us to normalize in computing quota 8147 * feasibility. 8148 */ 8149 if (period > max_cfs_quota_period) 8150 return -EINVAL; 8151 8152 /* 8153 * Prevent race between setting of cfs_rq->runtime_enabled and 8154 * unthrottle_offline_cfs_rqs(). 8155 */ 8156 get_online_cpus(); 8157 mutex_lock(&cfs_constraints_mutex); 8158 ret = __cfs_schedulable(tg, period, quota); 8159 if (ret) 8160 goto out_unlock; 8161 8162 runtime_enabled = quota != RUNTIME_INF; 8163 runtime_was_enabled = cfs_b->quota != RUNTIME_INF; 8164 /* 8165 * If we need to toggle cfs_bandwidth_used, off->on must occur 8166 * before making related changes, and on->off must occur afterwards 8167 */ 8168 if (runtime_enabled && !runtime_was_enabled) 8169 cfs_bandwidth_usage_inc(); 8170 raw_spin_lock_irq(&cfs_b->lock); 8171 cfs_b->period = ns_to_ktime(period); 8172 cfs_b->quota = quota; 8173 8174 __refill_cfs_bandwidth_runtime(cfs_b); 8175 /* restart the period timer (if active) to handle new period expiry */ 8176 if (runtime_enabled) 8177 start_cfs_bandwidth(cfs_b); 8178 raw_spin_unlock_irq(&cfs_b->lock); 8179 8180 for_each_online_cpu(i) { 8181 struct cfs_rq *cfs_rq = tg->cfs_rq[i]; 8182 struct rq *rq = cfs_rq->rq; 8183 8184 raw_spin_lock_irq(&rq->lock); 8185 cfs_rq->runtime_enabled = runtime_enabled; 8186 cfs_rq->runtime_remaining = 0; 8187 8188 if (cfs_rq->throttled) 8189 unthrottle_cfs_rq(cfs_rq); 8190 raw_spin_unlock_irq(&rq->lock); 8191 } 8192 if (runtime_was_enabled && !runtime_enabled) 8193 cfs_bandwidth_usage_dec(); 8194 out_unlock: 8195 mutex_unlock(&cfs_constraints_mutex); 8196 put_online_cpus(); 8197 8198 return ret; 8199 } 8200 8201 int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us) 8202 { 8203 u64 quota, period; 8204 8205 period = ktime_to_ns(tg->cfs_bandwidth.period); 8206 if (cfs_quota_us < 0) 8207 quota = RUNTIME_INF; 8208 else 8209 quota = (u64)cfs_quota_us * NSEC_PER_USEC; 8210 8211 return tg_set_cfs_bandwidth(tg, period, quota); 8212 } 8213 8214 long tg_get_cfs_quota(struct task_group *tg) 8215 { 8216 u64 quota_us; 8217 8218 if (tg->cfs_bandwidth.quota == RUNTIME_INF) 8219 return -1; 8220 8221 quota_us = tg->cfs_bandwidth.quota; 8222 do_div(quota_us, NSEC_PER_USEC); 8223 8224 return quota_us; 8225 } 8226 8227 int tg_set_cfs_period(struct task_group *tg, long cfs_period_us) 8228 { 8229 u64 quota, period; 8230 8231 period = (u64)cfs_period_us * NSEC_PER_USEC; 8232 quota = tg->cfs_bandwidth.quota; 8233 8234 return tg_set_cfs_bandwidth(tg, period, quota); 8235 } 8236 8237 long tg_get_cfs_period(struct task_group *tg) 8238 { 8239 u64 cfs_period_us; 8240 8241 cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period); 8242 do_div(cfs_period_us, NSEC_PER_USEC); 8243 8244 return cfs_period_us; 8245 } 8246 8247 static s64 cpu_cfs_quota_read_s64(struct cgroup_subsys_state *css, 8248 struct cftype *cft) 8249 { 8250 return tg_get_cfs_quota(css_tg(css)); 8251 } 8252 8253 static int cpu_cfs_quota_write_s64(struct cgroup_subsys_state *css, 8254 struct cftype *cftype, s64 cfs_quota_us) 8255 { 8256 return tg_set_cfs_quota(css_tg(css), cfs_quota_us); 8257 } 8258 8259 static u64 cpu_cfs_period_read_u64(struct cgroup_subsys_state *css, 8260 struct cftype *cft) 8261 { 8262 return tg_get_cfs_period(css_tg(css)); 8263 } 8264 8265 static int cpu_cfs_period_write_u64(struct cgroup_subsys_state *css, 8266 struct cftype *cftype, u64 cfs_period_us) 8267 { 8268 return tg_set_cfs_period(css_tg(css), cfs_period_us); 8269 } 8270 8271 struct cfs_schedulable_data { 8272 struct task_group *tg; 8273 u64 period, quota; 8274 }; 8275 8276 /* 8277 * normalize group quota/period to be quota/max_period 8278 * note: units are usecs 8279 */ 8280 static u64 normalize_cfs_quota(struct task_group *tg, 8281 struct cfs_schedulable_data *d) 8282 { 8283 u64 quota, period; 8284 8285 if (tg == d->tg) { 8286 period = d->period; 8287 quota = d->quota; 8288 } else { 8289 period = tg_get_cfs_period(tg); 8290 quota = tg_get_cfs_quota(tg); 8291 } 8292 8293 /* note: these should typically be equivalent */ 8294 if (quota == RUNTIME_INF || quota == -1) 8295 return RUNTIME_INF; 8296 8297 return to_ratio(period, quota); 8298 } 8299 8300 static int tg_cfs_schedulable_down(struct task_group *tg, void *data) 8301 { 8302 struct cfs_schedulable_data *d = data; 8303 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; 8304 s64 quota = 0, parent_quota = -1; 8305 8306 if (!tg->parent) { 8307 quota = RUNTIME_INF; 8308 } else { 8309 struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth; 8310 8311 quota = normalize_cfs_quota(tg, d); 8312 parent_quota = parent_b->hierarchical_quota; 8313 8314 /* 8315 * ensure max(child_quota) <= parent_quota, inherit when no 8316 * limit is set 8317 */ 8318 if (quota == RUNTIME_INF) 8319 quota = parent_quota; 8320 else if (parent_quota != RUNTIME_INF && quota > parent_quota) 8321 return -EINVAL; 8322 } 8323 cfs_b->hierarchical_quota = quota; 8324 8325 return 0; 8326 } 8327 8328 static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota) 8329 { 8330 int ret; 8331 struct cfs_schedulable_data data = { 8332 .tg = tg, 8333 .period = period, 8334 .quota = quota, 8335 }; 8336 8337 if (quota != RUNTIME_INF) { 8338 do_div(data.period, NSEC_PER_USEC); 8339 do_div(data.quota, NSEC_PER_USEC); 8340 } 8341 8342 rcu_read_lock(); 8343 ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data); 8344 rcu_read_unlock(); 8345 8346 return ret; 8347 } 8348 8349 static int cpu_stats_show(struct seq_file *sf, void *v) 8350 { 8351 struct task_group *tg = css_tg(seq_css(sf)); 8352 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; 8353 8354 seq_printf(sf, "nr_periods %d\n", cfs_b->nr_periods); 8355 seq_printf(sf, "nr_throttled %d\n", cfs_b->nr_throttled); 8356 seq_printf(sf, "throttled_time %llu\n", cfs_b->throttled_time); 8357 8358 return 0; 8359 } 8360 #endif /* CONFIG_CFS_BANDWIDTH */ 8361 #endif /* CONFIG_FAIR_GROUP_SCHED */ 8362 8363 #ifdef CONFIG_RT_GROUP_SCHED 8364 static int cpu_rt_runtime_write(struct cgroup_subsys_state *css, 8365 struct cftype *cft, s64 val) 8366 { 8367 return sched_group_set_rt_runtime(css_tg(css), val); 8368 } 8369 8370 static s64 cpu_rt_runtime_read(struct cgroup_subsys_state *css, 8371 struct cftype *cft) 8372 { 8373 return sched_group_rt_runtime(css_tg(css)); 8374 } 8375 8376 static int cpu_rt_period_write_uint(struct cgroup_subsys_state *css, 8377 struct cftype *cftype, u64 rt_period_us) 8378 { 8379 return sched_group_set_rt_period(css_tg(css), rt_period_us); 8380 } 8381 8382 static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css, 8383 struct cftype *cft) 8384 { 8385 return sched_group_rt_period(css_tg(css)); 8386 } 8387 #endif /* CONFIG_RT_GROUP_SCHED */ 8388 8389 static struct cftype cpu_files[] = { 8390 #ifdef CONFIG_FAIR_GROUP_SCHED 8391 { 8392 .name = "shares", 8393 .read_u64 = cpu_shares_read_u64, 8394 .write_u64 = cpu_shares_write_u64, 8395 }, 8396 #endif 8397 #ifdef CONFIG_CFS_BANDWIDTH 8398 { 8399 .name = "cfs_quota_us", 8400 .read_s64 = cpu_cfs_quota_read_s64, 8401 .write_s64 = cpu_cfs_quota_write_s64, 8402 }, 8403 { 8404 .name = "cfs_period_us", 8405 .read_u64 = cpu_cfs_period_read_u64, 8406 .write_u64 = cpu_cfs_period_write_u64, 8407 }, 8408 { 8409 .name = "stat", 8410 .seq_show = cpu_stats_show, 8411 }, 8412 #endif 8413 #ifdef CONFIG_RT_GROUP_SCHED 8414 { 8415 .name = "rt_runtime_us", 8416 .read_s64 = cpu_rt_runtime_read, 8417 .write_s64 = cpu_rt_runtime_write, 8418 }, 8419 { 8420 .name = "rt_period_us", 8421 .read_u64 = cpu_rt_period_read_uint, 8422 .write_u64 = cpu_rt_period_write_uint, 8423 }, 8424 #endif 8425 { } /* terminate */ 8426 }; 8427 8428 struct cgroup_subsys cpu_cgrp_subsys = { 8429 .css_alloc = cpu_cgroup_css_alloc, 8430 .css_free = cpu_cgroup_css_free, 8431 .css_online = cpu_cgroup_css_online, 8432 .css_offline = cpu_cgroup_css_offline, 8433 .fork = cpu_cgroup_fork, 8434 .can_attach = cpu_cgroup_can_attach, 8435 .attach = cpu_cgroup_attach, 8436 .exit = cpu_cgroup_exit, 8437 .legacy_cftypes = cpu_files, 8438 .early_init = 1, 8439 }; 8440 8441 #endif /* CONFIG_CGROUP_SCHED */ 8442 8443 void dump_cpu_task(int cpu) 8444 { 8445 pr_info("Task dump for CPU %d:\n", cpu); 8446 sched_show_task(cpu_curr(cpu)); 8447 } 8448