1 /* SPDX-License-Identifier: GPL-2.0 */ 2 /* 3 * Scheduler internal types and methods: 4 */ 5 #include <linux/sched.h> 6 7 #include <linux/sched/autogroup.h> 8 #include <linux/sched/clock.h> 9 #include <linux/sched/coredump.h> 10 #include <linux/sched/cpufreq.h> 11 #include <linux/sched/cputime.h> 12 #include <linux/sched/deadline.h> 13 #include <linux/sched/debug.h> 14 #include <linux/sched/hotplug.h> 15 #include <linux/sched/idle.h> 16 #include <linux/sched/init.h> 17 #include <linux/sched/isolation.h> 18 #include <linux/sched/jobctl.h> 19 #include <linux/sched/loadavg.h> 20 #include <linux/sched/mm.h> 21 #include <linux/sched/nohz.h> 22 #include <linux/sched/numa_balancing.h> 23 #include <linux/sched/prio.h> 24 #include <linux/sched/rt.h> 25 #include <linux/sched/signal.h> 26 #include <linux/sched/stat.h> 27 #include <linux/sched/sysctl.h> 28 #include <linux/sched/task.h> 29 #include <linux/sched/task_stack.h> 30 #include <linux/sched/topology.h> 31 #include <linux/sched/user.h> 32 #include <linux/sched/wake_q.h> 33 #include <linux/sched/xacct.h> 34 35 #include <uapi/linux/sched/types.h> 36 37 #include <linux/binfmts.h> 38 #include <linux/blkdev.h> 39 #include <linux/compat.h> 40 #include <linux/context_tracking.h> 41 #include <linux/cpufreq.h> 42 #include <linux/cpuidle.h> 43 #include <linux/cpuset.h> 44 #include <linux/ctype.h> 45 #include <linux/debugfs.h> 46 #include <linux/delayacct.h> 47 #include <linux/init_task.h> 48 #include <linux/kprobes.h> 49 #include <linux/kthread.h> 50 #include <linux/membarrier.h> 51 #include <linux/migrate.h> 52 #include <linux/mmu_context.h> 53 #include <linux/nmi.h> 54 #include <linux/proc_fs.h> 55 #include <linux/prefetch.h> 56 #include <linux/profile.h> 57 #include <linux/rcupdate_wait.h> 58 #include <linux/security.h> 59 #include <linux/stackprotector.h> 60 #include <linux/stop_machine.h> 61 #include <linux/suspend.h> 62 #include <linux/swait.h> 63 #include <linux/syscalls.h> 64 #include <linux/task_work.h> 65 #include <linux/tsacct_kern.h> 66 67 #include <asm/tlb.h> 68 69 #ifdef CONFIG_PARAVIRT 70 # include <asm/paravirt.h> 71 #endif 72 73 #include "cpupri.h" 74 #include "cpudeadline.h" 75 76 #ifdef CONFIG_SCHED_DEBUG 77 # define SCHED_WARN_ON(x) WARN_ONCE(x, #x) 78 #else 79 # define SCHED_WARN_ON(x) ({ (void)(x), 0; }) 80 #endif 81 82 struct rq; 83 struct cpuidle_state; 84 85 /* task_struct::on_rq states: */ 86 #define TASK_ON_RQ_QUEUED 1 87 #define TASK_ON_RQ_MIGRATING 2 88 89 extern __read_mostly int scheduler_running; 90 91 extern unsigned long calc_load_update; 92 extern atomic_long_t calc_load_tasks; 93 94 extern void calc_global_load_tick(struct rq *this_rq); 95 extern long calc_load_fold_active(struct rq *this_rq, long adjust); 96 97 #ifdef CONFIG_SMP 98 extern void cpu_load_update_active(struct rq *this_rq); 99 #else 100 static inline void cpu_load_update_active(struct rq *this_rq) { } 101 #endif 102 103 /* 104 * Helpers for converting nanosecond timing to jiffy resolution 105 */ 106 #define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ)) 107 108 /* 109 * Increase resolution of nice-level calculations for 64-bit architectures. 110 * The extra resolution improves shares distribution and load balancing of 111 * low-weight task groups (eg. nice +19 on an autogroup), deeper taskgroup 112 * hierarchies, especially on larger systems. This is not a user-visible change 113 * and does not change the user-interface for setting shares/weights. 114 * 115 * We increase resolution only if we have enough bits to allow this increased 116 * resolution (i.e. 64-bit). The costs for increasing resolution when 32-bit 117 * are pretty high and the returns do not justify the increased costs. 118 * 119 * Really only required when CONFIG_FAIR_GROUP_SCHED=y is also set, but to 120 * increase coverage and consistency always enable it on 64-bit platforms. 121 */ 122 #ifdef CONFIG_64BIT 123 # define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT + SCHED_FIXEDPOINT_SHIFT) 124 # define scale_load(w) ((w) << SCHED_FIXEDPOINT_SHIFT) 125 # define scale_load_down(w) ((w) >> SCHED_FIXEDPOINT_SHIFT) 126 #else 127 # define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT) 128 # define scale_load(w) (w) 129 # define scale_load_down(w) (w) 130 #endif 131 132 /* 133 * Task weight (visible to users) and its load (invisible to users) have 134 * independent resolution, but they should be well calibrated. We use 135 * scale_load() and scale_load_down(w) to convert between them. The 136 * following must be true: 137 * 138 * scale_load(sched_prio_to_weight[USER_PRIO(NICE_TO_PRIO(0))]) == NICE_0_LOAD 139 * 140 */ 141 #define NICE_0_LOAD (1L << NICE_0_LOAD_SHIFT) 142 143 /* 144 * Single value that decides SCHED_DEADLINE internal math precision. 145 * 10 -> just above 1us 146 * 9 -> just above 0.5us 147 */ 148 #define DL_SCALE 10 149 150 /* 151 * Single value that denotes runtime == period, ie unlimited time. 152 */ 153 #define RUNTIME_INF ((u64)~0ULL) 154 155 static inline int idle_policy(int policy) 156 { 157 return policy == SCHED_IDLE; 158 } 159 static inline int fair_policy(int policy) 160 { 161 return policy == SCHED_NORMAL || policy == SCHED_BATCH; 162 } 163 164 static inline int rt_policy(int policy) 165 { 166 return policy == SCHED_FIFO || policy == SCHED_RR; 167 } 168 169 static inline int dl_policy(int policy) 170 { 171 return policy == SCHED_DEADLINE; 172 } 173 static inline bool valid_policy(int policy) 174 { 175 return idle_policy(policy) || fair_policy(policy) || 176 rt_policy(policy) || dl_policy(policy); 177 } 178 179 static inline int task_has_rt_policy(struct task_struct *p) 180 { 181 return rt_policy(p->policy); 182 } 183 184 static inline int task_has_dl_policy(struct task_struct *p) 185 { 186 return dl_policy(p->policy); 187 } 188 189 #define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT) 190 191 /* 192 * !! For sched_setattr_nocheck() (kernel) only !! 193 * 194 * This is actually gross. :( 195 * 196 * It is used to make schedutil kworker(s) higher priority than SCHED_DEADLINE 197 * tasks, but still be able to sleep. We need this on platforms that cannot 198 * atomically change clock frequency. Remove once fast switching will be 199 * available on such platforms. 200 * 201 * SUGOV stands for SchedUtil GOVernor. 202 */ 203 #define SCHED_FLAG_SUGOV 0x10000000 204 205 static inline bool dl_entity_is_special(struct sched_dl_entity *dl_se) 206 { 207 #ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL 208 return unlikely(dl_se->flags & SCHED_FLAG_SUGOV); 209 #else 210 return false; 211 #endif 212 } 213 214 /* 215 * Tells if entity @a should preempt entity @b. 216 */ 217 static inline bool 218 dl_entity_preempt(struct sched_dl_entity *a, struct sched_dl_entity *b) 219 { 220 return dl_entity_is_special(a) || 221 dl_time_before(a->deadline, b->deadline); 222 } 223 224 /* 225 * This is the priority-queue data structure of the RT scheduling class: 226 */ 227 struct rt_prio_array { 228 DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */ 229 struct list_head queue[MAX_RT_PRIO]; 230 }; 231 232 struct rt_bandwidth { 233 /* nests inside the rq lock: */ 234 raw_spinlock_t rt_runtime_lock; 235 ktime_t rt_period; 236 u64 rt_runtime; 237 struct hrtimer rt_period_timer; 238 unsigned int rt_period_active; 239 }; 240 241 void __dl_clear_params(struct task_struct *p); 242 243 /* 244 * To keep the bandwidth of -deadline tasks and groups under control 245 * we need some place where: 246 * - store the maximum -deadline bandwidth of the system (the group); 247 * - cache the fraction of that bandwidth that is currently allocated. 248 * 249 * This is all done in the data structure below. It is similar to the 250 * one used for RT-throttling (rt_bandwidth), with the main difference 251 * that, since here we are only interested in admission control, we 252 * do not decrease any runtime while the group "executes", neither we 253 * need a timer to replenish it. 254 * 255 * With respect to SMP, the bandwidth is given on a per-CPU basis, 256 * meaning that: 257 * - dl_bw (< 100%) is the bandwidth of the system (group) on each CPU; 258 * - dl_total_bw array contains, in the i-eth element, the currently 259 * allocated bandwidth on the i-eth CPU. 260 * Moreover, groups consume bandwidth on each CPU, while tasks only 261 * consume bandwidth on the CPU they're running on. 262 * Finally, dl_total_bw_cpu is used to cache the index of dl_total_bw 263 * that will be shown the next time the proc or cgroup controls will 264 * be red. It on its turn can be changed by writing on its own 265 * control. 266 */ 267 struct dl_bandwidth { 268 raw_spinlock_t dl_runtime_lock; 269 u64 dl_runtime; 270 u64 dl_period; 271 }; 272 273 static inline int dl_bandwidth_enabled(void) 274 { 275 return sysctl_sched_rt_runtime >= 0; 276 } 277 278 struct dl_bw { 279 raw_spinlock_t lock; 280 u64 bw; 281 u64 total_bw; 282 }; 283 284 static inline void __dl_update(struct dl_bw *dl_b, s64 bw); 285 286 static inline 287 void __dl_sub(struct dl_bw *dl_b, u64 tsk_bw, int cpus) 288 { 289 dl_b->total_bw -= tsk_bw; 290 __dl_update(dl_b, (s32)tsk_bw / cpus); 291 } 292 293 static inline 294 void __dl_add(struct dl_bw *dl_b, u64 tsk_bw, int cpus) 295 { 296 dl_b->total_bw += tsk_bw; 297 __dl_update(dl_b, -((s32)tsk_bw / cpus)); 298 } 299 300 static inline 301 bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw) 302 { 303 return dl_b->bw != -1 && 304 dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw; 305 } 306 307 extern void dl_change_utilization(struct task_struct *p, u64 new_bw); 308 extern void init_dl_bw(struct dl_bw *dl_b); 309 extern int sched_dl_global_validate(void); 310 extern void sched_dl_do_global(void); 311 extern int sched_dl_overflow(struct task_struct *p, int policy, const struct sched_attr *attr); 312 extern void __setparam_dl(struct task_struct *p, const struct sched_attr *attr); 313 extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr); 314 extern bool __checkparam_dl(const struct sched_attr *attr); 315 extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr); 316 extern int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed); 317 extern int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial); 318 extern bool dl_cpu_busy(unsigned int cpu); 319 320 #ifdef CONFIG_CGROUP_SCHED 321 322 #include <linux/cgroup.h> 323 324 struct cfs_rq; 325 struct rt_rq; 326 327 extern struct list_head task_groups; 328 329 struct cfs_bandwidth { 330 #ifdef CONFIG_CFS_BANDWIDTH 331 raw_spinlock_t lock; 332 ktime_t period; 333 u64 quota; 334 u64 runtime; 335 s64 hierarchical_quota; 336 u64 runtime_expires; 337 int expires_seq; 338 339 short idle; 340 short period_active; 341 struct hrtimer period_timer; 342 struct hrtimer slack_timer; 343 struct list_head throttled_cfs_rq; 344 345 /* Statistics: */ 346 int nr_periods; 347 int nr_throttled; 348 u64 throttled_time; 349 350 bool distribute_running; 351 #endif 352 }; 353 354 /* Task group related information */ 355 struct task_group { 356 struct cgroup_subsys_state css; 357 358 #ifdef CONFIG_FAIR_GROUP_SCHED 359 /* schedulable entities of this group on each CPU */ 360 struct sched_entity **se; 361 /* runqueue "owned" by this group on each CPU */ 362 struct cfs_rq **cfs_rq; 363 unsigned long shares; 364 365 #ifdef CONFIG_SMP 366 /* 367 * load_avg can be heavily contended at clock tick time, so put 368 * it in its own cacheline separated from the fields above which 369 * will also be accessed at each tick. 370 */ 371 atomic_long_t load_avg ____cacheline_aligned; 372 #endif 373 #endif 374 375 #ifdef CONFIG_RT_GROUP_SCHED 376 struct sched_rt_entity **rt_se; 377 struct rt_rq **rt_rq; 378 379 struct rt_bandwidth rt_bandwidth; 380 #endif 381 382 struct rcu_head rcu; 383 struct list_head list; 384 385 struct task_group *parent; 386 struct list_head siblings; 387 struct list_head children; 388 389 #ifdef CONFIG_SCHED_AUTOGROUP 390 struct autogroup *autogroup; 391 #endif 392 393 struct cfs_bandwidth cfs_bandwidth; 394 }; 395 396 #ifdef CONFIG_FAIR_GROUP_SCHED 397 #define ROOT_TASK_GROUP_LOAD NICE_0_LOAD 398 399 /* 400 * A weight of 0 or 1 can cause arithmetics problems. 401 * A weight of a cfs_rq is the sum of weights of which entities 402 * are queued on this cfs_rq, so a weight of a entity should not be 403 * too large, so as the shares value of a task group. 404 * (The default weight is 1024 - so there's no practical 405 * limitation from this.) 406 */ 407 #define MIN_SHARES (1UL << 1) 408 #define MAX_SHARES (1UL << 18) 409 #endif 410 411 typedef int (*tg_visitor)(struct task_group *, void *); 412 413 extern int walk_tg_tree_from(struct task_group *from, 414 tg_visitor down, tg_visitor up, void *data); 415 416 /* 417 * Iterate the full tree, calling @down when first entering a node and @up when 418 * leaving it for the final time. 419 * 420 * Caller must hold rcu_lock or sufficient equivalent. 421 */ 422 static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) 423 { 424 return walk_tg_tree_from(&root_task_group, down, up, data); 425 } 426 427 extern int tg_nop(struct task_group *tg, void *data); 428 429 extern void free_fair_sched_group(struct task_group *tg); 430 extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent); 431 extern void online_fair_sched_group(struct task_group *tg); 432 extern void unregister_fair_sched_group(struct task_group *tg); 433 extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, 434 struct sched_entity *se, int cpu, 435 struct sched_entity *parent); 436 extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b); 437 438 extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b); 439 extern void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b); 440 extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq); 441 442 extern void free_rt_sched_group(struct task_group *tg); 443 extern int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent); 444 extern void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, 445 struct sched_rt_entity *rt_se, int cpu, 446 struct sched_rt_entity *parent); 447 extern int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us); 448 extern int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us); 449 extern long sched_group_rt_runtime(struct task_group *tg); 450 extern long sched_group_rt_period(struct task_group *tg); 451 extern int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk); 452 453 extern struct task_group *sched_create_group(struct task_group *parent); 454 extern void sched_online_group(struct task_group *tg, 455 struct task_group *parent); 456 extern void sched_destroy_group(struct task_group *tg); 457 extern void sched_offline_group(struct task_group *tg); 458 459 extern void sched_move_task(struct task_struct *tsk); 460 461 #ifdef CONFIG_FAIR_GROUP_SCHED 462 extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); 463 464 #ifdef CONFIG_SMP 465 extern void set_task_rq_fair(struct sched_entity *se, 466 struct cfs_rq *prev, struct cfs_rq *next); 467 #else /* !CONFIG_SMP */ 468 static inline void set_task_rq_fair(struct sched_entity *se, 469 struct cfs_rq *prev, struct cfs_rq *next) { } 470 #endif /* CONFIG_SMP */ 471 #endif /* CONFIG_FAIR_GROUP_SCHED */ 472 473 #else /* CONFIG_CGROUP_SCHED */ 474 475 struct cfs_bandwidth { }; 476 477 #endif /* CONFIG_CGROUP_SCHED */ 478 479 /* CFS-related fields in a runqueue */ 480 struct cfs_rq { 481 struct load_weight load; 482 unsigned long runnable_weight; 483 unsigned int nr_running; 484 unsigned int h_nr_running; 485 486 u64 exec_clock; 487 u64 min_vruntime; 488 #ifndef CONFIG_64BIT 489 u64 min_vruntime_copy; 490 #endif 491 492 struct rb_root_cached tasks_timeline; 493 494 /* 495 * 'curr' points to currently running entity on this cfs_rq. 496 * It is set to NULL otherwise (i.e when none are currently running). 497 */ 498 struct sched_entity *curr; 499 struct sched_entity *next; 500 struct sched_entity *last; 501 struct sched_entity *skip; 502 503 #ifdef CONFIG_SCHED_DEBUG 504 unsigned int nr_spread_over; 505 #endif 506 507 #ifdef CONFIG_SMP 508 /* 509 * CFS load tracking 510 */ 511 struct sched_avg avg; 512 #ifndef CONFIG_64BIT 513 u64 load_last_update_time_copy; 514 #endif 515 struct { 516 raw_spinlock_t lock ____cacheline_aligned; 517 int nr; 518 unsigned long load_avg; 519 unsigned long util_avg; 520 unsigned long runnable_sum; 521 } removed; 522 523 #ifdef CONFIG_FAIR_GROUP_SCHED 524 unsigned long tg_load_avg_contrib; 525 long propagate; 526 long prop_runnable_sum; 527 528 /* 529 * h_load = weight * f(tg) 530 * 531 * Where f(tg) is the recursive weight fraction assigned to 532 * this group. 533 */ 534 unsigned long h_load; 535 u64 last_h_load_update; 536 struct sched_entity *h_load_next; 537 #endif /* CONFIG_FAIR_GROUP_SCHED */ 538 #endif /* CONFIG_SMP */ 539 540 #ifdef CONFIG_FAIR_GROUP_SCHED 541 struct rq *rq; /* CPU runqueue to which this cfs_rq is attached */ 542 543 /* 544 * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in 545 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities 546 * (like users, containers etc.) 547 * 548 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a CPU. 549 * This list is used during load balance. 550 */ 551 int on_list; 552 struct list_head leaf_cfs_rq_list; 553 struct task_group *tg; /* group that "owns" this runqueue */ 554 555 #ifdef CONFIG_CFS_BANDWIDTH 556 int runtime_enabled; 557 int expires_seq; 558 u64 runtime_expires; 559 s64 runtime_remaining; 560 561 u64 throttled_clock; 562 u64 throttled_clock_task; 563 u64 throttled_clock_task_time; 564 int throttled; 565 int throttle_count; 566 struct list_head throttled_list; 567 #endif /* CONFIG_CFS_BANDWIDTH */ 568 #endif /* CONFIG_FAIR_GROUP_SCHED */ 569 }; 570 571 static inline int rt_bandwidth_enabled(void) 572 { 573 return sysctl_sched_rt_runtime >= 0; 574 } 575 576 /* RT IPI pull logic requires IRQ_WORK */ 577 #if defined(CONFIG_IRQ_WORK) && defined(CONFIG_SMP) 578 # define HAVE_RT_PUSH_IPI 579 #endif 580 581 /* Real-Time classes' related field in a runqueue: */ 582 struct rt_rq { 583 struct rt_prio_array active; 584 unsigned int rt_nr_running; 585 unsigned int rr_nr_running; 586 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED 587 struct { 588 int curr; /* highest queued rt task prio */ 589 #ifdef CONFIG_SMP 590 int next; /* next highest */ 591 #endif 592 } highest_prio; 593 #endif 594 #ifdef CONFIG_SMP 595 unsigned long rt_nr_migratory; 596 unsigned long rt_nr_total; 597 int overloaded; 598 struct plist_head pushable_tasks; 599 600 #endif /* CONFIG_SMP */ 601 int rt_queued; 602 603 int rt_throttled; 604 u64 rt_time; 605 u64 rt_runtime; 606 /* Nests inside the rq lock: */ 607 raw_spinlock_t rt_runtime_lock; 608 609 #ifdef CONFIG_RT_GROUP_SCHED 610 unsigned long rt_nr_boosted; 611 612 struct rq *rq; 613 struct task_group *tg; 614 #endif 615 }; 616 617 static inline bool rt_rq_is_runnable(struct rt_rq *rt_rq) 618 { 619 return rt_rq->rt_queued && rt_rq->rt_nr_running; 620 } 621 622 /* Deadline class' related fields in a runqueue */ 623 struct dl_rq { 624 /* runqueue is an rbtree, ordered by deadline */ 625 struct rb_root_cached root; 626 627 unsigned long dl_nr_running; 628 629 #ifdef CONFIG_SMP 630 /* 631 * Deadline values of the currently executing and the 632 * earliest ready task on this rq. Caching these facilitates 633 * the decision wether or not a ready but not running task 634 * should migrate somewhere else. 635 */ 636 struct { 637 u64 curr; 638 u64 next; 639 } earliest_dl; 640 641 unsigned long dl_nr_migratory; 642 int overloaded; 643 644 /* 645 * Tasks on this rq that can be pushed away. They are kept in 646 * an rb-tree, ordered by tasks' deadlines, with caching 647 * of the leftmost (earliest deadline) element. 648 */ 649 struct rb_root_cached pushable_dl_tasks_root; 650 #else 651 struct dl_bw dl_bw; 652 #endif 653 /* 654 * "Active utilization" for this runqueue: increased when a 655 * task wakes up (becomes TASK_RUNNING) and decreased when a 656 * task blocks 657 */ 658 u64 running_bw; 659 660 /* 661 * Utilization of the tasks "assigned" to this runqueue (including 662 * the tasks that are in runqueue and the tasks that executed on this 663 * CPU and blocked). Increased when a task moves to this runqueue, and 664 * decreased when the task moves away (migrates, changes scheduling 665 * policy, or terminates). 666 * This is needed to compute the "inactive utilization" for the 667 * runqueue (inactive utilization = this_bw - running_bw). 668 */ 669 u64 this_bw; 670 u64 extra_bw; 671 672 /* 673 * Inverse of the fraction of CPU utilization that can be reclaimed 674 * by the GRUB algorithm. 675 */ 676 u64 bw_ratio; 677 }; 678 679 #ifdef CONFIG_FAIR_GROUP_SCHED 680 /* An entity is a task if it doesn't "own" a runqueue */ 681 #define entity_is_task(se) (!se->my_q) 682 #else 683 #define entity_is_task(se) 1 684 #endif 685 686 #ifdef CONFIG_SMP 687 /* 688 * XXX we want to get rid of these helpers and use the full load resolution. 689 */ 690 static inline long se_weight(struct sched_entity *se) 691 { 692 return scale_load_down(se->load.weight); 693 } 694 695 static inline long se_runnable(struct sched_entity *se) 696 { 697 return scale_load_down(se->runnable_weight); 698 } 699 700 static inline bool sched_asym_prefer(int a, int b) 701 { 702 return arch_asym_cpu_priority(a) > arch_asym_cpu_priority(b); 703 } 704 705 /* 706 * We add the notion of a root-domain which will be used to define per-domain 707 * variables. Each exclusive cpuset essentially defines an island domain by 708 * fully partitioning the member CPUs from any other cpuset. Whenever a new 709 * exclusive cpuset is created, we also create and attach a new root-domain 710 * object. 711 * 712 */ 713 struct root_domain { 714 atomic_t refcount; 715 atomic_t rto_count; 716 struct rcu_head rcu; 717 cpumask_var_t span; 718 cpumask_var_t online; 719 720 /* Indicate more than one runnable task for any CPU */ 721 bool overload; 722 723 /* 724 * The bit corresponding to a CPU gets set here if such CPU has more 725 * than one runnable -deadline task (as it is below for RT tasks). 726 */ 727 cpumask_var_t dlo_mask; 728 atomic_t dlo_count; 729 struct dl_bw dl_bw; 730 struct cpudl cpudl; 731 732 #ifdef HAVE_RT_PUSH_IPI 733 /* 734 * For IPI pull requests, loop across the rto_mask. 735 */ 736 struct irq_work rto_push_work; 737 raw_spinlock_t rto_lock; 738 /* These are only updated and read within rto_lock */ 739 int rto_loop; 740 int rto_cpu; 741 /* These atomics are updated outside of a lock */ 742 atomic_t rto_loop_next; 743 atomic_t rto_loop_start; 744 #endif 745 /* 746 * The "RT overload" flag: it gets set if a CPU has more than 747 * one runnable RT task. 748 */ 749 cpumask_var_t rto_mask; 750 struct cpupri cpupri; 751 752 unsigned long max_cpu_capacity; 753 }; 754 755 extern struct root_domain def_root_domain; 756 extern struct mutex sched_domains_mutex; 757 758 extern void init_defrootdomain(void); 759 extern int sched_init_domains(const struct cpumask *cpu_map); 760 extern void rq_attach_root(struct rq *rq, struct root_domain *rd); 761 extern void sched_get_rd(struct root_domain *rd); 762 extern void sched_put_rd(struct root_domain *rd); 763 764 #ifdef HAVE_RT_PUSH_IPI 765 extern void rto_push_irq_work_func(struct irq_work *work); 766 #endif 767 #endif /* CONFIG_SMP */ 768 769 /* 770 * This is the main, per-CPU runqueue data structure. 771 * 772 * Locking rule: those places that want to lock multiple runqueues 773 * (such as the load balancing or the thread migration code), lock 774 * acquire operations must be ordered by ascending &runqueue. 775 */ 776 struct rq { 777 /* runqueue lock: */ 778 raw_spinlock_t lock; 779 780 /* 781 * nr_running and cpu_load should be in the same cacheline because 782 * remote CPUs use both these fields when doing load calculation. 783 */ 784 unsigned int nr_running; 785 #ifdef CONFIG_NUMA_BALANCING 786 unsigned int nr_numa_running; 787 unsigned int nr_preferred_running; 788 unsigned int numa_migrate_on; 789 #endif 790 #define CPU_LOAD_IDX_MAX 5 791 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; 792 #ifdef CONFIG_NO_HZ_COMMON 793 #ifdef CONFIG_SMP 794 unsigned long last_load_update_tick; 795 unsigned long last_blocked_load_update_tick; 796 unsigned int has_blocked_load; 797 #endif /* CONFIG_SMP */ 798 unsigned int nohz_tick_stopped; 799 atomic_t nohz_flags; 800 #endif /* CONFIG_NO_HZ_COMMON */ 801 802 /* capture load from *all* tasks on this CPU: */ 803 struct load_weight load; 804 unsigned long nr_load_updates; 805 u64 nr_switches; 806 807 struct cfs_rq cfs; 808 struct rt_rq rt; 809 struct dl_rq dl; 810 811 #ifdef CONFIG_FAIR_GROUP_SCHED 812 /* list of leaf cfs_rq on this CPU: */ 813 struct list_head leaf_cfs_rq_list; 814 struct list_head *tmp_alone_branch; 815 #endif /* CONFIG_FAIR_GROUP_SCHED */ 816 817 /* 818 * This is part of a global counter where only the total sum 819 * over all CPUs matters. A task can increase this counter on 820 * one CPU and if it got migrated afterwards it may decrease 821 * it on another CPU. Always updated under the runqueue lock: 822 */ 823 unsigned long nr_uninterruptible; 824 825 struct task_struct *curr; 826 struct task_struct *idle; 827 struct task_struct *stop; 828 unsigned long next_balance; 829 struct mm_struct *prev_mm; 830 831 unsigned int clock_update_flags; 832 u64 clock; 833 u64 clock_task; 834 835 atomic_t nr_iowait; 836 837 #ifdef CONFIG_SMP 838 struct root_domain *rd; 839 struct sched_domain *sd; 840 841 unsigned long cpu_capacity; 842 unsigned long cpu_capacity_orig; 843 844 struct callback_head *balance_callback; 845 846 unsigned char idle_balance; 847 848 /* For active balancing */ 849 int active_balance; 850 int push_cpu; 851 struct cpu_stop_work active_balance_work; 852 853 /* CPU of this runqueue: */ 854 int cpu; 855 int online; 856 857 struct list_head cfs_tasks; 858 859 struct sched_avg avg_rt; 860 struct sched_avg avg_dl; 861 #if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) 862 #define HAVE_SCHED_AVG_IRQ 863 struct sched_avg avg_irq; 864 #endif 865 u64 idle_stamp; 866 u64 avg_idle; 867 868 /* This is used to determine avg_idle's max value */ 869 u64 max_idle_balance_cost; 870 #endif 871 872 #ifdef CONFIG_IRQ_TIME_ACCOUNTING 873 u64 prev_irq_time; 874 #endif 875 #ifdef CONFIG_PARAVIRT 876 u64 prev_steal_time; 877 #endif 878 #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING 879 u64 prev_steal_time_rq; 880 #endif 881 882 /* calc_load related fields */ 883 unsigned long calc_load_update; 884 long calc_load_active; 885 886 #ifdef CONFIG_SCHED_HRTICK 887 #ifdef CONFIG_SMP 888 int hrtick_csd_pending; 889 call_single_data_t hrtick_csd; 890 #endif 891 struct hrtimer hrtick_timer; 892 #endif 893 894 #ifdef CONFIG_SCHEDSTATS 895 /* latency stats */ 896 struct sched_info rq_sched_info; 897 unsigned long long rq_cpu_time; 898 /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ 899 900 /* sys_sched_yield() stats */ 901 unsigned int yld_count; 902 903 /* schedule() stats */ 904 unsigned int sched_count; 905 unsigned int sched_goidle; 906 907 /* try_to_wake_up() stats */ 908 unsigned int ttwu_count; 909 unsigned int ttwu_local; 910 #endif 911 912 #ifdef CONFIG_SMP 913 struct llist_head wake_list; 914 #endif 915 916 #ifdef CONFIG_CPU_IDLE 917 /* Must be inspected within a rcu lock section */ 918 struct cpuidle_state *idle_state; 919 #endif 920 }; 921 922 static inline int cpu_of(struct rq *rq) 923 { 924 #ifdef CONFIG_SMP 925 return rq->cpu; 926 #else 927 return 0; 928 #endif 929 } 930 931 932 #ifdef CONFIG_SCHED_SMT 933 934 extern struct static_key_false sched_smt_present; 935 936 extern void __update_idle_core(struct rq *rq); 937 938 static inline void update_idle_core(struct rq *rq) 939 { 940 if (static_branch_unlikely(&sched_smt_present)) 941 __update_idle_core(rq); 942 } 943 944 #else 945 static inline void update_idle_core(struct rq *rq) { } 946 #endif 947 948 DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); 949 950 #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) 951 #define this_rq() this_cpu_ptr(&runqueues) 952 #define task_rq(p) cpu_rq(task_cpu(p)) 953 #define cpu_curr(cpu) (cpu_rq(cpu)->curr) 954 #define raw_rq() raw_cpu_ptr(&runqueues) 955 956 static inline u64 __rq_clock_broken(struct rq *rq) 957 { 958 return READ_ONCE(rq->clock); 959 } 960 961 /* 962 * rq::clock_update_flags bits 963 * 964 * %RQCF_REQ_SKIP - will request skipping of clock update on the next 965 * call to __schedule(). This is an optimisation to avoid 966 * neighbouring rq clock updates. 967 * 968 * %RQCF_ACT_SKIP - is set from inside of __schedule() when skipping is 969 * in effect and calls to update_rq_clock() are being ignored. 970 * 971 * %RQCF_UPDATED - is a debug flag that indicates whether a call has been 972 * made to update_rq_clock() since the last time rq::lock was pinned. 973 * 974 * If inside of __schedule(), clock_update_flags will have been 975 * shifted left (a left shift is a cheap operation for the fast path 976 * to promote %RQCF_REQ_SKIP to %RQCF_ACT_SKIP), so you must use, 977 * 978 * if (rq-clock_update_flags >= RQCF_UPDATED) 979 * 980 * to check if %RQCF_UPADTED is set. It'll never be shifted more than 981 * one position though, because the next rq_unpin_lock() will shift it 982 * back. 983 */ 984 #define RQCF_REQ_SKIP 0x01 985 #define RQCF_ACT_SKIP 0x02 986 #define RQCF_UPDATED 0x04 987 988 static inline void assert_clock_updated(struct rq *rq) 989 { 990 /* 991 * The only reason for not seeing a clock update since the 992 * last rq_pin_lock() is if we're currently skipping updates. 993 */ 994 SCHED_WARN_ON(rq->clock_update_flags < RQCF_ACT_SKIP); 995 } 996 997 static inline u64 rq_clock(struct rq *rq) 998 { 999 lockdep_assert_held(&rq->lock); 1000 assert_clock_updated(rq); 1001 1002 return rq->clock; 1003 } 1004 1005 static inline u64 rq_clock_task(struct rq *rq) 1006 { 1007 lockdep_assert_held(&rq->lock); 1008 assert_clock_updated(rq); 1009 1010 return rq->clock_task; 1011 } 1012 1013 static inline void rq_clock_skip_update(struct rq *rq) 1014 { 1015 lockdep_assert_held(&rq->lock); 1016 rq->clock_update_flags |= RQCF_REQ_SKIP; 1017 } 1018 1019 /* 1020 * See rt task throttling, which is the only time a skip 1021 * request is cancelled. 1022 */ 1023 static inline void rq_clock_cancel_skipupdate(struct rq *rq) 1024 { 1025 lockdep_assert_held(&rq->lock); 1026 rq->clock_update_flags &= ~RQCF_REQ_SKIP; 1027 } 1028 1029 struct rq_flags { 1030 unsigned long flags; 1031 struct pin_cookie cookie; 1032 #ifdef CONFIG_SCHED_DEBUG 1033 /* 1034 * A copy of (rq::clock_update_flags & RQCF_UPDATED) for the 1035 * current pin context is stashed here in case it needs to be 1036 * restored in rq_repin_lock(). 1037 */ 1038 unsigned int clock_update_flags; 1039 #endif 1040 }; 1041 1042 static inline void rq_pin_lock(struct rq *rq, struct rq_flags *rf) 1043 { 1044 rf->cookie = lockdep_pin_lock(&rq->lock); 1045 1046 #ifdef CONFIG_SCHED_DEBUG 1047 rq->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP); 1048 rf->clock_update_flags = 0; 1049 #endif 1050 } 1051 1052 static inline void rq_unpin_lock(struct rq *rq, struct rq_flags *rf) 1053 { 1054 #ifdef CONFIG_SCHED_DEBUG 1055 if (rq->clock_update_flags > RQCF_ACT_SKIP) 1056 rf->clock_update_flags = RQCF_UPDATED; 1057 #endif 1058 1059 lockdep_unpin_lock(&rq->lock, rf->cookie); 1060 } 1061 1062 static inline void rq_repin_lock(struct rq *rq, struct rq_flags *rf) 1063 { 1064 lockdep_repin_lock(&rq->lock, rf->cookie); 1065 1066 #ifdef CONFIG_SCHED_DEBUG 1067 /* 1068 * Restore the value we stashed in @rf for this pin context. 1069 */ 1070 rq->clock_update_flags |= rf->clock_update_flags; 1071 #endif 1072 } 1073 1074 #ifdef CONFIG_NUMA 1075 enum numa_topology_type { 1076 NUMA_DIRECT, 1077 NUMA_GLUELESS_MESH, 1078 NUMA_BACKPLANE, 1079 }; 1080 extern enum numa_topology_type sched_numa_topology_type; 1081 extern int sched_max_numa_distance; 1082 extern bool find_numa_distance(int distance); 1083 #endif 1084 1085 #ifdef CONFIG_NUMA 1086 extern void sched_init_numa(void); 1087 extern void sched_domains_numa_masks_set(unsigned int cpu); 1088 extern void sched_domains_numa_masks_clear(unsigned int cpu); 1089 #else 1090 static inline void sched_init_numa(void) { } 1091 static inline void sched_domains_numa_masks_set(unsigned int cpu) { } 1092 static inline void sched_domains_numa_masks_clear(unsigned int cpu) { } 1093 #endif 1094 1095 #ifdef CONFIG_NUMA_BALANCING 1096 /* The regions in numa_faults array from task_struct */ 1097 enum numa_faults_stats { 1098 NUMA_MEM = 0, 1099 NUMA_CPU, 1100 NUMA_MEMBUF, 1101 NUMA_CPUBUF 1102 }; 1103 extern void sched_setnuma(struct task_struct *p, int node); 1104 extern int migrate_task_to(struct task_struct *p, int cpu); 1105 extern int migrate_swap(struct task_struct *p, struct task_struct *t, 1106 int cpu, int scpu); 1107 extern void init_numa_balancing(unsigned long clone_flags, struct task_struct *p); 1108 #else 1109 static inline void 1110 init_numa_balancing(unsigned long clone_flags, struct task_struct *p) 1111 { 1112 } 1113 #endif /* CONFIG_NUMA_BALANCING */ 1114 1115 #ifdef CONFIG_SMP 1116 1117 static inline void 1118 queue_balance_callback(struct rq *rq, 1119 struct callback_head *head, 1120 void (*func)(struct rq *rq)) 1121 { 1122 lockdep_assert_held(&rq->lock); 1123 1124 if (unlikely(head->next)) 1125 return; 1126 1127 head->func = (void (*)(struct callback_head *))func; 1128 head->next = rq->balance_callback; 1129 rq->balance_callback = head; 1130 } 1131 1132 extern void sched_ttwu_pending(void); 1133 1134 #define rcu_dereference_check_sched_domain(p) \ 1135 rcu_dereference_check((p), \ 1136 lockdep_is_held(&sched_domains_mutex)) 1137 1138 /* 1139 * The domain tree (rq->sd) is protected by RCU's quiescent state transition. 1140 * See detach_destroy_domains: synchronize_sched for details. 1141 * 1142 * The domain tree of any CPU may only be accessed from within 1143 * preempt-disabled sections. 1144 */ 1145 #define for_each_domain(cpu, __sd) \ 1146 for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); \ 1147 __sd; __sd = __sd->parent) 1148 1149 #define for_each_lower_domain(sd) for (; sd; sd = sd->child) 1150 1151 /** 1152 * highest_flag_domain - Return highest sched_domain containing flag. 1153 * @cpu: The CPU whose highest level of sched domain is to 1154 * be returned. 1155 * @flag: The flag to check for the highest sched_domain 1156 * for the given CPU. 1157 * 1158 * Returns the highest sched_domain of a CPU which contains the given flag. 1159 */ 1160 static inline struct sched_domain *highest_flag_domain(int cpu, int flag) 1161 { 1162 struct sched_domain *sd, *hsd = NULL; 1163 1164 for_each_domain(cpu, sd) { 1165 if (!(sd->flags & flag)) 1166 break; 1167 hsd = sd; 1168 } 1169 1170 return hsd; 1171 } 1172 1173 static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) 1174 { 1175 struct sched_domain *sd; 1176 1177 for_each_domain(cpu, sd) { 1178 if (sd->flags & flag) 1179 break; 1180 } 1181 1182 return sd; 1183 } 1184 1185 DECLARE_PER_CPU(struct sched_domain *, sd_llc); 1186 DECLARE_PER_CPU(int, sd_llc_size); 1187 DECLARE_PER_CPU(int, sd_llc_id); 1188 DECLARE_PER_CPU(struct sched_domain_shared *, sd_llc_shared); 1189 DECLARE_PER_CPU(struct sched_domain *, sd_numa); 1190 DECLARE_PER_CPU(struct sched_domain *, sd_asym); 1191 1192 struct sched_group_capacity { 1193 atomic_t ref; 1194 /* 1195 * CPU capacity of this group, SCHED_CAPACITY_SCALE being max capacity 1196 * for a single CPU. 1197 */ 1198 unsigned long capacity; 1199 unsigned long min_capacity; /* Min per-CPU capacity in group */ 1200 unsigned long next_update; 1201 int imbalance; /* XXX unrelated to capacity but shared group state */ 1202 1203 #ifdef CONFIG_SCHED_DEBUG 1204 int id; 1205 #endif 1206 1207 unsigned long cpumask[0]; /* Balance mask */ 1208 }; 1209 1210 struct sched_group { 1211 struct sched_group *next; /* Must be a circular list */ 1212 atomic_t ref; 1213 1214 unsigned int group_weight; 1215 struct sched_group_capacity *sgc; 1216 int asym_prefer_cpu; /* CPU of highest priority in group */ 1217 1218 /* 1219 * The CPUs this group covers. 1220 * 1221 * NOTE: this field is variable length. (Allocated dynamically 1222 * by attaching extra space to the end of the structure, 1223 * depending on how many CPUs the kernel has booted up with) 1224 */ 1225 unsigned long cpumask[0]; 1226 }; 1227 1228 static inline struct cpumask *sched_group_span(struct sched_group *sg) 1229 { 1230 return to_cpumask(sg->cpumask); 1231 } 1232 1233 /* 1234 * See build_balance_mask(). 1235 */ 1236 static inline struct cpumask *group_balance_mask(struct sched_group *sg) 1237 { 1238 return to_cpumask(sg->sgc->cpumask); 1239 } 1240 1241 /** 1242 * group_first_cpu - Returns the first CPU in the cpumask of a sched_group. 1243 * @group: The group whose first CPU is to be returned. 1244 */ 1245 static inline unsigned int group_first_cpu(struct sched_group *group) 1246 { 1247 return cpumask_first(sched_group_span(group)); 1248 } 1249 1250 extern int group_balance_cpu(struct sched_group *sg); 1251 1252 #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) 1253 void register_sched_domain_sysctl(void); 1254 void dirty_sched_domain_sysctl(int cpu); 1255 void unregister_sched_domain_sysctl(void); 1256 #else 1257 static inline void register_sched_domain_sysctl(void) 1258 { 1259 } 1260 static inline void dirty_sched_domain_sysctl(int cpu) 1261 { 1262 } 1263 static inline void unregister_sched_domain_sysctl(void) 1264 { 1265 } 1266 #endif 1267 1268 #else 1269 1270 static inline void sched_ttwu_pending(void) { } 1271 1272 #endif /* CONFIG_SMP */ 1273 1274 #include "stats.h" 1275 #include "autogroup.h" 1276 1277 #ifdef CONFIG_CGROUP_SCHED 1278 1279 /* 1280 * Return the group to which this tasks belongs. 1281 * 1282 * We cannot use task_css() and friends because the cgroup subsystem 1283 * changes that value before the cgroup_subsys::attach() method is called, 1284 * therefore we cannot pin it and might observe the wrong value. 1285 * 1286 * The same is true for autogroup's p->signal->autogroup->tg, the autogroup 1287 * core changes this before calling sched_move_task(). 1288 * 1289 * Instead we use a 'copy' which is updated from sched_move_task() while 1290 * holding both task_struct::pi_lock and rq::lock. 1291 */ 1292 static inline struct task_group *task_group(struct task_struct *p) 1293 { 1294 return p->sched_task_group; 1295 } 1296 1297 /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ 1298 static inline void set_task_rq(struct task_struct *p, unsigned int cpu) 1299 { 1300 #if defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_RT_GROUP_SCHED) 1301 struct task_group *tg = task_group(p); 1302 #endif 1303 1304 #ifdef CONFIG_FAIR_GROUP_SCHED 1305 set_task_rq_fair(&p->se, p->se.cfs_rq, tg->cfs_rq[cpu]); 1306 p->se.cfs_rq = tg->cfs_rq[cpu]; 1307 p->se.parent = tg->se[cpu]; 1308 #endif 1309 1310 #ifdef CONFIG_RT_GROUP_SCHED 1311 p->rt.rt_rq = tg->rt_rq[cpu]; 1312 p->rt.parent = tg->rt_se[cpu]; 1313 #endif 1314 } 1315 1316 #else /* CONFIG_CGROUP_SCHED */ 1317 1318 static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } 1319 static inline struct task_group *task_group(struct task_struct *p) 1320 { 1321 return NULL; 1322 } 1323 1324 #endif /* CONFIG_CGROUP_SCHED */ 1325 1326 static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) 1327 { 1328 set_task_rq(p, cpu); 1329 #ifdef CONFIG_SMP 1330 /* 1331 * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be 1332 * successfuly executed on another CPU. We must ensure that updates of 1333 * per-task data have been completed by this moment. 1334 */ 1335 smp_wmb(); 1336 #ifdef CONFIG_THREAD_INFO_IN_TASK 1337 p->cpu = cpu; 1338 #else 1339 task_thread_info(p)->cpu = cpu; 1340 #endif 1341 p->wake_cpu = cpu; 1342 #endif 1343 } 1344 1345 /* 1346 * Tunables that become constants when CONFIG_SCHED_DEBUG is off: 1347 */ 1348 #ifdef CONFIG_SCHED_DEBUG 1349 # include <linux/static_key.h> 1350 # define const_debug __read_mostly 1351 #else 1352 # define const_debug const 1353 #endif 1354 1355 #define SCHED_FEAT(name, enabled) \ 1356 __SCHED_FEAT_##name , 1357 1358 enum { 1359 #include "features.h" 1360 __SCHED_FEAT_NR, 1361 }; 1362 1363 #undef SCHED_FEAT 1364 1365 #if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL) 1366 1367 /* 1368 * To support run-time toggling of sched features, all the translation units 1369 * (but core.c) reference the sysctl_sched_features defined in core.c. 1370 */ 1371 extern const_debug unsigned int sysctl_sched_features; 1372 1373 #define SCHED_FEAT(name, enabled) \ 1374 static __always_inline bool static_branch_##name(struct static_key *key) \ 1375 { \ 1376 return static_key_##enabled(key); \ 1377 } 1378 1379 #include "features.h" 1380 #undef SCHED_FEAT 1381 1382 extern struct static_key sched_feat_keys[__SCHED_FEAT_NR]; 1383 #define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x])) 1384 1385 #else /* !(SCHED_DEBUG && HAVE_JUMP_LABEL) */ 1386 1387 /* 1388 * Each translation unit has its own copy of sysctl_sched_features to allow 1389 * constants propagation at compile time and compiler optimization based on 1390 * features default. 1391 */ 1392 #define SCHED_FEAT(name, enabled) \ 1393 (1UL << __SCHED_FEAT_##name) * enabled | 1394 static const_debug __maybe_unused unsigned int sysctl_sched_features = 1395 #include "features.h" 1396 0; 1397 #undef SCHED_FEAT 1398 1399 #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) 1400 1401 #endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */ 1402 1403 extern struct static_key_false sched_numa_balancing; 1404 extern struct static_key_false sched_schedstats; 1405 1406 static inline u64 global_rt_period(void) 1407 { 1408 return (u64)sysctl_sched_rt_period * NSEC_PER_USEC; 1409 } 1410 1411 static inline u64 global_rt_runtime(void) 1412 { 1413 if (sysctl_sched_rt_runtime < 0) 1414 return RUNTIME_INF; 1415 1416 return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; 1417 } 1418 1419 static inline int task_current(struct rq *rq, struct task_struct *p) 1420 { 1421 return rq->curr == p; 1422 } 1423 1424 static inline int task_running(struct rq *rq, struct task_struct *p) 1425 { 1426 #ifdef CONFIG_SMP 1427 return p->on_cpu; 1428 #else 1429 return task_current(rq, p); 1430 #endif 1431 } 1432 1433 static inline int task_on_rq_queued(struct task_struct *p) 1434 { 1435 return p->on_rq == TASK_ON_RQ_QUEUED; 1436 } 1437 1438 static inline int task_on_rq_migrating(struct task_struct *p) 1439 { 1440 return p->on_rq == TASK_ON_RQ_MIGRATING; 1441 } 1442 1443 /* 1444 * wake flags 1445 */ 1446 #define WF_SYNC 0x01 /* Waker goes to sleep after wakeup */ 1447 #define WF_FORK 0x02 /* Child wakeup after fork */ 1448 #define WF_MIGRATED 0x4 /* Internal use, task got migrated */ 1449 1450 /* 1451 * To aid in avoiding the subversion of "niceness" due to uneven distribution 1452 * of tasks with abnormal "nice" values across CPUs the contribution that 1453 * each task makes to its run queue's load is weighted according to its 1454 * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a 1455 * scaled version of the new time slice allocation that they receive on time 1456 * slice expiry etc. 1457 */ 1458 1459 #define WEIGHT_IDLEPRIO 3 1460 #define WMULT_IDLEPRIO 1431655765 1461 1462 extern const int sched_prio_to_weight[40]; 1463 extern const u32 sched_prio_to_wmult[40]; 1464 1465 /* 1466 * {de,en}queue flags: 1467 * 1468 * DEQUEUE_SLEEP - task is no longer runnable 1469 * ENQUEUE_WAKEUP - task just became runnable 1470 * 1471 * SAVE/RESTORE - an otherwise spurious dequeue/enqueue, done to ensure tasks 1472 * are in a known state which allows modification. Such pairs 1473 * should preserve as much state as possible. 1474 * 1475 * MOVE - paired with SAVE/RESTORE, explicitly does not preserve the location 1476 * in the runqueue. 1477 * 1478 * ENQUEUE_HEAD - place at front of runqueue (tail if not specified) 1479 * ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline) 1480 * ENQUEUE_MIGRATED - the task was migrated during wakeup 1481 * 1482 */ 1483 1484 #define DEQUEUE_SLEEP 0x01 1485 #define DEQUEUE_SAVE 0x02 /* Matches ENQUEUE_RESTORE */ 1486 #define DEQUEUE_MOVE 0x04 /* Matches ENQUEUE_MOVE */ 1487 #define DEQUEUE_NOCLOCK 0x08 /* Matches ENQUEUE_NOCLOCK */ 1488 1489 #define ENQUEUE_WAKEUP 0x01 1490 #define ENQUEUE_RESTORE 0x02 1491 #define ENQUEUE_MOVE 0x04 1492 #define ENQUEUE_NOCLOCK 0x08 1493 1494 #define ENQUEUE_HEAD 0x10 1495 #define ENQUEUE_REPLENISH 0x20 1496 #ifdef CONFIG_SMP 1497 #define ENQUEUE_MIGRATED 0x40 1498 #else 1499 #define ENQUEUE_MIGRATED 0x00 1500 #endif 1501 1502 #define RETRY_TASK ((void *)-1UL) 1503 1504 struct sched_class { 1505 const struct sched_class *next; 1506 1507 void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags); 1508 void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags); 1509 void (*yield_task) (struct rq *rq); 1510 bool (*yield_to_task)(struct rq *rq, struct task_struct *p, bool preempt); 1511 1512 void (*check_preempt_curr)(struct rq *rq, struct task_struct *p, int flags); 1513 1514 /* 1515 * It is the responsibility of the pick_next_task() method that will 1516 * return the next task to call put_prev_task() on the @prev task or 1517 * something equivalent. 1518 * 1519 * May return RETRY_TASK when it finds a higher prio class has runnable 1520 * tasks. 1521 */ 1522 struct task_struct * (*pick_next_task)(struct rq *rq, 1523 struct task_struct *prev, 1524 struct rq_flags *rf); 1525 void (*put_prev_task)(struct rq *rq, struct task_struct *p); 1526 1527 #ifdef CONFIG_SMP 1528 int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags); 1529 void (*migrate_task_rq)(struct task_struct *p, int new_cpu); 1530 1531 void (*task_woken)(struct rq *this_rq, struct task_struct *task); 1532 1533 void (*set_cpus_allowed)(struct task_struct *p, 1534 const struct cpumask *newmask); 1535 1536 void (*rq_online)(struct rq *rq); 1537 void (*rq_offline)(struct rq *rq); 1538 #endif 1539 1540 void (*set_curr_task)(struct rq *rq); 1541 void (*task_tick)(struct rq *rq, struct task_struct *p, int queued); 1542 void (*task_fork)(struct task_struct *p); 1543 void (*task_dead)(struct task_struct *p); 1544 1545 /* 1546 * The switched_from() call is allowed to drop rq->lock, therefore we 1547 * cannot assume the switched_from/switched_to pair is serliazed by 1548 * rq->lock. They are however serialized by p->pi_lock. 1549 */ 1550 void (*switched_from)(struct rq *this_rq, struct task_struct *task); 1551 void (*switched_to) (struct rq *this_rq, struct task_struct *task); 1552 void (*prio_changed) (struct rq *this_rq, struct task_struct *task, 1553 int oldprio); 1554 1555 unsigned int (*get_rr_interval)(struct rq *rq, 1556 struct task_struct *task); 1557 1558 void (*update_curr)(struct rq *rq); 1559 1560 #define TASK_SET_GROUP 0 1561 #define TASK_MOVE_GROUP 1 1562 1563 #ifdef CONFIG_FAIR_GROUP_SCHED 1564 void (*task_change_group)(struct task_struct *p, int type); 1565 #endif 1566 }; 1567 1568 static inline void put_prev_task(struct rq *rq, struct task_struct *prev) 1569 { 1570 prev->sched_class->put_prev_task(rq, prev); 1571 } 1572 1573 static inline void set_curr_task(struct rq *rq, struct task_struct *curr) 1574 { 1575 curr->sched_class->set_curr_task(rq); 1576 } 1577 1578 #ifdef CONFIG_SMP 1579 #define sched_class_highest (&stop_sched_class) 1580 #else 1581 #define sched_class_highest (&dl_sched_class) 1582 #endif 1583 #define for_each_class(class) \ 1584 for (class = sched_class_highest; class; class = class->next) 1585 1586 extern const struct sched_class stop_sched_class; 1587 extern const struct sched_class dl_sched_class; 1588 extern const struct sched_class rt_sched_class; 1589 extern const struct sched_class fair_sched_class; 1590 extern const struct sched_class idle_sched_class; 1591 1592 1593 #ifdef CONFIG_SMP 1594 1595 extern void update_group_capacity(struct sched_domain *sd, int cpu); 1596 1597 extern void trigger_load_balance(struct rq *rq); 1598 1599 extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask); 1600 1601 #endif 1602 1603 #ifdef CONFIG_CPU_IDLE 1604 static inline void idle_set_state(struct rq *rq, 1605 struct cpuidle_state *idle_state) 1606 { 1607 rq->idle_state = idle_state; 1608 } 1609 1610 static inline struct cpuidle_state *idle_get_state(struct rq *rq) 1611 { 1612 SCHED_WARN_ON(!rcu_read_lock_held()); 1613 1614 return rq->idle_state; 1615 } 1616 #else 1617 static inline void idle_set_state(struct rq *rq, 1618 struct cpuidle_state *idle_state) 1619 { 1620 } 1621 1622 static inline struct cpuidle_state *idle_get_state(struct rq *rq) 1623 { 1624 return NULL; 1625 } 1626 #endif 1627 1628 extern void schedule_idle(void); 1629 1630 extern void sysrq_sched_debug_show(void); 1631 extern void sched_init_granularity(void); 1632 extern void update_max_interval(void); 1633 1634 extern void init_sched_dl_class(void); 1635 extern void init_sched_rt_class(void); 1636 extern void init_sched_fair_class(void); 1637 1638 extern void reweight_task(struct task_struct *p, int prio); 1639 1640 extern void resched_curr(struct rq *rq); 1641 extern void resched_cpu(int cpu); 1642 1643 extern struct rt_bandwidth def_rt_bandwidth; 1644 extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); 1645 1646 extern struct dl_bandwidth def_dl_bandwidth; 1647 extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime); 1648 extern void init_dl_task_timer(struct sched_dl_entity *dl_se); 1649 extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se); 1650 extern void init_dl_rq_bw_ratio(struct dl_rq *dl_rq); 1651 1652 #define BW_SHIFT 20 1653 #define BW_UNIT (1 << BW_SHIFT) 1654 #define RATIO_SHIFT 8 1655 unsigned long to_ratio(u64 period, u64 runtime); 1656 1657 extern void init_entity_runnable_average(struct sched_entity *se); 1658 extern void post_init_entity_util_avg(struct sched_entity *se); 1659 1660 #ifdef CONFIG_NO_HZ_FULL 1661 extern bool sched_can_stop_tick(struct rq *rq); 1662 extern int __init sched_tick_offload_init(void); 1663 1664 /* 1665 * Tick may be needed by tasks in the runqueue depending on their policy and 1666 * requirements. If tick is needed, lets send the target an IPI to kick it out of 1667 * nohz mode if necessary. 1668 */ 1669 static inline void sched_update_tick_dependency(struct rq *rq) 1670 { 1671 int cpu; 1672 1673 if (!tick_nohz_full_enabled()) 1674 return; 1675 1676 cpu = cpu_of(rq); 1677 1678 if (!tick_nohz_full_cpu(cpu)) 1679 return; 1680 1681 if (sched_can_stop_tick(rq)) 1682 tick_nohz_dep_clear_cpu(cpu, TICK_DEP_BIT_SCHED); 1683 else 1684 tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED); 1685 } 1686 #else 1687 static inline int sched_tick_offload_init(void) { return 0; } 1688 static inline void sched_update_tick_dependency(struct rq *rq) { } 1689 #endif 1690 1691 static inline void add_nr_running(struct rq *rq, unsigned count) 1692 { 1693 unsigned prev_nr = rq->nr_running; 1694 1695 rq->nr_running = prev_nr + count; 1696 1697 if (prev_nr < 2 && rq->nr_running >= 2) { 1698 #ifdef CONFIG_SMP 1699 if (!rq->rd->overload) 1700 rq->rd->overload = true; 1701 #endif 1702 } 1703 1704 sched_update_tick_dependency(rq); 1705 } 1706 1707 static inline void sub_nr_running(struct rq *rq, unsigned count) 1708 { 1709 rq->nr_running -= count; 1710 /* Check if we still need preemption */ 1711 sched_update_tick_dependency(rq); 1712 } 1713 1714 extern void update_rq_clock(struct rq *rq); 1715 1716 extern void activate_task(struct rq *rq, struct task_struct *p, int flags); 1717 extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags); 1718 1719 extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags); 1720 1721 extern const_debug unsigned int sysctl_sched_nr_migrate; 1722 extern const_debug unsigned int sysctl_sched_migration_cost; 1723 1724 #ifdef CONFIG_SCHED_HRTICK 1725 1726 /* 1727 * Use hrtick when: 1728 * - enabled by features 1729 * - hrtimer is actually high res 1730 */ 1731 static inline int hrtick_enabled(struct rq *rq) 1732 { 1733 if (!sched_feat(HRTICK)) 1734 return 0; 1735 if (!cpu_active(cpu_of(rq))) 1736 return 0; 1737 return hrtimer_is_hres_active(&rq->hrtick_timer); 1738 } 1739 1740 void hrtick_start(struct rq *rq, u64 delay); 1741 1742 #else 1743 1744 static inline int hrtick_enabled(struct rq *rq) 1745 { 1746 return 0; 1747 } 1748 1749 #endif /* CONFIG_SCHED_HRTICK */ 1750 1751 #ifndef arch_scale_freq_capacity 1752 static __always_inline 1753 unsigned long arch_scale_freq_capacity(int cpu) 1754 { 1755 return SCHED_CAPACITY_SCALE; 1756 } 1757 #endif 1758 1759 #ifdef CONFIG_SMP 1760 #ifndef arch_scale_cpu_capacity 1761 static __always_inline 1762 unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) 1763 { 1764 if (sd && (sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1)) 1765 return sd->smt_gain / sd->span_weight; 1766 1767 return SCHED_CAPACITY_SCALE; 1768 } 1769 #endif 1770 #else 1771 #ifndef arch_scale_cpu_capacity 1772 static __always_inline 1773 unsigned long arch_scale_cpu_capacity(void __always_unused *sd, int cpu) 1774 { 1775 return SCHED_CAPACITY_SCALE; 1776 } 1777 #endif 1778 #endif 1779 1780 struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) 1781 __acquires(rq->lock); 1782 1783 struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) 1784 __acquires(p->pi_lock) 1785 __acquires(rq->lock); 1786 1787 static inline void __task_rq_unlock(struct rq *rq, struct rq_flags *rf) 1788 __releases(rq->lock) 1789 { 1790 rq_unpin_lock(rq, rf); 1791 raw_spin_unlock(&rq->lock); 1792 } 1793 1794 static inline void 1795 task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf) 1796 __releases(rq->lock) 1797 __releases(p->pi_lock) 1798 { 1799 rq_unpin_lock(rq, rf); 1800 raw_spin_unlock(&rq->lock); 1801 raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); 1802 } 1803 1804 static inline void 1805 rq_lock_irqsave(struct rq *rq, struct rq_flags *rf) 1806 __acquires(rq->lock) 1807 { 1808 raw_spin_lock_irqsave(&rq->lock, rf->flags); 1809 rq_pin_lock(rq, rf); 1810 } 1811 1812 static inline void 1813 rq_lock_irq(struct rq *rq, struct rq_flags *rf) 1814 __acquires(rq->lock) 1815 { 1816 raw_spin_lock_irq(&rq->lock); 1817 rq_pin_lock(rq, rf); 1818 } 1819 1820 static inline void 1821 rq_lock(struct rq *rq, struct rq_flags *rf) 1822 __acquires(rq->lock) 1823 { 1824 raw_spin_lock(&rq->lock); 1825 rq_pin_lock(rq, rf); 1826 } 1827 1828 static inline void 1829 rq_relock(struct rq *rq, struct rq_flags *rf) 1830 __acquires(rq->lock) 1831 { 1832 raw_spin_lock(&rq->lock); 1833 rq_repin_lock(rq, rf); 1834 } 1835 1836 static inline void 1837 rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf) 1838 __releases(rq->lock) 1839 { 1840 rq_unpin_lock(rq, rf); 1841 raw_spin_unlock_irqrestore(&rq->lock, rf->flags); 1842 } 1843 1844 static inline void 1845 rq_unlock_irq(struct rq *rq, struct rq_flags *rf) 1846 __releases(rq->lock) 1847 { 1848 rq_unpin_lock(rq, rf); 1849 raw_spin_unlock_irq(&rq->lock); 1850 } 1851 1852 static inline void 1853 rq_unlock(struct rq *rq, struct rq_flags *rf) 1854 __releases(rq->lock) 1855 { 1856 rq_unpin_lock(rq, rf); 1857 raw_spin_unlock(&rq->lock); 1858 } 1859 1860 #ifdef CONFIG_SMP 1861 #ifdef CONFIG_PREEMPT 1862 1863 static inline void double_rq_lock(struct rq *rq1, struct rq *rq2); 1864 1865 /* 1866 * fair double_lock_balance: Safely acquires both rq->locks in a fair 1867 * way at the expense of forcing extra atomic operations in all 1868 * invocations. This assures that the double_lock is acquired using the 1869 * same underlying policy as the spinlock_t on this architecture, which 1870 * reduces latency compared to the unfair variant below. However, it 1871 * also adds more overhead and therefore may reduce throughput. 1872 */ 1873 static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) 1874 __releases(this_rq->lock) 1875 __acquires(busiest->lock) 1876 __acquires(this_rq->lock) 1877 { 1878 raw_spin_unlock(&this_rq->lock); 1879 double_rq_lock(this_rq, busiest); 1880 1881 return 1; 1882 } 1883 1884 #else 1885 /* 1886 * Unfair double_lock_balance: Optimizes throughput at the expense of 1887 * latency by eliminating extra atomic operations when the locks are 1888 * already in proper order on entry. This favors lower CPU-ids and will 1889 * grant the double lock to lower CPUs over higher ids under contention, 1890 * regardless of entry order into the function. 1891 */ 1892 static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) 1893 __releases(this_rq->lock) 1894 __acquires(busiest->lock) 1895 __acquires(this_rq->lock) 1896 { 1897 int ret = 0; 1898 1899 if (unlikely(!raw_spin_trylock(&busiest->lock))) { 1900 if (busiest < this_rq) { 1901 raw_spin_unlock(&this_rq->lock); 1902 raw_spin_lock(&busiest->lock); 1903 raw_spin_lock_nested(&this_rq->lock, 1904 SINGLE_DEPTH_NESTING); 1905 ret = 1; 1906 } else 1907 raw_spin_lock_nested(&busiest->lock, 1908 SINGLE_DEPTH_NESTING); 1909 } 1910 return ret; 1911 } 1912 1913 #endif /* CONFIG_PREEMPT */ 1914 1915 /* 1916 * double_lock_balance - lock the busiest runqueue, this_rq is locked already. 1917 */ 1918 static inline int double_lock_balance(struct rq *this_rq, struct rq *busiest) 1919 { 1920 if (unlikely(!irqs_disabled())) { 1921 /* printk() doesn't work well under rq->lock */ 1922 raw_spin_unlock(&this_rq->lock); 1923 BUG_ON(1); 1924 } 1925 1926 return _double_lock_balance(this_rq, busiest); 1927 } 1928 1929 static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest) 1930 __releases(busiest->lock) 1931 { 1932 raw_spin_unlock(&busiest->lock); 1933 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); 1934 } 1935 1936 static inline void double_lock(spinlock_t *l1, spinlock_t *l2) 1937 { 1938 if (l1 > l2) 1939 swap(l1, l2); 1940 1941 spin_lock(l1); 1942 spin_lock_nested(l2, SINGLE_DEPTH_NESTING); 1943 } 1944 1945 static inline void double_lock_irq(spinlock_t *l1, spinlock_t *l2) 1946 { 1947 if (l1 > l2) 1948 swap(l1, l2); 1949 1950 spin_lock_irq(l1); 1951 spin_lock_nested(l2, SINGLE_DEPTH_NESTING); 1952 } 1953 1954 static inline void double_raw_lock(raw_spinlock_t *l1, raw_spinlock_t *l2) 1955 { 1956 if (l1 > l2) 1957 swap(l1, l2); 1958 1959 raw_spin_lock(l1); 1960 raw_spin_lock_nested(l2, SINGLE_DEPTH_NESTING); 1961 } 1962 1963 /* 1964 * double_rq_lock - safely lock two runqueues 1965 * 1966 * Note this does not disable interrupts like task_rq_lock, 1967 * you need to do so manually before calling. 1968 */ 1969 static inline void double_rq_lock(struct rq *rq1, struct rq *rq2) 1970 __acquires(rq1->lock) 1971 __acquires(rq2->lock) 1972 { 1973 BUG_ON(!irqs_disabled()); 1974 if (rq1 == rq2) { 1975 raw_spin_lock(&rq1->lock); 1976 __acquire(rq2->lock); /* Fake it out ;) */ 1977 } else { 1978 if (rq1 < rq2) { 1979 raw_spin_lock(&rq1->lock); 1980 raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING); 1981 } else { 1982 raw_spin_lock(&rq2->lock); 1983 raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING); 1984 } 1985 } 1986 } 1987 1988 /* 1989 * double_rq_unlock - safely unlock two runqueues 1990 * 1991 * Note this does not restore interrupts like task_rq_unlock, 1992 * you need to do so manually after calling. 1993 */ 1994 static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2) 1995 __releases(rq1->lock) 1996 __releases(rq2->lock) 1997 { 1998 raw_spin_unlock(&rq1->lock); 1999 if (rq1 != rq2) 2000 raw_spin_unlock(&rq2->lock); 2001 else 2002 __release(rq2->lock); 2003 } 2004 2005 extern void set_rq_online (struct rq *rq); 2006 extern void set_rq_offline(struct rq *rq); 2007 extern bool sched_smp_initialized; 2008 2009 #else /* CONFIG_SMP */ 2010 2011 /* 2012 * double_rq_lock - safely lock two runqueues 2013 * 2014 * Note this does not disable interrupts like task_rq_lock, 2015 * you need to do so manually before calling. 2016 */ 2017 static inline void double_rq_lock(struct rq *rq1, struct rq *rq2) 2018 __acquires(rq1->lock) 2019 __acquires(rq2->lock) 2020 { 2021 BUG_ON(!irqs_disabled()); 2022 BUG_ON(rq1 != rq2); 2023 raw_spin_lock(&rq1->lock); 2024 __acquire(rq2->lock); /* Fake it out ;) */ 2025 } 2026 2027 /* 2028 * double_rq_unlock - safely unlock two runqueues 2029 * 2030 * Note this does not restore interrupts like task_rq_unlock, 2031 * you need to do so manually after calling. 2032 */ 2033 static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2) 2034 __releases(rq1->lock) 2035 __releases(rq2->lock) 2036 { 2037 BUG_ON(rq1 != rq2); 2038 raw_spin_unlock(&rq1->lock); 2039 __release(rq2->lock); 2040 } 2041 2042 #endif 2043 2044 extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq); 2045 extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq); 2046 2047 #ifdef CONFIG_SCHED_DEBUG 2048 extern bool sched_debug_enabled; 2049 2050 extern void print_cfs_stats(struct seq_file *m, int cpu); 2051 extern void print_rt_stats(struct seq_file *m, int cpu); 2052 extern void print_dl_stats(struct seq_file *m, int cpu); 2053 extern void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq); 2054 extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq); 2055 extern void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq); 2056 #ifdef CONFIG_NUMA_BALANCING 2057 extern void 2058 show_numa_stats(struct task_struct *p, struct seq_file *m); 2059 extern void 2060 print_numa_stats(struct seq_file *m, int node, unsigned long tsf, 2061 unsigned long tpf, unsigned long gsf, unsigned long gpf); 2062 #endif /* CONFIG_NUMA_BALANCING */ 2063 #endif /* CONFIG_SCHED_DEBUG */ 2064 2065 extern void init_cfs_rq(struct cfs_rq *cfs_rq); 2066 extern void init_rt_rq(struct rt_rq *rt_rq); 2067 extern void init_dl_rq(struct dl_rq *dl_rq); 2068 2069 extern void cfs_bandwidth_usage_inc(void); 2070 extern void cfs_bandwidth_usage_dec(void); 2071 2072 #ifdef CONFIG_NO_HZ_COMMON 2073 #define NOHZ_BALANCE_KICK_BIT 0 2074 #define NOHZ_STATS_KICK_BIT 1 2075 2076 #define NOHZ_BALANCE_KICK BIT(NOHZ_BALANCE_KICK_BIT) 2077 #define NOHZ_STATS_KICK BIT(NOHZ_STATS_KICK_BIT) 2078 2079 #define NOHZ_KICK_MASK (NOHZ_BALANCE_KICK | NOHZ_STATS_KICK) 2080 2081 #define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags) 2082 2083 extern void nohz_balance_exit_idle(struct rq *rq); 2084 #else 2085 static inline void nohz_balance_exit_idle(struct rq *rq) { } 2086 #endif 2087 2088 2089 #ifdef CONFIG_SMP 2090 static inline 2091 void __dl_update(struct dl_bw *dl_b, s64 bw) 2092 { 2093 struct root_domain *rd = container_of(dl_b, struct root_domain, dl_bw); 2094 int i; 2095 2096 RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(), 2097 "sched RCU must be held"); 2098 for_each_cpu_and(i, rd->span, cpu_active_mask) { 2099 struct rq *rq = cpu_rq(i); 2100 2101 rq->dl.extra_bw += bw; 2102 } 2103 } 2104 #else 2105 static inline 2106 void __dl_update(struct dl_bw *dl_b, s64 bw) 2107 { 2108 struct dl_rq *dl = container_of(dl_b, struct dl_rq, dl_bw); 2109 2110 dl->extra_bw += bw; 2111 } 2112 #endif 2113 2114 2115 #ifdef CONFIG_IRQ_TIME_ACCOUNTING 2116 struct irqtime { 2117 u64 total; 2118 u64 tick_delta; 2119 u64 irq_start_time; 2120 struct u64_stats_sync sync; 2121 }; 2122 2123 DECLARE_PER_CPU(struct irqtime, cpu_irqtime); 2124 2125 /* 2126 * Returns the irqtime minus the softirq time computed by ksoftirqd. 2127 * Otherwise ksoftirqd's sum_exec_runtime is substracted its own runtime 2128 * and never move forward. 2129 */ 2130 static inline u64 irq_time_read(int cpu) 2131 { 2132 struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu); 2133 unsigned int seq; 2134 u64 total; 2135 2136 do { 2137 seq = __u64_stats_fetch_begin(&irqtime->sync); 2138 total = irqtime->total; 2139 } while (__u64_stats_fetch_retry(&irqtime->sync, seq)); 2140 2141 return total; 2142 } 2143 #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ 2144 2145 #ifdef CONFIG_CPU_FREQ 2146 DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data); 2147 2148 /** 2149 * cpufreq_update_util - Take a note about CPU utilization changes. 2150 * @rq: Runqueue to carry out the update for. 2151 * @flags: Update reason flags. 2152 * 2153 * This function is called by the scheduler on the CPU whose utilization is 2154 * being updated. 2155 * 2156 * It can only be called from RCU-sched read-side critical sections. 2157 * 2158 * The way cpufreq is currently arranged requires it to evaluate the CPU 2159 * performance state (frequency/voltage) on a regular basis to prevent it from 2160 * being stuck in a completely inadequate performance level for too long. 2161 * That is not guaranteed to happen if the updates are only triggered from CFS 2162 * and DL, though, because they may not be coming in if only RT tasks are 2163 * active all the time (or there are RT tasks only). 2164 * 2165 * As a workaround for that issue, this function is called periodically by the 2166 * RT sched class to trigger extra cpufreq updates to prevent it from stalling, 2167 * but that really is a band-aid. Going forward it should be replaced with 2168 * solutions targeted more specifically at RT tasks. 2169 */ 2170 static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) 2171 { 2172 struct update_util_data *data; 2173 2174 data = rcu_dereference_sched(*per_cpu_ptr(&cpufreq_update_util_data, 2175 cpu_of(rq))); 2176 if (data) 2177 data->func(data, rq_clock(rq), flags); 2178 } 2179 #else 2180 static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} 2181 #endif /* CONFIG_CPU_FREQ */ 2182 2183 #ifdef arch_scale_freq_capacity 2184 # ifndef arch_scale_freq_invariant 2185 # define arch_scale_freq_invariant() true 2186 # endif 2187 #else 2188 # define arch_scale_freq_invariant() false 2189 #endif 2190 2191 #ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL 2192 static inline unsigned long cpu_bw_dl(struct rq *rq) 2193 { 2194 return (rq->dl.running_bw * SCHED_CAPACITY_SCALE) >> BW_SHIFT; 2195 } 2196 2197 static inline unsigned long cpu_util_dl(struct rq *rq) 2198 { 2199 return READ_ONCE(rq->avg_dl.util_avg); 2200 } 2201 2202 static inline unsigned long cpu_util_cfs(struct rq *rq) 2203 { 2204 unsigned long util = READ_ONCE(rq->cfs.avg.util_avg); 2205 2206 if (sched_feat(UTIL_EST)) { 2207 util = max_t(unsigned long, util, 2208 READ_ONCE(rq->cfs.avg.util_est.enqueued)); 2209 } 2210 2211 return util; 2212 } 2213 2214 static inline unsigned long cpu_util_rt(struct rq *rq) 2215 { 2216 return READ_ONCE(rq->avg_rt.util_avg); 2217 } 2218 #endif 2219 2220 #ifdef HAVE_SCHED_AVG_IRQ 2221 static inline unsigned long cpu_util_irq(struct rq *rq) 2222 { 2223 return rq->avg_irq.util_avg; 2224 } 2225 2226 static inline 2227 unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned long max) 2228 { 2229 util *= (max - irq); 2230 util /= max; 2231 2232 return util; 2233 2234 } 2235 #else 2236 static inline unsigned long cpu_util_irq(struct rq *rq) 2237 { 2238 return 0; 2239 } 2240 2241 static inline 2242 unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned long max) 2243 { 2244 return util; 2245 } 2246 #endif 2247