1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH) 4 * 5 * Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> 6 * 7 * Interactivity improvements by Mike Galbraith 8 * (C) 2007 Mike Galbraith <efault@gmx.de> 9 * 10 * Various enhancements by Dmitry Adamushko. 11 * (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com> 12 * 13 * Group scheduling enhancements by Srivatsa Vaddagiri 14 * Copyright IBM Corporation, 2007 15 * Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com> 16 * 17 * Scaled math optimizations by Thomas Gleixner 18 * Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de> 19 * 20 * Adaptive scheduling granularity, math enhancements by Peter Zijlstra 21 * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra 22 */ 23 #include <linux/energy_model.h> 24 #include <linux/mmap_lock.h> 25 #include <linux/hugetlb_inline.h> 26 #include <linux/jiffies.h> 27 #include <linux/mm_api.h> 28 #include <linux/highmem.h> 29 #include <linux/spinlock_api.h> 30 #include <linux/cpumask_api.h> 31 #include <linux/lockdep_api.h> 32 #include <linux/softirq.h> 33 #include <linux/refcount_api.h> 34 #include <linux/topology.h> 35 #include <linux/sched/clock.h> 36 #include <linux/sched/cond_resched.h> 37 #include <linux/sched/cputime.h> 38 #include <linux/sched/isolation.h> 39 #include <linux/sched/nohz.h> 40 41 #include <linux/cpuidle.h> 42 #include <linux/interrupt.h> 43 #include <linux/memory-tiers.h> 44 #include <linux/mempolicy.h> 45 #include <linux/mutex_api.h> 46 #include <linux/profile.h> 47 #include <linux/psi.h> 48 #include <linux/ratelimit.h> 49 #include <linux/task_work.h> 50 #include <linux/rbtree_augmented.h> 51 52 #include <asm/switch_to.h> 53 54 #include "sched.h" 55 #include "stats.h" 56 #include "autogroup.h" 57 58 /* 59 * The initial- and re-scaling of tunables is configurable 60 * 61 * Options are: 62 * 63 * SCHED_TUNABLESCALING_NONE - unscaled, always *1 64 * SCHED_TUNABLESCALING_LOG - scaled logarithmically, *1+ilog(ncpus) 65 * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus 66 * 67 * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus)) 68 */ 69 unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG; 70 71 /* 72 * Minimal preemption granularity for CPU-bound tasks: 73 * 74 * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds) 75 */ 76 unsigned int sysctl_sched_base_slice = 750000ULL; 77 static unsigned int normalized_sysctl_sched_base_slice = 750000ULL; 78 79 const_debug unsigned int sysctl_sched_migration_cost = 500000UL; 80 81 static int __init setup_sched_thermal_decay_shift(char *str) 82 { 83 pr_warn("Ignoring the deprecated sched_thermal_decay_shift= option\n"); 84 return 1; 85 } 86 __setup("sched_thermal_decay_shift=", setup_sched_thermal_decay_shift); 87 88 #ifdef CONFIG_SMP 89 /* 90 * For asym packing, by default the lower numbered CPU has higher priority. 91 */ 92 int __weak arch_asym_cpu_priority(int cpu) 93 { 94 return -cpu; 95 } 96 97 /* 98 * The margin used when comparing utilization with CPU capacity. 99 * 100 * (default: ~20%) 101 */ 102 #define fits_capacity(cap, max) ((cap) * 1280 < (max) * 1024) 103 104 /* 105 * The margin used when comparing CPU capacities. 106 * is 'cap1' noticeably greater than 'cap2' 107 * 108 * (default: ~5%) 109 */ 110 #define capacity_greater(cap1, cap2) ((cap1) * 1024 > (cap2) * 1078) 111 #endif 112 113 #ifdef CONFIG_CFS_BANDWIDTH 114 /* 115 * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool 116 * each time a cfs_rq requests quota. 117 * 118 * Note: in the case that the slice exceeds the runtime remaining (either due 119 * to consumption or the quota being specified to be smaller than the slice) 120 * we will always only issue the remaining available time. 121 * 122 * (default: 5 msec, units: microseconds) 123 */ 124 static unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; 125 #endif 126 127 #ifdef CONFIG_NUMA_BALANCING 128 /* Restrict the NUMA promotion throughput (MB/s) for each target node. */ 129 static unsigned int sysctl_numa_balancing_promote_rate_limit = 65536; 130 #endif 131 132 #ifdef CONFIG_SYSCTL 133 static struct ctl_table sched_fair_sysctls[] = { 134 #ifdef CONFIG_CFS_BANDWIDTH 135 { 136 .procname = "sched_cfs_bandwidth_slice_us", 137 .data = &sysctl_sched_cfs_bandwidth_slice, 138 .maxlen = sizeof(unsigned int), 139 .mode = 0644, 140 .proc_handler = proc_dointvec_minmax, 141 .extra1 = SYSCTL_ONE, 142 }, 143 #endif 144 #ifdef CONFIG_NUMA_BALANCING 145 { 146 .procname = "numa_balancing_promote_rate_limit_MBps", 147 .data = &sysctl_numa_balancing_promote_rate_limit, 148 .maxlen = sizeof(unsigned int), 149 .mode = 0644, 150 .proc_handler = proc_dointvec_minmax, 151 .extra1 = SYSCTL_ZERO, 152 }, 153 #endif /* CONFIG_NUMA_BALANCING */ 154 }; 155 156 static int __init sched_fair_sysctl_init(void) 157 { 158 register_sysctl_init("kernel", sched_fair_sysctls); 159 return 0; 160 } 161 late_initcall(sched_fair_sysctl_init); 162 #endif 163 164 static inline void update_load_add(struct load_weight *lw, unsigned long inc) 165 { 166 lw->weight += inc; 167 lw->inv_weight = 0; 168 } 169 170 static inline void update_load_sub(struct load_weight *lw, unsigned long dec) 171 { 172 lw->weight -= dec; 173 lw->inv_weight = 0; 174 } 175 176 static inline void update_load_set(struct load_weight *lw, unsigned long w) 177 { 178 lw->weight = w; 179 lw->inv_weight = 0; 180 } 181 182 /* 183 * Increase the granularity value when there are more CPUs, 184 * because with more CPUs the 'effective latency' as visible 185 * to users decreases. But the relationship is not linear, 186 * so pick a second-best guess by going with the log2 of the 187 * number of CPUs. 188 * 189 * This idea comes from the SD scheduler of Con Kolivas: 190 */ 191 static unsigned int get_update_sysctl_factor(void) 192 { 193 unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8); 194 unsigned int factor; 195 196 switch (sysctl_sched_tunable_scaling) { 197 case SCHED_TUNABLESCALING_NONE: 198 factor = 1; 199 break; 200 case SCHED_TUNABLESCALING_LINEAR: 201 factor = cpus; 202 break; 203 case SCHED_TUNABLESCALING_LOG: 204 default: 205 factor = 1 + ilog2(cpus); 206 break; 207 } 208 209 return factor; 210 } 211 212 static void update_sysctl(void) 213 { 214 unsigned int factor = get_update_sysctl_factor(); 215 216 #define SET_SYSCTL(name) \ 217 (sysctl_##name = (factor) * normalized_sysctl_##name) 218 SET_SYSCTL(sched_base_slice); 219 #undef SET_SYSCTL 220 } 221 222 void __init sched_init_granularity(void) 223 { 224 update_sysctl(); 225 } 226 227 #define WMULT_CONST (~0U) 228 #define WMULT_SHIFT 32 229 230 static void __update_inv_weight(struct load_weight *lw) 231 { 232 unsigned long w; 233 234 if (likely(lw->inv_weight)) 235 return; 236 237 w = scale_load_down(lw->weight); 238 239 if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST)) 240 lw->inv_weight = 1; 241 else if (unlikely(!w)) 242 lw->inv_weight = WMULT_CONST; 243 else 244 lw->inv_weight = WMULT_CONST / w; 245 } 246 247 /* 248 * delta_exec * weight / lw.weight 249 * OR 250 * (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT 251 * 252 * Either weight := NICE_0_LOAD and lw \e sched_prio_to_wmult[], in which case 253 * we're guaranteed shift stays positive because inv_weight is guaranteed to 254 * fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22. 255 * 256 * Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus 257 * weight/lw.weight <= 1, and therefore our shift will also be positive. 258 */ 259 static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw) 260 { 261 u64 fact = scale_load_down(weight); 262 u32 fact_hi = (u32)(fact >> 32); 263 int shift = WMULT_SHIFT; 264 int fs; 265 266 __update_inv_weight(lw); 267 268 if (unlikely(fact_hi)) { 269 fs = fls(fact_hi); 270 shift -= fs; 271 fact >>= fs; 272 } 273 274 fact = mul_u32_u32(fact, lw->inv_weight); 275 276 fact_hi = (u32)(fact >> 32); 277 if (fact_hi) { 278 fs = fls(fact_hi); 279 shift -= fs; 280 fact >>= fs; 281 } 282 283 return mul_u64_u32_shr(delta_exec, fact, shift); 284 } 285 286 /* 287 * delta /= w 288 */ 289 static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se) 290 { 291 if (unlikely(se->load.weight != NICE_0_LOAD)) 292 delta = __calc_delta(delta, NICE_0_LOAD, &se->load); 293 294 return delta; 295 } 296 297 const struct sched_class fair_sched_class; 298 299 /************************************************************** 300 * CFS operations on generic schedulable entities: 301 */ 302 303 #ifdef CONFIG_FAIR_GROUP_SCHED 304 305 /* Walk up scheduling entities hierarchy */ 306 #define for_each_sched_entity(se) \ 307 for (; se; se = se->parent) 308 309 static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) 310 { 311 struct rq *rq = rq_of(cfs_rq); 312 int cpu = cpu_of(rq); 313 314 if (cfs_rq->on_list) 315 return rq->tmp_alone_branch == &rq->leaf_cfs_rq_list; 316 317 cfs_rq->on_list = 1; 318 319 /* 320 * Ensure we either appear before our parent (if already 321 * enqueued) or force our parent to appear after us when it is 322 * enqueued. The fact that we always enqueue bottom-up 323 * reduces this to two cases and a special case for the root 324 * cfs_rq. Furthermore, it also means that we will always reset 325 * tmp_alone_branch either when the branch is connected 326 * to a tree or when we reach the top of the tree 327 */ 328 if (cfs_rq->tg->parent && 329 cfs_rq->tg->parent->cfs_rq[cpu]->on_list) { 330 /* 331 * If parent is already on the list, we add the child 332 * just before. Thanks to circular linked property of 333 * the list, this means to put the child at the tail 334 * of the list that starts by parent. 335 */ 336 list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list, 337 &(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list)); 338 /* 339 * The branch is now connected to its tree so we can 340 * reset tmp_alone_branch to the beginning of the 341 * list. 342 */ 343 rq->tmp_alone_branch = &rq->leaf_cfs_rq_list; 344 return true; 345 } 346 347 if (!cfs_rq->tg->parent) { 348 /* 349 * cfs rq without parent should be put 350 * at the tail of the list. 351 */ 352 list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list, 353 &rq->leaf_cfs_rq_list); 354 /* 355 * We have reach the top of a tree so we can reset 356 * tmp_alone_branch to the beginning of the list. 357 */ 358 rq->tmp_alone_branch = &rq->leaf_cfs_rq_list; 359 return true; 360 } 361 362 /* 363 * The parent has not already been added so we want to 364 * make sure that it will be put after us. 365 * tmp_alone_branch points to the begin of the branch 366 * where we will add parent. 367 */ 368 list_add_rcu(&cfs_rq->leaf_cfs_rq_list, rq->tmp_alone_branch); 369 /* 370 * update tmp_alone_branch to points to the new begin 371 * of the branch 372 */ 373 rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list; 374 return false; 375 } 376 377 static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) 378 { 379 if (cfs_rq->on_list) { 380 struct rq *rq = rq_of(cfs_rq); 381 382 /* 383 * With cfs_rq being unthrottled/throttled during an enqueue, 384 * it can happen the tmp_alone_branch points to the leaf that 385 * we finally want to delete. In this case, tmp_alone_branch moves 386 * to the prev element but it will point to rq->leaf_cfs_rq_list 387 * at the end of the enqueue. 388 */ 389 if (rq->tmp_alone_branch == &cfs_rq->leaf_cfs_rq_list) 390 rq->tmp_alone_branch = cfs_rq->leaf_cfs_rq_list.prev; 391 392 list_del_rcu(&cfs_rq->leaf_cfs_rq_list); 393 cfs_rq->on_list = 0; 394 } 395 } 396 397 static inline void assert_list_leaf_cfs_rq(struct rq *rq) 398 { 399 SCHED_WARN_ON(rq->tmp_alone_branch != &rq->leaf_cfs_rq_list); 400 } 401 402 /* Iterate through all leaf cfs_rq's on a runqueue */ 403 #define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \ 404 list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list, \ 405 leaf_cfs_rq_list) 406 407 /* Do the two (enqueued) entities belong to the same group ? */ 408 static inline struct cfs_rq * 409 is_same_group(struct sched_entity *se, struct sched_entity *pse) 410 { 411 if (se->cfs_rq == pse->cfs_rq) 412 return se->cfs_rq; 413 414 return NULL; 415 } 416 417 static inline struct sched_entity *parent_entity(const struct sched_entity *se) 418 { 419 return se->parent; 420 } 421 422 static void 423 find_matching_se(struct sched_entity **se, struct sched_entity **pse) 424 { 425 int se_depth, pse_depth; 426 427 /* 428 * preemption test can be made between sibling entities who are in the 429 * same cfs_rq i.e who have a common parent. Walk up the hierarchy of 430 * both tasks until we find their ancestors who are siblings of common 431 * parent. 432 */ 433 434 /* First walk up until both entities are at same depth */ 435 se_depth = (*se)->depth; 436 pse_depth = (*pse)->depth; 437 438 while (se_depth > pse_depth) { 439 se_depth--; 440 *se = parent_entity(*se); 441 } 442 443 while (pse_depth > se_depth) { 444 pse_depth--; 445 *pse = parent_entity(*pse); 446 } 447 448 while (!is_same_group(*se, *pse)) { 449 *se = parent_entity(*se); 450 *pse = parent_entity(*pse); 451 } 452 } 453 454 static int tg_is_idle(struct task_group *tg) 455 { 456 return tg->idle > 0; 457 } 458 459 static int cfs_rq_is_idle(struct cfs_rq *cfs_rq) 460 { 461 return cfs_rq->idle > 0; 462 } 463 464 static int se_is_idle(struct sched_entity *se) 465 { 466 if (entity_is_task(se)) 467 return task_has_idle_policy(task_of(se)); 468 return cfs_rq_is_idle(group_cfs_rq(se)); 469 } 470 471 #else /* !CONFIG_FAIR_GROUP_SCHED */ 472 473 #define for_each_sched_entity(se) \ 474 for (; se; se = NULL) 475 476 static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) 477 { 478 return true; 479 } 480 481 static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) 482 { 483 } 484 485 static inline void assert_list_leaf_cfs_rq(struct rq *rq) 486 { 487 } 488 489 #define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \ 490 for (cfs_rq = &rq->cfs, pos = NULL; cfs_rq; cfs_rq = pos) 491 492 static inline struct sched_entity *parent_entity(struct sched_entity *se) 493 { 494 return NULL; 495 } 496 497 static inline void 498 find_matching_se(struct sched_entity **se, struct sched_entity **pse) 499 { 500 } 501 502 static inline int tg_is_idle(struct task_group *tg) 503 { 504 return 0; 505 } 506 507 static int cfs_rq_is_idle(struct cfs_rq *cfs_rq) 508 { 509 return 0; 510 } 511 512 static int se_is_idle(struct sched_entity *se) 513 { 514 return task_has_idle_policy(task_of(se)); 515 } 516 517 #endif /* CONFIG_FAIR_GROUP_SCHED */ 518 519 static __always_inline 520 void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec); 521 522 /************************************************************** 523 * Scheduling class tree data structure manipulation methods: 524 */ 525 526 static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime) 527 { 528 s64 delta = (s64)(vruntime - max_vruntime); 529 if (delta > 0) 530 max_vruntime = vruntime; 531 532 return max_vruntime; 533 } 534 535 static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime) 536 { 537 s64 delta = (s64)(vruntime - min_vruntime); 538 if (delta < 0) 539 min_vruntime = vruntime; 540 541 return min_vruntime; 542 } 543 544 static inline bool entity_before(const struct sched_entity *a, 545 const struct sched_entity *b) 546 { 547 /* 548 * Tiebreak on vruntime seems unnecessary since it can 549 * hardly happen. 550 */ 551 return (s64)(a->deadline - b->deadline) < 0; 552 } 553 554 static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) 555 { 556 return (s64)(se->vruntime - cfs_rq->min_vruntime); 557 } 558 559 #define __node_2_se(node) \ 560 rb_entry((node), struct sched_entity, run_node) 561 562 /* 563 * Compute virtual time from the per-task service numbers: 564 * 565 * Fair schedulers conserve lag: 566 * 567 * \Sum lag_i = 0 568 * 569 * Where lag_i is given by: 570 * 571 * lag_i = S - s_i = w_i * (V - v_i) 572 * 573 * Where S is the ideal service time and V is it's virtual time counterpart. 574 * Therefore: 575 * 576 * \Sum lag_i = 0 577 * \Sum w_i * (V - v_i) = 0 578 * \Sum w_i * V - w_i * v_i = 0 579 * 580 * From which we can solve an expression for V in v_i (which we have in 581 * se->vruntime): 582 * 583 * \Sum v_i * w_i \Sum v_i * w_i 584 * V = -------------- = -------------- 585 * \Sum w_i W 586 * 587 * Specifically, this is the weighted average of all entity virtual runtimes. 588 * 589 * [[ NOTE: this is only equal to the ideal scheduler under the condition 590 * that join/leave operations happen at lag_i = 0, otherwise the 591 * virtual time has non-contiguous motion equivalent to: 592 * 593 * V +-= lag_i / W 594 * 595 * Also see the comment in place_entity() that deals with this. ]] 596 * 597 * However, since v_i is u64, and the multiplication could easily overflow 598 * transform it into a relative form that uses smaller quantities: 599 * 600 * Substitute: v_i == (v_i - v0) + v0 601 * 602 * \Sum ((v_i - v0) + v0) * w_i \Sum (v_i - v0) * w_i 603 * V = ---------------------------- = --------------------- + v0 604 * W W 605 * 606 * Which we track using: 607 * 608 * v0 := cfs_rq->min_vruntime 609 * \Sum (v_i - v0) * w_i := cfs_rq->avg_vruntime 610 * \Sum w_i := cfs_rq->avg_load 611 * 612 * Since min_vruntime is a monotonic increasing variable that closely tracks 613 * the per-task service, these deltas: (v_i - v), will be in the order of the 614 * maximal (virtual) lag induced in the system due to quantisation. 615 * 616 * Also, we use scale_load_down() to reduce the size. 617 * 618 * As measured, the max (key * weight) value was ~44 bits for a kernel build. 619 */ 620 static void 621 avg_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se) 622 { 623 unsigned long weight = scale_load_down(se->load.weight); 624 s64 key = entity_key(cfs_rq, se); 625 626 cfs_rq->avg_vruntime += key * weight; 627 cfs_rq->avg_load += weight; 628 } 629 630 static void 631 avg_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se) 632 { 633 unsigned long weight = scale_load_down(se->load.weight); 634 s64 key = entity_key(cfs_rq, se); 635 636 cfs_rq->avg_vruntime -= key * weight; 637 cfs_rq->avg_load -= weight; 638 } 639 640 static inline 641 void avg_vruntime_update(struct cfs_rq *cfs_rq, s64 delta) 642 { 643 /* 644 * v' = v + d ==> avg_vruntime' = avg_runtime - d*avg_load 645 */ 646 cfs_rq->avg_vruntime -= cfs_rq->avg_load * delta; 647 } 648 649 /* 650 * Specifically: avg_runtime() + 0 must result in entity_eligible() := true 651 * For this to be so, the result of this function must have a left bias. 652 */ 653 u64 avg_vruntime(struct cfs_rq *cfs_rq) 654 { 655 struct sched_entity *curr = cfs_rq->curr; 656 s64 avg = cfs_rq->avg_vruntime; 657 long load = cfs_rq->avg_load; 658 659 if (curr && curr->on_rq) { 660 unsigned long weight = scale_load_down(curr->load.weight); 661 662 avg += entity_key(cfs_rq, curr) * weight; 663 load += weight; 664 } 665 666 if (load) { 667 /* sign flips effective floor / ceiling */ 668 if (avg < 0) 669 avg -= (load - 1); 670 avg = div_s64(avg, load); 671 } 672 673 return cfs_rq->min_vruntime + avg; 674 } 675 676 /* 677 * lag_i = S - s_i = w_i * (V - v_i) 678 * 679 * However, since V is approximated by the weighted average of all entities it 680 * is possible -- by addition/removal/reweight to the tree -- to move V around 681 * and end up with a larger lag than we started with. 682 * 683 * Limit this to either double the slice length with a minimum of TICK_NSEC 684 * since that is the timing granularity. 685 * 686 * EEVDF gives the following limit for a steady state system: 687 * 688 * -r_max < lag < max(r_max, q) 689 * 690 * XXX could add max_slice to the augmented data to track this. 691 */ 692 static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se) 693 { 694 s64 vlag, limit; 695 696 SCHED_WARN_ON(!se->on_rq); 697 698 vlag = avg_vruntime(cfs_rq) - se->vruntime; 699 limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se); 700 701 se->vlag = clamp(vlag, -limit, limit); 702 } 703 704 /* 705 * Entity is eligible once it received less service than it ought to have, 706 * eg. lag >= 0. 707 * 708 * lag_i = S - s_i = w_i*(V - v_i) 709 * 710 * lag_i >= 0 -> V >= v_i 711 * 712 * \Sum (v_i - v)*w_i 713 * V = ------------------ + v 714 * \Sum w_i 715 * 716 * lag_i >= 0 -> \Sum (v_i - v)*w_i >= (v_i - v)*(\Sum w_i) 717 * 718 * Note: using 'avg_vruntime() > se->vruntime' is inaccurate due 719 * to the loss in precision caused by the division. 720 */ 721 static int vruntime_eligible(struct cfs_rq *cfs_rq, u64 vruntime) 722 { 723 struct sched_entity *curr = cfs_rq->curr; 724 s64 avg = cfs_rq->avg_vruntime; 725 long load = cfs_rq->avg_load; 726 727 if (curr && curr->on_rq) { 728 unsigned long weight = scale_load_down(curr->load.weight); 729 730 avg += entity_key(cfs_rq, curr) * weight; 731 load += weight; 732 } 733 734 return avg >= (s64)(vruntime - cfs_rq->min_vruntime) * load; 735 } 736 737 int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se) 738 { 739 return vruntime_eligible(cfs_rq, se->vruntime); 740 } 741 742 static u64 __update_min_vruntime(struct cfs_rq *cfs_rq, u64 vruntime) 743 { 744 u64 min_vruntime = cfs_rq->min_vruntime; 745 /* 746 * open coded max_vruntime() to allow updating avg_vruntime 747 */ 748 s64 delta = (s64)(vruntime - min_vruntime); 749 if (delta > 0) { 750 avg_vruntime_update(cfs_rq, delta); 751 min_vruntime = vruntime; 752 } 753 return min_vruntime; 754 } 755 756 static void update_min_vruntime(struct cfs_rq *cfs_rq) 757 { 758 struct sched_entity *se = __pick_root_entity(cfs_rq); 759 struct sched_entity *curr = cfs_rq->curr; 760 u64 vruntime = cfs_rq->min_vruntime; 761 762 if (curr) { 763 if (curr->on_rq) 764 vruntime = curr->vruntime; 765 else 766 curr = NULL; 767 } 768 769 if (se) { 770 if (!curr) 771 vruntime = se->min_vruntime; 772 else 773 vruntime = min_vruntime(vruntime, se->min_vruntime); 774 } 775 776 /* ensure we never gain time by being placed backwards. */ 777 cfs_rq->min_vruntime = __update_min_vruntime(cfs_rq, vruntime); 778 } 779 780 static inline u64 cfs_rq_min_slice(struct cfs_rq *cfs_rq) 781 { 782 struct sched_entity *root = __pick_root_entity(cfs_rq); 783 struct sched_entity *curr = cfs_rq->curr; 784 u64 min_slice = ~0ULL; 785 786 if (curr && curr->on_rq) 787 min_slice = curr->slice; 788 789 if (root) 790 min_slice = min(min_slice, root->min_slice); 791 792 return min_slice; 793 } 794 795 static inline bool __entity_less(struct rb_node *a, const struct rb_node *b) 796 { 797 return entity_before(__node_2_se(a), __node_2_se(b)); 798 } 799 800 #define vruntime_gt(field, lse, rse) ({ (s64)((lse)->field - (rse)->field) > 0; }) 801 802 static inline void __min_vruntime_update(struct sched_entity *se, struct rb_node *node) 803 { 804 if (node) { 805 struct sched_entity *rse = __node_2_se(node); 806 if (vruntime_gt(min_vruntime, se, rse)) 807 se->min_vruntime = rse->min_vruntime; 808 } 809 } 810 811 static inline void __min_slice_update(struct sched_entity *se, struct rb_node *node) 812 { 813 if (node) { 814 struct sched_entity *rse = __node_2_se(node); 815 if (rse->min_slice < se->min_slice) 816 se->min_slice = rse->min_slice; 817 } 818 } 819 820 /* 821 * se->min_vruntime = min(se->vruntime, {left,right}->min_vruntime) 822 */ 823 static inline bool min_vruntime_update(struct sched_entity *se, bool exit) 824 { 825 u64 old_min_vruntime = se->min_vruntime; 826 u64 old_min_slice = se->min_slice; 827 struct rb_node *node = &se->run_node; 828 829 se->min_vruntime = se->vruntime; 830 __min_vruntime_update(se, node->rb_right); 831 __min_vruntime_update(se, node->rb_left); 832 833 se->min_slice = se->slice; 834 __min_slice_update(se, node->rb_right); 835 __min_slice_update(se, node->rb_left); 836 837 return se->min_vruntime == old_min_vruntime && 838 se->min_slice == old_min_slice; 839 } 840 841 RB_DECLARE_CALLBACKS(static, min_vruntime_cb, struct sched_entity, 842 run_node, min_vruntime, min_vruntime_update); 843 844 /* 845 * Enqueue an entity into the rb-tree: 846 */ 847 static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) 848 { 849 avg_vruntime_add(cfs_rq, se); 850 se->min_vruntime = se->vruntime; 851 se->min_slice = se->slice; 852 rb_add_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline, 853 __entity_less, &min_vruntime_cb); 854 } 855 856 static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) 857 { 858 rb_erase_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline, 859 &min_vruntime_cb); 860 avg_vruntime_sub(cfs_rq, se); 861 } 862 863 struct sched_entity *__pick_root_entity(struct cfs_rq *cfs_rq) 864 { 865 struct rb_node *root = cfs_rq->tasks_timeline.rb_root.rb_node; 866 867 if (!root) 868 return NULL; 869 870 return __node_2_se(root); 871 } 872 873 struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) 874 { 875 struct rb_node *left = rb_first_cached(&cfs_rq->tasks_timeline); 876 877 if (!left) 878 return NULL; 879 880 return __node_2_se(left); 881 } 882 883 /* 884 * Earliest Eligible Virtual Deadline First 885 * 886 * In order to provide latency guarantees for different request sizes 887 * EEVDF selects the best runnable task from two criteria: 888 * 889 * 1) the task must be eligible (must be owed service) 890 * 891 * 2) from those tasks that meet 1), we select the one 892 * with the earliest virtual deadline. 893 * 894 * We can do this in O(log n) time due to an augmented RB-tree. The 895 * tree keeps the entries sorted on deadline, but also functions as a 896 * heap based on the vruntime by keeping: 897 * 898 * se->min_vruntime = min(se->vruntime, se->{left,right}->min_vruntime) 899 * 900 * Which allows tree pruning through eligibility. 901 */ 902 static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq) 903 { 904 struct rb_node *node = cfs_rq->tasks_timeline.rb_root.rb_node; 905 struct sched_entity *se = __pick_first_entity(cfs_rq); 906 struct sched_entity *curr = cfs_rq->curr; 907 struct sched_entity *best = NULL; 908 909 /* 910 * We can safely skip eligibility check if there is only one entity 911 * in this cfs_rq, saving some cycles. 912 */ 913 if (cfs_rq->nr_running == 1) 914 return curr && curr->on_rq ? curr : se; 915 916 if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr))) 917 curr = NULL; 918 919 /* 920 * Once selected, run a task until it either becomes non-eligible or 921 * until it gets a new slice. See the HACK in set_next_entity(). 922 */ 923 if (sched_feat(RUN_TO_PARITY) && curr && curr->vlag == curr->deadline) 924 return curr; 925 926 /* Pick the leftmost entity if it's eligible */ 927 if (se && entity_eligible(cfs_rq, se)) { 928 best = se; 929 goto found; 930 } 931 932 /* Heap search for the EEVD entity */ 933 while (node) { 934 struct rb_node *left = node->rb_left; 935 936 /* 937 * Eligible entities in left subtree are always better 938 * choices, since they have earlier deadlines. 939 */ 940 if (left && vruntime_eligible(cfs_rq, 941 __node_2_se(left)->min_vruntime)) { 942 node = left; 943 continue; 944 } 945 946 se = __node_2_se(node); 947 948 /* 949 * The left subtree either is empty or has no eligible 950 * entity, so check the current node since it is the one 951 * with earliest deadline that might be eligible. 952 */ 953 if (entity_eligible(cfs_rq, se)) { 954 best = se; 955 break; 956 } 957 958 node = node->rb_right; 959 } 960 found: 961 if (!best || (curr && entity_before(curr, best))) 962 best = curr; 963 964 return best; 965 } 966 967 #ifdef CONFIG_SCHED_DEBUG 968 struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) 969 { 970 struct rb_node *last = rb_last(&cfs_rq->tasks_timeline.rb_root); 971 972 if (!last) 973 return NULL; 974 975 return __node_2_se(last); 976 } 977 978 /************************************************************** 979 * Scheduling class statistics methods: 980 */ 981 #ifdef CONFIG_SMP 982 int sched_update_scaling(void) 983 { 984 unsigned int factor = get_update_sysctl_factor(); 985 986 #define WRT_SYSCTL(name) \ 987 (normalized_sysctl_##name = sysctl_##name / (factor)) 988 WRT_SYSCTL(sched_base_slice); 989 #undef WRT_SYSCTL 990 991 return 0; 992 } 993 #endif 994 #endif 995 996 static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se); 997 998 /* 999 * XXX: strictly: vd_i += N*r_i/w_i such that: vd_i > ve_i 1000 * this is probably good enough. 1001 */ 1002 static bool update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se) 1003 { 1004 if ((s64)(se->vruntime - se->deadline) < 0) 1005 return false; 1006 1007 /* 1008 * For EEVDF the virtual time slope is determined by w_i (iow. 1009 * nice) while the request time r_i is determined by 1010 * sysctl_sched_base_slice. 1011 */ 1012 if (!se->custom_slice) 1013 se->slice = sysctl_sched_base_slice; 1014 1015 /* 1016 * EEVDF: vd_i = ve_i + r_i / w_i 1017 */ 1018 se->deadline = se->vruntime + calc_delta_fair(se->slice, se); 1019 1020 /* 1021 * The task has consumed its request, reschedule. 1022 */ 1023 return true; 1024 } 1025 1026 #include "pelt.h" 1027 #ifdef CONFIG_SMP 1028 1029 static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu); 1030 static unsigned long task_h_load(struct task_struct *p); 1031 static unsigned long capacity_of(int cpu); 1032 1033 /* Give new sched_entity start runnable values to heavy its load in infant time */ 1034 void init_entity_runnable_average(struct sched_entity *se) 1035 { 1036 struct sched_avg *sa = &se->avg; 1037 1038 memset(sa, 0, sizeof(*sa)); 1039 1040 /* 1041 * Tasks are initialized with full load to be seen as heavy tasks until 1042 * they get a chance to stabilize to their real load level. 1043 * Group entities are initialized with zero load to reflect the fact that 1044 * nothing has been attached to the task group yet. 1045 */ 1046 if (entity_is_task(se)) 1047 sa->load_avg = scale_load_down(se->load.weight); 1048 1049 /* when this task is enqueued, it will contribute to its cfs_rq's load_avg */ 1050 } 1051 1052 /* 1053 * With new tasks being created, their initial util_avgs are extrapolated 1054 * based on the cfs_rq's current util_avg: 1055 * 1056 * util_avg = cfs_rq->avg.util_avg / (cfs_rq->avg.load_avg + 1) 1057 * * se_weight(se) 1058 * 1059 * However, in many cases, the above util_avg does not give a desired 1060 * value. Moreover, the sum of the util_avgs may be divergent, such 1061 * as when the series is a harmonic series. 1062 * 1063 * To solve this problem, we also cap the util_avg of successive tasks to 1064 * only 1/2 of the left utilization budget: 1065 * 1066 * util_avg_cap = (cpu_scale - cfs_rq->avg.util_avg) / 2^n 1067 * 1068 * where n denotes the nth task and cpu_scale the CPU capacity. 1069 * 1070 * For example, for a CPU with 1024 of capacity, a simplest series from 1071 * the beginning would be like: 1072 * 1073 * task util_avg: 512, 256, 128, 64, 32, 16, 8, ... 1074 * cfs_rq util_avg: 512, 768, 896, 960, 992, 1008, 1016, ... 1075 * 1076 * Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap) 1077 * if util_avg > util_avg_cap. 1078 */ 1079 void post_init_entity_util_avg(struct task_struct *p) 1080 { 1081 struct sched_entity *se = &p->se; 1082 struct cfs_rq *cfs_rq = cfs_rq_of(se); 1083 struct sched_avg *sa = &se->avg; 1084 long cpu_scale = arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq))); 1085 long cap = (long)(cpu_scale - cfs_rq->avg.util_avg) / 2; 1086 1087 if (p->sched_class != &fair_sched_class) { 1088 /* 1089 * For !fair tasks do: 1090 * 1091 update_cfs_rq_load_avg(now, cfs_rq); 1092 attach_entity_load_avg(cfs_rq, se); 1093 switched_from_fair(rq, p); 1094 * 1095 * such that the next switched_to_fair() has the 1096 * expected state. 1097 */ 1098 se->avg.last_update_time = cfs_rq_clock_pelt(cfs_rq); 1099 return; 1100 } 1101 1102 if (cap > 0) { 1103 if (cfs_rq->avg.util_avg != 0) { 1104 sa->util_avg = cfs_rq->avg.util_avg * se_weight(se); 1105 sa->util_avg /= (cfs_rq->avg.load_avg + 1); 1106 1107 if (sa->util_avg > cap) 1108 sa->util_avg = cap; 1109 } else { 1110 sa->util_avg = cap; 1111 } 1112 } 1113 1114 sa->runnable_avg = sa->util_avg; 1115 } 1116 1117 #else /* !CONFIG_SMP */ 1118 void init_entity_runnable_average(struct sched_entity *se) 1119 { 1120 } 1121 void post_init_entity_util_avg(struct task_struct *p) 1122 { 1123 } 1124 static void update_tg_load_avg(struct cfs_rq *cfs_rq) 1125 { 1126 } 1127 #endif /* CONFIG_SMP */ 1128 1129 static s64 update_curr_se(struct rq *rq, struct sched_entity *curr) 1130 { 1131 u64 now = rq_clock_task(rq); 1132 s64 delta_exec; 1133 1134 delta_exec = now - curr->exec_start; 1135 if (unlikely(delta_exec <= 0)) 1136 return delta_exec; 1137 1138 curr->exec_start = now; 1139 curr->sum_exec_runtime += delta_exec; 1140 1141 if (schedstat_enabled()) { 1142 struct sched_statistics *stats; 1143 1144 stats = __schedstats_from_se(curr); 1145 __schedstat_set(stats->exec_max, 1146 max(delta_exec, stats->exec_max)); 1147 } 1148 1149 return delta_exec; 1150 } 1151 1152 static inline void update_curr_task(struct task_struct *p, s64 delta_exec) 1153 { 1154 trace_sched_stat_runtime(p, delta_exec); 1155 account_group_exec_runtime(p, delta_exec); 1156 cgroup_account_cputime(p, delta_exec); 1157 } 1158 1159 static inline bool did_preempt_short(struct cfs_rq *cfs_rq, struct sched_entity *curr) 1160 { 1161 if (!sched_feat(PREEMPT_SHORT)) 1162 return false; 1163 1164 if (curr->vlag == curr->deadline) 1165 return false; 1166 1167 return !entity_eligible(cfs_rq, curr); 1168 } 1169 1170 static inline bool do_preempt_short(struct cfs_rq *cfs_rq, 1171 struct sched_entity *pse, struct sched_entity *se) 1172 { 1173 if (!sched_feat(PREEMPT_SHORT)) 1174 return false; 1175 1176 if (pse->slice >= se->slice) 1177 return false; 1178 1179 if (!entity_eligible(cfs_rq, pse)) 1180 return false; 1181 1182 if (entity_before(pse, se)) 1183 return true; 1184 1185 if (!entity_eligible(cfs_rq, se)) 1186 return true; 1187 1188 return false; 1189 } 1190 1191 /* 1192 * Used by other classes to account runtime. 1193 */ 1194 s64 update_curr_common(struct rq *rq) 1195 { 1196 struct task_struct *donor = rq->donor; 1197 s64 delta_exec; 1198 1199 delta_exec = update_curr_se(rq, &donor->se); 1200 if (likely(delta_exec > 0)) 1201 update_curr_task(donor, delta_exec); 1202 1203 return delta_exec; 1204 } 1205 1206 /* 1207 * Update the current task's runtime statistics. 1208 */ 1209 static void update_curr(struct cfs_rq *cfs_rq) 1210 { 1211 struct sched_entity *curr = cfs_rq->curr; 1212 struct rq *rq = rq_of(cfs_rq); 1213 s64 delta_exec; 1214 bool resched; 1215 1216 if (unlikely(!curr)) 1217 return; 1218 1219 delta_exec = update_curr_se(rq, curr); 1220 if (unlikely(delta_exec <= 0)) 1221 return; 1222 1223 curr->vruntime += calc_delta_fair(delta_exec, curr); 1224 resched = update_deadline(cfs_rq, curr); 1225 update_min_vruntime(cfs_rq); 1226 1227 if (entity_is_task(curr)) { 1228 struct task_struct *p = task_of(curr); 1229 1230 update_curr_task(p, delta_exec); 1231 1232 /* 1233 * If the fair_server is active, we need to account for the 1234 * fair_server time whether or not the task is running on 1235 * behalf of fair_server or not: 1236 * - If the task is running on behalf of fair_server, we need 1237 * to limit its time based on the assigned runtime. 1238 * - Fair task that runs outside of fair_server should account 1239 * against fair_server such that it can account for this time 1240 * and possibly avoid running this period. 1241 */ 1242 if (dl_server_active(&rq->fair_server)) 1243 dl_server_update(&rq->fair_server, delta_exec); 1244 } 1245 1246 account_cfs_rq_runtime(cfs_rq, delta_exec); 1247 1248 if (cfs_rq->nr_running == 1) 1249 return; 1250 1251 if (resched || did_preempt_short(cfs_rq, curr)) { 1252 resched_curr_lazy(rq); 1253 clear_buddies(cfs_rq, curr); 1254 } 1255 } 1256 1257 static void update_curr_fair(struct rq *rq) 1258 { 1259 update_curr(cfs_rq_of(&rq->donor->se)); 1260 } 1261 1262 static inline void 1263 update_stats_wait_start_fair(struct cfs_rq *cfs_rq, struct sched_entity *se) 1264 { 1265 struct sched_statistics *stats; 1266 struct task_struct *p = NULL; 1267 1268 if (!schedstat_enabled()) 1269 return; 1270 1271 stats = __schedstats_from_se(se); 1272 1273 if (entity_is_task(se)) 1274 p = task_of(se); 1275 1276 __update_stats_wait_start(rq_of(cfs_rq), p, stats); 1277 } 1278 1279 static inline void 1280 update_stats_wait_end_fair(struct cfs_rq *cfs_rq, struct sched_entity *se) 1281 { 1282 struct sched_statistics *stats; 1283 struct task_struct *p = NULL; 1284 1285 if (!schedstat_enabled()) 1286 return; 1287 1288 stats = __schedstats_from_se(se); 1289 1290 /* 1291 * When the sched_schedstat changes from 0 to 1, some sched se 1292 * maybe already in the runqueue, the se->statistics.wait_start 1293 * will be 0.So it will let the delta wrong. We need to avoid this 1294 * scenario. 1295 */ 1296 if (unlikely(!schedstat_val(stats->wait_start))) 1297 return; 1298 1299 if (entity_is_task(se)) 1300 p = task_of(se); 1301 1302 __update_stats_wait_end(rq_of(cfs_rq), p, stats); 1303 } 1304 1305 static inline void 1306 update_stats_enqueue_sleeper_fair(struct cfs_rq *cfs_rq, struct sched_entity *se) 1307 { 1308 struct sched_statistics *stats; 1309 struct task_struct *tsk = NULL; 1310 1311 if (!schedstat_enabled()) 1312 return; 1313 1314 stats = __schedstats_from_se(se); 1315 1316 if (entity_is_task(se)) 1317 tsk = task_of(se); 1318 1319 __update_stats_enqueue_sleeper(rq_of(cfs_rq), tsk, stats); 1320 } 1321 1322 /* 1323 * Task is being enqueued - update stats: 1324 */ 1325 static inline void 1326 update_stats_enqueue_fair(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) 1327 { 1328 if (!schedstat_enabled()) 1329 return; 1330 1331 /* 1332 * Are we enqueueing a waiting task? (for current tasks 1333 * a dequeue/enqueue event is a NOP) 1334 */ 1335 if (se != cfs_rq->curr) 1336 update_stats_wait_start_fair(cfs_rq, se); 1337 1338 if (flags & ENQUEUE_WAKEUP) 1339 update_stats_enqueue_sleeper_fair(cfs_rq, se); 1340 } 1341 1342 static inline void 1343 update_stats_dequeue_fair(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) 1344 { 1345 1346 if (!schedstat_enabled()) 1347 return; 1348 1349 /* 1350 * Mark the end of the wait period if dequeueing a 1351 * waiting task: 1352 */ 1353 if (se != cfs_rq->curr) 1354 update_stats_wait_end_fair(cfs_rq, se); 1355 1356 if ((flags & DEQUEUE_SLEEP) && entity_is_task(se)) { 1357 struct task_struct *tsk = task_of(se); 1358 unsigned int state; 1359 1360 /* XXX racy against TTWU */ 1361 state = READ_ONCE(tsk->__state); 1362 if (state & TASK_INTERRUPTIBLE) 1363 __schedstat_set(tsk->stats.sleep_start, 1364 rq_clock(rq_of(cfs_rq))); 1365 if (state & TASK_UNINTERRUPTIBLE) 1366 __schedstat_set(tsk->stats.block_start, 1367 rq_clock(rq_of(cfs_rq))); 1368 } 1369 } 1370 1371 /* 1372 * We are picking a new current task - update its stats: 1373 */ 1374 static inline void 1375 update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) 1376 { 1377 /* 1378 * We are starting a new run period: 1379 */ 1380 se->exec_start = rq_clock_task(rq_of(cfs_rq)); 1381 } 1382 1383 /************************************************** 1384 * Scheduling class queueing methods: 1385 */ 1386 1387 static inline bool is_core_idle(int cpu) 1388 { 1389 #ifdef CONFIG_SCHED_SMT 1390 int sibling; 1391 1392 for_each_cpu(sibling, cpu_smt_mask(cpu)) { 1393 if (cpu == sibling) 1394 continue; 1395 1396 if (!idle_cpu(sibling)) 1397 return false; 1398 } 1399 #endif 1400 1401 return true; 1402 } 1403 1404 #ifdef CONFIG_NUMA 1405 #define NUMA_IMBALANCE_MIN 2 1406 1407 static inline long 1408 adjust_numa_imbalance(int imbalance, int dst_running, int imb_numa_nr) 1409 { 1410 /* 1411 * Allow a NUMA imbalance if busy CPUs is less than the maximum 1412 * threshold. Above this threshold, individual tasks may be contending 1413 * for both memory bandwidth and any shared HT resources. This is an 1414 * approximation as the number of running tasks may not be related to 1415 * the number of busy CPUs due to sched_setaffinity. 1416 */ 1417 if (dst_running > imb_numa_nr) 1418 return imbalance; 1419 1420 /* 1421 * Allow a small imbalance based on a simple pair of communicating 1422 * tasks that remain local when the destination is lightly loaded. 1423 */ 1424 if (imbalance <= NUMA_IMBALANCE_MIN) 1425 return 0; 1426 1427 return imbalance; 1428 } 1429 #endif /* CONFIG_NUMA */ 1430 1431 #ifdef CONFIG_NUMA_BALANCING 1432 /* 1433 * Approximate time to scan a full NUMA task in ms. The task scan period is 1434 * calculated based on the tasks virtual memory size and 1435 * numa_balancing_scan_size. 1436 */ 1437 unsigned int sysctl_numa_balancing_scan_period_min = 1000; 1438 unsigned int sysctl_numa_balancing_scan_period_max = 60000; 1439 1440 /* Portion of address space to scan in MB */ 1441 unsigned int sysctl_numa_balancing_scan_size = 256; 1442 1443 /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */ 1444 unsigned int sysctl_numa_balancing_scan_delay = 1000; 1445 1446 /* The page with hint page fault latency < threshold in ms is considered hot */ 1447 unsigned int sysctl_numa_balancing_hot_threshold = MSEC_PER_SEC; 1448 1449 struct numa_group { 1450 refcount_t refcount; 1451 1452 spinlock_t lock; /* nr_tasks, tasks */ 1453 int nr_tasks; 1454 pid_t gid; 1455 int active_nodes; 1456 1457 struct rcu_head rcu; 1458 unsigned long total_faults; 1459 unsigned long max_faults_cpu; 1460 /* 1461 * faults[] array is split into two regions: faults_mem and faults_cpu. 1462 * 1463 * Faults_cpu is used to decide whether memory should move 1464 * towards the CPU. As a consequence, these stats are weighted 1465 * more by CPU use than by memory faults. 1466 */ 1467 unsigned long faults[]; 1468 }; 1469 1470 /* 1471 * For functions that can be called in multiple contexts that permit reading 1472 * ->numa_group (see struct task_struct for locking rules). 1473 */ 1474 static struct numa_group *deref_task_numa_group(struct task_struct *p) 1475 { 1476 return rcu_dereference_check(p->numa_group, p == current || 1477 (lockdep_is_held(__rq_lockp(task_rq(p))) && !READ_ONCE(p->on_cpu))); 1478 } 1479 1480 static struct numa_group *deref_curr_numa_group(struct task_struct *p) 1481 { 1482 return rcu_dereference_protected(p->numa_group, p == current); 1483 } 1484 1485 static inline unsigned long group_faults_priv(struct numa_group *ng); 1486 static inline unsigned long group_faults_shared(struct numa_group *ng); 1487 1488 static unsigned int task_nr_scan_windows(struct task_struct *p) 1489 { 1490 unsigned long rss = 0; 1491 unsigned long nr_scan_pages; 1492 1493 /* 1494 * Calculations based on RSS as non-present and empty pages are skipped 1495 * by the PTE scanner and NUMA hinting faults should be trapped based 1496 * on resident pages 1497 */ 1498 nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT); 1499 rss = get_mm_rss(p->mm); 1500 if (!rss) 1501 rss = nr_scan_pages; 1502 1503 rss = round_up(rss, nr_scan_pages); 1504 return rss / nr_scan_pages; 1505 } 1506 1507 /* For sanity's sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */ 1508 #define MAX_SCAN_WINDOW 2560 1509 1510 static unsigned int task_scan_min(struct task_struct *p) 1511 { 1512 unsigned int scan_size = READ_ONCE(sysctl_numa_balancing_scan_size); 1513 unsigned int scan, floor; 1514 unsigned int windows = 1; 1515 1516 if (scan_size < MAX_SCAN_WINDOW) 1517 windows = MAX_SCAN_WINDOW / scan_size; 1518 floor = 1000 / windows; 1519 1520 scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p); 1521 return max_t(unsigned int, floor, scan); 1522 } 1523 1524 static unsigned int task_scan_start(struct task_struct *p) 1525 { 1526 unsigned long smin = task_scan_min(p); 1527 unsigned long period = smin; 1528 struct numa_group *ng; 1529 1530 /* Scale the maximum scan period with the amount of shared memory. */ 1531 rcu_read_lock(); 1532 ng = rcu_dereference(p->numa_group); 1533 if (ng) { 1534 unsigned long shared = group_faults_shared(ng); 1535 unsigned long private = group_faults_priv(ng); 1536 1537 period *= refcount_read(&ng->refcount); 1538 period *= shared + 1; 1539 period /= private + shared + 1; 1540 } 1541 rcu_read_unlock(); 1542 1543 return max(smin, period); 1544 } 1545 1546 static unsigned int task_scan_max(struct task_struct *p) 1547 { 1548 unsigned long smin = task_scan_min(p); 1549 unsigned long smax; 1550 struct numa_group *ng; 1551 1552 /* Watch for min being lower than max due to floor calculations */ 1553 smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p); 1554 1555 /* Scale the maximum scan period with the amount of shared memory. */ 1556 ng = deref_curr_numa_group(p); 1557 if (ng) { 1558 unsigned long shared = group_faults_shared(ng); 1559 unsigned long private = group_faults_priv(ng); 1560 unsigned long period = smax; 1561 1562 period *= refcount_read(&ng->refcount); 1563 period *= shared + 1; 1564 period /= private + shared + 1; 1565 1566 smax = max(smax, period); 1567 } 1568 1569 return max(smin, smax); 1570 } 1571 1572 static void account_numa_enqueue(struct rq *rq, struct task_struct *p) 1573 { 1574 rq->nr_numa_running += (p->numa_preferred_nid != NUMA_NO_NODE); 1575 rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p)); 1576 } 1577 1578 static void account_numa_dequeue(struct rq *rq, struct task_struct *p) 1579 { 1580 rq->nr_numa_running -= (p->numa_preferred_nid != NUMA_NO_NODE); 1581 rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p)); 1582 } 1583 1584 /* Shared or private faults. */ 1585 #define NR_NUMA_HINT_FAULT_TYPES 2 1586 1587 /* Memory and CPU locality */ 1588 #define NR_NUMA_HINT_FAULT_STATS (NR_NUMA_HINT_FAULT_TYPES * 2) 1589 1590 /* Averaged statistics, and temporary buffers. */ 1591 #define NR_NUMA_HINT_FAULT_BUCKETS (NR_NUMA_HINT_FAULT_STATS * 2) 1592 1593 pid_t task_numa_group_id(struct task_struct *p) 1594 { 1595 struct numa_group *ng; 1596 pid_t gid = 0; 1597 1598 rcu_read_lock(); 1599 ng = rcu_dereference(p->numa_group); 1600 if (ng) 1601 gid = ng->gid; 1602 rcu_read_unlock(); 1603 1604 return gid; 1605 } 1606 1607 /* 1608 * The averaged statistics, shared & private, memory & CPU, 1609 * occupy the first half of the array. The second half of the 1610 * array is for current counters, which are averaged into the 1611 * first set by task_numa_placement. 1612 */ 1613 static inline int task_faults_idx(enum numa_faults_stats s, int nid, int priv) 1614 { 1615 return NR_NUMA_HINT_FAULT_TYPES * (s * nr_node_ids + nid) + priv; 1616 } 1617 1618 static inline unsigned long task_faults(struct task_struct *p, int nid) 1619 { 1620 if (!p->numa_faults) 1621 return 0; 1622 1623 return p->numa_faults[task_faults_idx(NUMA_MEM, nid, 0)] + 1624 p->numa_faults[task_faults_idx(NUMA_MEM, nid, 1)]; 1625 } 1626 1627 static inline unsigned long group_faults(struct task_struct *p, int nid) 1628 { 1629 struct numa_group *ng = deref_task_numa_group(p); 1630 1631 if (!ng) 1632 return 0; 1633 1634 return ng->faults[task_faults_idx(NUMA_MEM, nid, 0)] + 1635 ng->faults[task_faults_idx(NUMA_MEM, nid, 1)]; 1636 } 1637 1638 static inline unsigned long group_faults_cpu(struct numa_group *group, int nid) 1639 { 1640 return group->faults[task_faults_idx(NUMA_CPU, nid, 0)] + 1641 group->faults[task_faults_idx(NUMA_CPU, nid, 1)]; 1642 } 1643 1644 static inline unsigned long group_faults_priv(struct numa_group *ng) 1645 { 1646 unsigned long faults = 0; 1647 int node; 1648 1649 for_each_online_node(node) { 1650 faults += ng->faults[task_faults_idx(NUMA_MEM, node, 1)]; 1651 } 1652 1653 return faults; 1654 } 1655 1656 static inline unsigned long group_faults_shared(struct numa_group *ng) 1657 { 1658 unsigned long faults = 0; 1659 int node; 1660 1661 for_each_online_node(node) { 1662 faults += ng->faults[task_faults_idx(NUMA_MEM, node, 0)]; 1663 } 1664 1665 return faults; 1666 } 1667 1668 /* 1669 * A node triggering more than 1/3 as many NUMA faults as the maximum is 1670 * considered part of a numa group's pseudo-interleaving set. Migrations 1671 * between these nodes are slowed down, to allow things to settle down. 1672 */ 1673 #define ACTIVE_NODE_FRACTION 3 1674 1675 static bool numa_is_active_node(int nid, struct numa_group *ng) 1676 { 1677 return group_faults_cpu(ng, nid) * ACTIVE_NODE_FRACTION > ng->max_faults_cpu; 1678 } 1679 1680 /* Handle placement on systems where not all nodes are directly connected. */ 1681 static unsigned long score_nearby_nodes(struct task_struct *p, int nid, 1682 int lim_dist, bool task) 1683 { 1684 unsigned long score = 0; 1685 int node, max_dist; 1686 1687 /* 1688 * All nodes are directly connected, and the same distance 1689 * from each other. No need for fancy placement algorithms. 1690 */ 1691 if (sched_numa_topology_type == NUMA_DIRECT) 1692 return 0; 1693 1694 /* sched_max_numa_distance may be changed in parallel. */ 1695 max_dist = READ_ONCE(sched_max_numa_distance); 1696 /* 1697 * This code is called for each node, introducing N^2 complexity, 1698 * which should be OK given the number of nodes rarely exceeds 8. 1699 */ 1700 for_each_online_node(node) { 1701 unsigned long faults; 1702 int dist = node_distance(nid, node); 1703 1704 /* 1705 * The furthest away nodes in the system are not interesting 1706 * for placement; nid was already counted. 1707 */ 1708 if (dist >= max_dist || node == nid) 1709 continue; 1710 1711 /* 1712 * On systems with a backplane NUMA topology, compare groups 1713 * of nodes, and move tasks towards the group with the most 1714 * memory accesses. When comparing two nodes at distance 1715 * "hoplimit", only nodes closer by than "hoplimit" are part 1716 * of each group. Skip other nodes. 1717 */ 1718 if (sched_numa_topology_type == NUMA_BACKPLANE && dist >= lim_dist) 1719 continue; 1720 1721 /* Add up the faults from nearby nodes. */ 1722 if (task) 1723 faults = task_faults(p, node); 1724 else 1725 faults = group_faults(p, node); 1726 1727 /* 1728 * On systems with a glueless mesh NUMA topology, there are 1729 * no fixed "groups of nodes". Instead, nodes that are not 1730 * directly connected bounce traffic through intermediate 1731 * nodes; a numa_group can occupy any set of nodes. 1732 * The further away a node is, the less the faults count. 1733 * This seems to result in good task placement. 1734 */ 1735 if (sched_numa_topology_type == NUMA_GLUELESS_MESH) { 1736 faults *= (max_dist - dist); 1737 faults /= (max_dist - LOCAL_DISTANCE); 1738 } 1739 1740 score += faults; 1741 } 1742 1743 return score; 1744 } 1745 1746 /* 1747 * These return the fraction of accesses done by a particular task, or 1748 * task group, on a particular numa node. The group weight is given a 1749 * larger multiplier, in order to group tasks together that are almost 1750 * evenly spread out between numa nodes. 1751 */ 1752 static inline unsigned long task_weight(struct task_struct *p, int nid, 1753 int dist) 1754 { 1755 unsigned long faults, total_faults; 1756 1757 if (!p->numa_faults) 1758 return 0; 1759 1760 total_faults = p->total_numa_faults; 1761 1762 if (!total_faults) 1763 return 0; 1764 1765 faults = task_faults(p, nid); 1766 faults += score_nearby_nodes(p, nid, dist, true); 1767 1768 return 1000 * faults / total_faults; 1769 } 1770 1771 static inline unsigned long group_weight(struct task_struct *p, int nid, 1772 int dist) 1773 { 1774 struct numa_group *ng = deref_task_numa_group(p); 1775 unsigned long faults, total_faults; 1776 1777 if (!ng) 1778 return 0; 1779 1780 total_faults = ng->total_faults; 1781 1782 if (!total_faults) 1783 return 0; 1784 1785 faults = group_faults(p, nid); 1786 faults += score_nearby_nodes(p, nid, dist, false); 1787 1788 return 1000 * faults / total_faults; 1789 } 1790 1791 /* 1792 * If memory tiering mode is enabled, cpupid of slow memory page is 1793 * used to record scan time instead of CPU and PID. When tiering mode 1794 * is disabled at run time, the scan time (in cpupid) will be 1795 * interpreted as CPU and PID. So CPU needs to be checked to avoid to 1796 * access out of array bound. 1797 */ 1798 static inline bool cpupid_valid(int cpupid) 1799 { 1800 return cpupid_to_cpu(cpupid) < nr_cpu_ids; 1801 } 1802 1803 /* 1804 * For memory tiering mode, if there are enough free pages (more than 1805 * enough watermark defined here) in fast memory node, to take full 1806 * advantage of fast memory capacity, all recently accessed slow 1807 * memory pages will be migrated to fast memory node without 1808 * considering hot threshold. 1809 */ 1810 static bool pgdat_free_space_enough(struct pglist_data *pgdat) 1811 { 1812 int z; 1813 unsigned long enough_wmark; 1814 1815 enough_wmark = max(1UL * 1024 * 1024 * 1024 >> PAGE_SHIFT, 1816 pgdat->node_present_pages >> 4); 1817 for (z = pgdat->nr_zones - 1; z >= 0; z--) { 1818 struct zone *zone = pgdat->node_zones + z; 1819 1820 if (!populated_zone(zone)) 1821 continue; 1822 1823 if (zone_watermark_ok(zone, 0, 1824 promo_wmark_pages(zone) + enough_wmark, 1825 ZONE_MOVABLE, 0)) 1826 return true; 1827 } 1828 return false; 1829 } 1830 1831 /* 1832 * For memory tiering mode, when page tables are scanned, the scan 1833 * time will be recorded in struct page in addition to make page 1834 * PROT_NONE for slow memory page. So when the page is accessed, in 1835 * hint page fault handler, the hint page fault latency is calculated 1836 * via, 1837 * 1838 * hint page fault latency = hint page fault time - scan time 1839 * 1840 * The smaller the hint page fault latency, the higher the possibility 1841 * for the page to be hot. 1842 */ 1843 static int numa_hint_fault_latency(struct folio *folio) 1844 { 1845 int last_time, time; 1846 1847 time = jiffies_to_msecs(jiffies); 1848 last_time = folio_xchg_access_time(folio, time); 1849 1850 return (time - last_time) & PAGE_ACCESS_TIME_MASK; 1851 } 1852 1853 /* 1854 * For memory tiering mode, too high promotion/demotion throughput may 1855 * hurt application latency. So we provide a mechanism to rate limit 1856 * the number of pages that are tried to be promoted. 1857 */ 1858 static bool numa_promotion_rate_limit(struct pglist_data *pgdat, 1859 unsigned long rate_limit, int nr) 1860 { 1861 unsigned long nr_cand; 1862 unsigned int now, start; 1863 1864 now = jiffies_to_msecs(jiffies); 1865 mod_node_page_state(pgdat, PGPROMOTE_CANDIDATE, nr); 1866 nr_cand = node_page_state(pgdat, PGPROMOTE_CANDIDATE); 1867 start = pgdat->nbp_rl_start; 1868 if (now - start > MSEC_PER_SEC && 1869 cmpxchg(&pgdat->nbp_rl_start, start, now) == start) 1870 pgdat->nbp_rl_nr_cand = nr_cand; 1871 if (nr_cand - pgdat->nbp_rl_nr_cand >= rate_limit) 1872 return true; 1873 return false; 1874 } 1875 1876 #define NUMA_MIGRATION_ADJUST_STEPS 16 1877 1878 static void numa_promotion_adjust_threshold(struct pglist_data *pgdat, 1879 unsigned long rate_limit, 1880 unsigned int ref_th) 1881 { 1882 unsigned int now, start, th_period, unit_th, th; 1883 unsigned long nr_cand, ref_cand, diff_cand; 1884 1885 now = jiffies_to_msecs(jiffies); 1886 th_period = sysctl_numa_balancing_scan_period_max; 1887 start = pgdat->nbp_th_start; 1888 if (now - start > th_period && 1889 cmpxchg(&pgdat->nbp_th_start, start, now) == start) { 1890 ref_cand = rate_limit * 1891 sysctl_numa_balancing_scan_period_max / MSEC_PER_SEC; 1892 nr_cand = node_page_state(pgdat, PGPROMOTE_CANDIDATE); 1893 diff_cand = nr_cand - pgdat->nbp_th_nr_cand; 1894 unit_th = ref_th * 2 / NUMA_MIGRATION_ADJUST_STEPS; 1895 th = pgdat->nbp_threshold ? : ref_th; 1896 if (diff_cand > ref_cand * 11 / 10) 1897 th = max(th - unit_th, unit_th); 1898 else if (diff_cand < ref_cand * 9 / 10) 1899 th = min(th + unit_th, ref_th * 2); 1900 pgdat->nbp_th_nr_cand = nr_cand; 1901 pgdat->nbp_threshold = th; 1902 } 1903 } 1904 1905 bool should_numa_migrate_memory(struct task_struct *p, struct folio *folio, 1906 int src_nid, int dst_cpu) 1907 { 1908 struct numa_group *ng = deref_curr_numa_group(p); 1909 int dst_nid = cpu_to_node(dst_cpu); 1910 int last_cpupid, this_cpupid; 1911 1912 /* 1913 * Cannot migrate to memoryless nodes. 1914 */ 1915 if (!node_state(dst_nid, N_MEMORY)) 1916 return false; 1917 1918 /* 1919 * The pages in slow memory node should be migrated according 1920 * to hot/cold instead of private/shared. 1921 */ 1922 if (folio_use_access_time(folio)) { 1923 struct pglist_data *pgdat; 1924 unsigned long rate_limit; 1925 unsigned int latency, th, def_th; 1926 1927 pgdat = NODE_DATA(dst_nid); 1928 if (pgdat_free_space_enough(pgdat)) { 1929 /* workload changed, reset hot threshold */ 1930 pgdat->nbp_threshold = 0; 1931 return true; 1932 } 1933 1934 def_th = sysctl_numa_balancing_hot_threshold; 1935 rate_limit = sysctl_numa_balancing_promote_rate_limit << \ 1936 (20 - PAGE_SHIFT); 1937 numa_promotion_adjust_threshold(pgdat, rate_limit, def_th); 1938 1939 th = pgdat->nbp_threshold ? : def_th; 1940 latency = numa_hint_fault_latency(folio); 1941 if (latency >= th) 1942 return false; 1943 1944 return !numa_promotion_rate_limit(pgdat, rate_limit, 1945 folio_nr_pages(folio)); 1946 } 1947 1948 this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid); 1949 last_cpupid = folio_xchg_last_cpupid(folio, this_cpupid); 1950 1951 if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) && 1952 !node_is_toptier(src_nid) && !cpupid_valid(last_cpupid)) 1953 return false; 1954 1955 /* 1956 * Allow first faults or private faults to migrate immediately early in 1957 * the lifetime of a task. The magic number 4 is based on waiting for 1958 * two full passes of the "multi-stage node selection" test that is 1959 * executed below. 1960 */ 1961 if ((p->numa_preferred_nid == NUMA_NO_NODE || p->numa_scan_seq <= 4) && 1962 (cpupid_pid_unset(last_cpupid) || cpupid_match_pid(p, last_cpupid))) 1963 return true; 1964 1965 /* 1966 * Multi-stage node selection is used in conjunction with a periodic 1967 * migration fault to build a temporal task<->page relation. By using 1968 * a two-stage filter we remove short/unlikely relations. 1969 * 1970 * Using P(p) ~ n_p / n_t as per frequentist probability, we can equate 1971 * a task's usage of a particular page (n_p) per total usage of this 1972 * page (n_t) (in a given time-span) to a probability. 1973 * 1974 * Our periodic faults will sample this probability and getting the 1975 * same result twice in a row, given these samples are fully 1976 * independent, is then given by P(n)^2, provided our sample period 1977 * is sufficiently short compared to the usage pattern. 1978 * 1979 * This quadric squishes small probabilities, making it less likely we 1980 * act on an unlikely task<->page relation. 1981 */ 1982 if (!cpupid_pid_unset(last_cpupid) && 1983 cpupid_to_nid(last_cpupid) != dst_nid) 1984 return false; 1985 1986 /* Always allow migrate on private faults */ 1987 if (cpupid_match_pid(p, last_cpupid)) 1988 return true; 1989 1990 /* A shared fault, but p->numa_group has not been set up yet. */ 1991 if (!ng) 1992 return true; 1993 1994 /* 1995 * Destination node is much more heavily used than the source 1996 * node? Allow migration. 1997 */ 1998 if (group_faults_cpu(ng, dst_nid) > group_faults_cpu(ng, src_nid) * 1999 ACTIVE_NODE_FRACTION) 2000 return true; 2001 2002 /* 2003 * Distribute memory according to CPU & memory use on each node, 2004 * with 3/4 hysteresis to avoid unnecessary memory migrations: 2005 * 2006 * faults_cpu(dst) 3 faults_cpu(src) 2007 * --------------- * - > --------------- 2008 * faults_mem(dst) 4 faults_mem(src) 2009 */ 2010 return group_faults_cpu(ng, dst_nid) * group_faults(p, src_nid) * 3 > 2011 group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4; 2012 } 2013 2014 /* 2015 * 'numa_type' describes the node at the moment of load balancing. 2016 */ 2017 enum numa_type { 2018 /* The node has spare capacity that can be used to run more tasks. */ 2019 node_has_spare = 0, 2020 /* 2021 * The node is fully used and the tasks don't compete for more CPU 2022 * cycles. Nevertheless, some tasks might wait before running. 2023 */ 2024 node_fully_busy, 2025 /* 2026 * The node is overloaded and can't provide expected CPU cycles to all 2027 * tasks. 2028 */ 2029 node_overloaded 2030 }; 2031 2032 /* Cached statistics for all CPUs within a node */ 2033 struct numa_stats { 2034 unsigned long load; 2035 unsigned long runnable; 2036 unsigned long util; 2037 /* Total compute capacity of CPUs on a node */ 2038 unsigned long compute_capacity; 2039 unsigned int nr_running; 2040 unsigned int weight; 2041 enum numa_type node_type; 2042 int idle_cpu; 2043 }; 2044 2045 struct task_numa_env { 2046 struct task_struct *p; 2047 2048 int src_cpu, src_nid; 2049 int dst_cpu, dst_nid; 2050 int imb_numa_nr; 2051 2052 struct numa_stats src_stats, dst_stats; 2053 2054 int imbalance_pct; 2055 int dist; 2056 2057 struct task_struct *best_task; 2058 long best_imp; 2059 int best_cpu; 2060 }; 2061 2062 static unsigned long cpu_load(struct rq *rq); 2063 static unsigned long cpu_runnable(struct rq *rq); 2064 2065 static inline enum 2066 numa_type numa_classify(unsigned int imbalance_pct, 2067 struct numa_stats *ns) 2068 { 2069 if ((ns->nr_running > ns->weight) && 2070 (((ns->compute_capacity * 100) < (ns->util * imbalance_pct)) || 2071 ((ns->compute_capacity * imbalance_pct) < (ns->runnable * 100)))) 2072 return node_overloaded; 2073 2074 if ((ns->nr_running < ns->weight) || 2075 (((ns->compute_capacity * 100) > (ns->util * imbalance_pct)) && 2076 ((ns->compute_capacity * imbalance_pct) > (ns->runnable * 100)))) 2077 return node_has_spare; 2078 2079 return node_fully_busy; 2080 } 2081 2082 #ifdef CONFIG_SCHED_SMT 2083 /* Forward declarations of select_idle_sibling helpers */ 2084 static inline bool test_idle_cores(int cpu); 2085 static inline int numa_idle_core(int idle_core, int cpu) 2086 { 2087 if (!static_branch_likely(&sched_smt_present) || 2088 idle_core >= 0 || !test_idle_cores(cpu)) 2089 return idle_core; 2090 2091 /* 2092 * Prefer cores instead of packing HT siblings 2093 * and triggering future load balancing. 2094 */ 2095 if (is_core_idle(cpu)) 2096 idle_core = cpu; 2097 2098 return idle_core; 2099 } 2100 #else 2101 static inline int numa_idle_core(int idle_core, int cpu) 2102 { 2103 return idle_core; 2104 } 2105 #endif 2106 2107 /* 2108 * Gather all necessary information to make NUMA balancing placement 2109 * decisions that are compatible with standard load balancer. This 2110 * borrows code and logic from update_sg_lb_stats but sharing a 2111 * common implementation is impractical. 2112 */ 2113 static void update_numa_stats(struct task_numa_env *env, 2114 struct numa_stats *ns, int nid, 2115 bool find_idle) 2116 { 2117 int cpu, idle_core = -1; 2118 2119 memset(ns, 0, sizeof(*ns)); 2120 ns->idle_cpu = -1; 2121 2122 rcu_read_lock(); 2123 for_each_cpu(cpu, cpumask_of_node(nid)) { 2124 struct rq *rq = cpu_rq(cpu); 2125 2126 ns->load += cpu_load(rq); 2127 ns->runnable += cpu_runnable(rq); 2128 ns->util += cpu_util_cfs(cpu); 2129 ns->nr_running += rq->cfs.h_nr_running; 2130 ns->compute_capacity += capacity_of(cpu); 2131 2132 if (find_idle && idle_core < 0 && !rq->nr_running && idle_cpu(cpu)) { 2133 if (READ_ONCE(rq->numa_migrate_on) || 2134 !cpumask_test_cpu(cpu, env->p->cpus_ptr)) 2135 continue; 2136 2137 if (ns->idle_cpu == -1) 2138 ns->idle_cpu = cpu; 2139 2140 idle_core = numa_idle_core(idle_core, cpu); 2141 } 2142 } 2143 rcu_read_unlock(); 2144 2145 ns->weight = cpumask_weight(cpumask_of_node(nid)); 2146 2147 ns->node_type = numa_classify(env->imbalance_pct, ns); 2148 2149 if (idle_core >= 0) 2150 ns->idle_cpu = idle_core; 2151 } 2152 2153 static void task_numa_assign(struct task_numa_env *env, 2154 struct task_struct *p, long imp) 2155 { 2156 struct rq *rq = cpu_rq(env->dst_cpu); 2157 2158 /* Check if run-queue part of active NUMA balance. */ 2159 if (env->best_cpu != env->dst_cpu && xchg(&rq->numa_migrate_on, 1)) { 2160 int cpu; 2161 int start = env->dst_cpu; 2162 2163 /* Find alternative idle CPU. */ 2164 for_each_cpu_wrap(cpu, cpumask_of_node(env->dst_nid), start + 1) { 2165 if (cpu == env->best_cpu || !idle_cpu(cpu) || 2166 !cpumask_test_cpu(cpu, env->p->cpus_ptr)) { 2167 continue; 2168 } 2169 2170 env->dst_cpu = cpu; 2171 rq = cpu_rq(env->dst_cpu); 2172 if (!xchg(&rq->numa_migrate_on, 1)) 2173 goto assign; 2174 } 2175 2176 /* Failed to find an alternative idle CPU */ 2177 return; 2178 } 2179 2180 assign: 2181 /* 2182 * Clear previous best_cpu/rq numa-migrate flag, since task now 2183 * found a better CPU to move/swap. 2184 */ 2185 if (env->best_cpu != -1 && env->best_cpu != env->dst_cpu) { 2186 rq = cpu_rq(env->best_cpu); 2187 WRITE_ONCE(rq->numa_migrate_on, 0); 2188 } 2189 2190 if (env->best_task) 2191 put_task_struct(env->best_task); 2192 if (p) 2193 get_task_struct(p); 2194 2195 env->best_task = p; 2196 env->best_imp = imp; 2197 env->best_cpu = env->dst_cpu; 2198 } 2199 2200 static bool load_too_imbalanced(long src_load, long dst_load, 2201 struct task_numa_env *env) 2202 { 2203 long imb, old_imb; 2204 long orig_src_load, orig_dst_load; 2205 long src_capacity, dst_capacity; 2206 2207 /* 2208 * The load is corrected for the CPU capacity available on each node. 2209 * 2210 * src_load dst_load 2211 * ------------ vs --------- 2212 * src_capacity dst_capacity 2213 */ 2214 src_capacity = env->src_stats.compute_capacity; 2215 dst_capacity = env->dst_stats.compute_capacity; 2216 2217 imb = abs(dst_load * src_capacity - src_load * dst_capacity); 2218 2219 orig_src_load = env->src_stats.load; 2220 orig_dst_load = env->dst_stats.load; 2221 2222 old_imb = abs(orig_dst_load * src_capacity - orig_src_load * dst_capacity); 2223 2224 /* Would this change make things worse? */ 2225 return (imb > old_imb); 2226 } 2227 2228 /* 2229 * Maximum NUMA importance can be 1998 (2*999); 2230 * SMALLIMP @ 30 would be close to 1998/64. 2231 * Used to deter task migration. 2232 */ 2233 #define SMALLIMP 30 2234 2235 /* 2236 * This checks if the overall compute and NUMA accesses of the system would 2237 * be improved if the source tasks was migrated to the target dst_cpu taking 2238 * into account that it might be best if task running on the dst_cpu should 2239 * be exchanged with the source task 2240 */ 2241 static bool task_numa_compare(struct task_numa_env *env, 2242 long taskimp, long groupimp, bool maymove) 2243 { 2244 struct numa_group *cur_ng, *p_ng = deref_curr_numa_group(env->p); 2245 struct rq *dst_rq = cpu_rq(env->dst_cpu); 2246 long imp = p_ng ? groupimp : taskimp; 2247 struct task_struct *cur; 2248 long src_load, dst_load; 2249 int dist = env->dist; 2250 long moveimp = imp; 2251 long load; 2252 bool stopsearch = false; 2253 2254 if (READ_ONCE(dst_rq->numa_migrate_on)) 2255 return false; 2256 2257 rcu_read_lock(); 2258 cur = rcu_dereference(dst_rq->curr); 2259 if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur))) 2260 cur = NULL; 2261 2262 /* 2263 * Because we have preemption enabled we can get migrated around and 2264 * end try selecting ourselves (current == env->p) as a swap candidate. 2265 */ 2266 if (cur == env->p) { 2267 stopsearch = true; 2268 goto unlock; 2269 } 2270 2271 if (!cur) { 2272 if (maymove && moveimp >= env->best_imp) 2273 goto assign; 2274 else 2275 goto unlock; 2276 } 2277 2278 /* Skip this swap candidate if cannot move to the source cpu. */ 2279 if (!cpumask_test_cpu(env->src_cpu, cur->cpus_ptr)) 2280 goto unlock; 2281 2282 /* 2283 * Skip this swap candidate if it is not moving to its preferred 2284 * node and the best task is. 2285 */ 2286 if (env->best_task && 2287 env->best_task->numa_preferred_nid == env->src_nid && 2288 cur->numa_preferred_nid != env->src_nid) { 2289 goto unlock; 2290 } 2291 2292 /* 2293 * "imp" is the fault differential for the source task between the 2294 * source and destination node. Calculate the total differential for 2295 * the source task and potential destination task. The more negative 2296 * the value is, the more remote accesses that would be expected to 2297 * be incurred if the tasks were swapped. 2298 * 2299 * If dst and source tasks are in the same NUMA group, or not 2300 * in any group then look only at task weights. 2301 */ 2302 cur_ng = rcu_dereference(cur->numa_group); 2303 if (cur_ng == p_ng) { 2304 /* 2305 * Do not swap within a group or between tasks that have 2306 * no group if there is spare capacity. Swapping does 2307 * not address the load imbalance and helps one task at 2308 * the cost of punishing another. 2309 */ 2310 if (env->dst_stats.node_type == node_has_spare) 2311 goto unlock; 2312 2313 imp = taskimp + task_weight(cur, env->src_nid, dist) - 2314 task_weight(cur, env->dst_nid, dist); 2315 /* 2316 * Add some hysteresis to prevent swapping the 2317 * tasks within a group over tiny differences. 2318 */ 2319 if (cur_ng) 2320 imp -= imp / 16; 2321 } else { 2322 /* 2323 * Compare the group weights. If a task is all by itself 2324 * (not part of a group), use the task weight instead. 2325 */ 2326 if (cur_ng && p_ng) 2327 imp += group_weight(cur, env->src_nid, dist) - 2328 group_weight(cur, env->dst_nid, dist); 2329 else 2330 imp += task_weight(cur, env->src_nid, dist) - 2331 task_weight(cur, env->dst_nid, dist); 2332 } 2333 2334 /* Discourage picking a task already on its preferred node */ 2335 if (cur->numa_preferred_nid == env->dst_nid) 2336 imp -= imp / 16; 2337 2338 /* 2339 * Encourage picking a task that moves to its preferred node. 2340 * This potentially makes imp larger than it's maximum of 2341 * 1998 (see SMALLIMP and task_weight for why) but in this 2342 * case, it does not matter. 2343 */ 2344 if (cur->numa_preferred_nid == env->src_nid) 2345 imp += imp / 8; 2346 2347 if (maymove && moveimp > imp && moveimp > env->best_imp) { 2348 imp = moveimp; 2349 cur = NULL; 2350 goto assign; 2351 } 2352 2353 /* 2354 * Prefer swapping with a task moving to its preferred node over a 2355 * task that is not. 2356 */ 2357 if (env->best_task && cur->numa_preferred_nid == env->src_nid && 2358 env->best_task->numa_preferred_nid != env->src_nid) { 2359 goto assign; 2360 } 2361 2362 /* 2363 * If the NUMA importance is less than SMALLIMP, 2364 * task migration might only result in ping pong 2365 * of tasks and also hurt performance due to cache 2366 * misses. 2367 */ 2368 if (imp < SMALLIMP || imp <= env->best_imp + SMALLIMP / 2) 2369 goto unlock; 2370 2371 /* 2372 * In the overloaded case, try and keep the load balanced. 2373 */ 2374 load = task_h_load(env->p) - task_h_load(cur); 2375 if (!load) 2376 goto assign; 2377 2378 dst_load = env->dst_stats.load + load; 2379 src_load = env->src_stats.load - load; 2380 2381 if (load_too_imbalanced(src_load, dst_load, env)) 2382 goto unlock; 2383 2384 assign: 2385 /* Evaluate an idle CPU for a task numa move. */ 2386 if (!cur) { 2387 int cpu = env->dst_stats.idle_cpu; 2388 2389 /* Nothing cached so current CPU went idle since the search. */ 2390 if (cpu < 0) 2391 cpu = env->dst_cpu; 2392 2393 /* 2394 * If the CPU is no longer truly idle and the previous best CPU 2395 * is, keep using it. 2396 */ 2397 if (!idle_cpu(cpu) && env->best_cpu >= 0 && 2398 idle_cpu(env->best_cpu)) { 2399 cpu = env->best_cpu; 2400 } 2401 2402 env->dst_cpu = cpu; 2403 } 2404 2405 task_numa_assign(env, cur, imp); 2406 2407 /* 2408 * If a move to idle is allowed because there is capacity or load 2409 * balance improves then stop the search. While a better swap 2410 * candidate may exist, a search is not free. 2411 */ 2412 if (maymove && !cur && env->best_cpu >= 0 && idle_cpu(env->best_cpu)) 2413 stopsearch = true; 2414 2415 /* 2416 * If a swap candidate must be identified and the current best task 2417 * moves its preferred node then stop the search. 2418 */ 2419 if (!maymove && env->best_task && 2420 env->best_task->numa_preferred_nid == env->src_nid) { 2421 stopsearch = true; 2422 } 2423 unlock: 2424 rcu_read_unlock(); 2425 2426 return stopsearch; 2427 } 2428 2429 static void task_numa_find_cpu(struct task_numa_env *env, 2430 long taskimp, long groupimp) 2431 { 2432 bool maymove = false; 2433 int cpu; 2434 2435 /* 2436 * If dst node has spare capacity, then check if there is an 2437 * imbalance that would be overruled by the load balancer. 2438 */ 2439 if (env->dst_stats.node_type == node_has_spare) { 2440 unsigned int imbalance; 2441 int src_running, dst_running; 2442 2443 /* 2444 * Would movement cause an imbalance? Note that if src has 2445 * more running tasks that the imbalance is ignored as the 2446 * move improves the imbalance from the perspective of the 2447 * CPU load balancer. 2448 * */ 2449 src_running = env->src_stats.nr_running - 1; 2450 dst_running = env->dst_stats.nr_running + 1; 2451 imbalance = max(0, dst_running - src_running); 2452 imbalance = adjust_numa_imbalance(imbalance, dst_running, 2453 env->imb_numa_nr); 2454 2455 /* Use idle CPU if there is no imbalance */ 2456 if (!imbalance) { 2457 maymove = true; 2458 if (env->dst_stats.idle_cpu >= 0) { 2459 env->dst_cpu = env->dst_stats.idle_cpu; 2460 task_numa_assign(env, NULL, 0); 2461 return; 2462 } 2463 } 2464 } else { 2465 long src_load, dst_load, load; 2466 /* 2467 * If the improvement from just moving env->p direction is better 2468 * than swapping tasks around, check if a move is possible. 2469 */ 2470 load = task_h_load(env->p); 2471 dst_load = env->dst_stats.load + load; 2472 src_load = env->src_stats.load - load; 2473 maymove = !load_too_imbalanced(src_load, dst_load, env); 2474 } 2475 2476 for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) { 2477 /* Skip this CPU if the source task cannot migrate */ 2478 if (!cpumask_test_cpu(cpu, env->p->cpus_ptr)) 2479 continue; 2480 2481 env->dst_cpu = cpu; 2482 if (task_numa_compare(env, taskimp, groupimp, maymove)) 2483 break; 2484 } 2485 } 2486 2487 static int task_numa_migrate(struct task_struct *p) 2488 { 2489 struct task_numa_env env = { 2490 .p = p, 2491 2492 .src_cpu = task_cpu(p), 2493 .src_nid = task_node(p), 2494 2495 .imbalance_pct = 112, 2496 2497 .best_task = NULL, 2498 .best_imp = 0, 2499 .best_cpu = -1, 2500 }; 2501 unsigned long taskweight, groupweight; 2502 struct sched_domain *sd; 2503 long taskimp, groupimp; 2504 struct numa_group *ng; 2505 struct rq *best_rq; 2506 int nid, ret, dist; 2507 2508 /* 2509 * Pick the lowest SD_NUMA domain, as that would have the smallest 2510 * imbalance and would be the first to start moving tasks about. 2511 * 2512 * And we want to avoid any moving of tasks about, as that would create 2513 * random movement of tasks -- counter the numa conditions we're trying 2514 * to satisfy here. 2515 */ 2516 rcu_read_lock(); 2517 sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu)); 2518 if (sd) { 2519 env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2; 2520 env.imb_numa_nr = sd->imb_numa_nr; 2521 } 2522 rcu_read_unlock(); 2523 2524 /* 2525 * Cpusets can break the scheduler domain tree into smaller 2526 * balance domains, some of which do not cross NUMA boundaries. 2527 * Tasks that are "trapped" in such domains cannot be migrated 2528 * elsewhere, so there is no point in (re)trying. 2529 */ 2530 if (unlikely(!sd)) { 2531 sched_setnuma(p, task_node(p)); 2532 return -EINVAL; 2533 } 2534 2535 env.dst_nid = p->numa_preferred_nid; 2536 dist = env.dist = node_distance(env.src_nid, env.dst_nid); 2537 taskweight = task_weight(p, env.src_nid, dist); 2538 groupweight = group_weight(p, env.src_nid, dist); 2539 update_numa_stats(&env, &env.src_stats, env.src_nid, false); 2540 taskimp = task_weight(p, env.dst_nid, dist) - taskweight; 2541 groupimp = group_weight(p, env.dst_nid, dist) - groupweight; 2542 update_numa_stats(&env, &env.dst_stats, env.dst_nid, true); 2543 2544 /* Try to find a spot on the preferred nid. */ 2545 task_numa_find_cpu(&env, taskimp, groupimp); 2546 2547 /* 2548 * Look at other nodes in these cases: 2549 * - there is no space available on the preferred_nid 2550 * - the task is part of a numa_group that is interleaved across 2551 * multiple NUMA nodes; in order to better consolidate the group, 2552 * we need to check other locations. 2553 */ 2554 ng = deref_curr_numa_group(p); 2555 if (env.best_cpu == -1 || (ng && ng->active_nodes > 1)) { 2556 for_each_node_state(nid, N_CPU) { 2557 if (nid == env.src_nid || nid == p->numa_preferred_nid) 2558 continue; 2559 2560 dist = node_distance(env.src_nid, env.dst_nid); 2561 if (sched_numa_topology_type == NUMA_BACKPLANE && 2562 dist != env.dist) { 2563 taskweight = task_weight(p, env.src_nid, dist); 2564 groupweight = group_weight(p, env.src_nid, dist); 2565 } 2566 2567 /* Only consider nodes where both task and groups benefit */ 2568 taskimp = task_weight(p, nid, dist) - taskweight; 2569 groupimp = group_weight(p, nid, dist) - groupweight; 2570 if (taskimp < 0 && groupimp < 0) 2571 continue; 2572 2573 env.dist = dist; 2574 env.dst_nid = nid; 2575 update_numa_stats(&env, &env.dst_stats, env.dst_nid, true); 2576 task_numa_find_cpu(&env, taskimp, groupimp); 2577 } 2578 } 2579 2580 /* 2581 * If the task is part of a workload that spans multiple NUMA nodes, 2582 * and is migrating into one of the workload's active nodes, remember 2583 * this node as the task's preferred numa node, so the workload can 2584 * settle down. 2585 * A task that migrated to a second choice node will be better off 2586 * trying for a better one later. Do not set the preferred node here. 2587 */ 2588 if (ng) { 2589 if (env.best_cpu == -1) 2590 nid = env.src_nid; 2591 else 2592 nid = cpu_to_node(env.best_cpu); 2593 2594 if (nid != p->numa_preferred_nid) 2595 sched_setnuma(p, nid); 2596 } 2597 2598 /* No better CPU than the current one was found. */ 2599 if (env.best_cpu == -1) { 2600 trace_sched_stick_numa(p, env.src_cpu, NULL, -1); 2601 return -EAGAIN; 2602 } 2603 2604 best_rq = cpu_rq(env.best_cpu); 2605 if (env.best_task == NULL) { 2606 ret = migrate_task_to(p, env.best_cpu); 2607 WRITE_ONCE(best_rq->numa_migrate_on, 0); 2608 if (ret != 0) 2609 trace_sched_stick_numa(p, env.src_cpu, NULL, env.best_cpu); 2610 return ret; 2611 } 2612 2613 ret = migrate_swap(p, env.best_task, env.best_cpu, env.src_cpu); 2614 WRITE_ONCE(best_rq->numa_migrate_on, 0); 2615 2616 if (ret != 0) 2617 trace_sched_stick_numa(p, env.src_cpu, env.best_task, env.best_cpu); 2618 put_task_struct(env.best_task); 2619 return ret; 2620 } 2621 2622 /* Attempt to migrate a task to a CPU on the preferred node. */ 2623 static void numa_migrate_preferred(struct task_struct *p) 2624 { 2625 unsigned long interval = HZ; 2626 2627 /* This task has no NUMA fault statistics yet */ 2628 if (unlikely(p->numa_preferred_nid == NUMA_NO_NODE || !p->numa_faults)) 2629 return; 2630 2631 /* Periodically retry migrating the task to the preferred node */ 2632 interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16); 2633 p->numa_migrate_retry = jiffies + interval; 2634 2635 /* Success if task is already running on preferred CPU */ 2636 if (task_node(p) == p->numa_preferred_nid) 2637 return; 2638 2639 /* Otherwise, try migrate to a CPU on the preferred node */ 2640 task_numa_migrate(p); 2641 } 2642 2643 /* 2644 * Find out how many nodes the workload is actively running on. Do this by 2645 * tracking the nodes from which NUMA hinting faults are triggered. This can 2646 * be different from the set of nodes where the workload's memory is currently 2647 * located. 2648 */ 2649 static void numa_group_count_active_nodes(struct numa_group *numa_group) 2650 { 2651 unsigned long faults, max_faults = 0; 2652 int nid, active_nodes = 0; 2653 2654 for_each_node_state(nid, N_CPU) { 2655 faults = group_faults_cpu(numa_group, nid); 2656 if (faults > max_faults) 2657 max_faults = faults; 2658 } 2659 2660 for_each_node_state(nid, N_CPU) { 2661 faults = group_faults_cpu(numa_group, nid); 2662 if (faults * ACTIVE_NODE_FRACTION > max_faults) 2663 active_nodes++; 2664 } 2665 2666 numa_group->max_faults_cpu = max_faults; 2667 numa_group->active_nodes = active_nodes; 2668 } 2669 2670 /* 2671 * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS 2672 * increments. The more local the fault statistics are, the higher the scan 2673 * period will be for the next scan window. If local/(local+remote) ratio is 2674 * below NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS) 2675 * the scan period will decrease. Aim for 70% local accesses. 2676 */ 2677 #define NUMA_PERIOD_SLOTS 10 2678 #define NUMA_PERIOD_THRESHOLD 7 2679 2680 /* 2681 * Increase the scan period (slow down scanning) if the majority of 2682 * our memory is already on our local node, or if the majority of 2683 * the page accesses are shared with other processes. 2684 * Otherwise, decrease the scan period. 2685 */ 2686 static void update_task_scan_period(struct task_struct *p, 2687 unsigned long shared, unsigned long private) 2688 { 2689 unsigned int period_slot; 2690 int lr_ratio, ps_ratio; 2691 int diff; 2692 2693 unsigned long remote = p->numa_faults_locality[0]; 2694 unsigned long local = p->numa_faults_locality[1]; 2695 2696 /* 2697 * If there were no record hinting faults then either the task is 2698 * completely idle or all activity is in areas that are not of interest 2699 * to automatic numa balancing. Related to that, if there were failed 2700 * migration then it implies we are migrating too quickly or the local 2701 * node is overloaded. In either case, scan slower 2702 */ 2703 if (local + shared == 0 || p->numa_faults_locality[2]) { 2704 p->numa_scan_period = min(p->numa_scan_period_max, 2705 p->numa_scan_period << 1); 2706 2707 p->mm->numa_next_scan = jiffies + 2708 msecs_to_jiffies(p->numa_scan_period); 2709 2710 return; 2711 } 2712 2713 /* 2714 * Prepare to scale scan period relative to the current period. 2715 * == NUMA_PERIOD_THRESHOLD scan period stays the same 2716 * < NUMA_PERIOD_THRESHOLD scan period decreases (scan faster) 2717 * >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower) 2718 */ 2719 period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS); 2720 lr_ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote); 2721 ps_ratio = (private * NUMA_PERIOD_SLOTS) / (private + shared); 2722 2723 if (ps_ratio >= NUMA_PERIOD_THRESHOLD) { 2724 /* 2725 * Most memory accesses are local. There is no need to 2726 * do fast NUMA scanning, since memory is already local. 2727 */ 2728 int slot = ps_ratio - NUMA_PERIOD_THRESHOLD; 2729 if (!slot) 2730 slot = 1; 2731 diff = slot * period_slot; 2732 } else if (lr_ratio >= NUMA_PERIOD_THRESHOLD) { 2733 /* 2734 * Most memory accesses are shared with other tasks. 2735 * There is no point in continuing fast NUMA scanning, 2736 * since other tasks may just move the memory elsewhere. 2737 */ 2738 int slot = lr_ratio - NUMA_PERIOD_THRESHOLD; 2739 if (!slot) 2740 slot = 1; 2741 diff = slot * period_slot; 2742 } else { 2743 /* 2744 * Private memory faults exceed (SLOTS-THRESHOLD)/SLOTS, 2745 * yet they are not on the local NUMA node. Speed up 2746 * NUMA scanning to get the memory moved over. 2747 */ 2748 int ratio = max(lr_ratio, ps_ratio); 2749 diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot; 2750 } 2751 2752 p->numa_scan_period = clamp(p->numa_scan_period + diff, 2753 task_scan_min(p), task_scan_max(p)); 2754 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality)); 2755 } 2756 2757 /* 2758 * Get the fraction of time the task has been running since the last 2759 * NUMA placement cycle. The scheduler keeps similar statistics, but 2760 * decays those on a 32ms period, which is orders of magnitude off 2761 * from the dozens-of-seconds NUMA balancing period. Use the scheduler 2762 * stats only if the task is so new there are no NUMA statistics yet. 2763 */ 2764 static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period) 2765 { 2766 u64 runtime, delta, now; 2767 /* Use the start of this time slice to avoid calculations. */ 2768 now = p->se.exec_start; 2769 runtime = p->se.sum_exec_runtime; 2770 2771 if (p->last_task_numa_placement) { 2772 delta = runtime - p->last_sum_exec_runtime; 2773 *period = now - p->last_task_numa_placement; 2774 2775 /* Avoid time going backwards, prevent potential divide error: */ 2776 if (unlikely((s64)*period < 0)) 2777 *period = 0; 2778 } else { 2779 delta = p->se.avg.load_sum; 2780 *period = LOAD_AVG_MAX; 2781 } 2782 2783 p->last_sum_exec_runtime = runtime; 2784 p->last_task_numa_placement = now; 2785 2786 return delta; 2787 } 2788 2789 /* 2790 * Determine the preferred nid for a task in a numa_group. This needs to 2791 * be done in a way that produces consistent results with group_weight, 2792 * otherwise workloads might not converge. 2793 */ 2794 static int preferred_group_nid(struct task_struct *p, int nid) 2795 { 2796 nodemask_t nodes; 2797 int dist; 2798 2799 /* Direct connections between all NUMA nodes. */ 2800 if (sched_numa_topology_type == NUMA_DIRECT) 2801 return nid; 2802 2803 /* 2804 * On a system with glueless mesh NUMA topology, group_weight 2805 * scores nodes according to the number of NUMA hinting faults on 2806 * both the node itself, and on nearby nodes. 2807 */ 2808 if (sched_numa_topology_type == NUMA_GLUELESS_MESH) { 2809 unsigned long score, max_score = 0; 2810 int node, max_node = nid; 2811 2812 dist = sched_max_numa_distance; 2813 2814 for_each_node_state(node, N_CPU) { 2815 score = group_weight(p, node, dist); 2816 if (score > max_score) { 2817 max_score = score; 2818 max_node = node; 2819 } 2820 } 2821 return max_node; 2822 } 2823 2824 /* 2825 * Finding the preferred nid in a system with NUMA backplane 2826 * interconnect topology is more involved. The goal is to locate 2827 * tasks from numa_groups near each other in the system, and 2828 * untangle workloads from different sides of the system. This requires 2829 * searching down the hierarchy of node groups, recursively searching 2830 * inside the highest scoring group of nodes. The nodemask tricks 2831 * keep the complexity of the search down. 2832 */ 2833 nodes = node_states[N_CPU]; 2834 for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) { 2835 unsigned long max_faults = 0; 2836 nodemask_t max_group = NODE_MASK_NONE; 2837 int a, b; 2838 2839 /* Are there nodes at this distance from each other? */ 2840 if (!find_numa_distance(dist)) 2841 continue; 2842 2843 for_each_node_mask(a, nodes) { 2844 unsigned long faults = 0; 2845 nodemask_t this_group; 2846 nodes_clear(this_group); 2847 2848 /* Sum group's NUMA faults; includes a==b case. */ 2849 for_each_node_mask(b, nodes) { 2850 if (node_distance(a, b) < dist) { 2851 faults += group_faults(p, b); 2852 node_set(b, this_group); 2853 node_clear(b, nodes); 2854 } 2855 } 2856 2857 /* Remember the top group. */ 2858 if (faults > max_faults) { 2859 max_faults = faults; 2860 max_group = this_group; 2861 /* 2862 * subtle: at the smallest distance there is 2863 * just one node left in each "group", the 2864 * winner is the preferred nid. 2865 */ 2866 nid = a; 2867 } 2868 } 2869 /* Next round, evaluate the nodes within max_group. */ 2870 if (!max_faults) 2871 break; 2872 nodes = max_group; 2873 } 2874 return nid; 2875 } 2876 2877 static void task_numa_placement(struct task_struct *p) 2878 { 2879 int seq, nid, max_nid = NUMA_NO_NODE; 2880 unsigned long max_faults = 0; 2881 unsigned long fault_types[2] = { 0, 0 }; 2882 unsigned long total_faults; 2883 u64 runtime, period; 2884 spinlock_t *group_lock = NULL; 2885 struct numa_group *ng; 2886 2887 /* 2888 * The p->mm->numa_scan_seq field gets updated without 2889 * exclusive access. Use READ_ONCE() here to ensure 2890 * that the field is read in a single access: 2891 */ 2892 seq = READ_ONCE(p->mm->numa_scan_seq); 2893 if (p->numa_scan_seq == seq) 2894 return; 2895 p->numa_scan_seq = seq; 2896 p->numa_scan_period_max = task_scan_max(p); 2897 2898 total_faults = p->numa_faults_locality[0] + 2899 p->numa_faults_locality[1]; 2900 runtime = numa_get_avg_runtime(p, &period); 2901 2902 /* If the task is part of a group prevent parallel updates to group stats */ 2903 ng = deref_curr_numa_group(p); 2904 if (ng) { 2905 group_lock = &ng->lock; 2906 spin_lock_irq(group_lock); 2907 } 2908 2909 /* Find the node with the highest number of faults */ 2910 for_each_online_node(nid) { 2911 /* Keep track of the offsets in numa_faults array */ 2912 int mem_idx, membuf_idx, cpu_idx, cpubuf_idx; 2913 unsigned long faults = 0, group_faults = 0; 2914 int priv; 2915 2916 for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) { 2917 long diff, f_diff, f_weight; 2918 2919 mem_idx = task_faults_idx(NUMA_MEM, nid, priv); 2920 membuf_idx = task_faults_idx(NUMA_MEMBUF, nid, priv); 2921 cpu_idx = task_faults_idx(NUMA_CPU, nid, priv); 2922 cpubuf_idx = task_faults_idx(NUMA_CPUBUF, nid, priv); 2923 2924 /* Decay existing window, copy faults since last scan */ 2925 diff = p->numa_faults[membuf_idx] - p->numa_faults[mem_idx] / 2; 2926 fault_types[priv] += p->numa_faults[membuf_idx]; 2927 p->numa_faults[membuf_idx] = 0; 2928 2929 /* 2930 * Normalize the faults_from, so all tasks in a group 2931 * count according to CPU use, instead of by the raw 2932 * number of faults. Tasks with little runtime have 2933 * little over-all impact on throughput, and thus their 2934 * faults are less important. 2935 */ 2936 f_weight = div64_u64(runtime << 16, period + 1); 2937 f_weight = (f_weight * p->numa_faults[cpubuf_idx]) / 2938 (total_faults + 1); 2939 f_diff = f_weight - p->numa_faults[cpu_idx] / 2; 2940 p->numa_faults[cpubuf_idx] = 0; 2941 2942 p->numa_faults[mem_idx] += diff; 2943 p->numa_faults[cpu_idx] += f_diff; 2944 faults += p->numa_faults[mem_idx]; 2945 p->total_numa_faults += diff; 2946 if (ng) { 2947 /* 2948 * safe because we can only change our own group 2949 * 2950 * mem_idx represents the offset for a given 2951 * nid and priv in a specific region because it 2952 * is at the beginning of the numa_faults array. 2953 */ 2954 ng->faults[mem_idx] += diff; 2955 ng->faults[cpu_idx] += f_diff; 2956 ng->total_faults += diff; 2957 group_faults += ng->faults[mem_idx]; 2958 } 2959 } 2960 2961 if (!ng) { 2962 if (faults > max_faults) { 2963 max_faults = faults; 2964 max_nid = nid; 2965 } 2966 } else if (group_faults > max_faults) { 2967 max_faults = group_faults; 2968 max_nid = nid; 2969 } 2970 } 2971 2972 /* Cannot migrate task to CPU-less node */ 2973 max_nid = numa_nearest_node(max_nid, N_CPU); 2974 2975 if (ng) { 2976 numa_group_count_active_nodes(ng); 2977 spin_unlock_irq(group_lock); 2978 max_nid = preferred_group_nid(p, max_nid); 2979 } 2980 2981 if (max_faults) { 2982 /* Set the new preferred node */ 2983 if (max_nid != p->numa_preferred_nid) 2984 sched_setnuma(p, max_nid); 2985 } 2986 2987 update_task_scan_period(p, fault_types[0], fault_types[1]); 2988 } 2989 2990 static inline int get_numa_group(struct numa_group *grp) 2991 { 2992 return refcount_inc_not_zero(&grp->refcount); 2993 } 2994 2995 static inline void put_numa_group(struct numa_group *grp) 2996 { 2997 if (refcount_dec_and_test(&grp->refcount)) 2998 kfree_rcu(grp, rcu); 2999 } 3000 3001 static void task_numa_group(struct task_struct *p, int cpupid, int flags, 3002 int *priv) 3003 { 3004 struct numa_group *grp, *my_grp; 3005 struct task_struct *tsk; 3006 bool join = false; 3007 int cpu = cpupid_to_cpu(cpupid); 3008 int i; 3009 3010 if (unlikely(!deref_curr_numa_group(p))) { 3011 unsigned int size = sizeof(struct numa_group) + 3012 NR_NUMA_HINT_FAULT_STATS * 3013 nr_node_ids * sizeof(unsigned long); 3014 3015 grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN); 3016 if (!grp) 3017 return; 3018 3019 refcount_set(&grp->refcount, 1); 3020 grp->active_nodes = 1; 3021 grp->max_faults_cpu = 0; 3022 spin_lock_init(&grp->lock); 3023 grp->gid = p->pid; 3024 3025 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) 3026 grp->faults[i] = p->numa_faults[i]; 3027 3028 grp->total_faults = p->total_numa_faults; 3029 3030 grp->nr_tasks++; 3031 rcu_assign_pointer(p->numa_group, grp); 3032 } 3033 3034 rcu_read_lock(); 3035 tsk = READ_ONCE(cpu_rq(cpu)->curr); 3036 3037 if (!cpupid_match_pid(tsk, cpupid)) 3038 goto no_join; 3039 3040 grp = rcu_dereference(tsk->numa_group); 3041 if (!grp) 3042 goto no_join; 3043 3044 my_grp = deref_curr_numa_group(p); 3045 if (grp == my_grp) 3046 goto no_join; 3047 3048 /* 3049 * Only join the other group if its bigger; if we're the bigger group, 3050 * the other task will join us. 3051 */ 3052 if (my_grp->nr_tasks > grp->nr_tasks) 3053 goto no_join; 3054 3055 /* 3056 * Tie-break on the grp address. 3057 */ 3058 if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp) 3059 goto no_join; 3060 3061 /* Always join threads in the same process. */ 3062 if (tsk->mm == current->mm) 3063 join = true; 3064 3065 /* Simple filter to avoid false positives due to PID collisions */ 3066 if (flags & TNF_SHARED) 3067 join = true; 3068 3069 /* Update priv based on whether false sharing was detected */ 3070 *priv = !join; 3071 3072 if (join && !get_numa_group(grp)) 3073 goto no_join; 3074 3075 rcu_read_unlock(); 3076 3077 if (!join) 3078 return; 3079 3080 WARN_ON_ONCE(irqs_disabled()); 3081 double_lock_irq(&my_grp->lock, &grp->lock); 3082 3083 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) { 3084 my_grp->faults[i] -= p->numa_faults[i]; 3085 grp->faults[i] += p->numa_faults[i]; 3086 } 3087 my_grp->total_faults -= p->total_numa_faults; 3088 grp->total_faults += p->total_numa_faults; 3089 3090 my_grp->nr_tasks--; 3091 grp->nr_tasks++; 3092 3093 spin_unlock(&my_grp->lock); 3094 spin_unlock_irq(&grp->lock); 3095 3096 rcu_assign_pointer(p->numa_group, grp); 3097 3098 put_numa_group(my_grp); 3099 return; 3100 3101 no_join: 3102 rcu_read_unlock(); 3103 return; 3104 } 3105 3106 /* 3107 * Get rid of NUMA statistics associated with a task (either current or dead). 3108 * If @final is set, the task is dead and has reached refcount zero, so we can 3109 * safely free all relevant data structures. Otherwise, there might be 3110 * concurrent reads from places like load balancing and procfs, and we should 3111 * reset the data back to default state without freeing ->numa_faults. 3112 */ 3113 void task_numa_free(struct task_struct *p, bool final) 3114 { 3115 /* safe: p either is current or is being freed by current */ 3116 struct numa_group *grp = rcu_dereference_raw(p->numa_group); 3117 unsigned long *numa_faults = p->numa_faults; 3118 unsigned long flags; 3119 int i; 3120 3121 if (!numa_faults) 3122 return; 3123 3124 if (grp) { 3125 spin_lock_irqsave(&grp->lock, flags); 3126 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) 3127 grp->faults[i] -= p->numa_faults[i]; 3128 grp->total_faults -= p->total_numa_faults; 3129 3130 grp->nr_tasks--; 3131 spin_unlock_irqrestore(&grp->lock, flags); 3132 RCU_INIT_POINTER(p->numa_group, NULL); 3133 put_numa_group(grp); 3134 } 3135 3136 if (final) { 3137 p->numa_faults = NULL; 3138 kfree(numa_faults); 3139 } else { 3140 p->total_numa_faults = 0; 3141 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) 3142 numa_faults[i] = 0; 3143 } 3144 } 3145 3146 /* 3147 * Got a PROT_NONE fault for a page on @node. 3148 */ 3149 void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) 3150 { 3151 struct task_struct *p = current; 3152 bool migrated = flags & TNF_MIGRATED; 3153 int cpu_node = task_node(current); 3154 int local = !!(flags & TNF_FAULT_LOCAL); 3155 struct numa_group *ng; 3156 int priv; 3157 3158 if (!static_branch_likely(&sched_numa_balancing)) 3159 return; 3160 3161 /* for example, ksmd faulting in a user's mm */ 3162 if (!p->mm) 3163 return; 3164 3165 /* 3166 * NUMA faults statistics are unnecessary for the slow memory 3167 * node for memory tiering mode. 3168 */ 3169 if (!node_is_toptier(mem_node) && 3170 (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING || 3171 !cpupid_valid(last_cpupid))) 3172 return; 3173 3174 /* Allocate buffer to track faults on a per-node basis */ 3175 if (unlikely(!p->numa_faults)) { 3176 int size = sizeof(*p->numa_faults) * 3177 NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids; 3178 3179 p->numa_faults = kzalloc(size, GFP_KERNEL|__GFP_NOWARN); 3180 if (!p->numa_faults) 3181 return; 3182 3183 p->total_numa_faults = 0; 3184 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality)); 3185 } 3186 3187 /* 3188 * First accesses are treated as private, otherwise consider accesses 3189 * to be private if the accessing pid has not changed 3190 */ 3191 if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) { 3192 priv = 1; 3193 } else { 3194 priv = cpupid_match_pid(p, last_cpupid); 3195 if (!priv && !(flags & TNF_NO_GROUP)) 3196 task_numa_group(p, last_cpupid, flags, &priv); 3197 } 3198 3199 /* 3200 * If a workload spans multiple NUMA nodes, a shared fault that 3201 * occurs wholly within the set of nodes that the workload is 3202 * actively using should be counted as local. This allows the 3203 * scan rate to slow down when a workload has settled down. 3204 */ 3205 ng = deref_curr_numa_group(p); 3206 if (!priv && !local && ng && ng->active_nodes > 1 && 3207 numa_is_active_node(cpu_node, ng) && 3208 numa_is_active_node(mem_node, ng)) 3209 local = 1; 3210 3211 /* 3212 * Retry to migrate task to preferred node periodically, in case it 3213 * previously failed, or the scheduler moved us. 3214 */ 3215 if (time_after(jiffies, p->numa_migrate_retry)) { 3216 task_numa_placement(p); 3217 numa_migrate_preferred(p); 3218 } 3219 3220 if (migrated) 3221 p->numa_pages_migrated += pages; 3222 if (flags & TNF_MIGRATE_FAIL) 3223 p->numa_faults_locality[2] += pages; 3224 3225 p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages; 3226 p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages; 3227 p->numa_faults_locality[local] += pages; 3228 } 3229 3230 static void reset_ptenuma_scan(struct task_struct *p) 3231 { 3232 /* 3233 * We only did a read acquisition of the mmap sem, so 3234 * p->mm->numa_scan_seq is written to without exclusive access 3235 * and the update is not guaranteed to be atomic. That's not 3236 * much of an issue though, since this is just used for 3237 * statistical sampling. Use READ_ONCE/WRITE_ONCE, which are not 3238 * expensive, to avoid any form of compiler optimizations: 3239 */ 3240 WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1); 3241 p->mm->numa_scan_offset = 0; 3242 } 3243 3244 static bool vma_is_accessed(struct mm_struct *mm, struct vm_area_struct *vma) 3245 { 3246 unsigned long pids; 3247 /* 3248 * Allow unconditional access first two times, so that all the (pages) 3249 * of VMAs get prot_none fault introduced irrespective of accesses. 3250 * This is also done to avoid any side effect of task scanning 3251 * amplifying the unfairness of disjoint set of VMAs' access. 3252 */ 3253 if ((READ_ONCE(current->mm->numa_scan_seq) - vma->numab_state->start_scan_seq) < 2) 3254 return true; 3255 3256 pids = vma->numab_state->pids_active[0] | vma->numab_state->pids_active[1]; 3257 if (test_bit(hash_32(current->pid, ilog2(BITS_PER_LONG)), &pids)) 3258 return true; 3259 3260 /* 3261 * Complete a scan that has already started regardless of PID access, or 3262 * some VMAs may never be scanned in multi-threaded applications: 3263 */ 3264 if (mm->numa_scan_offset > vma->vm_start) { 3265 trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_IGNORE_PID); 3266 return true; 3267 } 3268 3269 /* 3270 * This vma has not been accessed for a while, and if the number 3271 * the threads in the same process is low, which means no other 3272 * threads can help scan this vma, force a vma scan. 3273 */ 3274 if (READ_ONCE(mm->numa_scan_seq) > 3275 (vma->numab_state->prev_scan_seq + get_nr_threads(current))) 3276 return true; 3277 3278 return false; 3279 } 3280 3281 #define VMA_PID_RESET_PERIOD (4 * sysctl_numa_balancing_scan_delay) 3282 3283 /* 3284 * The expensive part of numa migration is done from task_work context. 3285 * Triggered from task_tick_numa(). 3286 */ 3287 static void task_numa_work(struct callback_head *work) 3288 { 3289 unsigned long migrate, next_scan, now = jiffies; 3290 struct task_struct *p = current; 3291 struct mm_struct *mm = p->mm; 3292 u64 runtime = p->se.sum_exec_runtime; 3293 struct vm_area_struct *vma; 3294 unsigned long start, end; 3295 unsigned long nr_pte_updates = 0; 3296 long pages, virtpages; 3297 struct vma_iterator vmi; 3298 bool vma_pids_skipped; 3299 bool vma_pids_forced = false; 3300 3301 SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work)); 3302 3303 work->next = work; 3304 /* 3305 * Who cares about NUMA placement when they're dying. 3306 * 3307 * NOTE: make sure not to dereference p->mm before this check, 3308 * exit_task_work() happens _after_ exit_mm() so we could be called 3309 * without p->mm even though we still had it when we enqueued this 3310 * work. 3311 */ 3312 if (p->flags & PF_EXITING) 3313 return; 3314 3315 if (!mm->numa_next_scan) { 3316 mm->numa_next_scan = now + 3317 msecs_to_jiffies(sysctl_numa_balancing_scan_delay); 3318 } 3319 3320 /* 3321 * Enforce maximal scan/migration frequency.. 3322 */ 3323 migrate = mm->numa_next_scan; 3324 if (time_before(now, migrate)) 3325 return; 3326 3327 if (p->numa_scan_period == 0) { 3328 p->numa_scan_period_max = task_scan_max(p); 3329 p->numa_scan_period = task_scan_start(p); 3330 } 3331 3332 next_scan = now + msecs_to_jiffies(p->numa_scan_period); 3333 if (!try_cmpxchg(&mm->numa_next_scan, &migrate, next_scan)) 3334 return; 3335 3336 /* 3337 * Delay this task enough that another task of this mm will likely win 3338 * the next time around. 3339 */ 3340 p->node_stamp += 2 * TICK_NSEC; 3341 3342 pages = sysctl_numa_balancing_scan_size; 3343 pages <<= 20 - PAGE_SHIFT; /* MB in pages */ 3344 virtpages = pages * 8; /* Scan up to this much virtual space */ 3345 if (!pages) 3346 return; 3347 3348 3349 if (!mmap_read_trylock(mm)) 3350 return; 3351 3352 /* 3353 * VMAs are skipped if the current PID has not trapped a fault within 3354 * the VMA recently. Allow scanning to be forced if there is no 3355 * suitable VMA remaining. 3356 */ 3357 vma_pids_skipped = false; 3358 3359 retry_pids: 3360 start = mm->numa_scan_offset; 3361 vma_iter_init(&vmi, mm, start); 3362 vma = vma_next(&vmi); 3363 if (!vma) { 3364 reset_ptenuma_scan(p); 3365 start = 0; 3366 vma_iter_set(&vmi, start); 3367 vma = vma_next(&vmi); 3368 } 3369 3370 for (; vma; vma = vma_next(&vmi)) { 3371 if (!vma_migratable(vma) || !vma_policy_mof(vma) || 3372 is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) { 3373 trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_UNSUITABLE); 3374 continue; 3375 } 3376 3377 /* 3378 * Shared library pages mapped by multiple processes are not 3379 * migrated as it is expected they are cache replicated. Avoid 3380 * hinting faults in read-only file-backed mappings or the vDSO 3381 * as migrating the pages will be of marginal benefit. 3382 */ 3383 if (!vma->vm_mm || 3384 (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ))) { 3385 trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_SHARED_RO); 3386 continue; 3387 } 3388 3389 /* 3390 * Skip inaccessible VMAs to avoid any confusion between 3391 * PROT_NONE and NUMA hinting PTEs 3392 */ 3393 if (!vma_is_accessible(vma)) { 3394 trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_INACCESSIBLE); 3395 continue; 3396 } 3397 3398 /* Initialise new per-VMA NUMAB state. */ 3399 if (!vma->numab_state) { 3400 struct vma_numab_state *ptr; 3401 3402 ptr = kzalloc(sizeof(*ptr), GFP_KERNEL); 3403 if (!ptr) 3404 continue; 3405 3406 if (cmpxchg(&vma->numab_state, NULL, ptr)) { 3407 kfree(ptr); 3408 continue; 3409 } 3410 3411 vma->numab_state->start_scan_seq = mm->numa_scan_seq; 3412 3413 vma->numab_state->next_scan = now + 3414 msecs_to_jiffies(sysctl_numa_balancing_scan_delay); 3415 3416 /* Reset happens after 4 times scan delay of scan start */ 3417 vma->numab_state->pids_active_reset = vma->numab_state->next_scan + 3418 msecs_to_jiffies(VMA_PID_RESET_PERIOD); 3419 3420 /* 3421 * Ensure prev_scan_seq does not match numa_scan_seq, 3422 * to prevent VMAs being skipped prematurely on the 3423 * first scan: 3424 */ 3425 vma->numab_state->prev_scan_seq = mm->numa_scan_seq - 1; 3426 } 3427 3428 /* 3429 * Scanning the VMAs of short lived tasks add more overhead. So 3430 * delay the scan for new VMAs. 3431 */ 3432 if (mm->numa_scan_seq && time_before(jiffies, 3433 vma->numab_state->next_scan)) { 3434 trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_SCAN_DELAY); 3435 continue; 3436 } 3437 3438 /* RESET access PIDs regularly for old VMAs. */ 3439 if (mm->numa_scan_seq && 3440 time_after(jiffies, vma->numab_state->pids_active_reset)) { 3441 vma->numab_state->pids_active_reset = vma->numab_state->pids_active_reset + 3442 msecs_to_jiffies(VMA_PID_RESET_PERIOD); 3443 vma->numab_state->pids_active[0] = READ_ONCE(vma->numab_state->pids_active[1]); 3444 vma->numab_state->pids_active[1] = 0; 3445 } 3446 3447 /* Do not rescan VMAs twice within the same sequence. */ 3448 if (vma->numab_state->prev_scan_seq == mm->numa_scan_seq) { 3449 mm->numa_scan_offset = vma->vm_end; 3450 trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_SEQ_COMPLETED); 3451 continue; 3452 } 3453 3454 /* 3455 * Do not scan the VMA if task has not accessed it, unless no other 3456 * VMA candidate exists. 3457 */ 3458 if (!vma_pids_forced && !vma_is_accessed(mm, vma)) { 3459 vma_pids_skipped = true; 3460 trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_PID_INACTIVE); 3461 continue; 3462 } 3463 3464 do { 3465 start = max(start, vma->vm_start); 3466 end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE); 3467 end = min(end, vma->vm_end); 3468 nr_pte_updates = change_prot_numa(vma, start, end); 3469 3470 /* 3471 * Try to scan sysctl_numa_balancing_size worth of 3472 * hpages that have at least one present PTE that 3473 * is not already PTE-numa. If the VMA contains 3474 * areas that are unused or already full of prot_numa 3475 * PTEs, scan up to virtpages, to skip through those 3476 * areas faster. 3477 */ 3478 if (nr_pte_updates) 3479 pages -= (end - start) >> PAGE_SHIFT; 3480 virtpages -= (end - start) >> PAGE_SHIFT; 3481 3482 start = end; 3483 if (pages <= 0 || virtpages <= 0) 3484 goto out; 3485 3486 cond_resched(); 3487 } while (end != vma->vm_end); 3488 3489 /* VMA scan is complete, do not scan until next sequence. */ 3490 vma->numab_state->prev_scan_seq = mm->numa_scan_seq; 3491 3492 /* 3493 * Only force scan within one VMA at a time, to limit the 3494 * cost of scanning a potentially uninteresting VMA. 3495 */ 3496 if (vma_pids_forced) 3497 break; 3498 } 3499 3500 /* 3501 * If no VMAs are remaining and VMAs were skipped due to the PID 3502 * not accessing the VMA previously, then force a scan to ensure 3503 * forward progress: 3504 */ 3505 if (!vma && !vma_pids_forced && vma_pids_skipped) { 3506 vma_pids_forced = true; 3507 goto retry_pids; 3508 } 3509 3510 out: 3511 /* 3512 * It is possible to reach the end of the VMA list but the last few 3513 * VMAs are not guaranteed to the vma_migratable. If they are not, we 3514 * would find the !migratable VMA on the next scan but not reset the 3515 * scanner to the start so check it now. 3516 */ 3517 if (vma) 3518 mm->numa_scan_offset = start; 3519 else 3520 reset_ptenuma_scan(p); 3521 mmap_read_unlock(mm); 3522 3523 /* 3524 * Make sure tasks use at least 32x as much time to run other code 3525 * than they used here, to limit NUMA PTE scanning overhead to 3% max. 3526 * Usually update_task_scan_period slows down scanning enough; on an 3527 * overloaded system we need to limit overhead on a per task basis. 3528 */ 3529 if (unlikely(p->se.sum_exec_runtime != runtime)) { 3530 u64 diff = p->se.sum_exec_runtime - runtime; 3531 p->node_stamp += 32 * diff; 3532 } 3533 } 3534 3535 void init_numa_balancing(unsigned long clone_flags, struct task_struct *p) 3536 { 3537 int mm_users = 0; 3538 struct mm_struct *mm = p->mm; 3539 3540 if (mm) { 3541 mm_users = atomic_read(&mm->mm_users); 3542 if (mm_users == 1) { 3543 mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay); 3544 mm->numa_scan_seq = 0; 3545 } 3546 } 3547 p->node_stamp = 0; 3548 p->numa_scan_seq = mm ? mm->numa_scan_seq : 0; 3549 p->numa_scan_period = sysctl_numa_balancing_scan_delay; 3550 p->numa_migrate_retry = 0; 3551 /* Protect against double add, see task_tick_numa and task_numa_work */ 3552 p->numa_work.next = &p->numa_work; 3553 p->numa_faults = NULL; 3554 p->numa_pages_migrated = 0; 3555 p->total_numa_faults = 0; 3556 RCU_INIT_POINTER(p->numa_group, NULL); 3557 p->last_task_numa_placement = 0; 3558 p->last_sum_exec_runtime = 0; 3559 3560 init_task_work(&p->numa_work, task_numa_work); 3561 3562 /* New address space, reset the preferred nid */ 3563 if (!(clone_flags & CLONE_VM)) { 3564 p->numa_preferred_nid = NUMA_NO_NODE; 3565 return; 3566 } 3567 3568 /* 3569 * New thread, keep existing numa_preferred_nid which should be copied 3570 * already by arch_dup_task_struct but stagger when scans start. 3571 */ 3572 if (mm) { 3573 unsigned int delay; 3574 3575 delay = min_t(unsigned int, task_scan_max(current), 3576 current->numa_scan_period * mm_users * NSEC_PER_MSEC); 3577 delay += 2 * TICK_NSEC; 3578 p->node_stamp = delay; 3579 } 3580 } 3581 3582 /* 3583 * Drive the periodic memory faults.. 3584 */ 3585 static void task_tick_numa(struct rq *rq, struct task_struct *curr) 3586 { 3587 struct callback_head *work = &curr->numa_work; 3588 u64 period, now; 3589 3590 /* 3591 * We don't care about NUMA placement if we don't have memory. 3592 */ 3593 if (!curr->mm || (curr->flags & (PF_EXITING | PF_KTHREAD)) || work->next != work) 3594 return; 3595 3596 /* 3597 * Using runtime rather than walltime has the dual advantage that 3598 * we (mostly) drive the selection from busy threads and that the 3599 * task needs to have done some actual work before we bother with 3600 * NUMA placement. 3601 */ 3602 now = curr->se.sum_exec_runtime; 3603 period = (u64)curr->numa_scan_period * NSEC_PER_MSEC; 3604 3605 if (now > curr->node_stamp + period) { 3606 if (!curr->node_stamp) 3607 curr->numa_scan_period = task_scan_start(curr); 3608 curr->node_stamp += period; 3609 3610 if (!time_before(jiffies, curr->mm->numa_next_scan)) 3611 task_work_add(curr, work, TWA_RESUME); 3612 } 3613 } 3614 3615 static void update_scan_period(struct task_struct *p, int new_cpu) 3616 { 3617 int src_nid = cpu_to_node(task_cpu(p)); 3618 int dst_nid = cpu_to_node(new_cpu); 3619 3620 if (!static_branch_likely(&sched_numa_balancing)) 3621 return; 3622 3623 if (!p->mm || !p->numa_faults || (p->flags & PF_EXITING)) 3624 return; 3625 3626 if (src_nid == dst_nid) 3627 return; 3628 3629 /* 3630 * Allow resets if faults have been trapped before one scan 3631 * has completed. This is most likely due to a new task that 3632 * is pulled cross-node due to wakeups or load balancing. 3633 */ 3634 if (p->numa_scan_seq) { 3635 /* 3636 * Avoid scan adjustments if moving to the preferred 3637 * node or if the task was not previously running on 3638 * the preferred node. 3639 */ 3640 if (dst_nid == p->numa_preferred_nid || 3641 (p->numa_preferred_nid != NUMA_NO_NODE && 3642 src_nid != p->numa_preferred_nid)) 3643 return; 3644 } 3645 3646 p->numa_scan_period = task_scan_start(p); 3647 } 3648 3649 #else 3650 static void task_tick_numa(struct rq *rq, struct task_struct *curr) 3651 { 3652 } 3653 3654 static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p) 3655 { 3656 } 3657 3658 static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p) 3659 { 3660 } 3661 3662 static inline void update_scan_period(struct task_struct *p, int new_cpu) 3663 { 3664 } 3665 3666 #endif /* CONFIG_NUMA_BALANCING */ 3667 3668 static void 3669 account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) 3670 { 3671 update_load_add(&cfs_rq->load, se->load.weight); 3672 #ifdef CONFIG_SMP 3673 if (entity_is_task(se)) { 3674 struct rq *rq = rq_of(cfs_rq); 3675 3676 account_numa_enqueue(rq, task_of(se)); 3677 list_add(&se->group_node, &rq->cfs_tasks); 3678 } 3679 #endif 3680 cfs_rq->nr_running++; 3681 if (se_is_idle(se)) 3682 cfs_rq->idle_nr_running++; 3683 } 3684 3685 static void 3686 account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) 3687 { 3688 update_load_sub(&cfs_rq->load, se->load.weight); 3689 #ifdef CONFIG_SMP 3690 if (entity_is_task(se)) { 3691 account_numa_dequeue(rq_of(cfs_rq), task_of(se)); 3692 list_del_init(&se->group_node); 3693 } 3694 #endif 3695 cfs_rq->nr_running--; 3696 if (se_is_idle(se)) 3697 cfs_rq->idle_nr_running--; 3698 } 3699 3700 /* 3701 * Signed add and clamp on underflow. 3702 * 3703 * Explicitly do a load-store to ensure the intermediate value never hits 3704 * memory. This allows lockless observations without ever seeing the negative 3705 * values. 3706 */ 3707 #define add_positive(_ptr, _val) do { \ 3708 typeof(_ptr) ptr = (_ptr); \ 3709 typeof(_val) val = (_val); \ 3710 typeof(*ptr) res, var = READ_ONCE(*ptr); \ 3711 \ 3712 res = var + val; \ 3713 \ 3714 if (val < 0 && res > var) \ 3715 res = 0; \ 3716 \ 3717 WRITE_ONCE(*ptr, res); \ 3718 } while (0) 3719 3720 /* 3721 * Unsigned subtract and clamp on underflow. 3722 * 3723 * Explicitly do a load-store to ensure the intermediate value never hits 3724 * memory. This allows lockless observations without ever seeing the negative 3725 * values. 3726 */ 3727 #define sub_positive(_ptr, _val) do { \ 3728 typeof(_ptr) ptr = (_ptr); \ 3729 typeof(*ptr) val = (_val); \ 3730 typeof(*ptr) res, var = READ_ONCE(*ptr); \ 3731 res = var - val; \ 3732 if (res > var) \ 3733 res = 0; \ 3734 WRITE_ONCE(*ptr, res); \ 3735 } while (0) 3736 3737 /* 3738 * Remove and clamp on negative, from a local variable. 3739 * 3740 * A variant of sub_positive(), which does not use explicit load-store 3741 * and is thus optimized for local variable updates. 3742 */ 3743 #define lsub_positive(_ptr, _val) do { \ 3744 typeof(_ptr) ptr = (_ptr); \ 3745 *ptr -= min_t(typeof(*ptr), *ptr, _val); \ 3746 } while (0) 3747 3748 #ifdef CONFIG_SMP 3749 static inline void 3750 enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) 3751 { 3752 cfs_rq->avg.load_avg += se->avg.load_avg; 3753 cfs_rq->avg.load_sum += se_weight(se) * se->avg.load_sum; 3754 } 3755 3756 static inline void 3757 dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) 3758 { 3759 sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg); 3760 sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum); 3761 /* See update_cfs_rq_load_avg() */ 3762 cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum, 3763 cfs_rq->avg.load_avg * PELT_MIN_DIVIDER); 3764 } 3765 #else 3766 static inline void 3767 enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } 3768 static inline void 3769 dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } 3770 #endif 3771 3772 static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags); 3773 3774 static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, 3775 unsigned long weight) 3776 { 3777 bool curr = cfs_rq->curr == se; 3778 3779 if (se->on_rq) { 3780 /* commit outstanding execution time */ 3781 update_curr(cfs_rq); 3782 update_entity_lag(cfs_rq, se); 3783 se->deadline -= se->vruntime; 3784 se->rel_deadline = 1; 3785 if (!curr) 3786 __dequeue_entity(cfs_rq, se); 3787 update_load_sub(&cfs_rq->load, se->load.weight); 3788 } 3789 dequeue_load_avg(cfs_rq, se); 3790 3791 /* 3792 * Because we keep se->vlag = V - v_i, while: lag_i = w_i*(V - v_i), 3793 * we need to scale se->vlag when w_i changes. 3794 */ 3795 se->vlag = div_s64(se->vlag * se->load.weight, weight); 3796 if (se->rel_deadline) 3797 se->deadline = div_s64(se->deadline * se->load.weight, weight); 3798 3799 update_load_set(&se->load, weight); 3800 3801 #ifdef CONFIG_SMP 3802 do { 3803 u32 divider = get_pelt_divider(&se->avg); 3804 3805 se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider); 3806 } while (0); 3807 #endif 3808 3809 enqueue_load_avg(cfs_rq, se); 3810 if (se->on_rq) { 3811 update_load_add(&cfs_rq->load, se->load.weight); 3812 place_entity(cfs_rq, se, 0); 3813 if (!curr) 3814 __enqueue_entity(cfs_rq, se); 3815 3816 /* 3817 * The entity's vruntime has been adjusted, so let's check 3818 * whether the rq-wide min_vruntime needs updated too. Since 3819 * the calculations above require stable min_vruntime rather 3820 * than up-to-date one, we do the update at the end of the 3821 * reweight process. 3822 */ 3823 update_min_vruntime(cfs_rq); 3824 } 3825 } 3826 3827 static void reweight_task_fair(struct rq *rq, struct task_struct *p, 3828 const struct load_weight *lw) 3829 { 3830 struct sched_entity *se = &p->se; 3831 struct cfs_rq *cfs_rq = cfs_rq_of(se); 3832 struct load_weight *load = &se->load; 3833 3834 reweight_entity(cfs_rq, se, lw->weight); 3835 load->inv_weight = lw->inv_weight; 3836 } 3837 3838 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); 3839 3840 #ifdef CONFIG_FAIR_GROUP_SCHED 3841 #ifdef CONFIG_SMP 3842 /* 3843 * All this does is approximate the hierarchical proportion which includes that 3844 * global sum we all love to hate. 3845 * 3846 * That is, the weight of a group entity, is the proportional share of the 3847 * group weight based on the group runqueue weights. That is: 3848 * 3849 * tg->weight * grq->load.weight 3850 * ge->load.weight = ----------------------------- (1) 3851 * \Sum grq->load.weight 3852 * 3853 * Now, because computing that sum is prohibitively expensive to compute (been 3854 * there, done that) we approximate it with this average stuff. The average 3855 * moves slower and therefore the approximation is cheaper and more stable. 3856 * 3857 * So instead of the above, we substitute: 3858 * 3859 * grq->load.weight -> grq->avg.load_avg (2) 3860 * 3861 * which yields the following: 3862 * 3863 * tg->weight * grq->avg.load_avg 3864 * ge->load.weight = ------------------------------ (3) 3865 * tg->load_avg 3866 * 3867 * Where: tg->load_avg ~= \Sum grq->avg.load_avg 3868 * 3869 * That is shares_avg, and it is right (given the approximation (2)). 3870 * 3871 * The problem with it is that because the average is slow -- it was designed 3872 * to be exactly that of course -- this leads to transients in boundary 3873 * conditions. In specific, the case where the group was idle and we start the 3874 * one task. It takes time for our CPU's grq->avg.load_avg to build up, 3875 * yielding bad latency etc.. 3876 * 3877 * Now, in that special case (1) reduces to: 3878 * 3879 * tg->weight * grq->load.weight 3880 * ge->load.weight = ----------------------------- = tg->weight (4) 3881 * grp->load.weight 3882 * 3883 * That is, the sum collapses because all other CPUs are idle; the UP scenario. 3884 * 3885 * So what we do is modify our approximation (3) to approach (4) in the (near) 3886 * UP case, like: 3887 * 3888 * ge->load.weight = 3889 * 3890 * tg->weight * grq->load.weight 3891 * --------------------------------------------------- (5) 3892 * tg->load_avg - grq->avg.load_avg + grq->load.weight 3893 * 3894 * But because grq->load.weight can drop to 0, resulting in a divide by zero, 3895 * we need to use grq->avg.load_avg as its lower bound, which then gives: 3896 * 3897 * 3898 * tg->weight * grq->load.weight 3899 * ge->load.weight = ----------------------------- (6) 3900 * tg_load_avg' 3901 * 3902 * Where: 3903 * 3904 * tg_load_avg' = tg->load_avg - grq->avg.load_avg + 3905 * max(grq->load.weight, grq->avg.load_avg) 3906 * 3907 * And that is shares_weight and is icky. In the (near) UP case it approaches 3908 * (4) while in the normal case it approaches (3). It consistently 3909 * overestimates the ge->load.weight and therefore: 3910 * 3911 * \Sum ge->load.weight >= tg->weight 3912 * 3913 * hence icky! 3914 */ 3915 static long calc_group_shares(struct cfs_rq *cfs_rq) 3916 { 3917 long tg_weight, tg_shares, load, shares; 3918 struct task_group *tg = cfs_rq->tg; 3919 3920 tg_shares = READ_ONCE(tg->shares); 3921 3922 load = max(scale_load_down(cfs_rq->load.weight), cfs_rq->avg.load_avg); 3923 3924 tg_weight = atomic_long_read(&tg->load_avg); 3925 3926 /* Ensure tg_weight >= load */ 3927 tg_weight -= cfs_rq->tg_load_avg_contrib; 3928 tg_weight += load; 3929 3930 shares = (tg_shares * load); 3931 if (tg_weight) 3932 shares /= tg_weight; 3933 3934 /* 3935 * MIN_SHARES has to be unscaled here to support per-CPU partitioning 3936 * of a group with small tg->shares value. It is a floor value which is 3937 * assigned as a minimum load.weight to the sched_entity representing 3938 * the group on a CPU. 3939 * 3940 * E.g. on 64-bit for a group with tg->shares of scale_load(15)=15*1024 3941 * on an 8-core system with 8 tasks each runnable on one CPU shares has 3942 * to be 15*1024*1/8=1920 instead of scale_load(MIN_SHARES)=2*1024. In 3943 * case no task is runnable on a CPU MIN_SHARES=2 should be returned 3944 * instead of 0. 3945 */ 3946 return clamp_t(long, shares, MIN_SHARES, tg_shares); 3947 } 3948 #endif /* CONFIG_SMP */ 3949 3950 /* 3951 * Recomputes the group entity based on the current state of its group 3952 * runqueue. 3953 */ 3954 static void update_cfs_group(struct sched_entity *se) 3955 { 3956 struct cfs_rq *gcfs_rq = group_cfs_rq(se); 3957 long shares; 3958 3959 /* 3960 * When a group becomes empty, preserve its weight. This matters for 3961 * DELAY_DEQUEUE. 3962 */ 3963 if (!gcfs_rq || !gcfs_rq->load.weight) 3964 return; 3965 3966 if (throttled_hierarchy(gcfs_rq)) 3967 return; 3968 3969 #ifndef CONFIG_SMP 3970 shares = READ_ONCE(gcfs_rq->tg->shares); 3971 #else 3972 shares = calc_group_shares(gcfs_rq); 3973 #endif 3974 if (unlikely(se->load.weight != shares)) 3975 reweight_entity(cfs_rq_of(se), se, shares); 3976 } 3977 3978 #else /* CONFIG_FAIR_GROUP_SCHED */ 3979 static inline void update_cfs_group(struct sched_entity *se) 3980 { 3981 } 3982 #endif /* CONFIG_FAIR_GROUP_SCHED */ 3983 3984 static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags) 3985 { 3986 struct rq *rq = rq_of(cfs_rq); 3987 3988 if (&rq->cfs == cfs_rq) { 3989 /* 3990 * There are a few boundary cases this might miss but it should 3991 * get called often enough that that should (hopefully) not be 3992 * a real problem. 3993 * 3994 * It will not get called when we go idle, because the idle 3995 * thread is a different class (!fair), nor will the utilization 3996 * number include things like RT tasks. 3997 * 3998 * As is, the util number is not freq-invariant (we'd have to 3999 * implement arch_scale_freq_capacity() for that). 4000 * 4001 * See cpu_util_cfs(). 4002 */ 4003 cpufreq_update_util(rq, flags); 4004 } 4005 } 4006 4007 #ifdef CONFIG_SMP 4008 static inline bool load_avg_is_decayed(struct sched_avg *sa) 4009 { 4010 if (sa->load_sum) 4011 return false; 4012 4013 if (sa->util_sum) 4014 return false; 4015 4016 if (sa->runnable_sum) 4017 return false; 4018 4019 /* 4020 * _avg must be null when _sum are null because _avg = _sum / divider 4021 * Make sure that rounding and/or propagation of PELT values never 4022 * break this. 4023 */ 4024 SCHED_WARN_ON(sa->load_avg || 4025 sa->util_avg || 4026 sa->runnable_avg); 4027 4028 return true; 4029 } 4030 4031 static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq) 4032 { 4033 return u64_u32_load_copy(cfs_rq->avg.last_update_time, 4034 cfs_rq->last_update_time_copy); 4035 } 4036 #ifdef CONFIG_FAIR_GROUP_SCHED 4037 /* 4038 * Because list_add_leaf_cfs_rq always places a child cfs_rq on the list 4039 * immediately before a parent cfs_rq, and cfs_rqs are removed from the list 4040 * bottom-up, we only have to test whether the cfs_rq before us on the list 4041 * is our child. 4042 * If cfs_rq is not on the list, test whether a child needs its to be added to 4043 * connect a branch to the tree * (see list_add_leaf_cfs_rq() for details). 4044 */ 4045 static inline bool child_cfs_rq_on_list(struct cfs_rq *cfs_rq) 4046 { 4047 struct cfs_rq *prev_cfs_rq; 4048 struct list_head *prev; 4049 4050 if (cfs_rq->on_list) { 4051 prev = cfs_rq->leaf_cfs_rq_list.prev; 4052 } else { 4053 struct rq *rq = rq_of(cfs_rq); 4054 4055 prev = rq->tmp_alone_branch; 4056 } 4057 4058 prev_cfs_rq = container_of(prev, struct cfs_rq, leaf_cfs_rq_list); 4059 4060 return (prev_cfs_rq->tg->parent == cfs_rq->tg); 4061 } 4062 4063 static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) 4064 { 4065 if (cfs_rq->load.weight) 4066 return false; 4067 4068 if (!load_avg_is_decayed(&cfs_rq->avg)) 4069 return false; 4070 4071 if (child_cfs_rq_on_list(cfs_rq)) 4072 return false; 4073 4074 return true; 4075 } 4076 4077 /** 4078 * update_tg_load_avg - update the tg's load avg 4079 * @cfs_rq: the cfs_rq whose avg changed 4080 * 4081 * This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load. 4082 * However, because tg->load_avg is a global value there are performance 4083 * considerations. 4084 * 4085 * In order to avoid having to look at the other cfs_rq's, we use a 4086 * differential update where we store the last value we propagated. This in 4087 * turn allows skipping updates if the differential is 'small'. 4088 * 4089 * Updating tg's load_avg is necessary before update_cfs_share(). 4090 */ 4091 static inline void update_tg_load_avg(struct cfs_rq *cfs_rq) 4092 { 4093 long delta; 4094 u64 now; 4095 4096 /* 4097 * No need to update load_avg for root_task_group as it is not used. 4098 */ 4099 if (cfs_rq->tg == &root_task_group) 4100 return; 4101 4102 /* rq has been offline and doesn't contribute to the share anymore: */ 4103 if (!cpu_active(cpu_of(rq_of(cfs_rq)))) 4104 return; 4105 4106 /* 4107 * For migration heavy workloads, access to tg->load_avg can be 4108 * unbound. Limit the update rate to at most once per ms. 4109 */ 4110 now = sched_clock_cpu(cpu_of(rq_of(cfs_rq))); 4111 if (now - cfs_rq->last_update_tg_load_avg < NSEC_PER_MSEC) 4112 return; 4113 4114 delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib; 4115 if (abs(delta) > cfs_rq->tg_load_avg_contrib / 64) { 4116 atomic_long_add(delta, &cfs_rq->tg->load_avg); 4117 cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg; 4118 cfs_rq->last_update_tg_load_avg = now; 4119 } 4120 } 4121 4122 static inline void clear_tg_load_avg(struct cfs_rq *cfs_rq) 4123 { 4124 long delta; 4125 u64 now; 4126 4127 /* 4128 * No need to update load_avg for root_task_group, as it is not used. 4129 */ 4130 if (cfs_rq->tg == &root_task_group) 4131 return; 4132 4133 now = sched_clock_cpu(cpu_of(rq_of(cfs_rq))); 4134 delta = 0 - cfs_rq->tg_load_avg_contrib; 4135 atomic_long_add(delta, &cfs_rq->tg->load_avg); 4136 cfs_rq->tg_load_avg_contrib = 0; 4137 cfs_rq->last_update_tg_load_avg = now; 4138 } 4139 4140 /* CPU offline callback: */ 4141 static void __maybe_unused clear_tg_offline_cfs_rqs(struct rq *rq) 4142 { 4143 struct task_group *tg; 4144 4145 lockdep_assert_rq_held(rq); 4146 4147 /* 4148 * The rq clock has already been updated in 4149 * set_rq_offline(), so we should skip updating 4150 * the rq clock again in unthrottle_cfs_rq(). 4151 */ 4152 rq_clock_start_loop_update(rq); 4153 4154 rcu_read_lock(); 4155 list_for_each_entry_rcu(tg, &task_groups, list) { 4156 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; 4157 4158 clear_tg_load_avg(cfs_rq); 4159 } 4160 rcu_read_unlock(); 4161 4162 rq_clock_stop_loop_update(rq); 4163 } 4164 4165 /* 4166 * Called within set_task_rq() right before setting a task's CPU. The 4167 * caller only guarantees p->pi_lock is held; no other assumptions, 4168 * including the state of rq->lock, should be made. 4169 */ 4170 void set_task_rq_fair(struct sched_entity *se, 4171 struct cfs_rq *prev, struct cfs_rq *next) 4172 { 4173 u64 p_last_update_time; 4174 u64 n_last_update_time; 4175 4176 if (!sched_feat(ATTACH_AGE_LOAD)) 4177 return; 4178 4179 /* 4180 * We are supposed to update the task to "current" time, then its up to 4181 * date and ready to go to new CPU/cfs_rq. But we have difficulty in 4182 * getting what current time is, so simply throw away the out-of-date 4183 * time. This will result in the wakee task is less decayed, but giving 4184 * the wakee more load sounds not bad. 4185 */ 4186 if (!(se->avg.last_update_time && prev)) 4187 return; 4188 4189 p_last_update_time = cfs_rq_last_update_time(prev); 4190 n_last_update_time = cfs_rq_last_update_time(next); 4191 4192 __update_load_avg_blocked_se(p_last_update_time, se); 4193 se->avg.last_update_time = n_last_update_time; 4194 } 4195 4196 /* 4197 * When on migration a sched_entity joins/leaves the PELT hierarchy, we need to 4198 * propagate its contribution. The key to this propagation is the invariant 4199 * that for each group: 4200 * 4201 * ge->avg == grq->avg (1) 4202 * 4203 * _IFF_ we look at the pure running and runnable sums. Because they 4204 * represent the very same entity, just at different points in the hierarchy. 4205 * 4206 * Per the above update_tg_cfs_util() and update_tg_cfs_runnable() are trivial 4207 * and simply copies the running/runnable sum over (but still wrong, because 4208 * the group entity and group rq do not have their PELT windows aligned). 4209 * 4210 * However, update_tg_cfs_load() is more complex. So we have: 4211 * 4212 * ge->avg.load_avg = ge->load.weight * ge->avg.runnable_avg (2) 4213 * 4214 * And since, like util, the runnable part should be directly transferable, 4215 * the following would _appear_ to be the straight forward approach: 4216 * 4217 * grq->avg.load_avg = grq->load.weight * grq->avg.runnable_avg (3) 4218 * 4219 * And per (1) we have: 4220 * 4221 * ge->avg.runnable_avg == grq->avg.runnable_avg 4222 * 4223 * Which gives: 4224 * 4225 * ge->load.weight * grq->avg.load_avg 4226 * ge->avg.load_avg = ----------------------------------- (4) 4227 * grq->load.weight 4228 * 4229 * Except that is wrong! 4230 * 4231 * Because while for entities historical weight is not important and we 4232 * really only care about our future and therefore can consider a pure 4233 * runnable sum, runqueues can NOT do this. 4234 * 4235 * We specifically want runqueues to have a load_avg that includes 4236 * historical weights. Those represent the blocked load, the load we expect 4237 * to (shortly) return to us. This only works by keeping the weights as 4238 * integral part of the sum. We therefore cannot decompose as per (3). 4239 * 4240 * Another reason this doesn't work is that runnable isn't a 0-sum entity. 4241 * Imagine a rq with 2 tasks that each are runnable 2/3 of the time. Then the 4242 * rq itself is runnable anywhere between 2/3 and 1 depending on how the 4243 * runnable section of these tasks overlap (or not). If they were to perfectly 4244 * align the rq as a whole would be runnable 2/3 of the time. If however we 4245 * always have at least 1 runnable task, the rq as a whole is always runnable. 4246 * 4247 * So we'll have to approximate.. :/ 4248 * 4249 * Given the constraint: 4250 * 4251 * ge->avg.running_sum <= ge->avg.runnable_sum <= LOAD_AVG_MAX 4252 * 4253 * We can construct a rule that adds runnable to a rq by assuming minimal 4254 * overlap. 4255 * 4256 * On removal, we'll assume each task is equally runnable; which yields: 4257 * 4258 * grq->avg.runnable_sum = grq->avg.load_sum / grq->load.weight 4259 * 4260 * XXX: only do this for the part of runnable > running ? 4261 * 4262 */ 4263 static inline void 4264 update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq) 4265 { 4266 long delta_sum, delta_avg = gcfs_rq->avg.util_avg - se->avg.util_avg; 4267 u32 new_sum, divider; 4268 4269 /* Nothing to update */ 4270 if (!delta_avg) 4271 return; 4272 4273 /* 4274 * cfs_rq->avg.period_contrib can be used for both cfs_rq and se. 4275 * See ___update_load_avg() for details. 4276 */ 4277 divider = get_pelt_divider(&cfs_rq->avg); 4278 4279 4280 /* Set new sched_entity's utilization */ 4281 se->avg.util_avg = gcfs_rq->avg.util_avg; 4282 new_sum = se->avg.util_avg * divider; 4283 delta_sum = (long)new_sum - (long)se->avg.util_sum; 4284 se->avg.util_sum = new_sum; 4285 4286 /* Update parent cfs_rq utilization */ 4287 add_positive(&cfs_rq->avg.util_avg, delta_avg); 4288 add_positive(&cfs_rq->avg.util_sum, delta_sum); 4289 4290 /* See update_cfs_rq_load_avg() */ 4291 cfs_rq->avg.util_sum = max_t(u32, cfs_rq->avg.util_sum, 4292 cfs_rq->avg.util_avg * PELT_MIN_DIVIDER); 4293 } 4294 4295 static inline void 4296 update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq) 4297 { 4298 long delta_sum, delta_avg = gcfs_rq->avg.runnable_avg - se->avg.runnable_avg; 4299 u32 new_sum, divider; 4300 4301 /* Nothing to update */ 4302 if (!delta_avg) 4303 return; 4304 4305 /* 4306 * cfs_rq->avg.period_contrib can be used for both cfs_rq and se. 4307 * See ___update_load_avg() for details. 4308 */ 4309 divider = get_pelt_divider(&cfs_rq->avg); 4310 4311 /* Set new sched_entity's runnable */ 4312 se->avg.runnable_avg = gcfs_rq->avg.runnable_avg; 4313 new_sum = se->avg.runnable_avg * divider; 4314 delta_sum = (long)new_sum - (long)se->avg.runnable_sum; 4315 se->avg.runnable_sum = new_sum; 4316 4317 /* Update parent cfs_rq runnable */ 4318 add_positive(&cfs_rq->avg.runnable_avg, delta_avg); 4319 add_positive(&cfs_rq->avg.runnable_sum, delta_sum); 4320 /* See update_cfs_rq_load_avg() */ 4321 cfs_rq->avg.runnable_sum = max_t(u32, cfs_rq->avg.runnable_sum, 4322 cfs_rq->avg.runnable_avg * PELT_MIN_DIVIDER); 4323 } 4324 4325 static inline void 4326 update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq) 4327 { 4328 long delta_avg, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum; 4329 unsigned long load_avg; 4330 u64 load_sum = 0; 4331 s64 delta_sum; 4332 u32 divider; 4333 4334 if (!runnable_sum) 4335 return; 4336 4337 gcfs_rq->prop_runnable_sum = 0; 4338 4339 /* 4340 * cfs_rq->avg.period_contrib can be used for both cfs_rq and se. 4341 * See ___update_load_avg() for details. 4342 */ 4343 divider = get_pelt_divider(&cfs_rq->avg); 4344 4345 if (runnable_sum >= 0) { 4346 /* 4347 * Add runnable; clip at LOAD_AVG_MAX. Reflects that until 4348 * the CPU is saturated running == runnable. 4349 */ 4350 runnable_sum += se->avg.load_sum; 4351 runnable_sum = min_t(long, runnable_sum, divider); 4352 } else { 4353 /* 4354 * Estimate the new unweighted runnable_sum of the gcfs_rq by 4355 * assuming all tasks are equally runnable. 4356 */ 4357 if (scale_load_down(gcfs_rq->load.weight)) { 4358 load_sum = div_u64(gcfs_rq->avg.load_sum, 4359 scale_load_down(gcfs_rq->load.weight)); 4360 } 4361 4362 /* But make sure to not inflate se's runnable */ 4363 runnable_sum = min(se->avg.load_sum, load_sum); 4364 } 4365 4366 /* 4367 * runnable_sum can't be lower than running_sum 4368 * Rescale running sum to be in the same range as runnable sum 4369 * running_sum is in [0 : LOAD_AVG_MAX << SCHED_CAPACITY_SHIFT] 4370 * runnable_sum is in [0 : LOAD_AVG_MAX] 4371 */ 4372 running_sum = se->avg.util_sum >> SCHED_CAPACITY_SHIFT; 4373 runnable_sum = max(runnable_sum, running_sum); 4374 4375 load_sum = se_weight(se) * runnable_sum; 4376 load_avg = div_u64(load_sum, divider); 4377 4378 delta_avg = load_avg - se->avg.load_avg; 4379 if (!delta_avg) 4380 return; 4381 4382 delta_sum = load_sum - (s64)se_weight(se) * se->avg.load_sum; 4383 4384 se->avg.load_sum = runnable_sum; 4385 se->avg.load_avg = load_avg; 4386 add_positive(&cfs_rq->avg.load_avg, delta_avg); 4387 add_positive(&cfs_rq->avg.load_sum, delta_sum); 4388 /* See update_cfs_rq_load_avg() */ 4389 cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum, 4390 cfs_rq->avg.load_avg * PELT_MIN_DIVIDER); 4391 } 4392 4393 static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) 4394 { 4395 cfs_rq->propagate = 1; 4396 cfs_rq->prop_runnable_sum += runnable_sum; 4397 } 4398 4399 /* Update task and its cfs_rq load average */ 4400 static inline int propagate_entity_load_avg(struct sched_entity *se) 4401 { 4402 struct cfs_rq *cfs_rq, *gcfs_rq; 4403 4404 if (entity_is_task(se)) 4405 return 0; 4406 4407 gcfs_rq = group_cfs_rq(se); 4408 if (!gcfs_rq->propagate) 4409 return 0; 4410 4411 gcfs_rq->propagate = 0; 4412 4413 cfs_rq = cfs_rq_of(se); 4414 4415 add_tg_cfs_propagate(cfs_rq, gcfs_rq->prop_runnable_sum); 4416 4417 update_tg_cfs_util(cfs_rq, se, gcfs_rq); 4418 update_tg_cfs_runnable(cfs_rq, se, gcfs_rq); 4419 update_tg_cfs_load(cfs_rq, se, gcfs_rq); 4420 4421 trace_pelt_cfs_tp(cfs_rq); 4422 trace_pelt_se_tp(se); 4423 4424 return 1; 4425 } 4426 4427 /* 4428 * Check if we need to update the load and the utilization of a blocked 4429 * group_entity: 4430 */ 4431 static inline bool skip_blocked_update(struct sched_entity *se) 4432 { 4433 struct cfs_rq *gcfs_rq = group_cfs_rq(se); 4434 4435 /* 4436 * If sched_entity still have not zero load or utilization, we have to 4437 * decay it: 4438 */ 4439 if (se->avg.load_avg || se->avg.util_avg) 4440 return false; 4441 4442 /* 4443 * If there is a pending propagation, we have to update the load and 4444 * the utilization of the sched_entity: 4445 */ 4446 if (gcfs_rq->propagate) 4447 return false; 4448 4449 /* 4450 * Otherwise, the load and the utilization of the sched_entity is 4451 * already zero and there is no pending propagation, so it will be a 4452 * waste of time to try to decay it: 4453 */ 4454 return true; 4455 } 4456 4457 #else /* CONFIG_FAIR_GROUP_SCHED */ 4458 4459 static inline void update_tg_load_avg(struct cfs_rq *cfs_rq) {} 4460 4461 static inline void clear_tg_offline_cfs_rqs(struct rq *rq) {} 4462 4463 static inline int propagate_entity_load_avg(struct sched_entity *se) 4464 { 4465 return 0; 4466 } 4467 4468 static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) {} 4469 4470 #endif /* CONFIG_FAIR_GROUP_SCHED */ 4471 4472 #ifdef CONFIG_NO_HZ_COMMON 4473 static inline void migrate_se_pelt_lag(struct sched_entity *se) 4474 { 4475 u64 throttled = 0, now, lut; 4476 struct cfs_rq *cfs_rq; 4477 struct rq *rq; 4478 bool is_idle; 4479 4480 if (load_avg_is_decayed(&se->avg)) 4481 return; 4482 4483 cfs_rq = cfs_rq_of(se); 4484 rq = rq_of(cfs_rq); 4485 4486 rcu_read_lock(); 4487 is_idle = is_idle_task(rcu_dereference(rq->curr)); 4488 rcu_read_unlock(); 4489 4490 /* 4491 * The lag estimation comes with a cost we don't want to pay all the 4492 * time. Hence, limiting to the case where the source CPU is idle and 4493 * we know we are at the greatest risk to have an outdated clock. 4494 */ 4495 if (!is_idle) 4496 return; 4497 4498 /* 4499 * Estimated "now" is: last_update_time + cfs_idle_lag + rq_idle_lag, where: 4500 * 4501 * last_update_time (the cfs_rq's last_update_time) 4502 * = cfs_rq_clock_pelt()@cfs_rq_idle 4503 * = rq_clock_pelt()@cfs_rq_idle 4504 * - cfs->throttled_clock_pelt_time@cfs_rq_idle 4505 * 4506 * cfs_idle_lag (delta between rq's update and cfs_rq's update) 4507 * = rq_clock_pelt()@rq_idle - rq_clock_pelt()@cfs_rq_idle 4508 * 4509 * rq_idle_lag (delta between now and rq's update) 4510 * = sched_clock_cpu() - rq_clock()@rq_idle 4511 * 4512 * We can then write: 4513 * 4514 * now = rq_clock_pelt()@rq_idle - cfs->throttled_clock_pelt_time + 4515 * sched_clock_cpu() - rq_clock()@rq_idle 4516 * Where: 4517 * rq_clock_pelt()@rq_idle is rq->clock_pelt_idle 4518 * rq_clock()@rq_idle is rq->clock_idle 4519 * cfs->throttled_clock_pelt_time@cfs_rq_idle 4520 * is cfs_rq->throttled_pelt_idle 4521 */ 4522 4523 #ifdef CONFIG_CFS_BANDWIDTH 4524 throttled = u64_u32_load(cfs_rq->throttled_pelt_idle); 4525 /* The clock has been stopped for throttling */ 4526 if (throttled == U64_MAX) 4527 return; 4528 #endif 4529 now = u64_u32_load(rq->clock_pelt_idle); 4530 /* 4531 * Paired with _update_idle_rq_clock_pelt(). It ensures at the worst case 4532 * is observed the old clock_pelt_idle value and the new clock_idle, 4533 * which lead to an underestimation. The opposite would lead to an 4534 * overestimation. 4535 */ 4536 smp_rmb(); 4537 lut = cfs_rq_last_update_time(cfs_rq); 4538 4539 now -= throttled; 4540 if (now < lut) 4541 /* 4542 * cfs_rq->avg.last_update_time is more recent than our 4543 * estimation, let's use it. 4544 */ 4545 now = lut; 4546 else 4547 now += sched_clock_cpu(cpu_of(rq)) - u64_u32_load(rq->clock_idle); 4548 4549 __update_load_avg_blocked_se(now, se); 4550 } 4551 #else 4552 static void migrate_se_pelt_lag(struct sched_entity *se) {} 4553 #endif 4554 4555 /** 4556 * update_cfs_rq_load_avg - update the cfs_rq's load/util averages 4557 * @now: current time, as per cfs_rq_clock_pelt() 4558 * @cfs_rq: cfs_rq to update 4559 * 4560 * The cfs_rq avg is the direct sum of all its entities (blocked and runnable) 4561 * avg. The immediate corollary is that all (fair) tasks must be attached. 4562 * 4563 * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example. 4564 * 4565 * Return: true if the load decayed or we removed load. 4566 * 4567 * Since both these conditions indicate a changed cfs_rq->avg.load we should 4568 * call update_tg_load_avg() when this function returns true. 4569 */ 4570 static inline int 4571 update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) 4572 { 4573 unsigned long removed_load = 0, removed_util = 0, removed_runnable = 0; 4574 struct sched_avg *sa = &cfs_rq->avg; 4575 int decayed = 0; 4576 4577 if (cfs_rq->removed.nr) { 4578 unsigned long r; 4579 u32 divider = get_pelt_divider(&cfs_rq->avg); 4580 4581 raw_spin_lock(&cfs_rq->removed.lock); 4582 swap(cfs_rq->removed.util_avg, removed_util); 4583 swap(cfs_rq->removed.load_avg, removed_load); 4584 swap(cfs_rq->removed.runnable_avg, removed_runnable); 4585 cfs_rq->removed.nr = 0; 4586 raw_spin_unlock(&cfs_rq->removed.lock); 4587 4588 r = removed_load; 4589 sub_positive(&sa->load_avg, r); 4590 sub_positive(&sa->load_sum, r * divider); 4591 /* See sa->util_sum below */ 4592 sa->load_sum = max_t(u32, sa->load_sum, sa->load_avg * PELT_MIN_DIVIDER); 4593 4594 r = removed_util; 4595 sub_positive(&sa->util_avg, r); 4596 sub_positive(&sa->util_sum, r * divider); 4597 /* 4598 * Because of rounding, se->util_sum might ends up being +1 more than 4599 * cfs->util_sum. Although this is not a problem by itself, detaching 4600 * a lot of tasks with the rounding problem between 2 updates of 4601 * util_avg (~1ms) can make cfs->util_sum becoming null whereas 4602 * cfs_util_avg is not. 4603 * Check that util_sum is still above its lower bound for the new 4604 * util_avg. Given that period_contrib might have moved since the last 4605 * sync, we are only sure that util_sum must be above or equal to 4606 * util_avg * minimum possible divider 4607 */ 4608 sa->util_sum = max_t(u32, sa->util_sum, sa->util_avg * PELT_MIN_DIVIDER); 4609 4610 r = removed_runnable; 4611 sub_positive(&sa->runnable_avg, r); 4612 sub_positive(&sa->runnable_sum, r * divider); 4613 /* See sa->util_sum above */ 4614 sa->runnable_sum = max_t(u32, sa->runnable_sum, 4615 sa->runnable_avg * PELT_MIN_DIVIDER); 4616 4617 /* 4618 * removed_runnable is the unweighted version of removed_load so we 4619 * can use it to estimate removed_load_sum. 4620 */ 4621 add_tg_cfs_propagate(cfs_rq, 4622 -(long)(removed_runnable * divider) >> SCHED_CAPACITY_SHIFT); 4623 4624 decayed = 1; 4625 } 4626 4627 decayed |= __update_load_avg_cfs_rq(now, cfs_rq); 4628 u64_u32_store_copy(sa->last_update_time, 4629 cfs_rq->last_update_time_copy, 4630 sa->last_update_time); 4631 return decayed; 4632 } 4633 4634 /** 4635 * attach_entity_load_avg - attach this entity to its cfs_rq load avg 4636 * @cfs_rq: cfs_rq to attach to 4637 * @se: sched_entity to attach 4638 * 4639 * Must call update_cfs_rq_load_avg() before this, since we rely on 4640 * cfs_rq->avg.last_update_time being current. 4641 */ 4642 static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) 4643 { 4644 /* 4645 * cfs_rq->avg.period_contrib can be used for both cfs_rq and se. 4646 * See ___update_load_avg() for details. 4647 */ 4648 u32 divider = get_pelt_divider(&cfs_rq->avg); 4649 4650 /* 4651 * When we attach the @se to the @cfs_rq, we must align the decay 4652 * window because without that, really weird and wonderful things can 4653 * happen. 4654 * 4655 * XXX illustrate 4656 */ 4657 se->avg.last_update_time = cfs_rq->avg.last_update_time; 4658 se->avg.period_contrib = cfs_rq->avg.period_contrib; 4659 4660 /* 4661 * Hell(o) Nasty stuff.. we need to recompute _sum based on the new 4662 * period_contrib. This isn't strictly correct, but since we're 4663 * entirely outside of the PELT hierarchy, nobody cares if we truncate 4664 * _sum a little. 4665 */ 4666 se->avg.util_sum = se->avg.util_avg * divider; 4667 4668 se->avg.runnable_sum = se->avg.runnable_avg * divider; 4669 4670 se->avg.load_sum = se->avg.load_avg * divider; 4671 if (se_weight(se) < se->avg.load_sum) 4672 se->avg.load_sum = div_u64(se->avg.load_sum, se_weight(se)); 4673 else 4674 se->avg.load_sum = 1; 4675 4676 enqueue_load_avg(cfs_rq, se); 4677 cfs_rq->avg.util_avg += se->avg.util_avg; 4678 cfs_rq->avg.util_sum += se->avg.util_sum; 4679 cfs_rq->avg.runnable_avg += se->avg.runnable_avg; 4680 cfs_rq->avg.runnable_sum += se->avg.runnable_sum; 4681 4682 add_tg_cfs_propagate(cfs_rq, se->avg.load_sum); 4683 4684 cfs_rq_util_change(cfs_rq, 0); 4685 4686 trace_pelt_cfs_tp(cfs_rq); 4687 } 4688 4689 /** 4690 * detach_entity_load_avg - detach this entity from its cfs_rq load avg 4691 * @cfs_rq: cfs_rq to detach from 4692 * @se: sched_entity to detach 4693 * 4694 * Must call update_cfs_rq_load_avg() before this, since we rely on 4695 * cfs_rq->avg.last_update_time being current. 4696 */ 4697 static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) 4698 { 4699 dequeue_load_avg(cfs_rq, se); 4700 sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg); 4701 sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum); 4702 /* See update_cfs_rq_load_avg() */ 4703 cfs_rq->avg.util_sum = max_t(u32, cfs_rq->avg.util_sum, 4704 cfs_rq->avg.util_avg * PELT_MIN_DIVIDER); 4705 4706 sub_positive(&cfs_rq->avg.runnable_avg, se->avg.runnable_avg); 4707 sub_positive(&cfs_rq->avg.runnable_sum, se->avg.runnable_sum); 4708 /* See update_cfs_rq_load_avg() */ 4709 cfs_rq->avg.runnable_sum = max_t(u32, cfs_rq->avg.runnable_sum, 4710 cfs_rq->avg.runnable_avg * PELT_MIN_DIVIDER); 4711 4712 add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum); 4713 4714 cfs_rq_util_change(cfs_rq, 0); 4715 4716 trace_pelt_cfs_tp(cfs_rq); 4717 } 4718 4719 /* 4720 * Optional action to be done while updating the load average 4721 */ 4722 #define UPDATE_TG 0x1 4723 #define SKIP_AGE_LOAD 0x2 4724 #define DO_ATTACH 0x4 4725 #define DO_DETACH 0x8 4726 4727 /* Update task and its cfs_rq load average */ 4728 static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) 4729 { 4730 u64 now = cfs_rq_clock_pelt(cfs_rq); 4731 int decayed; 4732 4733 /* 4734 * Track task load average for carrying it to new CPU after migrated, and 4735 * track group sched_entity load average for task_h_load calculation in migration 4736 */ 4737 if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) 4738 __update_load_avg_se(now, cfs_rq, se); 4739 4740 decayed = update_cfs_rq_load_avg(now, cfs_rq); 4741 decayed |= propagate_entity_load_avg(se); 4742 4743 if (!se->avg.last_update_time && (flags & DO_ATTACH)) { 4744 4745 /* 4746 * DO_ATTACH means we're here from enqueue_entity(). 4747 * !last_update_time means we've passed through 4748 * migrate_task_rq_fair() indicating we migrated. 4749 * 4750 * IOW we're enqueueing a task on a new CPU. 4751 */ 4752 attach_entity_load_avg(cfs_rq, se); 4753 update_tg_load_avg(cfs_rq); 4754 4755 } else if (flags & DO_DETACH) { 4756 /* 4757 * DO_DETACH means we're here from dequeue_entity() 4758 * and we are migrating task out of the CPU. 4759 */ 4760 detach_entity_load_avg(cfs_rq, se); 4761 update_tg_load_avg(cfs_rq); 4762 } else if (decayed) { 4763 cfs_rq_util_change(cfs_rq, 0); 4764 4765 if (flags & UPDATE_TG) 4766 update_tg_load_avg(cfs_rq); 4767 } 4768 } 4769 4770 /* 4771 * Synchronize entity load avg of dequeued entity without locking 4772 * the previous rq. 4773 */ 4774 static void sync_entity_load_avg(struct sched_entity *se) 4775 { 4776 struct cfs_rq *cfs_rq = cfs_rq_of(se); 4777 u64 last_update_time; 4778 4779 last_update_time = cfs_rq_last_update_time(cfs_rq); 4780 __update_load_avg_blocked_se(last_update_time, se); 4781 } 4782 4783 /* 4784 * Task first catches up with cfs_rq, and then subtract 4785 * itself from the cfs_rq (task must be off the queue now). 4786 */ 4787 static void remove_entity_load_avg(struct sched_entity *se) 4788 { 4789 struct cfs_rq *cfs_rq = cfs_rq_of(se); 4790 unsigned long flags; 4791 4792 /* 4793 * tasks cannot exit without having gone through wake_up_new_task() -> 4794 * enqueue_task_fair() which will have added things to the cfs_rq, 4795 * so we can remove unconditionally. 4796 */ 4797 4798 sync_entity_load_avg(se); 4799 4800 raw_spin_lock_irqsave(&cfs_rq->removed.lock, flags); 4801 ++cfs_rq->removed.nr; 4802 cfs_rq->removed.util_avg += se->avg.util_avg; 4803 cfs_rq->removed.load_avg += se->avg.load_avg; 4804 cfs_rq->removed.runnable_avg += se->avg.runnable_avg; 4805 raw_spin_unlock_irqrestore(&cfs_rq->removed.lock, flags); 4806 } 4807 4808 static inline unsigned long cfs_rq_runnable_avg(struct cfs_rq *cfs_rq) 4809 { 4810 return cfs_rq->avg.runnable_avg; 4811 } 4812 4813 static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq) 4814 { 4815 return cfs_rq->avg.load_avg; 4816 } 4817 4818 static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf); 4819 4820 static inline unsigned long task_util(struct task_struct *p) 4821 { 4822 return READ_ONCE(p->se.avg.util_avg); 4823 } 4824 4825 static inline unsigned long task_runnable(struct task_struct *p) 4826 { 4827 return READ_ONCE(p->se.avg.runnable_avg); 4828 } 4829 4830 static inline unsigned long _task_util_est(struct task_struct *p) 4831 { 4832 return READ_ONCE(p->se.avg.util_est) & ~UTIL_AVG_UNCHANGED; 4833 } 4834 4835 static inline unsigned long task_util_est(struct task_struct *p) 4836 { 4837 return max(task_util(p), _task_util_est(p)); 4838 } 4839 4840 static inline void util_est_enqueue(struct cfs_rq *cfs_rq, 4841 struct task_struct *p) 4842 { 4843 unsigned int enqueued; 4844 4845 if (!sched_feat(UTIL_EST)) 4846 return; 4847 4848 /* Update root cfs_rq's estimated utilization */ 4849 enqueued = cfs_rq->avg.util_est; 4850 enqueued += _task_util_est(p); 4851 WRITE_ONCE(cfs_rq->avg.util_est, enqueued); 4852 4853 trace_sched_util_est_cfs_tp(cfs_rq); 4854 } 4855 4856 static inline void util_est_dequeue(struct cfs_rq *cfs_rq, 4857 struct task_struct *p) 4858 { 4859 unsigned int enqueued; 4860 4861 if (!sched_feat(UTIL_EST)) 4862 return; 4863 4864 /* Update root cfs_rq's estimated utilization */ 4865 enqueued = cfs_rq->avg.util_est; 4866 enqueued -= min_t(unsigned int, enqueued, _task_util_est(p)); 4867 WRITE_ONCE(cfs_rq->avg.util_est, enqueued); 4868 4869 trace_sched_util_est_cfs_tp(cfs_rq); 4870 } 4871 4872 #define UTIL_EST_MARGIN (SCHED_CAPACITY_SCALE / 100) 4873 4874 static inline void util_est_update(struct cfs_rq *cfs_rq, 4875 struct task_struct *p, 4876 bool task_sleep) 4877 { 4878 unsigned int ewma, dequeued, last_ewma_diff; 4879 4880 if (!sched_feat(UTIL_EST)) 4881 return; 4882 4883 /* 4884 * Skip update of task's estimated utilization when the task has not 4885 * yet completed an activation, e.g. being migrated. 4886 */ 4887 if (!task_sleep) 4888 return; 4889 4890 /* Get current estimate of utilization */ 4891 ewma = READ_ONCE(p->se.avg.util_est); 4892 4893 /* 4894 * If the PELT values haven't changed since enqueue time, 4895 * skip the util_est update. 4896 */ 4897 if (ewma & UTIL_AVG_UNCHANGED) 4898 return; 4899 4900 /* Get utilization at dequeue */ 4901 dequeued = task_util(p); 4902 4903 /* 4904 * Reset EWMA on utilization increases, the moving average is used only 4905 * to smooth utilization decreases. 4906 */ 4907 if (ewma <= dequeued) { 4908 ewma = dequeued; 4909 goto done; 4910 } 4911 4912 /* 4913 * Skip update of task's estimated utilization when its members are 4914 * already ~1% close to its last activation value. 4915 */ 4916 last_ewma_diff = ewma - dequeued; 4917 if (last_ewma_diff < UTIL_EST_MARGIN) 4918 goto done; 4919 4920 /* 4921 * To avoid overestimation of actual task utilization, skip updates if 4922 * we cannot grant there is idle time in this CPU. 4923 */ 4924 if (dequeued > arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq)))) 4925 return; 4926 4927 /* 4928 * To avoid underestimate of task utilization, skip updates of EWMA if 4929 * we cannot grant that thread got all CPU time it wanted. 4930 */ 4931 if ((dequeued + UTIL_EST_MARGIN) < task_runnable(p)) 4932 goto done; 4933 4934 4935 /* 4936 * Update Task's estimated utilization 4937 * 4938 * When *p completes an activation we can consolidate another sample 4939 * of the task size. This is done by using this value to update the 4940 * Exponential Weighted Moving Average (EWMA): 4941 * 4942 * ewma(t) = w * task_util(p) + (1-w) * ewma(t-1) 4943 * = w * task_util(p) + ewma(t-1) - w * ewma(t-1) 4944 * = w * (task_util(p) - ewma(t-1)) + ewma(t-1) 4945 * = w * ( -last_ewma_diff ) + ewma(t-1) 4946 * = w * (-last_ewma_diff + ewma(t-1) / w) 4947 * 4948 * Where 'w' is the weight of new samples, which is configured to be 4949 * 0.25, thus making w=1/4 ( >>= UTIL_EST_WEIGHT_SHIFT) 4950 */ 4951 ewma <<= UTIL_EST_WEIGHT_SHIFT; 4952 ewma -= last_ewma_diff; 4953 ewma >>= UTIL_EST_WEIGHT_SHIFT; 4954 done: 4955 ewma |= UTIL_AVG_UNCHANGED; 4956 WRITE_ONCE(p->se.avg.util_est, ewma); 4957 4958 trace_sched_util_est_se_tp(&p->se); 4959 } 4960 4961 static inline unsigned long get_actual_cpu_capacity(int cpu) 4962 { 4963 unsigned long capacity = arch_scale_cpu_capacity(cpu); 4964 4965 capacity -= max(hw_load_avg(cpu_rq(cpu)), cpufreq_get_pressure(cpu)); 4966 4967 return capacity; 4968 } 4969 4970 static inline int util_fits_cpu(unsigned long util, 4971 unsigned long uclamp_min, 4972 unsigned long uclamp_max, 4973 int cpu) 4974 { 4975 unsigned long capacity = capacity_of(cpu); 4976 unsigned long capacity_orig; 4977 bool fits, uclamp_max_fits; 4978 4979 /* 4980 * Check if the real util fits without any uclamp boost/cap applied. 4981 */ 4982 fits = fits_capacity(util, capacity); 4983 4984 if (!uclamp_is_used()) 4985 return fits; 4986 4987 /* 4988 * We must use arch_scale_cpu_capacity() for comparing against uclamp_min and 4989 * uclamp_max. We only care about capacity pressure (by using 4990 * capacity_of()) for comparing against the real util. 4991 * 4992 * If a task is boosted to 1024 for example, we don't want a tiny 4993 * pressure to skew the check whether it fits a CPU or not. 4994 * 4995 * Similarly if a task is capped to arch_scale_cpu_capacity(little_cpu), it 4996 * should fit a little cpu even if there's some pressure. 4997 * 4998 * Only exception is for HW or cpufreq pressure since it has a direct impact 4999 * on available OPP of the system. 5000 * 5001 * We honour it for uclamp_min only as a drop in performance level 5002 * could result in not getting the requested minimum performance level. 5003 * 5004 * For uclamp_max, we can tolerate a drop in performance level as the 5005 * goal is to cap the task. So it's okay if it's getting less. 5006 */ 5007 capacity_orig = arch_scale_cpu_capacity(cpu); 5008 5009 /* 5010 * We want to force a task to fit a cpu as implied by uclamp_max. 5011 * But we do have some corner cases to cater for.. 5012 * 5013 * 5014 * C=z 5015 * | ___ 5016 * | C=y | | 5017 * |_ _ _ _ _ _ _ _ _ ___ _ _ _ | _ | _ _ _ _ _ uclamp_max 5018 * | C=x | | | | 5019 * | ___ | | | | 5020 * | | | | | | | (util somewhere in this region) 5021 * | | | | | | | 5022 * | | | | | | | 5023 * +---------------------------------------- 5024 * CPU0 CPU1 CPU2 5025 * 5026 * In the above example if a task is capped to a specific performance 5027 * point, y, then when: 5028 * 5029 * * util = 80% of x then it does not fit on CPU0 and should migrate 5030 * to CPU1 5031 * * util = 80% of y then it is forced to fit on CPU1 to honour 5032 * uclamp_max request. 5033 * 5034 * which is what we're enforcing here. A task always fits if 5035 * uclamp_max <= capacity_orig. But when uclamp_max > capacity_orig, 5036 * the normal upmigration rules should withhold still. 5037 * 5038 * Only exception is when we are on max capacity, then we need to be 5039 * careful not to block overutilized state. This is so because: 5040 * 5041 * 1. There's no concept of capping at max_capacity! We can't go 5042 * beyond this performance level anyway. 5043 * 2. The system is being saturated when we're operating near 5044 * max capacity, it doesn't make sense to block overutilized. 5045 */ 5046 uclamp_max_fits = (capacity_orig == SCHED_CAPACITY_SCALE) && (uclamp_max == SCHED_CAPACITY_SCALE); 5047 uclamp_max_fits = !uclamp_max_fits && (uclamp_max <= capacity_orig); 5048 fits = fits || uclamp_max_fits; 5049 5050 /* 5051 * 5052 * C=z 5053 * | ___ (region a, capped, util >= uclamp_max) 5054 * | C=y | | 5055 * |_ _ _ _ _ _ _ _ _ ___ _ _ _ | _ | _ _ _ _ _ uclamp_max 5056 * | C=x | | | | 5057 * | ___ | | | | (region b, uclamp_min <= util <= uclamp_max) 5058 * |_ _ _|_ _|_ _ _ _| _ | _ _ _| _ | _ _ _ _ _ uclamp_min 5059 * | | | | | | | 5060 * | | | | | | | (region c, boosted, util < uclamp_min) 5061 * +---------------------------------------- 5062 * CPU0 CPU1 CPU2 5063 * 5064 * a) If util > uclamp_max, then we're capped, we don't care about 5065 * actual fitness value here. We only care if uclamp_max fits 5066 * capacity without taking margin/pressure into account. 5067 * See comment above. 5068 * 5069 * b) If uclamp_min <= util <= uclamp_max, then the normal 5070 * fits_capacity() rules apply. Except we need to ensure that we 5071 * enforce we remain within uclamp_max, see comment above. 5072 * 5073 * c) If util < uclamp_min, then we are boosted. Same as (b) but we 5074 * need to take into account the boosted value fits the CPU without 5075 * taking margin/pressure into account. 5076 * 5077 * Cases (a) and (b) are handled in the 'fits' variable already. We 5078 * just need to consider an extra check for case (c) after ensuring we 5079 * handle the case uclamp_min > uclamp_max. 5080 */ 5081 uclamp_min = min(uclamp_min, uclamp_max); 5082 if (fits && (util < uclamp_min) && 5083 (uclamp_min > get_actual_cpu_capacity(cpu))) 5084 return -1; 5085 5086 return fits; 5087 } 5088 5089 static inline int task_fits_cpu(struct task_struct *p, int cpu) 5090 { 5091 unsigned long uclamp_min = uclamp_eff_value(p, UCLAMP_MIN); 5092 unsigned long uclamp_max = uclamp_eff_value(p, UCLAMP_MAX); 5093 unsigned long util = task_util_est(p); 5094 /* 5095 * Return true only if the cpu fully fits the task requirements, which 5096 * include the utilization but also the performance hints. 5097 */ 5098 return (util_fits_cpu(util, uclamp_min, uclamp_max, cpu) > 0); 5099 } 5100 5101 static inline void update_misfit_status(struct task_struct *p, struct rq *rq) 5102 { 5103 int cpu = cpu_of(rq); 5104 5105 if (!sched_asym_cpucap_active()) 5106 return; 5107 5108 /* 5109 * Affinity allows us to go somewhere higher? Or are we on biggest 5110 * available CPU already? Or do we fit into this CPU ? 5111 */ 5112 if (!p || (p->nr_cpus_allowed == 1) || 5113 (arch_scale_cpu_capacity(cpu) == p->max_allowed_capacity) || 5114 task_fits_cpu(p, cpu)) { 5115 5116 rq->misfit_task_load = 0; 5117 return; 5118 } 5119 5120 /* 5121 * Make sure that misfit_task_load will not be null even if 5122 * task_h_load() returns 0. 5123 */ 5124 rq->misfit_task_load = max_t(unsigned long, task_h_load(p), 1); 5125 } 5126 5127 #else /* CONFIG_SMP */ 5128 5129 static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) 5130 { 5131 return !cfs_rq->nr_running; 5132 } 5133 5134 #define UPDATE_TG 0x0 5135 #define SKIP_AGE_LOAD 0x0 5136 #define DO_ATTACH 0x0 5137 #define DO_DETACH 0x0 5138 5139 static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int not_used1) 5140 { 5141 cfs_rq_util_change(cfs_rq, 0); 5142 } 5143 5144 static inline void remove_entity_load_avg(struct sched_entity *se) {} 5145 5146 static inline void 5147 attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} 5148 static inline void 5149 detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} 5150 5151 static inline int sched_balance_newidle(struct rq *rq, struct rq_flags *rf) 5152 { 5153 return 0; 5154 } 5155 5156 static inline void 5157 util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {} 5158 5159 static inline void 5160 util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p) {} 5161 5162 static inline void 5163 util_est_update(struct cfs_rq *cfs_rq, struct task_struct *p, 5164 bool task_sleep) {} 5165 static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {} 5166 5167 #endif /* CONFIG_SMP */ 5168 5169 static void 5170 place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) 5171 { 5172 u64 vslice, vruntime = avg_vruntime(cfs_rq); 5173 s64 lag = 0; 5174 5175 if (!se->custom_slice) 5176 se->slice = sysctl_sched_base_slice; 5177 vslice = calc_delta_fair(se->slice, se); 5178 5179 /* 5180 * Due to how V is constructed as the weighted average of entities, 5181 * adding tasks with positive lag, or removing tasks with negative lag 5182 * will move 'time' backwards, this can screw around with the lag of 5183 * other tasks. 5184 * 5185 * EEVDF: placement strategy #1 / #2 5186 */ 5187 if (sched_feat(PLACE_LAG) && cfs_rq->nr_running && se->vlag) { 5188 struct sched_entity *curr = cfs_rq->curr; 5189 unsigned long load; 5190 5191 lag = se->vlag; 5192 5193 /* 5194 * If we want to place a task and preserve lag, we have to 5195 * consider the effect of the new entity on the weighted 5196 * average and compensate for this, otherwise lag can quickly 5197 * evaporate. 5198 * 5199 * Lag is defined as: 5200 * 5201 * lag_i = S - s_i = w_i * (V - v_i) 5202 * 5203 * To avoid the 'w_i' term all over the place, we only track 5204 * the virtual lag: 5205 * 5206 * vl_i = V - v_i <=> v_i = V - vl_i 5207 * 5208 * And we take V to be the weighted average of all v: 5209 * 5210 * V = (\Sum w_j*v_j) / W 5211 * 5212 * Where W is: \Sum w_j 5213 * 5214 * Then, the weighted average after adding an entity with lag 5215 * vl_i is given by: 5216 * 5217 * V' = (\Sum w_j*v_j + w_i*v_i) / (W + w_i) 5218 * = (W*V + w_i*(V - vl_i)) / (W + w_i) 5219 * = (W*V + w_i*V - w_i*vl_i) / (W + w_i) 5220 * = (V*(W + w_i) - w_i*l) / (W + w_i) 5221 * = V - w_i*vl_i / (W + w_i) 5222 * 5223 * And the actual lag after adding an entity with vl_i is: 5224 * 5225 * vl'_i = V' - v_i 5226 * = V - w_i*vl_i / (W + w_i) - (V - vl_i) 5227 * = vl_i - w_i*vl_i / (W + w_i) 5228 * 5229 * Which is strictly less than vl_i. So in order to preserve lag 5230 * we should inflate the lag before placement such that the 5231 * effective lag after placement comes out right. 5232 * 5233 * As such, invert the above relation for vl'_i to get the vl_i 5234 * we need to use such that the lag after placement is the lag 5235 * we computed before dequeue. 5236 * 5237 * vl'_i = vl_i - w_i*vl_i / (W + w_i) 5238 * = ((W + w_i)*vl_i - w_i*vl_i) / (W + w_i) 5239 * 5240 * (W + w_i)*vl'_i = (W + w_i)*vl_i - w_i*vl_i 5241 * = W*vl_i 5242 * 5243 * vl_i = (W + w_i)*vl'_i / W 5244 */ 5245 load = cfs_rq->avg_load; 5246 if (curr && curr->on_rq) 5247 load += scale_load_down(curr->load.weight); 5248 5249 lag *= load + scale_load_down(se->load.weight); 5250 if (WARN_ON_ONCE(!load)) 5251 load = 1; 5252 lag = div_s64(lag, load); 5253 } 5254 5255 se->vruntime = vruntime - lag; 5256 5257 if (se->rel_deadline) { 5258 se->deadline += se->vruntime; 5259 se->rel_deadline = 0; 5260 return; 5261 } 5262 5263 /* 5264 * When joining the competition; the existing tasks will be, 5265 * on average, halfway through their slice, as such start tasks 5266 * off with half a slice to ease into the competition. 5267 */ 5268 if (sched_feat(PLACE_DEADLINE_INITIAL) && (flags & ENQUEUE_INITIAL)) 5269 vslice /= 2; 5270 5271 /* 5272 * EEVDF: vd_i = ve_i + r_i/w_i 5273 */ 5274 se->deadline = se->vruntime + vslice; 5275 } 5276 5277 static void check_enqueue_throttle(struct cfs_rq *cfs_rq); 5278 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq); 5279 5280 static inline bool cfs_bandwidth_used(void); 5281 5282 static void 5283 requeue_delayed_entity(struct sched_entity *se); 5284 5285 static void 5286 enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) 5287 { 5288 bool curr = cfs_rq->curr == se; 5289 5290 /* 5291 * If we're the current task, we must renormalise before calling 5292 * update_curr(). 5293 */ 5294 if (curr) 5295 place_entity(cfs_rq, se, flags); 5296 5297 update_curr(cfs_rq); 5298 5299 /* 5300 * When enqueuing a sched_entity, we must: 5301 * - Update loads to have both entity and cfs_rq synced with now. 5302 * - For group_entity, update its runnable_weight to reflect the new 5303 * h_nr_running of its group cfs_rq. 5304 * - For group_entity, update its weight to reflect the new share of 5305 * its group cfs_rq 5306 * - Add its new weight to cfs_rq->load.weight 5307 */ 5308 update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH); 5309 se_update_runnable(se); 5310 /* 5311 * XXX update_load_avg() above will have attached us to the pelt sum; 5312 * but update_cfs_group() here will re-adjust the weight and have to 5313 * undo/redo all that. Seems wasteful. 5314 */ 5315 update_cfs_group(se); 5316 5317 /* 5318 * XXX now that the entity has been re-weighted, and it's lag adjusted, 5319 * we can place the entity. 5320 */ 5321 if (!curr) 5322 place_entity(cfs_rq, se, flags); 5323 5324 account_entity_enqueue(cfs_rq, se); 5325 5326 /* Entity has migrated, no longer consider this task hot */ 5327 if (flags & ENQUEUE_MIGRATED) 5328 se->exec_start = 0; 5329 5330 check_schedstat_required(); 5331 update_stats_enqueue_fair(cfs_rq, se, flags); 5332 if (!curr) 5333 __enqueue_entity(cfs_rq, se); 5334 se->on_rq = 1; 5335 5336 if (cfs_rq->nr_running == 1) { 5337 check_enqueue_throttle(cfs_rq); 5338 if (!throttled_hierarchy(cfs_rq)) { 5339 list_add_leaf_cfs_rq(cfs_rq); 5340 } else { 5341 #ifdef CONFIG_CFS_BANDWIDTH 5342 struct rq *rq = rq_of(cfs_rq); 5343 5344 if (cfs_rq_throttled(cfs_rq) && !cfs_rq->throttled_clock) 5345 cfs_rq->throttled_clock = rq_clock(rq); 5346 if (!cfs_rq->throttled_clock_self) 5347 cfs_rq->throttled_clock_self = rq_clock(rq); 5348 #endif 5349 } 5350 } 5351 } 5352 5353 static void __clear_buddies_next(struct sched_entity *se) 5354 { 5355 for_each_sched_entity(se) { 5356 struct cfs_rq *cfs_rq = cfs_rq_of(se); 5357 if (cfs_rq->next != se) 5358 break; 5359 5360 cfs_rq->next = NULL; 5361 } 5362 } 5363 5364 static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) 5365 { 5366 if (cfs_rq->next == se) 5367 __clear_buddies_next(se); 5368 } 5369 5370 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq); 5371 5372 static void set_delayed(struct sched_entity *se) 5373 { 5374 se->sched_delayed = 1; 5375 for_each_sched_entity(se) { 5376 struct cfs_rq *cfs_rq = cfs_rq_of(se); 5377 5378 cfs_rq->h_nr_delayed++; 5379 if (cfs_rq_throttled(cfs_rq)) 5380 break; 5381 } 5382 } 5383 5384 static void clear_delayed(struct sched_entity *se) 5385 { 5386 se->sched_delayed = 0; 5387 for_each_sched_entity(se) { 5388 struct cfs_rq *cfs_rq = cfs_rq_of(se); 5389 5390 cfs_rq->h_nr_delayed--; 5391 if (cfs_rq_throttled(cfs_rq)) 5392 break; 5393 } 5394 } 5395 5396 static inline void finish_delayed_dequeue_entity(struct sched_entity *se) 5397 { 5398 clear_delayed(se); 5399 if (sched_feat(DELAY_ZERO) && se->vlag > 0) 5400 se->vlag = 0; 5401 } 5402 5403 static bool 5404 dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) 5405 { 5406 bool sleep = flags & DEQUEUE_SLEEP; 5407 5408 update_curr(cfs_rq); 5409 clear_buddies(cfs_rq, se); 5410 5411 if (flags & DEQUEUE_DELAYED) { 5412 SCHED_WARN_ON(!se->sched_delayed); 5413 } else { 5414 bool delay = sleep; 5415 /* 5416 * DELAY_DEQUEUE relies on spurious wakeups, special task 5417 * states must not suffer spurious wakeups, excempt them. 5418 */ 5419 if (flags & DEQUEUE_SPECIAL) 5420 delay = false; 5421 5422 SCHED_WARN_ON(delay && se->sched_delayed); 5423 5424 if (sched_feat(DELAY_DEQUEUE) && delay && 5425 !entity_eligible(cfs_rq, se)) { 5426 update_load_avg(cfs_rq, se, 0); 5427 set_delayed(se); 5428 return false; 5429 } 5430 } 5431 5432 int action = UPDATE_TG; 5433 if (entity_is_task(se) && task_on_rq_migrating(task_of(se))) 5434 action |= DO_DETACH; 5435 5436 /* 5437 * When dequeuing a sched_entity, we must: 5438 * - Update loads to have both entity and cfs_rq synced with now. 5439 * - For group_entity, update its runnable_weight to reflect the new 5440 * h_nr_running of its group cfs_rq. 5441 * - Subtract its previous weight from cfs_rq->load.weight. 5442 * - For group entity, update its weight to reflect the new share 5443 * of its group cfs_rq. 5444 */ 5445 update_load_avg(cfs_rq, se, action); 5446 se_update_runnable(se); 5447 5448 update_stats_dequeue_fair(cfs_rq, se, flags); 5449 5450 update_entity_lag(cfs_rq, se); 5451 if (sched_feat(PLACE_REL_DEADLINE) && !sleep) { 5452 se->deadline -= se->vruntime; 5453 se->rel_deadline = 1; 5454 } 5455 5456 if (se != cfs_rq->curr) 5457 __dequeue_entity(cfs_rq, se); 5458 se->on_rq = 0; 5459 account_entity_dequeue(cfs_rq, se); 5460 5461 /* return excess runtime on last dequeue */ 5462 return_cfs_rq_runtime(cfs_rq); 5463 5464 update_cfs_group(se); 5465 5466 /* 5467 * Now advance min_vruntime if @se was the entity holding it back, 5468 * except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be 5469 * put back on, and if we advance min_vruntime, we'll be placed back 5470 * further than we started -- i.e. we'll be penalized. 5471 */ 5472 if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE) 5473 update_min_vruntime(cfs_rq); 5474 5475 if (flags & DEQUEUE_DELAYED) 5476 finish_delayed_dequeue_entity(se); 5477 5478 if (cfs_rq->nr_running == 0) 5479 update_idle_cfs_rq_clock_pelt(cfs_rq); 5480 5481 return true; 5482 } 5483 5484 static void 5485 set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) 5486 { 5487 clear_buddies(cfs_rq, se); 5488 5489 /* 'current' is not kept within the tree. */ 5490 if (se->on_rq) { 5491 /* 5492 * Any task has to be enqueued before it get to execute on 5493 * a CPU. So account for the time it spent waiting on the 5494 * runqueue. 5495 */ 5496 update_stats_wait_end_fair(cfs_rq, se); 5497 __dequeue_entity(cfs_rq, se); 5498 update_load_avg(cfs_rq, se, UPDATE_TG); 5499 /* 5500 * HACK, stash a copy of deadline at the point of pick in vlag, 5501 * which isn't used until dequeue. 5502 */ 5503 se->vlag = se->deadline; 5504 } 5505 5506 update_stats_curr_start(cfs_rq, se); 5507 SCHED_WARN_ON(cfs_rq->curr); 5508 cfs_rq->curr = se; 5509 5510 /* 5511 * Track our maximum slice length, if the CPU's load is at 5512 * least twice that of our own weight (i.e. don't track it 5513 * when there are only lesser-weight tasks around): 5514 */ 5515 if (schedstat_enabled() && 5516 rq_of(cfs_rq)->cfs.load.weight >= 2*se->load.weight) { 5517 struct sched_statistics *stats; 5518 5519 stats = __schedstats_from_se(se); 5520 __schedstat_set(stats->slice_max, 5521 max((u64)stats->slice_max, 5522 se->sum_exec_runtime - se->prev_sum_exec_runtime)); 5523 } 5524 5525 se->prev_sum_exec_runtime = se->sum_exec_runtime; 5526 } 5527 5528 static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags); 5529 5530 /* 5531 * Pick the next process, keeping these things in mind, in this order: 5532 * 1) keep things fair between processes/task groups 5533 * 2) pick the "next" process, since someone really wants that to run 5534 * 3) pick the "last" process, for cache locality 5535 * 4) do not run the "skip" process, if something else is available 5536 */ 5537 static struct sched_entity * 5538 pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq) 5539 { 5540 /* 5541 * Enabling NEXT_BUDDY will affect latency but not fairness. 5542 */ 5543 if (sched_feat(NEXT_BUDDY) && 5544 cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next)) { 5545 /* ->next will never be delayed */ 5546 SCHED_WARN_ON(cfs_rq->next->sched_delayed); 5547 return cfs_rq->next; 5548 } 5549 5550 struct sched_entity *se = pick_eevdf(cfs_rq); 5551 if (se->sched_delayed) { 5552 dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED); 5553 /* 5554 * Must not reference @se again, see __block_task(). 5555 */ 5556 return NULL; 5557 } 5558 return se; 5559 } 5560 5561 static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq); 5562 5563 static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) 5564 { 5565 /* 5566 * If still on the runqueue then deactivate_task() 5567 * was not called and update_curr() has to be done: 5568 */ 5569 if (prev->on_rq) 5570 update_curr(cfs_rq); 5571 5572 /* throttle cfs_rqs exceeding runtime */ 5573 check_cfs_rq_runtime(cfs_rq); 5574 5575 if (prev->on_rq) { 5576 update_stats_wait_start_fair(cfs_rq, prev); 5577 /* Put 'current' back into the tree. */ 5578 __enqueue_entity(cfs_rq, prev); 5579 /* in !on_rq case, update occurred at dequeue */ 5580 update_load_avg(cfs_rq, prev, 0); 5581 } 5582 SCHED_WARN_ON(cfs_rq->curr != prev); 5583 cfs_rq->curr = NULL; 5584 } 5585 5586 static void 5587 entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) 5588 { 5589 /* 5590 * Update run-time statistics of the 'current'. 5591 */ 5592 update_curr(cfs_rq); 5593 5594 /* 5595 * Ensure that runnable average is periodically updated. 5596 */ 5597 update_load_avg(cfs_rq, curr, UPDATE_TG); 5598 update_cfs_group(curr); 5599 5600 #ifdef CONFIG_SCHED_HRTICK 5601 /* 5602 * queued ticks are scheduled to match the slice, so don't bother 5603 * validating it and just reschedule. 5604 */ 5605 if (queued) { 5606 resched_curr_lazy(rq_of(cfs_rq)); 5607 return; 5608 } 5609 #endif 5610 } 5611 5612 5613 /************************************************** 5614 * CFS bandwidth control machinery 5615 */ 5616 5617 #ifdef CONFIG_CFS_BANDWIDTH 5618 5619 #ifdef CONFIG_JUMP_LABEL 5620 static struct static_key __cfs_bandwidth_used; 5621 5622 static inline bool cfs_bandwidth_used(void) 5623 { 5624 return static_key_false(&__cfs_bandwidth_used); 5625 } 5626 5627 void cfs_bandwidth_usage_inc(void) 5628 { 5629 static_key_slow_inc_cpuslocked(&__cfs_bandwidth_used); 5630 } 5631 5632 void cfs_bandwidth_usage_dec(void) 5633 { 5634 static_key_slow_dec_cpuslocked(&__cfs_bandwidth_used); 5635 } 5636 #else /* CONFIG_JUMP_LABEL */ 5637 static bool cfs_bandwidth_used(void) 5638 { 5639 return true; 5640 } 5641 5642 void cfs_bandwidth_usage_inc(void) {} 5643 void cfs_bandwidth_usage_dec(void) {} 5644 #endif /* CONFIG_JUMP_LABEL */ 5645 5646 /* 5647 * default period for cfs group bandwidth. 5648 * default: 0.1s, units: nanoseconds 5649 */ 5650 static inline u64 default_cfs_period(void) 5651 { 5652 return 100000000ULL; 5653 } 5654 5655 static inline u64 sched_cfs_bandwidth_slice(void) 5656 { 5657 return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC; 5658 } 5659 5660 /* 5661 * Replenish runtime according to assigned quota. We use sched_clock_cpu 5662 * directly instead of rq->clock to avoid adding additional synchronization 5663 * around rq->lock. 5664 * 5665 * requires cfs_b->lock 5666 */ 5667 void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b) 5668 { 5669 s64 runtime; 5670 5671 if (unlikely(cfs_b->quota == RUNTIME_INF)) 5672 return; 5673 5674 cfs_b->runtime += cfs_b->quota; 5675 runtime = cfs_b->runtime_snap - cfs_b->runtime; 5676 if (runtime > 0) { 5677 cfs_b->burst_time += runtime; 5678 cfs_b->nr_burst++; 5679 } 5680 5681 cfs_b->runtime = min(cfs_b->runtime, cfs_b->quota + cfs_b->burst); 5682 cfs_b->runtime_snap = cfs_b->runtime; 5683 } 5684 5685 static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) 5686 { 5687 return &tg->cfs_bandwidth; 5688 } 5689 5690 /* returns 0 on failure to allocate runtime */ 5691 static int __assign_cfs_rq_runtime(struct cfs_bandwidth *cfs_b, 5692 struct cfs_rq *cfs_rq, u64 target_runtime) 5693 { 5694 u64 min_amount, amount = 0; 5695 5696 lockdep_assert_held(&cfs_b->lock); 5697 5698 /* note: this is a positive sum as runtime_remaining <= 0 */ 5699 min_amount = target_runtime - cfs_rq->runtime_remaining; 5700 5701 if (cfs_b->quota == RUNTIME_INF) 5702 amount = min_amount; 5703 else { 5704 start_cfs_bandwidth(cfs_b); 5705 5706 if (cfs_b->runtime > 0) { 5707 amount = min(cfs_b->runtime, min_amount); 5708 cfs_b->runtime -= amount; 5709 cfs_b->idle = 0; 5710 } 5711 } 5712 5713 cfs_rq->runtime_remaining += amount; 5714 5715 return cfs_rq->runtime_remaining > 0; 5716 } 5717 5718 /* returns 0 on failure to allocate runtime */ 5719 static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) 5720 { 5721 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); 5722 int ret; 5723 5724 raw_spin_lock(&cfs_b->lock); 5725 ret = __assign_cfs_rq_runtime(cfs_b, cfs_rq, sched_cfs_bandwidth_slice()); 5726 raw_spin_unlock(&cfs_b->lock); 5727 5728 return ret; 5729 } 5730 5731 static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) 5732 { 5733 /* dock delta_exec before expiring quota (as it could span periods) */ 5734 cfs_rq->runtime_remaining -= delta_exec; 5735 5736 if (likely(cfs_rq->runtime_remaining > 0)) 5737 return; 5738 5739 if (cfs_rq->throttled) 5740 return; 5741 /* 5742 * if we're unable to extend our runtime we resched so that the active 5743 * hierarchy can be throttled 5744 */ 5745 if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr)) 5746 resched_curr(rq_of(cfs_rq)); 5747 } 5748 5749 static __always_inline 5750 void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) 5751 { 5752 if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled) 5753 return; 5754 5755 __account_cfs_rq_runtime(cfs_rq, delta_exec); 5756 } 5757 5758 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) 5759 { 5760 return cfs_bandwidth_used() && cfs_rq->throttled; 5761 } 5762 5763 /* check whether cfs_rq, or any parent, is throttled */ 5764 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq) 5765 { 5766 return cfs_bandwidth_used() && cfs_rq->throttle_count; 5767 } 5768 5769 /* 5770 * Ensure that neither of the group entities corresponding to src_cpu or 5771 * dest_cpu are members of a throttled hierarchy when performing group 5772 * load-balance operations. 5773 */ 5774 static inline int throttled_lb_pair(struct task_group *tg, 5775 int src_cpu, int dest_cpu) 5776 { 5777 struct cfs_rq *src_cfs_rq, *dest_cfs_rq; 5778 5779 src_cfs_rq = tg->cfs_rq[src_cpu]; 5780 dest_cfs_rq = tg->cfs_rq[dest_cpu]; 5781 5782 return throttled_hierarchy(src_cfs_rq) || 5783 throttled_hierarchy(dest_cfs_rq); 5784 } 5785 5786 static int tg_unthrottle_up(struct task_group *tg, void *data) 5787 { 5788 struct rq *rq = data; 5789 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; 5790 5791 cfs_rq->throttle_count--; 5792 if (!cfs_rq->throttle_count) { 5793 cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) - 5794 cfs_rq->throttled_clock_pelt; 5795 5796 /* Add cfs_rq with load or one or more already running entities to the list */ 5797 if (!cfs_rq_is_decayed(cfs_rq)) 5798 list_add_leaf_cfs_rq(cfs_rq); 5799 5800 if (cfs_rq->throttled_clock_self) { 5801 u64 delta = rq_clock(rq) - cfs_rq->throttled_clock_self; 5802 5803 cfs_rq->throttled_clock_self = 0; 5804 5805 if (SCHED_WARN_ON((s64)delta < 0)) 5806 delta = 0; 5807 5808 cfs_rq->throttled_clock_self_time += delta; 5809 } 5810 } 5811 5812 return 0; 5813 } 5814 5815 static int tg_throttle_down(struct task_group *tg, void *data) 5816 { 5817 struct rq *rq = data; 5818 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; 5819 5820 /* group is entering throttled state, stop time */ 5821 if (!cfs_rq->throttle_count) { 5822 cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq); 5823 list_del_leaf_cfs_rq(cfs_rq); 5824 5825 SCHED_WARN_ON(cfs_rq->throttled_clock_self); 5826 if (cfs_rq->nr_running) 5827 cfs_rq->throttled_clock_self = rq_clock(rq); 5828 } 5829 cfs_rq->throttle_count++; 5830 5831 return 0; 5832 } 5833 5834 static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) 5835 { 5836 struct rq *rq = rq_of(cfs_rq); 5837 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); 5838 struct sched_entity *se; 5839 long task_delta, idle_task_delta, delayed_delta, dequeue = 1; 5840 long rq_h_nr_running = rq->cfs.h_nr_running; 5841 5842 raw_spin_lock(&cfs_b->lock); 5843 /* This will start the period timer if necessary */ 5844 if (__assign_cfs_rq_runtime(cfs_b, cfs_rq, 1)) { 5845 /* 5846 * We have raced with bandwidth becoming available, and if we 5847 * actually throttled the timer might not unthrottle us for an 5848 * entire period. We additionally needed to make sure that any 5849 * subsequent check_cfs_rq_runtime calls agree not to throttle 5850 * us, as we may commit to do cfs put_prev+pick_next, so we ask 5851 * for 1ns of runtime rather than just check cfs_b. 5852 */ 5853 dequeue = 0; 5854 } else { 5855 list_add_tail_rcu(&cfs_rq->throttled_list, 5856 &cfs_b->throttled_cfs_rq); 5857 } 5858 raw_spin_unlock(&cfs_b->lock); 5859 5860 if (!dequeue) 5861 return false; /* Throttle no longer required. */ 5862 5863 se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; 5864 5865 /* freeze hierarchy runnable averages while throttled */ 5866 rcu_read_lock(); 5867 walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq); 5868 rcu_read_unlock(); 5869 5870 task_delta = cfs_rq->h_nr_running; 5871 idle_task_delta = cfs_rq->idle_h_nr_running; 5872 delayed_delta = cfs_rq->h_nr_delayed; 5873 for_each_sched_entity(se) { 5874 struct cfs_rq *qcfs_rq = cfs_rq_of(se); 5875 int flags; 5876 5877 /* throttled entity or throttle-on-deactivate */ 5878 if (!se->on_rq) 5879 goto done; 5880 5881 /* 5882 * Abuse SPECIAL to avoid delayed dequeue in this instance. 5883 * This avoids teaching dequeue_entities() about throttled 5884 * entities and keeps things relatively simple. 5885 */ 5886 flags = DEQUEUE_SLEEP | DEQUEUE_SPECIAL; 5887 if (se->sched_delayed) 5888 flags |= DEQUEUE_DELAYED; 5889 dequeue_entity(qcfs_rq, se, flags); 5890 5891 if (cfs_rq_is_idle(group_cfs_rq(se))) 5892 idle_task_delta = cfs_rq->h_nr_running; 5893 5894 qcfs_rq->h_nr_running -= task_delta; 5895 qcfs_rq->idle_h_nr_running -= idle_task_delta; 5896 qcfs_rq->h_nr_delayed -= delayed_delta; 5897 5898 if (qcfs_rq->load.weight) { 5899 /* Avoid re-evaluating load for this entity: */ 5900 se = parent_entity(se); 5901 break; 5902 } 5903 } 5904 5905 for_each_sched_entity(se) { 5906 struct cfs_rq *qcfs_rq = cfs_rq_of(se); 5907 /* throttled entity or throttle-on-deactivate */ 5908 if (!se->on_rq) 5909 goto done; 5910 5911 update_load_avg(qcfs_rq, se, 0); 5912 se_update_runnable(se); 5913 5914 if (cfs_rq_is_idle(group_cfs_rq(se))) 5915 idle_task_delta = cfs_rq->h_nr_running; 5916 5917 qcfs_rq->h_nr_running -= task_delta; 5918 qcfs_rq->idle_h_nr_running -= idle_task_delta; 5919 qcfs_rq->h_nr_delayed -= delayed_delta; 5920 } 5921 5922 /* At this point se is NULL and we are at root level*/ 5923 sub_nr_running(rq, task_delta); 5924 5925 /* Stop the fair server if throttling resulted in no runnable tasks */ 5926 if (rq_h_nr_running && !rq->cfs.h_nr_running) 5927 dl_server_stop(&rq->fair_server); 5928 done: 5929 /* 5930 * Note: distribution will already see us throttled via the 5931 * throttled-list. rq->lock protects completion. 5932 */ 5933 cfs_rq->throttled = 1; 5934 SCHED_WARN_ON(cfs_rq->throttled_clock); 5935 if (cfs_rq->nr_running) 5936 cfs_rq->throttled_clock = rq_clock(rq); 5937 return true; 5938 } 5939 5940 void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) 5941 { 5942 struct rq *rq = rq_of(cfs_rq); 5943 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); 5944 struct sched_entity *se; 5945 long task_delta, idle_task_delta, delayed_delta; 5946 long rq_h_nr_running = rq->cfs.h_nr_running; 5947 5948 se = cfs_rq->tg->se[cpu_of(rq)]; 5949 5950 cfs_rq->throttled = 0; 5951 5952 update_rq_clock(rq); 5953 5954 raw_spin_lock(&cfs_b->lock); 5955 if (cfs_rq->throttled_clock) { 5956 cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock; 5957 cfs_rq->throttled_clock = 0; 5958 } 5959 list_del_rcu(&cfs_rq->throttled_list); 5960 raw_spin_unlock(&cfs_b->lock); 5961 5962 /* update hierarchical throttle state */ 5963 walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq); 5964 5965 if (!cfs_rq->load.weight) { 5966 if (!cfs_rq->on_list) 5967 return; 5968 /* 5969 * Nothing to run but something to decay (on_list)? 5970 * Complete the branch. 5971 */ 5972 for_each_sched_entity(se) { 5973 if (list_add_leaf_cfs_rq(cfs_rq_of(se))) 5974 break; 5975 } 5976 goto unthrottle_throttle; 5977 } 5978 5979 task_delta = cfs_rq->h_nr_running; 5980 idle_task_delta = cfs_rq->idle_h_nr_running; 5981 delayed_delta = cfs_rq->h_nr_delayed; 5982 for_each_sched_entity(se) { 5983 struct cfs_rq *qcfs_rq = cfs_rq_of(se); 5984 5985 /* Handle any unfinished DELAY_DEQUEUE business first. */ 5986 if (se->sched_delayed) { 5987 int flags = DEQUEUE_SLEEP | DEQUEUE_DELAYED; 5988 5989 dequeue_entity(qcfs_rq, se, flags); 5990 } else if (se->on_rq) 5991 break; 5992 enqueue_entity(qcfs_rq, se, ENQUEUE_WAKEUP); 5993 5994 if (cfs_rq_is_idle(group_cfs_rq(se))) 5995 idle_task_delta = cfs_rq->h_nr_running; 5996 5997 qcfs_rq->h_nr_running += task_delta; 5998 qcfs_rq->idle_h_nr_running += idle_task_delta; 5999 qcfs_rq->h_nr_delayed += delayed_delta; 6000 6001 /* end evaluation on encountering a throttled cfs_rq */ 6002 if (cfs_rq_throttled(qcfs_rq)) 6003 goto unthrottle_throttle; 6004 } 6005 6006 for_each_sched_entity(se) { 6007 struct cfs_rq *qcfs_rq = cfs_rq_of(se); 6008 6009 update_load_avg(qcfs_rq, se, UPDATE_TG); 6010 se_update_runnable(se); 6011 6012 if (cfs_rq_is_idle(group_cfs_rq(se))) 6013 idle_task_delta = cfs_rq->h_nr_running; 6014 6015 qcfs_rq->h_nr_running += task_delta; 6016 qcfs_rq->idle_h_nr_running += idle_task_delta; 6017 qcfs_rq->h_nr_delayed += delayed_delta; 6018 6019 /* end evaluation on encountering a throttled cfs_rq */ 6020 if (cfs_rq_throttled(qcfs_rq)) 6021 goto unthrottle_throttle; 6022 } 6023 6024 /* Start the fair server if un-throttling resulted in new runnable tasks */ 6025 if (!rq_h_nr_running && rq->cfs.h_nr_running) 6026 dl_server_start(&rq->fair_server); 6027 6028 /* At this point se is NULL and we are at root level*/ 6029 add_nr_running(rq, task_delta); 6030 6031 unthrottle_throttle: 6032 assert_list_leaf_cfs_rq(rq); 6033 6034 /* Determine whether we need to wake up potentially idle CPU: */ 6035 if (rq->curr == rq->idle && rq->cfs.nr_running) 6036 resched_curr(rq); 6037 } 6038 6039 #ifdef CONFIG_SMP 6040 static void __cfsb_csd_unthrottle(void *arg) 6041 { 6042 struct cfs_rq *cursor, *tmp; 6043 struct rq *rq = arg; 6044 struct rq_flags rf; 6045 6046 rq_lock(rq, &rf); 6047 6048 /* 6049 * Iterating over the list can trigger several call to 6050 * update_rq_clock() in unthrottle_cfs_rq(). 6051 * Do it once and skip the potential next ones. 6052 */ 6053 update_rq_clock(rq); 6054 rq_clock_start_loop_update(rq); 6055 6056 /* 6057 * Since we hold rq lock we're safe from concurrent manipulation of 6058 * the CSD list. However, this RCU critical section annotates the 6059 * fact that we pair with sched_free_group_rcu(), so that we cannot 6060 * race with group being freed in the window between removing it 6061 * from the list and advancing to the next entry in the list. 6062 */ 6063 rcu_read_lock(); 6064 6065 list_for_each_entry_safe(cursor, tmp, &rq->cfsb_csd_list, 6066 throttled_csd_list) { 6067 list_del_init(&cursor->throttled_csd_list); 6068 6069 if (cfs_rq_throttled(cursor)) 6070 unthrottle_cfs_rq(cursor); 6071 } 6072 6073 rcu_read_unlock(); 6074 6075 rq_clock_stop_loop_update(rq); 6076 rq_unlock(rq, &rf); 6077 } 6078 6079 static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq) 6080 { 6081 struct rq *rq = rq_of(cfs_rq); 6082 bool first; 6083 6084 if (rq == this_rq()) { 6085 unthrottle_cfs_rq(cfs_rq); 6086 return; 6087 } 6088 6089 /* Already enqueued */ 6090 if (SCHED_WARN_ON(!list_empty(&cfs_rq->throttled_csd_list))) 6091 return; 6092 6093 first = list_empty(&rq->cfsb_csd_list); 6094 list_add_tail(&cfs_rq->throttled_csd_list, &rq->cfsb_csd_list); 6095 if (first) 6096 smp_call_function_single_async(cpu_of(rq), &rq->cfsb_csd); 6097 } 6098 #else 6099 static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq) 6100 { 6101 unthrottle_cfs_rq(cfs_rq); 6102 } 6103 #endif 6104 6105 static void unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq) 6106 { 6107 lockdep_assert_rq_held(rq_of(cfs_rq)); 6108 6109 if (SCHED_WARN_ON(!cfs_rq_throttled(cfs_rq) || 6110 cfs_rq->runtime_remaining <= 0)) 6111 return; 6112 6113 __unthrottle_cfs_rq_async(cfs_rq); 6114 } 6115 6116 static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b) 6117 { 6118 int this_cpu = smp_processor_id(); 6119 u64 runtime, remaining = 1; 6120 bool throttled = false; 6121 struct cfs_rq *cfs_rq, *tmp; 6122 struct rq_flags rf; 6123 struct rq *rq; 6124 LIST_HEAD(local_unthrottle); 6125 6126 rcu_read_lock(); 6127 list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq, 6128 throttled_list) { 6129 rq = rq_of(cfs_rq); 6130 6131 if (!remaining) { 6132 throttled = true; 6133 break; 6134 } 6135 6136 rq_lock_irqsave(rq, &rf); 6137 if (!cfs_rq_throttled(cfs_rq)) 6138 goto next; 6139 6140 /* Already queued for async unthrottle */ 6141 if (!list_empty(&cfs_rq->throttled_csd_list)) 6142 goto next; 6143 6144 /* By the above checks, this should never be true */ 6145 SCHED_WARN_ON(cfs_rq->runtime_remaining > 0); 6146 6147 raw_spin_lock(&cfs_b->lock); 6148 runtime = -cfs_rq->runtime_remaining + 1; 6149 if (runtime > cfs_b->runtime) 6150 runtime = cfs_b->runtime; 6151 cfs_b->runtime -= runtime; 6152 remaining = cfs_b->runtime; 6153 raw_spin_unlock(&cfs_b->lock); 6154 6155 cfs_rq->runtime_remaining += runtime; 6156 6157 /* we check whether we're throttled above */ 6158 if (cfs_rq->runtime_remaining > 0) { 6159 if (cpu_of(rq) != this_cpu) { 6160 unthrottle_cfs_rq_async(cfs_rq); 6161 } else { 6162 /* 6163 * We currently only expect to be unthrottling 6164 * a single cfs_rq locally. 6165 */ 6166 SCHED_WARN_ON(!list_empty(&local_unthrottle)); 6167 list_add_tail(&cfs_rq->throttled_csd_list, 6168 &local_unthrottle); 6169 } 6170 } else { 6171 throttled = true; 6172 } 6173 6174 next: 6175 rq_unlock_irqrestore(rq, &rf); 6176 } 6177 6178 list_for_each_entry_safe(cfs_rq, tmp, &local_unthrottle, 6179 throttled_csd_list) { 6180 struct rq *rq = rq_of(cfs_rq); 6181 6182 rq_lock_irqsave(rq, &rf); 6183 6184 list_del_init(&cfs_rq->throttled_csd_list); 6185 6186 if (cfs_rq_throttled(cfs_rq)) 6187 unthrottle_cfs_rq(cfs_rq); 6188 6189 rq_unlock_irqrestore(rq, &rf); 6190 } 6191 SCHED_WARN_ON(!list_empty(&local_unthrottle)); 6192 6193 rcu_read_unlock(); 6194 6195 return throttled; 6196 } 6197 6198 /* 6199 * Responsible for refilling a task_group's bandwidth and unthrottling its 6200 * cfs_rqs as appropriate. If there has been no activity within the last 6201 * period the timer is deactivated until scheduling resumes; cfs_b->idle is 6202 * used to track this state. 6203 */ 6204 static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, unsigned long flags) 6205 { 6206 int throttled; 6207 6208 /* no need to continue the timer with no bandwidth constraint */ 6209 if (cfs_b->quota == RUNTIME_INF) 6210 goto out_deactivate; 6211 6212 throttled = !list_empty(&cfs_b->throttled_cfs_rq); 6213 cfs_b->nr_periods += overrun; 6214 6215 /* Refill extra burst quota even if cfs_b->idle */ 6216 __refill_cfs_bandwidth_runtime(cfs_b); 6217 6218 /* 6219 * idle depends on !throttled (for the case of a large deficit), and if 6220 * we're going inactive then everything else can be deferred 6221 */ 6222 if (cfs_b->idle && !throttled) 6223 goto out_deactivate; 6224 6225 if (!throttled) { 6226 /* mark as potentially idle for the upcoming period */ 6227 cfs_b->idle = 1; 6228 return 0; 6229 } 6230 6231 /* account preceding periods in which throttling occurred */ 6232 cfs_b->nr_throttled += overrun; 6233 6234 /* 6235 * This check is repeated as we release cfs_b->lock while we unthrottle. 6236 */ 6237 while (throttled && cfs_b->runtime > 0) { 6238 raw_spin_unlock_irqrestore(&cfs_b->lock, flags); 6239 /* we can't nest cfs_b->lock while distributing bandwidth */ 6240 throttled = distribute_cfs_runtime(cfs_b); 6241 raw_spin_lock_irqsave(&cfs_b->lock, flags); 6242 } 6243 6244 /* 6245 * While we are ensured activity in the period following an 6246 * unthrottle, this also covers the case in which the new bandwidth is 6247 * insufficient to cover the existing bandwidth deficit. (Forcing the 6248 * timer to remain active while there are any throttled entities.) 6249 */ 6250 cfs_b->idle = 0; 6251 6252 return 0; 6253 6254 out_deactivate: 6255 return 1; 6256 } 6257 6258 /* a cfs_rq won't donate quota below this amount */ 6259 static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC; 6260 /* minimum remaining period time to redistribute slack quota */ 6261 static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC; 6262 /* how long we wait to gather additional slack before distributing */ 6263 static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC; 6264 6265 /* 6266 * Are we near the end of the current quota period? 6267 * 6268 * Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the 6269 * hrtimer base being cleared by hrtimer_start. In the case of 6270 * migrate_hrtimers, base is never cleared, so we are fine. 6271 */ 6272 static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire) 6273 { 6274 struct hrtimer *refresh_timer = &cfs_b->period_timer; 6275 s64 remaining; 6276 6277 /* if the call-back is running a quota refresh is already occurring */ 6278 if (hrtimer_callback_running(refresh_timer)) 6279 return 1; 6280 6281 /* is a quota refresh about to occur? */ 6282 remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer)); 6283 if (remaining < (s64)min_expire) 6284 return 1; 6285 6286 return 0; 6287 } 6288 6289 static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b) 6290 { 6291 u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration; 6292 6293 /* if there's a quota refresh soon don't bother with slack */ 6294 if (runtime_refresh_within(cfs_b, min_left)) 6295 return; 6296 6297 /* don't push forwards an existing deferred unthrottle */ 6298 if (cfs_b->slack_started) 6299 return; 6300 cfs_b->slack_started = true; 6301 6302 hrtimer_start(&cfs_b->slack_timer, 6303 ns_to_ktime(cfs_bandwidth_slack_period), 6304 HRTIMER_MODE_REL); 6305 } 6306 6307 /* we know any runtime found here is valid as update_curr() precedes return */ 6308 static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq) 6309 { 6310 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); 6311 s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime; 6312 6313 if (slack_runtime <= 0) 6314 return; 6315 6316 raw_spin_lock(&cfs_b->lock); 6317 if (cfs_b->quota != RUNTIME_INF) { 6318 cfs_b->runtime += slack_runtime; 6319 6320 /* we are under rq->lock, defer unthrottling using a timer */ 6321 if (cfs_b->runtime > sched_cfs_bandwidth_slice() && 6322 !list_empty(&cfs_b->throttled_cfs_rq)) 6323 start_cfs_slack_bandwidth(cfs_b); 6324 } 6325 raw_spin_unlock(&cfs_b->lock); 6326 6327 /* even if it's not valid for return we don't want to try again */ 6328 cfs_rq->runtime_remaining -= slack_runtime; 6329 } 6330 6331 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) 6332 { 6333 if (!cfs_bandwidth_used()) 6334 return; 6335 6336 if (!cfs_rq->runtime_enabled || cfs_rq->nr_running) 6337 return; 6338 6339 __return_cfs_rq_runtime(cfs_rq); 6340 } 6341 6342 /* 6343 * This is done with a timer (instead of inline with bandwidth return) since 6344 * it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs. 6345 */ 6346 static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) 6347 { 6348 u64 runtime = 0, slice = sched_cfs_bandwidth_slice(); 6349 unsigned long flags; 6350 6351 /* confirm we're still not at a refresh boundary */ 6352 raw_spin_lock_irqsave(&cfs_b->lock, flags); 6353 cfs_b->slack_started = false; 6354 6355 if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) { 6356 raw_spin_unlock_irqrestore(&cfs_b->lock, flags); 6357 return; 6358 } 6359 6360 if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) 6361 runtime = cfs_b->runtime; 6362 6363 raw_spin_unlock_irqrestore(&cfs_b->lock, flags); 6364 6365 if (!runtime) 6366 return; 6367 6368 distribute_cfs_runtime(cfs_b); 6369 } 6370 6371 /* 6372 * When a group wakes up we want to make sure that its quota is not already 6373 * expired/exceeded, otherwise it may be allowed to steal additional ticks of 6374 * runtime as update_curr() throttling can not trigger until it's on-rq. 6375 */ 6376 static void check_enqueue_throttle(struct cfs_rq *cfs_rq) 6377 { 6378 if (!cfs_bandwidth_used()) 6379 return; 6380 6381 /* an active group must be handled by the update_curr()->put() path */ 6382 if (!cfs_rq->runtime_enabled || cfs_rq->curr) 6383 return; 6384 6385 /* ensure the group is not already throttled */ 6386 if (cfs_rq_throttled(cfs_rq)) 6387 return; 6388 6389 /* update runtime allocation */ 6390 account_cfs_rq_runtime(cfs_rq, 0); 6391 if (cfs_rq->runtime_remaining <= 0) 6392 throttle_cfs_rq(cfs_rq); 6393 } 6394 6395 static void sync_throttle(struct task_group *tg, int cpu) 6396 { 6397 struct cfs_rq *pcfs_rq, *cfs_rq; 6398 6399 if (!cfs_bandwidth_used()) 6400 return; 6401 6402 if (!tg->parent) 6403 return; 6404 6405 cfs_rq = tg->cfs_rq[cpu]; 6406 pcfs_rq = tg->parent->cfs_rq[cpu]; 6407 6408 cfs_rq->throttle_count = pcfs_rq->throttle_count; 6409 cfs_rq->throttled_clock_pelt = rq_clock_pelt(cpu_rq(cpu)); 6410 } 6411 6412 /* conditionally throttle active cfs_rq's from put_prev_entity() */ 6413 static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) 6414 { 6415 if (!cfs_bandwidth_used()) 6416 return false; 6417 6418 if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0)) 6419 return false; 6420 6421 /* 6422 * it's possible for a throttled entity to be forced into a running 6423 * state (e.g. set_curr_task), in this case we're finished. 6424 */ 6425 if (cfs_rq_throttled(cfs_rq)) 6426 return true; 6427 6428 return throttle_cfs_rq(cfs_rq); 6429 } 6430 6431 static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer) 6432 { 6433 struct cfs_bandwidth *cfs_b = 6434 container_of(timer, struct cfs_bandwidth, slack_timer); 6435 6436 do_sched_cfs_slack_timer(cfs_b); 6437 6438 return HRTIMER_NORESTART; 6439 } 6440 6441 extern const u64 max_cfs_quota_period; 6442 6443 static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) 6444 { 6445 struct cfs_bandwidth *cfs_b = 6446 container_of(timer, struct cfs_bandwidth, period_timer); 6447 unsigned long flags; 6448 int overrun; 6449 int idle = 0; 6450 int count = 0; 6451 6452 raw_spin_lock_irqsave(&cfs_b->lock, flags); 6453 for (;;) { 6454 overrun = hrtimer_forward_now(timer, cfs_b->period); 6455 if (!overrun) 6456 break; 6457 6458 idle = do_sched_cfs_period_timer(cfs_b, overrun, flags); 6459 6460 if (++count > 3) { 6461 u64 new, old = ktime_to_ns(cfs_b->period); 6462 6463 /* 6464 * Grow period by a factor of 2 to avoid losing precision. 6465 * Precision loss in the quota/period ratio can cause __cfs_schedulable 6466 * to fail. 6467 */ 6468 new = old * 2; 6469 if (new < max_cfs_quota_period) { 6470 cfs_b->period = ns_to_ktime(new); 6471 cfs_b->quota *= 2; 6472 cfs_b->burst *= 2; 6473 6474 pr_warn_ratelimited( 6475 "cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us = %lld, cfs_quota_us = %lld)\n", 6476 smp_processor_id(), 6477 div_u64(new, NSEC_PER_USEC), 6478 div_u64(cfs_b->quota, NSEC_PER_USEC)); 6479 } else { 6480 pr_warn_ratelimited( 6481 "cfs_period_timer[cpu%d]: period too short, but cannot scale up without losing precision (cfs_period_us = %lld, cfs_quota_us = %lld)\n", 6482 smp_processor_id(), 6483 div_u64(old, NSEC_PER_USEC), 6484 div_u64(cfs_b->quota, NSEC_PER_USEC)); 6485 } 6486 6487 /* reset count so we don't come right back in here */ 6488 count = 0; 6489 } 6490 } 6491 if (idle) 6492 cfs_b->period_active = 0; 6493 raw_spin_unlock_irqrestore(&cfs_b->lock, flags); 6494 6495 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; 6496 } 6497 6498 void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b, struct cfs_bandwidth *parent) 6499 { 6500 raw_spin_lock_init(&cfs_b->lock); 6501 cfs_b->runtime = 0; 6502 cfs_b->quota = RUNTIME_INF; 6503 cfs_b->period = ns_to_ktime(default_cfs_period()); 6504 cfs_b->burst = 0; 6505 cfs_b->hierarchical_quota = parent ? parent->hierarchical_quota : RUNTIME_INF; 6506 6507 INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq); 6508 hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED); 6509 cfs_b->period_timer.function = sched_cfs_period_timer; 6510 6511 /* Add a random offset so that timers interleave */ 6512 hrtimer_set_expires(&cfs_b->period_timer, 6513 get_random_u32_below(cfs_b->period)); 6514 hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 6515 cfs_b->slack_timer.function = sched_cfs_slack_timer; 6516 cfs_b->slack_started = false; 6517 } 6518 6519 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) 6520 { 6521 cfs_rq->runtime_enabled = 0; 6522 INIT_LIST_HEAD(&cfs_rq->throttled_list); 6523 INIT_LIST_HEAD(&cfs_rq->throttled_csd_list); 6524 } 6525 6526 void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) 6527 { 6528 lockdep_assert_held(&cfs_b->lock); 6529 6530 if (cfs_b->period_active) 6531 return; 6532 6533 cfs_b->period_active = 1; 6534 hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period); 6535 hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED); 6536 } 6537 6538 static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) 6539 { 6540 int __maybe_unused i; 6541 6542 /* init_cfs_bandwidth() was not called */ 6543 if (!cfs_b->throttled_cfs_rq.next) 6544 return; 6545 6546 hrtimer_cancel(&cfs_b->period_timer); 6547 hrtimer_cancel(&cfs_b->slack_timer); 6548 6549 /* 6550 * It is possible that we still have some cfs_rq's pending on a CSD 6551 * list, though this race is very rare. In order for this to occur, we 6552 * must have raced with the last task leaving the group while there 6553 * exist throttled cfs_rq(s), and the period_timer must have queued the 6554 * CSD item but the remote cpu has not yet processed it. To handle this, 6555 * we can simply flush all pending CSD work inline here. We're 6556 * guaranteed at this point that no additional cfs_rq of this group can 6557 * join a CSD list. 6558 */ 6559 #ifdef CONFIG_SMP 6560 for_each_possible_cpu(i) { 6561 struct rq *rq = cpu_rq(i); 6562 unsigned long flags; 6563 6564 if (list_empty(&rq->cfsb_csd_list)) 6565 continue; 6566 6567 local_irq_save(flags); 6568 __cfsb_csd_unthrottle(rq); 6569 local_irq_restore(flags); 6570 } 6571 #endif 6572 } 6573 6574 /* 6575 * Both these CPU hotplug callbacks race against unregister_fair_sched_group() 6576 * 6577 * The race is harmless, since modifying bandwidth settings of unhooked group 6578 * bits doesn't do much. 6579 */ 6580 6581 /* cpu online callback */ 6582 static void __maybe_unused update_runtime_enabled(struct rq *rq) 6583 { 6584 struct task_group *tg; 6585 6586 lockdep_assert_rq_held(rq); 6587 6588 rcu_read_lock(); 6589 list_for_each_entry_rcu(tg, &task_groups, list) { 6590 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; 6591 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; 6592 6593 raw_spin_lock(&cfs_b->lock); 6594 cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF; 6595 raw_spin_unlock(&cfs_b->lock); 6596 } 6597 rcu_read_unlock(); 6598 } 6599 6600 /* cpu offline callback */ 6601 static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) 6602 { 6603 struct task_group *tg; 6604 6605 lockdep_assert_rq_held(rq); 6606 6607 /* 6608 * The rq clock has already been updated in the 6609 * set_rq_offline(), so we should skip updating 6610 * the rq clock again in unthrottle_cfs_rq(). 6611 */ 6612 rq_clock_start_loop_update(rq); 6613 6614 rcu_read_lock(); 6615 list_for_each_entry_rcu(tg, &task_groups, list) { 6616 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; 6617 6618 if (!cfs_rq->runtime_enabled) 6619 continue; 6620 6621 /* 6622 * clock_task is not advancing so we just need to make sure 6623 * there's some valid quota amount 6624 */ 6625 cfs_rq->runtime_remaining = 1; 6626 /* 6627 * Offline rq is schedulable till CPU is completely disabled 6628 * in take_cpu_down(), so we prevent new cfs throttling here. 6629 */ 6630 cfs_rq->runtime_enabled = 0; 6631 6632 if (cfs_rq_throttled(cfs_rq)) 6633 unthrottle_cfs_rq(cfs_rq); 6634 } 6635 rcu_read_unlock(); 6636 6637 rq_clock_stop_loop_update(rq); 6638 } 6639 6640 bool cfs_task_bw_constrained(struct task_struct *p) 6641 { 6642 struct cfs_rq *cfs_rq = task_cfs_rq(p); 6643 6644 if (!cfs_bandwidth_used()) 6645 return false; 6646 6647 if (cfs_rq->runtime_enabled || 6648 tg_cfs_bandwidth(cfs_rq->tg)->hierarchical_quota != RUNTIME_INF) 6649 return true; 6650 6651 return false; 6652 } 6653 6654 #ifdef CONFIG_NO_HZ_FULL 6655 /* called from pick_next_task_fair() */ 6656 static void sched_fair_update_stop_tick(struct rq *rq, struct task_struct *p) 6657 { 6658 int cpu = cpu_of(rq); 6659 6660 if (!cfs_bandwidth_used()) 6661 return; 6662 6663 if (!tick_nohz_full_cpu(cpu)) 6664 return; 6665 6666 if (rq->nr_running != 1) 6667 return; 6668 6669 /* 6670 * We know there is only one task runnable and we've just picked it. The 6671 * normal enqueue path will have cleared TICK_DEP_BIT_SCHED if we will 6672 * be otherwise able to stop the tick. Just need to check if we are using 6673 * bandwidth control. 6674 */ 6675 if (cfs_task_bw_constrained(p)) 6676 tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED); 6677 } 6678 #endif 6679 6680 #else /* CONFIG_CFS_BANDWIDTH */ 6681 6682 static inline bool cfs_bandwidth_used(void) 6683 { 6684 return false; 6685 } 6686 6687 static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {} 6688 static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; } 6689 static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} 6690 static inline void sync_throttle(struct task_group *tg, int cpu) {} 6691 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} 6692 6693 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) 6694 { 6695 return 0; 6696 } 6697 6698 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq) 6699 { 6700 return 0; 6701 } 6702 6703 static inline int throttled_lb_pair(struct task_group *tg, 6704 int src_cpu, int dest_cpu) 6705 { 6706 return 0; 6707 } 6708 6709 #ifdef CONFIG_FAIR_GROUP_SCHED 6710 void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b, struct cfs_bandwidth *parent) {} 6711 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} 6712 #endif 6713 6714 static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) 6715 { 6716 return NULL; 6717 } 6718 static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} 6719 static inline void update_runtime_enabled(struct rq *rq) {} 6720 static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {} 6721 #ifdef CONFIG_CGROUP_SCHED 6722 bool cfs_task_bw_constrained(struct task_struct *p) 6723 { 6724 return false; 6725 } 6726 #endif 6727 #endif /* CONFIG_CFS_BANDWIDTH */ 6728 6729 #if !defined(CONFIG_CFS_BANDWIDTH) || !defined(CONFIG_NO_HZ_FULL) 6730 static inline void sched_fair_update_stop_tick(struct rq *rq, struct task_struct *p) {} 6731 #endif 6732 6733 /************************************************** 6734 * CFS operations on tasks: 6735 */ 6736 6737 #ifdef CONFIG_SCHED_HRTICK 6738 static void hrtick_start_fair(struct rq *rq, struct task_struct *p) 6739 { 6740 struct sched_entity *se = &p->se; 6741 6742 SCHED_WARN_ON(task_rq(p) != rq); 6743 6744 if (rq->cfs.h_nr_running > 1) { 6745 u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime; 6746 u64 slice = se->slice; 6747 s64 delta = slice - ran; 6748 6749 if (delta < 0) { 6750 if (task_current_donor(rq, p)) 6751 resched_curr(rq); 6752 return; 6753 } 6754 hrtick_start(rq, delta); 6755 } 6756 } 6757 6758 /* 6759 * called from enqueue/dequeue and updates the hrtick when the 6760 * current task is from our class and nr_running is low enough 6761 * to matter. 6762 */ 6763 static void hrtick_update(struct rq *rq) 6764 { 6765 struct task_struct *donor = rq->donor; 6766 6767 if (!hrtick_enabled_fair(rq) || donor->sched_class != &fair_sched_class) 6768 return; 6769 6770 hrtick_start_fair(rq, donor); 6771 } 6772 #else /* !CONFIG_SCHED_HRTICK */ 6773 static inline void 6774 hrtick_start_fair(struct rq *rq, struct task_struct *p) 6775 { 6776 } 6777 6778 static inline void hrtick_update(struct rq *rq) 6779 { 6780 } 6781 #endif 6782 6783 #ifdef CONFIG_SMP 6784 static inline bool cpu_overutilized(int cpu) 6785 { 6786 unsigned long rq_util_min, rq_util_max; 6787 6788 if (!sched_energy_enabled()) 6789 return false; 6790 6791 rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN); 6792 rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX); 6793 6794 /* Return true only if the utilization doesn't fit CPU's capacity */ 6795 return !util_fits_cpu(cpu_util_cfs(cpu), rq_util_min, rq_util_max, cpu); 6796 } 6797 6798 /* 6799 * overutilized value make sense only if EAS is enabled 6800 */ 6801 static inline bool is_rd_overutilized(struct root_domain *rd) 6802 { 6803 return !sched_energy_enabled() || READ_ONCE(rd->overutilized); 6804 } 6805 6806 static inline void set_rd_overutilized(struct root_domain *rd, bool flag) 6807 { 6808 if (!sched_energy_enabled()) 6809 return; 6810 6811 WRITE_ONCE(rd->overutilized, flag); 6812 trace_sched_overutilized_tp(rd, flag); 6813 } 6814 6815 static inline void check_update_overutilized_status(struct rq *rq) 6816 { 6817 /* 6818 * overutilized field is used for load balancing decisions only 6819 * if energy aware scheduler is being used 6820 */ 6821 6822 if (!is_rd_overutilized(rq->rd) && cpu_overutilized(rq->cpu)) 6823 set_rd_overutilized(rq->rd, 1); 6824 } 6825 #else 6826 static inline void check_update_overutilized_status(struct rq *rq) { } 6827 #endif 6828 6829 /* Runqueue only has SCHED_IDLE tasks enqueued */ 6830 static int sched_idle_rq(struct rq *rq) 6831 { 6832 return unlikely(rq->nr_running == rq->cfs.idle_h_nr_running && 6833 rq->nr_running); 6834 } 6835 6836 #ifdef CONFIG_SMP 6837 static int sched_idle_cpu(int cpu) 6838 { 6839 return sched_idle_rq(cpu_rq(cpu)); 6840 } 6841 #endif 6842 6843 static void 6844 requeue_delayed_entity(struct sched_entity *se) 6845 { 6846 struct cfs_rq *cfs_rq = cfs_rq_of(se); 6847 6848 /* 6849 * se->sched_delayed should imply: se->on_rq == 1. 6850 * Because a delayed entity is one that is still on 6851 * the runqueue competing until elegibility. 6852 */ 6853 SCHED_WARN_ON(!se->sched_delayed); 6854 SCHED_WARN_ON(!se->on_rq); 6855 6856 if (sched_feat(DELAY_ZERO)) { 6857 update_entity_lag(cfs_rq, se); 6858 if (se->vlag > 0) { 6859 cfs_rq->nr_running--; 6860 if (se != cfs_rq->curr) 6861 __dequeue_entity(cfs_rq, se); 6862 se->vlag = 0; 6863 place_entity(cfs_rq, se, 0); 6864 if (se != cfs_rq->curr) 6865 __enqueue_entity(cfs_rq, se); 6866 cfs_rq->nr_running++; 6867 } 6868 } 6869 6870 update_load_avg(cfs_rq, se, 0); 6871 clear_delayed(se); 6872 } 6873 6874 /* 6875 * The enqueue_task method is called before nr_running is 6876 * increased. Here we update the fair scheduling stats and 6877 * then put the task into the rbtree: 6878 */ 6879 static void 6880 enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) 6881 { 6882 struct cfs_rq *cfs_rq; 6883 struct sched_entity *se = &p->se; 6884 int idle_h_nr_running = task_has_idle_policy(p); 6885 int h_nr_delayed = 0; 6886 int task_new = !(flags & ENQUEUE_WAKEUP); 6887 int rq_h_nr_running = rq->cfs.h_nr_running; 6888 u64 slice = 0; 6889 6890 /* 6891 * The code below (indirectly) updates schedutil which looks at 6892 * the cfs_rq utilization to select a frequency. 6893 * Let's add the task's estimated utilization to the cfs_rq's 6894 * estimated utilization, before we update schedutil. 6895 */ 6896 if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & ENQUEUE_RESTORE)))) 6897 util_est_enqueue(&rq->cfs, p); 6898 6899 if (flags & ENQUEUE_DELAYED) { 6900 requeue_delayed_entity(se); 6901 return; 6902 } 6903 6904 /* 6905 * If in_iowait is set, the code below may not trigger any cpufreq 6906 * utilization updates, so do it here explicitly with the IOWAIT flag 6907 * passed. 6908 */ 6909 if (p->in_iowait) 6910 cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT); 6911 6912 if (task_new) 6913 h_nr_delayed = !!se->sched_delayed; 6914 6915 for_each_sched_entity(se) { 6916 if (se->on_rq) { 6917 if (se->sched_delayed) 6918 requeue_delayed_entity(se); 6919 break; 6920 } 6921 cfs_rq = cfs_rq_of(se); 6922 6923 /* 6924 * Basically set the slice of group entries to the min_slice of 6925 * their respective cfs_rq. This ensures the group can service 6926 * its entities in the desired time-frame. 6927 */ 6928 if (slice) { 6929 se->slice = slice; 6930 se->custom_slice = 1; 6931 } 6932 enqueue_entity(cfs_rq, se, flags); 6933 slice = cfs_rq_min_slice(cfs_rq); 6934 6935 cfs_rq->h_nr_running++; 6936 cfs_rq->idle_h_nr_running += idle_h_nr_running; 6937 cfs_rq->h_nr_delayed += h_nr_delayed; 6938 6939 if (cfs_rq_is_idle(cfs_rq)) 6940 idle_h_nr_running = 1; 6941 6942 /* end evaluation on encountering a throttled cfs_rq */ 6943 if (cfs_rq_throttled(cfs_rq)) 6944 goto enqueue_throttle; 6945 6946 flags = ENQUEUE_WAKEUP; 6947 } 6948 6949 for_each_sched_entity(se) { 6950 cfs_rq = cfs_rq_of(se); 6951 6952 update_load_avg(cfs_rq, se, UPDATE_TG); 6953 se_update_runnable(se); 6954 update_cfs_group(se); 6955 6956 se->slice = slice; 6957 slice = cfs_rq_min_slice(cfs_rq); 6958 6959 cfs_rq->h_nr_running++; 6960 cfs_rq->idle_h_nr_running += idle_h_nr_running; 6961 cfs_rq->h_nr_delayed += h_nr_delayed; 6962 6963 if (cfs_rq_is_idle(cfs_rq)) 6964 idle_h_nr_running = 1; 6965 6966 /* end evaluation on encountering a throttled cfs_rq */ 6967 if (cfs_rq_throttled(cfs_rq)) 6968 goto enqueue_throttle; 6969 } 6970 6971 if (!rq_h_nr_running && rq->cfs.h_nr_running) { 6972 /* Account for idle runtime */ 6973 if (!rq->nr_running) 6974 dl_server_update_idle_time(rq, rq->curr); 6975 dl_server_start(&rq->fair_server); 6976 } 6977 6978 /* At this point se is NULL and we are at root level*/ 6979 add_nr_running(rq, 1); 6980 6981 /* 6982 * Since new tasks are assigned an initial util_avg equal to 6983 * half of the spare capacity of their CPU, tiny tasks have the 6984 * ability to cross the overutilized threshold, which will 6985 * result in the load balancer ruining all the task placement 6986 * done by EAS. As a way to mitigate that effect, do not account 6987 * for the first enqueue operation of new tasks during the 6988 * overutilized flag detection. 6989 * 6990 * A better way of solving this problem would be to wait for 6991 * the PELT signals of tasks to converge before taking them 6992 * into account, but that is not straightforward to implement, 6993 * and the following generally works well enough in practice. 6994 */ 6995 if (!task_new) 6996 check_update_overutilized_status(rq); 6997 6998 enqueue_throttle: 6999 assert_list_leaf_cfs_rq(rq); 7000 7001 hrtick_update(rq); 7002 } 7003 7004 static void set_next_buddy(struct sched_entity *se); 7005 7006 /* 7007 * Basically dequeue_task_fair(), except it can deal with dequeue_entity() 7008 * failing half-way through and resume the dequeue later. 7009 * 7010 * Returns: 7011 * -1 - dequeue delayed 7012 * 0 - dequeue throttled 7013 * 1 - dequeue complete 7014 */ 7015 static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) 7016 { 7017 bool was_sched_idle = sched_idle_rq(rq); 7018 int rq_h_nr_running = rq->cfs.h_nr_running; 7019 bool task_sleep = flags & DEQUEUE_SLEEP; 7020 bool task_delayed = flags & DEQUEUE_DELAYED; 7021 struct task_struct *p = NULL; 7022 int idle_h_nr_running = 0; 7023 int h_nr_running = 0; 7024 int h_nr_delayed = 0; 7025 struct cfs_rq *cfs_rq; 7026 u64 slice = 0; 7027 7028 if (entity_is_task(se)) { 7029 p = task_of(se); 7030 h_nr_running = 1; 7031 idle_h_nr_running = task_has_idle_policy(p); 7032 if (!task_sleep && !task_delayed) 7033 h_nr_delayed = !!se->sched_delayed; 7034 } else { 7035 cfs_rq = group_cfs_rq(se); 7036 slice = cfs_rq_min_slice(cfs_rq); 7037 } 7038 7039 for_each_sched_entity(se) { 7040 cfs_rq = cfs_rq_of(se); 7041 7042 if (!dequeue_entity(cfs_rq, se, flags)) { 7043 if (p && &p->se == se) 7044 return -1; 7045 7046 break; 7047 } 7048 7049 cfs_rq->h_nr_running -= h_nr_running; 7050 cfs_rq->idle_h_nr_running -= idle_h_nr_running; 7051 cfs_rq->h_nr_delayed -= h_nr_delayed; 7052 7053 if (cfs_rq_is_idle(cfs_rq)) 7054 idle_h_nr_running = h_nr_running; 7055 7056 /* end evaluation on encountering a throttled cfs_rq */ 7057 if (cfs_rq_throttled(cfs_rq)) 7058 return 0; 7059 7060 /* Don't dequeue parent if it has other entities besides us */ 7061 if (cfs_rq->load.weight) { 7062 slice = cfs_rq_min_slice(cfs_rq); 7063 7064 /* Avoid re-evaluating load for this entity: */ 7065 se = parent_entity(se); 7066 /* 7067 * Bias pick_next to pick a task from this cfs_rq, as 7068 * p is sleeping when it is within its sched_slice. 7069 */ 7070 if (task_sleep && se && !throttled_hierarchy(cfs_rq)) 7071 set_next_buddy(se); 7072 break; 7073 } 7074 flags |= DEQUEUE_SLEEP; 7075 flags &= ~(DEQUEUE_DELAYED | DEQUEUE_SPECIAL); 7076 } 7077 7078 for_each_sched_entity(se) { 7079 cfs_rq = cfs_rq_of(se); 7080 7081 update_load_avg(cfs_rq, se, UPDATE_TG); 7082 se_update_runnable(se); 7083 update_cfs_group(se); 7084 7085 se->slice = slice; 7086 slice = cfs_rq_min_slice(cfs_rq); 7087 7088 cfs_rq->h_nr_running -= h_nr_running; 7089 cfs_rq->idle_h_nr_running -= idle_h_nr_running; 7090 cfs_rq->h_nr_delayed -= h_nr_delayed; 7091 7092 if (cfs_rq_is_idle(cfs_rq)) 7093 idle_h_nr_running = h_nr_running; 7094 7095 /* end evaluation on encountering a throttled cfs_rq */ 7096 if (cfs_rq_throttled(cfs_rq)) 7097 return 0; 7098 } 7099 7100 sub_nr_running(rq, h_nr_running); 7101 7102 if (rq_h_nr_running && !rq->cfs.h_nr_running) 7103 dl_server_stop(&rq->fair_server); 7104 7105 /* balance early to pull high priority tasks */ 7106 if (unlikely(!was_sched_idle && sched_idle_rq(rq))) 7107 rq->next_balance = jiffies; 7108 7109 if (p && task_delayed) { 7110 SCHED_WARN_ON(!task_sleep); 7111 SCHED_WARN_ON(p->on_rq != 1); 7112 7113 /* Fix-up what dequeue_task_fair() skipped */ 7114 hrtick_update(rq); 7115 7116 /* 7117 * Fix-up what block_task() skipped. 7118 * 7119 * Must be last, @p might not be valid after this. 7120 */ 7121 __block_task(rq, p); 7122 } 7123 7124 return 1; 7125 } 7126 7127 /* 7128 * The dequeue_task method is called before nr_running is 7129 * decreased. We remove the task from the rbtree and 7130 * update the fair scheduling stats: 7131 */ 7132 static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) 7133 { 7134 if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & DEQUEUE_SAVE)))) 7135 util_est_dequeue(&rq->cfs, p); 7136 7137 util_est_update(&rq->cfs, p, flags & DEQUEUE_SLEEP); 7138 if (dequeue_entities(rq, &p->se, flags) < 0) 7139 return false; 7140 7141 /* 7142 * Must not reference @p after dequeue_entities(DEQUEUE_DELAYED). 7143 */ 7144 7145 hrtick_update(rq); 7146 return true; 7147 } 7148 7149 #ifdef CONFIG_SMP 7150 7151 /* Working cpumask for: sched_balance_rq(), sched_balance_newidle(). */ 7152 static DEFINE_PER_CPU(cpumask_var_t, load_balance_mask); 7153 static DEFINE_PER_CPU(cpumask_var_t, select_rq_mask); 7154 static DEFINE_PER_CPU(cpumask_var_t, should_we_balance_tmpmask); 7155 7156 #ifdef CONFIG_NO_HZ_COMMON 7157 7158 static struct { 7159 cpumask_var_t idle_cpus_mask; 7160 atomic_t nr_cpus; 7161 int has_blocked; /* Idle CPUS has blocked load */ 7162 int needs_update; /* Newly idle CPUs need their next_balance collated */ 7163 unsigned long next_balance; /* in jiffy units */ 7164 unsigned long next_blocked; /* Next update of blocked load in jiffies */ 7165 } nohz ____cacheline_aligned; 7166 7167 #endif /* CONFIG_NO_HZ_COMMON */ 7168 7169 static unsigned long cpu_load(struct rq *rq) 7170 { 7171 return cfs_rq_load_avg(&rq->cfs); 7172 } 7173 7174 /* 7175 * cpu_load_without - compute CPU load without any contributions from *p 7176 * @cpu: the CPU which load is requested 7177 * @p: the task which load should be discounted 7178 * 7179 * The load of a CPU is defined by the load of tasks currently enqueued on that 7180 * CPU as well as tasks which are currently sleeping after an execution on that 7181 * CPU. 7182 * 7183 * This method returns the load of the specified CPU by discounting the load of 7184 * the specified task, whenever the task is currently contributing to the CPU 7185 * load. 7186 */ 7187 static unsigned long cpu_load_without(struct rq *rq, struct task_struct *p) 7188 { 7189 struct cfs_rq *cfs_rq; 7190 unsigned int load; 7191 7192 /* Task has no contribution or is new */ 7193 if (cpu_of(rq) != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time)) 7194 return cpu_load(rq); 7195 7196 cfs_rq = &rq->cfs; 7197 load = READ_ONCE(cfs_rq->avg.load_avg); 7198 7199 /* Discount task's util from CPU's util */ 7200 lsub_positive(&load, task_h_load(p)); 7201 7202 return load; 7203 } 7204 7205 static unsigned long cpu_runnable(struct rq *rq) 7206 { 7207 return cfs_rq_runnable_avg(&rq->cfs); 7208 } 7209 7210 static unsigned long cpu_runnable_without(struct rq *rq, struct task_struct *p) 7211 { 7212 struct cfs_rq *cfs_rq; 7213 unsigned int runnable; 7214 7215 /* Task has no contribution or is new */ 7216 if (cpu_of(rq) != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time)) 7217 return cpu_runnable(rq); 7218 7219 cfs_rq = &rq->cfs; 7220 runnable = READ_ONCE(cfs_rq->avg.runnable_avg); 7221 7222 /* Discount task's runnable from CPU's runnable */ 7223 lsub_positive(&runnable, p->se.avg.runnable_avg); 7224 7225 return runnable; 7226 } 7227 7228 static unsigned long capacity_of(int cpu) 7229 { 7230 return cpu_rq(cpu)->cpu_capacity; 7231 } 7232 7233 static void record_wakee(struct task_struct *p) 7234 { 7235 /* 7236 * Only decay a single time; tasks that have less then 1 wakeup per 7237 * jiffy will not have built up many flips. 7238 */ 7239 if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) { 7240 current->wakee_flips >>= 1; 7241 current->wakee_flip_decay_ts = jiffies; 7242 } 7243 7244 if (current->last_wakee != p) { 7245 current->last_wakee = p; 7246 current->wakee_flips++; 7247 } 7248 } 7249 7250 /* 7251 * Detect M:N waker/wakee relationships via a switching-frequency heuristic. 7252 * 7253 * A waker of many should wake a different task than the one last awakened 7254 * at a frequency roughly N times higher than one of its wakees. 7255 * 7256 * In order to determine whether we should let the load spread vs consolidating 7257 * to shared cache, we look for a minimum 'flip' frequency of llc_size in one 7258 * partner, and a factor of lls_size higher frequency in the other. 7259 * 7260 * With both conditions met, we can be relatively sure that the relationship is 7261 * non-monogamous, with partner count exceeding socket size. 7262 * 7263 * Waker/wakee being client/server, worker/dispatcher, interrupt source or 7264 * whatever is irrelevant, spread criteria is apparent partner count exceeds 7265 * socket size. 7266 */ 7267 static int wake_wide(struct task_struct *p) 7268 { 7269 unsigned int master = current->wakee_flips; 7270 unsigned int slave = p->wakee_flips; 7271 int factor = __this_cpu_read(sd_llc_size); 7272 7273 if (master < slave) 7274 swap(master, slave); 7275 if (slave < factor || master < slave * factor) 7276 return 0; 7277 return 1; 7278 } 7279 7280 /* 7281 * The purpose of wake_affine() is to quickly determine on which CPU we can run 7282 * soonest. For the purpose of speed we only consider the waking and previous 7283 * CPU. 7284 * 7285 * wake_affine_idle() - only considers 'now', it check if the waking CPU is 7286 * cache-affine and is (or will be) idle. 7287 * 7288 * wake_affine_weight() - considers the weight to reflect the average 7289 * scheduling latency of the CPUs. This seems to work 7290 * for the overloaded case. 7291 */ 7292 static int 7293 wake_affine_idle(int this_cpu, int prev_cpu, int sync) 7294 { 7295 /* 7296 * If this_cpu is idle, it implies the wakeup is from interrupt 7297 * context. Only allow the move if cache is shared. Otherwise an 7298 * interrupt intensive workload could force all tasks onto one 7299 * node depending on the IO topology or IRQ affinity settings. 7300 * 7301 * If the prev_cpu is idle and cache affine then avoid a migration. 7302 * There is no guarantee that the cache hot data from an interrupt 7303 * is more important than cache hot data on the prev_cpu and from 7304 * a cpufreq perspective, it's better to have higher utilisation 7305 * on one CPU. 7306 */ 7307 if (available_idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu)) 7308 return available_idle_cpu(prev_cpu) ? prev_cpu : this_cpu; 7309 7310 if (sync && cpu_rq(this_cpu)->nr_running == 1) 7311 return this_cpu; 7312 7313 if (available_idle_cpu(prev_cpu)) 7314 return prev_cpu; 7315 7316 return nr_cpumask_bits; 7317 } 7318 7319 static int 7320 wake_affine_weight(struct sched_domain *sd, struct task_struct *p, 7321 int this_cpu, int prev_cpu, int sync) 7322 { 7323 s64 this_eff_load, prev_eff_load; 7324 unsigned long task_load; 7325 7326 this_eff_load = cpu_load(cpu_rq(this_cpu)); 7327 7328 if (sync) { 7329 unsigned long current_load = task_h_load(current); 7330 7331 if (current_load > this_eff_load) 7332 return this_cpu; 7333 7334 this_eff_load -= current_load; 7335 } 7336 7337 task_load = task_h_load(p); 7338 7339 this_eff_load += task_load; 7340 if (sched_feat(WA_BIAS)) 7341 this_eff_load *= 100; 7342 this_eff_load *= capacity_of(prev_cpu); 7343 7344 prev_eff_load = cpu_load(cpu_rq(prev_cpu)); 7345 prev_eff_load -= task_load; 7346 if (sched_feat(WA_BIAS)) 7347 prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2; 7348 prev_eff_load *= capacity_of(this_cpu); 7349 7350 /* 7351 * If sync, adjust the weight of prev_eff_load such that if 7352 * prev_eff == this_eff that select_idle_sibling() will consider 7353 * stacking the wakee on top of the waker if no other CPU is 7354 * idle. 7355 */ 7356 if (sync) 7357 prev_eff_load += 1; 7358 7359 return this_eff_load < prev_eff_load ? this_cpu : nr_cpumask_bits; 7360 } 7361 7362 static int wake_affine(struct sched_domain *sd, struct task_struct *p, 7363 int this_cpu, int prev_cpu, int sync) 7364 { 7365 int target = nr_cpumask_bits; 7366 7367 if (sched_feat(WA_IDLE)) 7368 target = wake_affine_idle(this_cpu, prev_cpu, sync); 7369 7370 if (sched_feat(WA_WEIGHT) && target == nr_cpumask_bits) 7371 target = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync); 7372 7373 schedstat_inc(p->stats.nr_wakeups_affine_attempts); 7374 if (target != this_cpu) 7375 return prev_cpu; 7376 7377 schedstat_inc(sd->ttwu_move_affine); 7378 schedstat_inc(p->stats.nr_wakeups_affine); 7379 return target; 7380 } 7381 7382 static struct sched_group * 7383 sched_balance_find_dst_group(struct sched_domain *sd, struct task_struct *p, int this_cpu); 7384 7385 /* 7386 * sched_balance_find_dst_group_cpu - find the idlest CPU among the CPUs in the group. 7387 */ 7388 static int 7389 sched_balance_find_dst_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) 7390 { 7391 unsigned long load, min_load = ULONG_MAX; 7392 unsigned int min_exit_latency = UINT_MAX; 7393 u64 latest_idle_timestamp = 0; 7394 int least_loaded_cpu = this_cpu; 7395 int shallowest_idle_cpu = -1; 7396 int i; 7397 7398 /* Check if we have any choice: */ 7399 if (group->group_weight == 1) 7400 return cpumask_first(sched_group_span(group)); 7401 7402 /* Traverse only the allowed CPUs */ 7403 for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) { 7404 struct rq *rq = cpu_rq(i); 7405 7406 if (!sched_core_cookie_match(rq, p)) 7407 continue; 7408 7409 if (sched_idle_cpu(i)) 7410 return i; 7411 7412 if (available_idle_cpu(i)) { 7413 struct cpuidle_state *idle = idle_get_state(rq); 7414 if (idle && idle->exit_latency < min_exit_latency) { 7415 /* 7416 * We give priority to a CPU whose idle state 7417 * has the smallest exit latency irrespective 7418 * of any idle timestamp. 7419 */ 7420 min_exit_latency = idle->exit_latency; 7421 latest_idle_timestamp = rq->idle_stamp; 7422 shallowest_idle_cpu = i; 7423 } else if ((!idle || idle->exit_latency == min_exit_latency) && 7424 rq->idle_stamp > latest_idle_timestamp) { 7425 /* 7426 * If equal or no active idle state, then 7427 * the most recently idled CPU might have 7428 * a warmer cache. 7429 */ 7430 latest_idle_timestamp = rq->idle_stamp; 7431 shallowest_idle_cpu = i; 7432 } 7433 } else if (shallowest_idle_cpu == -1) { 7434 load = cpu_load(cpu_rq(i)); 7435 if (load < min_load) { 7436 min_load = load; 7437 least_loaded_cpu = i; 7438 } 7439 } 7440 } 7441 7442 return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu; 7443 } 7444 7445 static inline int sched_balance_find_dst_cpu(struct sched_domain *sd, struct task_struct *p, 7446 int cpu, int prev_cpu, int sd_flag) 7447 { 7448 int new_cpu = cpu; 7449 7450 if (!cpumask_intersects(sched_domain_span(sd), p->cpus_ptr)) 7451 return prev_cpu; 7452 7453 /* 7454 * We need task's util for cpu_util_without, sync it up to 7455 * prev_cpu's last_update_time. 7456 */ 7457 if (!(sd_flag & SD_BALANCE_FORK)) 7458 sync_entity_load_avg(&p->se); 7459 7460 while (sd) { 7461 struct sched_group *group; 7462 struct sched_domain *tmp; 7463 int weight; 7464 7465 if (!(sd->flags & sd_flag)) { 7466 sd = sd->child; 7467 continue; 7468 } 7469 7470 group = sched_balance_find_dst_group(sd, p, cpu); 7471 if (!group) { 7472 sd = sd->child; 7473 continue; 7474 } 7475 7476 new_cpu = sched_balance_find_dst_group_cpu(group, p, cpu); 7477 if (new_cpu == cpu) { 7478 /* Now try balancing at a lower domain level of 'cpu': */ 7479 sd = sd->child; 7480 continue; 7481 } 7482 7483 /* Now try balancing at a lower domain level of 'new_cpu': */ 7484 cpu = new_cpu; 7485 weight = sd->span_weight; 7486 sd = NULL; 7487 for_each_domain(cpu, tmp) { 7488 if (weight <= tmp->span_weight) 7489 break; 7490 if (tmp->flags & sd_flag) 7491 sd = tmp; 7492 } 7493 } 7494 7495 return new_cpu; 7496 } 7497 7498 static inline int __select_idle_cpu(int cpu, struct task_struct *p) 7499 { 7500 if ((available_idle_cpu(cpu) || sched_idle_cpu(cpu)) && 7501 sched_cpu_cookie_match(cpu_rq(cpu), p)) 7502 return cpu; 7503 7504 return -1; 7505 } 7506 7507 #ifdef CONFIG_SCHED_SMT 7508 DEFINE_STATIC_KEY_FALSE(sched_smt_present); 7509 EXPORT_SYMBOL_GPL(sched_smt_present); 7510 7511 static inline void set_idle_cores(int cpu, int val) 7512 { 7513 struct sched_domain_shared *sds; 7514 7515 sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); 7516 if (sds) 7517 WRITE_ONCE(sds->has_idle_cores, val); 7518 } 7519 7520 static inline bool test_idle_cores(int cpu) 7521 { 7522 struct sched_domain_shared *sds; 7523 7524 sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); 7525 if (sds) 7526 return READ_ONCE(sds->has_idle_cores); 7527 7528 return false; 7529 } 7530 7531 /* 7532 * Scans the local SMT mask to see if the entire core is idle, and records this 7533 * information in sd_llc_shared->has_idle_cores. 7534 * 7535 * Since SMT siblings share all cache levels, inspecting this limited remote 7536 * state should be fairly cheap. 7537 */ 7538 void __update_idle_core(struct rq *rq) 7539 { 7540 int core = cpu_of(rq); 7541 int cpu; 7542 7543 rcu_read_lock(); 7544 if (test_idle_cores(core)) 7545 goto unlock; 7546 7547 for_each_cpu(cpu, cpu_smt_mask(core)) { 7548 if (cpu == core) 7549 continue; 7550 7551 if (!available_idle_cpu(cpu)) 7552 goto unlock; 7553 } 7554 7555 set_idle_cores(core, 1); 7556 unlock: 7557 rcu_read_unlock(); 7558 } 7559 7560 /* 7561 * Scan the entire LLC domain for idle cores; this dynamically switches off if 7562 * there are no idle cores left in the system; tracked through 7563 * sd_llc->shared->has_idle_cores and enabled through update_idle_core() above. 7564 */ 7565 static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpus, int *idle_cpu) 7566 { 7567 bool idle = true; 7568 int cpu; 7569 7570 for_each_cpu(cpu, cpu_smt_mask(core)) { 7571 if (!available_idle_cpu(cpu)) { 7572 idle = false; 7573 if (*idle_cpu == -1) { 7574 if (sched_idle_cpu(cpu) && cpumask_test_cpu(cpu, cpus)) { 7575 *idle_cpu = cpu; 7576 break; 7577 } 7578 continue; 7579 } 7580 break; 7581 } 7582 if (*idle_cpu == -1 && cpumask_test_cpu(cpu, cpus)) 7583 *idle_cpu = cpu; 7584 } 7585 7586 if (idle) 7587 return core; 7588 7589 cpumask_andnot(cpus, cpus, cpu_smt_mask(core)); 7590 return -1; 7591 } 7592 7593 /* 7594 * Scan the local SMT mask for idle CPUs. 7595 */ 7596 static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target) 7597 { 7598 int cpu; 7599 7600 for_each_cpu_and(cpu, cpu_smt_mask(target), p->cpus_ptr) { 7601 if (cpu == target) 7602 continue; 7603 /* 7604 * Check if the CPU is in the LLC scheduling domain of @target. 7605 * Due to isolcpus, there is no guarantee that all the siblings are in the domain. 7606 */ 7607 if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) 7608 continue; 7609 if (available_idle_cpu(cpu) || sched_idle_cpu(cpu)) 7610 return cpu; 7611 } 7612 7613 return -1; 7614 } 7615 7616 #else /* CONFIG_SCHED_SMT */ 7617 7618 static inline void set_idle_cores(int cpu, int val) 7619 { 7620 } 7621 7622 static inline bool test_idle_cores(int cpu) 7623 { 7624 return false; 7625 } 7626 7627 static inline int select_idle_core(struct task_struct *p, int core, struct cpumask *cpus, int *idle_cpu) 7628 { 7629 return __select_idle_cpu(core, p); 7630 } 7631 7632 static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target) 7633 { 7634 return -1; 7635 } 7636 7637 #endif /* CONFIG_SCHED_SMT */ 7638 7639 /* 7640 * Scan the LLC domain for idle CPUs; this is dynamically regulated by 7641 * comparing the average scan cost (tracked in sd->avg_scan_cost) against the 7642 * average idle time for this rq (as found in rq->avg_idle). 7643 */ 7644 static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool has_idle_core, int target) 7645 { 7646 struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask); 7647 int i, cpu, idle_cpu = -1, nr = INT_MAX; 7648 struct sched_domain_shared *sd_share; 7649 7650 cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); 7651 7652 if (sched_feat(SIS_UTIL)) { 7653 sd_share = rcu_dereference(per_cpu(sd_llc_shared, target)); 7654 if (sd_share) { 7655 /* because !--nr is the condition to stop scan */ 7656 nr = READ_ONCE(sd_share->nr_idle_scan) + 1; 7657 /* overloaded LLC is unlikely to have idle cpu/core */ 7658 if (nr == 1) 7659 return -1; 7660 } 7661 } 7662 7663 if (static_branch_unlikely(&sched_cluster_active)) { 7664 struct sched_group *sg = sd->groups; 7665 7666 if (sg->flags & SD_CLUSTER) { 7667 for_each_cpu_wrap(cpu, sched_group_span(sg), target + 1) { 7668 if (!cpumask_test_cpu(cpu, cpus)) 7669 continue; 7670 7671 if (has_idle_core) { 7672 i = select_idle_core(p, cpu, cpus, &idle_cpu); 7673 if ((unsigned int)i < nr_cpumask_bits) 7674 return i; 7675 } else { 7676 if (--nr <= 0) 7677 return -1; 7678 idle_cpu = __select_idle_cpu(cpu, p); 7679 if ((unsigned int)idle_cpu < nr_cpumask_bits) 7680 return idle_cpu; 7681 } 7682 } 7683 cpumask_andnot(cpus, cpus, sched_group_span(sg)); 7684 } 7685 } 7686 7687 for_each_cpu_wrap(cpu, cpus, target + 1) { 7688 if (has_idle_core) { 7689 i = select_idle_core(p, cpu, cpus, &idle_cpu); 7690 if ((unsigned int)i < nr_cpumask_bits) 7691 return i; 7692 7693 } else { 7694 if (--nr <= 0) 7695 return -1; 7696 idle_cpu = __select_idle_cpu(cpu, p); 7697 if ((unsigned int)idle_cpu < nr_cpumask_bits) 7698 break; 7699 } 7700 } 7701 7702 if (has_idle_core) 7703 set_idle_cores(target, false); 7704 7705 return idle_cpu; 7706 } 7707 7708 /* 7709 * Scan the asym_capacity domain for idle CPUs; pick the first idle one on which 7710 * the task fits. If no CPU is big enough, but there are idle ones, try to 7711 * maximize capacity. 7712 */ 7713 static int 7714 select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target) 7715 { 7716 unsigned long task_util, util_min, util_max, best_cap = 0; 7717 int fits, best_fits = 0; 7718 int cpu, best_cpu = -1; 7719 struct cpumask *cpus; 7720 7721 cpus = this_cpu_cpumask_var_ptr(select_rq_mask); 7722 cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); 7723 7724 task_util = task_util_est(p); 7725 util_min = uclamp_eff_value(p, UCLAMP_MIN); 7726 util_max = uclamp_eff_value(p, UCLAMP_MAX); 7727 7728 for_each_cpu_wrap(cpu, cpus, target) { 7729 unsigned long cpu_cap = capacity_of(cpu); 7730 7731 if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu)) 7732 continue; 7733 7734 fits = util_fits_cpu(task_util, util_min, util_max, cpu); 7735 7736 /* This CPU fits with all requirements */ 7737 if (fits > 0) 7738 return cpu; 7739 /* 7740 * Only the min performance hint (i.e. uclamp_min) doesn't fit. 7741 * Look for the CPU with best capacity. 7742 */ 7743 else if (fits < 0) 7744 cpu_cap = get_actual_cpu_capacity(cpu); 7745 7746 /* 7747 * First, select CPU which fits better (-1 being better than 0). 7748 * Then, select the one with best capacity at same level. 7749 */ 7750 if ((fits < best_fits) || 7751 ((fits == best_fits) && (cpu_cap > best_cap))) { 7752 best_cap = cpu_cap; 7753 best_cpu = cpu; 7754 best_fits = fits; 7755 } 7756 } 7757 7758 return best_cpu; 7759 } 7760 7761 static inline bool asym_fits_cpu(unsigned long util, 7762 unsigned long util_min, 7763 unsigned long util_max, 7764 int cpu) 7765 { 7766 if (sched_asym_cpucap_active()) 7767 /* 7768 * Return true only if the cpu fully fits the task requirements 7769 * which include the utilization and the performance hints. 7770 */ 7771 return (util_fits_cpu(util, util_min, util_max, cpu) > 0); 7772 7773 return true; 7774 } 7775 7776 /* 7777 * Try and locate an idle core/thread in the LLC cache domain. 7778 */ 7779 static int select_idle_sibling(struct task_struct *p, int prev, int target) 7780 { 7781 bool has_idle_core = false; 7782 struct sched_domain *sd; 7783 unsigned long task_util, util_min, util_max; 7784 int i, recent_used_cpu, prev_aff = -1; 7785 7786 /* 7787 * On asymmetric system, update task utilization because we will check 7788 * that the task fits with CPU's capacity. 7789 */ 7790 if (sched_asym_cpucap_active()) { 7791 sync_entity_load_avg(&p->se); 7792 task_util = task_util_est(p); 7793 util_min = uclamp_eff_value(p, UCLAMP_MIN); 7794 util_max = uclamp_eff_value(p, UCLAMP_MAX); 7795 } 7796 7797 /* 7798 * per-cpu select_rq_mask usage 7799 */ 7800 lockdep_assert_irqs_disabled(); 7801 7802 if ((available_idle_cpu(target) || sched_idle_cpu(target)) && 7803 asym_fits_cpu(task_util, util_min, util_max, target)) 7804 return target; 7805 7806 /* 7807 * If the previous CPU is cache affine and idle, don't be stupid: 7808 */ 7809 if (prev != target && cpus_share_cache(prev, target) && 7810 (available_idle_cpu(prev) || sched_idle_cpu(prev)) && 7811 asym_fits_cpu(task_util, util_min, util_max, prev)) { 7812 7813 if (!static_branch_unlikely(&sched_cluster_active) || 7814 cpus_share_resources(prev, target)) 7815 return prev; 7816 7817 prev_aff = prev; 7818 } 7819 7820 /* 7821 * Allow a per-cpu kthread to stack with the wakee if the 7822 * kworker thread and the tasks previous CPUs are the same. 7823 * The assumption is that the wakee queued work for the 7824 * per-cpu kthread that is now complete and the wakeup is 7825 * essentially a sync wakeup. An obvious example of this 7826 * pattern is IO completions. 7827 */ 7828 if (is_per_cpu_kthread(current) && 7829 in_task() && 7830 prev == smp_processor_id() && 7831 this_rq()->nr_running <= 1 && 7832 asym_fits_cpu(task_util, util_min, util_max, prev)) { 7833 return prev; 7834 } 7835 7836 /* Check a recently used CPU as a potential idle candidate: */ 7837 recent_used_cpu = p->recent_used_cpu; 7838 p->recent_used_cpu = prev; 7839 if (recent_used_cpu != prev && 7840 recent_used_cpu != target && 7841 cpus_share_cache(recent_used_cpu, target) && 7842 (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) && 7843 cpumask_test_cpu(recent_used_cpu, p->cpus_ptr) && 7844 asym_fits_cpu(task_util, util_min, util_max, recent_used_cpu)) { 7845 7846 if (!static_branch_unlikely(&sched_cluster_active) || 7847 cpus_share_resources(recent_used_cpu, target)) 7848 return recent_used_cpu; 7849 7850 } else { 7851 recent_used_cpu = -1; 7852 } 7853 7854 /* 7855 * For asymmetric CPU capacity systems, our domain of interest is 7856 * sd_asym_cpucapacity rather than sd_llc. 7857 */ 7858 if (sched_asym_cpucap_active()) { 7859 sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, target)); 7860 /* 7861 * On an asymmetric CPU capacity system where an exclusive 7862 * cpuset defines a symmetric island (i.e. one unique 7863 * capacity_orig value through the cpuset), the key will be set 7864 * but the CPUs within that cpuset will not have a domain with 7865 * SD_ASYM_CPUCAPACITY. These should follow the usual symmetric 7866 * capacity path. 7867 */ 7868 if (sd) { 7869 i = select_idle_capacity(p, sd, target); 7870 return ((unsigned)i < nr_cpumask_bits) ? i : target; 7871 } 7872 } 7873 7874 sd = rcu_dereference(per_cpu(sd_llc, target)); 7875 if (!sd) 7876 return target; 7877 7878 if (sched_smt_active()) { 7879 has_idle_core = test_idle_cores(target); 7880 7881 if (!has_idle_core && cpus_share_cache(prev, target)) { 7882 i = select_idle_smt(p, sd, prev); 7883 if ((unsigned int)i < nr_cpumask_bits) 7884 return i; 7885 } 7886 } 7887 7888 i = select_idle_cpu(p, sd, has_idle_core, target); 7889 if ((unsigned)i < nr_cpumask_bits) 7890 return i; 7891 7892 /* 7893 * For cluster machines which have lower sharing cache like L2 or 7894 * LLC Tag, we tend to find an idle CPU in the target's cluster 7895 * first. But prev_cpu or recent_used_cpu may also be a good candidate, 7896 * use them if possible when no idle CPU found in select_idle_cpu(). 7897 */ 7898 if ((unsigned int)prev_aff < nr_cpumask_bits) 7899 return prev_aff; 7900 if ((unsigned int)recent_used_cpu < nr_cpumask_bits) 7901 return recent_used_cpu; 7902 7903 return target; 7904 } 7905 7906 /** 7907 * cpu_util() - Estimates the amount of CPU capacity used by CFS tasks. 7908 * @cpu: the CPU to get the utilization for 7909 * @p: task for which the CPU utilization should be predicted or NULL 7910 * @dst_cpu: CPU @p migrates to, -1 if @p moves from @cpu or @p == NULL 7911 * @boost: 1 to enable boosting, otherwise 0 7912 * 7913 * The unit of the return value must be the same as the one of CPU capacity 7914 * so that CPU utilization can be compared with CPU capacity. 7915 * 7916 * CPU utilization is the sum of running time of runnable tasks plus the 7917 * recent utilization of currently non-runnable tasks on that CPU. 7918 * It represents the amount of CPU capacity currently used by CFS tasks in 7919 * the range [0..max CPU capacity] with max CPU capacity being the CPU 7920 * capacity at f_max. 7921 * 7922 * The estimated CPU utilization is defined as the maximum between CPU 7923 * utilization and sum of the estimated utilization of the currently 7924 * runnable tasks on that CPU. It preserves a utilization "snapshot" of 7925 * previously-executed tasks, which helps better deduce how busy a CPU will 7926 * be when a long-sleeping task wakes up. The contribution to CPU utilization 7927 * of such a task would be significantly decayed at this point of time. 7928 * 7929 * Boosted CPU utilization is defined as max(CPU runnable, CPU utilization). 7930 * CPU contention for CFS tasks can be detected by CPU runnable > CPU 7931 * utilization. Boosting is implemented in cpu_util() so that internal 7932 * users (e.g. EAS) can use it next to external users (e.g. schedutil), 7933 * latter via cpu_util_cfs_boost(). 7934 * 7935 * CPU utilization can be higher than the current CPU capacity 7936 * (f_curr/f_max * max CPU capacity) or even the max CPU capacity because 7937 * of rounding errors as well as task migrations or wakeups of new tasks. 7938 * CPU utilization has to be capped to fit into the [0..max CPU capacity] 7939 * range. Otherwise a group of CPUs (CPU0 util = 121% + CPU1 util = 80%) 7940 * could be seen as over-utilized even though CPU1 has 20% of spare CPU 7941 * capacity. CPU utilization is allowed to overshoot current CPU capacity 7942 * though since this is useful for predicting the CPU capacity required 7943 * after task migrations (scheduler-driven DVFS). 7944 * 7945 * Return: (Boosted) (estimated) utilization for the specified CPU. 7946 */ 7947 static unsigned long 7948 cpu_util(int cpu, struct task_struct *p, int dst_cpu, int boost) 7949 { 7950 struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs; 7951 unsigned long util = READ_ONCE(cfs_rq->avg.util_avg); 7952 unsigned long runnable; 7953 7954 if (boost) { 7955 runnable = READ_ONCE(cfs_rq->avg.runnable_avg); 7956 util = max(util, runnable); 7957 } 7958 7959 /* 7960 * If @dst_cpu is -1 or @p migrates from @cpu to @dst_cpu remove its 7961 * contribution. If @p migrates from another CPU to @cpu add its 7962 * contribution. In all the other cases @cpu is not impacted by the 7963 * migration so its util_avg is already correct. 7964 */ 7965 if (p && task_cpu(p) == cpu && dst_cpu != cpu) 7966 lsub_positive(&util, task_util(p)); 7967 else if (p && task_cpu(p) != cpu && dst_cpu == cpu) 7968 util += task_util(p); 7969 7970 if (sched_feat(UTIL_EST)) { 7971 unsigned long util_est; 7972 7973 util_est = READ_ONCE(cfs_rq->avg.util_est); 7974 7975 /* 7976 * During wake-up @p isn't enqueued yet and doesn't contribute 7977 * to any cpu_rq(cpu)->cfs.avg.util_est. 7978 * If @dst_cpu == @cpu add it to "simulate" cpu_util after @p 7979 * has been enqueued. 7980 * 7981 * During exec (@dst_cpu = -1) @p is enqueued and does 7982 * contribute to cpu_rq(cpu)->cfs.util_est. 7983 * Remove it to "simulate" cpu_util without @p's contribution. 7984 * 7985 * Despite the task_on_rq_queued(@p) check there is still a 7986 * small window for a possible race when an exec 7987 * select_task_rq_fair() races with LB's detach_task(). 7988 * 7989 * detach_task() 7990 * deactivate_task() 7991 * p->on_rq = TASK_ON_RQ_MIGRATING; 7992 * -------------------------------- A 7993 * dequeue_task() \ 7994 * dequeue_task_fair() + Race Time 7995 * util_est_dequeue() / 7996 * -------------------------------- B 7997 * 7998 * The additional check "current == p" is required to further 7999 * reduce the race window. 8000 */ 8001 if (dst_cpu == cpu) 8002 util_est += _task_util_est(p); 8003 else if (p && unlikely(task_on_rq_queued(p) || current == p)) 8004 lsub_positive(&util_est, _task_util_est(p)); 8005 8006 util = max(util, util_est); 8007 } 8008 8009 return min(util, arch_scale_cpu_capacity(cpu)); 8010 } 8011 8012 unsigned long cpu_util_cfs(int cpu) 8013 { 8014 return cpu_util(cpu, NULL, -1, 0); 8015 } 8016 8017 unsigned long cpu_util_cfs_boost(int cpu) 8018 { 8019 return cpu_util(cpu, NULL, -1, 1); 8020 } 8021 8022 /* 8023 * cpu_util_without: compute cpu utilization without any contributions from *p 8024 * @cpu: the CPU which utilization is requested 8025 * @p: the task which utilization should be discounted 8026 * 8027 * The utilization of a CPU is defined by the utilization of tasks currently 8028 * enqueued on that CPU as well as tasks which are currently sleeping after an 8029 * execution on that CPU. 8030 * 8031 * This method returns the utilization of the specified CPU by discounting the 8032 * utilization of the specified task, whenever the task is currently 8033 * contributing to the CPU utilization. 8034 */ 8035 static unsigned long cpu_util_without(int cpu, struct task_struct *p) 8036 { 8037 /* Task has no contribution or is new */ 8038 if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time)) 8039 p = NULL; 8040 8041 return cpu_util(cpu, p, -1, 0); 8042 } 8043 8044 /* 8045 * This function computes an effective utilization for the given CPU, to be 8046 * used for frequency selection given the linear relation: f = u * f_max. 8047 * 8048 * The scheduler tracks the following metrics: 8049 * 8050 * cpu_util_{cfs,rt,dl,irq}() 8051 * cpu_bw_dl() 8052 * 8053 * Where the cfs,rt and dl util numbers are tracked with the same metric and 8054 * synchronized windows and are thus directly comparable. 8055 * 8056 * The cfs,rt,dl utilization are the running times measured with rq->clock_task 8057 * which excludes things like IRQ and steal-time. These latter are then accrued 8058 * in the IRQ utilization. 8059 * 8060 * The DL bandwidth number OTOH is not a measured metric but a value computed 8061 * based on the task model parameters and gives the minimal utilization 8062 * required to meet deadlines. 8063 */ 8064 unsigned long effective_cpu_util(int cpu, unsigned long util_cfs, 8065 unsigned long *min, 8066 unsigned long *max) 8067 { 8068 unsigned long util, irq, scale; 8069 struct rq *rq = cpu_rq(cpu); 8070 8071 scale = arch_scale_cpu_capacity(cpu); 8072 8073 /* 8074 * Early check to see if IRQ/steal time saturates the CPU, can be 8075 * because of inaccuracies in how we track these -- see 8076 * update_irq_load_avg(). 8077 */ 8078 irq = cpu_util_irq(rq); 8079 if (unlikely(irq >= scale)) { 8080 if (min) 8081 *min = scale; 8082 if (max) 8083 *max = scale; 8084 return scale; 8085 } 8086 8087 if (min) { 8088 /* 8089 * The minimum utilization returns the highest level between: 8090 * - the computed DL bandwidth needed with the IRQ pressure which 8091 * steals time to the deadline task. 8092 * - The minimum performance requirement for CFS and/or RT. 8093 */ 8094 *min = max(irq + cpu_bw_dl(rq), uclamp_rq_get(rq, UCLAMP_MIN)); 8095 8096 /* 8097 * When an RT task is runnable and uclamp is not used, we must 8098 * ensure that the task will run at maximum compute capacity. 8099 */ 8100 if (!uclamp_is_used() && rt_rq_is_runnable(&rq->rt)) 8101 *min = max(*min, scale); 8102 } 8103 8104 /* 8105 * Because the time spend on RT/DL tasks is visible as 'lost' time to 8106 * CFS tasks and we use the same metric to track the effective 8107 * utilization (PELT windows are synchronized) we can directly add them 8108 * to obtain the CPU's actual utilization. 8109 */ 8110 util = util_cfs + cpu_util_rt(rq); 8111 util += cpu_util_dl(rq); 8112 8113 /* 8114 * The maximum hint is a soft bandwidth requirement, which can be lower 8115 * than the actual utilization because of uclamp_max requirements. 8116 */ 8117 if (max) 8118 *max = min(scale, uclamp_rq_get(rq, UCLAMP_MAX)); 8119 8120 if (util >= scale) 8121 return scale; 8122 8123 /* 8124 * There is still idle time; further improve the number by using the 8125 * IRQ metric. Because IRQ/steal time is hidden from the task clock we 8126 * need to scale the task numbers: 8127 * 8128 * max - irq 8129 * U' = irq + --------- * U 8130 * max 8131 */ 8132 util = scale_irq_capacity(util, irq, scale); 8133 util += irq; 8134 8135 return min(scale, util); 8136 } 8137 8138 unsigned long sched_cpu_util(int cpu) 8139 { 8140 return effective_cpu_util(cpu, cpu_util_cfs(cpu), NULL, NULL); 8141 } 8142 8143 /* 8144 * energy_env - Utilization landscape for energy estimation. 8145 * @task_busy_time: Utilization contribution by the task for which we test the 8146 * placement. Given by eenv_task_busy_time(). 8147 * @pd_busy_time: Utilization of the whole perf domain without the task 8148 * contribution. Given by eenv_pd_busy_time(). 8149 * @cpu_cap: Maximum CPU capacity for the perf domain. 8150 * @pd_cap: Entire perf domain capacity. (pd->nr_cpus * cpu_cap). 8151 */ 8152 struct energy_env { 8153 unsigned long task_busy_time; 8154 unsigned long pd_busy_time; 8155 unsigned long cpu_cap; 8156 unsigned long pd_cap; 8157 }; 8158 8159 /* 8160 * Compute the task busy time for compute_energy(). This time cannot be 8161 * injected directly into effective_cpu_util() because of the IRQ scaling. 8162 * The latter only makes sense with the most recent CPUs where the task has 8163 * run. 8164 */ 8165 static inline void eenv_task_busy_time(struct energy_env *eenv, 8166 struct task_struct *p, int prev_cpu) 8167 { 8168 unsigned long busy_time, max_cap = arch_scale_cpu_capacity(prev_cpu); 8169 unsigned long irq = cpu_util_irq(cpu_rq(prev_cpu)); 8170 8171 if (unlikely(irq >= max_cap)) 8172 busy_time = max_cap; 8173 else 8174 busy_time = scale_irq_capacity(task_util_est(p), irq, max_cap); 8175 8176 eenv->task_busy_time = busy_time; 8177 } 8178 8179 /* 8180 * Compute the perf_domain (PD) busy time for compute_energy(). Based on the 8181 * utilization for each @pd_cpus, it however doesn't take into account 8182 * clamping since the ratio (utilization / cpu_capacity) is already enough to 8183 * scale the EM reported power consumption at the (eventually clamped) 8184 * cpu_capacity. 8185 * 8186 * The contribution of the task @p for which we want to estimate the 8187 * energy cost is removed (by cpu_util()) and must be calculated 8188 * separately (see eenv_task_busy_time). This ensures: 8189 * 8190 * - A stable PD utilization, no matter which CPU of that PD we want to place 8191 * the task on. 8192 * 8193 * - A fair comparison between CPUs as the task contribution (task_util()) 8194 * will always be the same no matter which CPU utilization we rely on 8195 * (util_avg or util_est). 8196 * 8197 * Set @eenv busy time for the PD that spans @pd_cpus. This busy time can't 8198 * exceed @eenv->pd_cap. 8199 */ 8200 static inline void eenv_pd_busy_time(struct energy_env *eenv, 8201 struct cpumask *pd_cpus, 8202 struct task_struct *p) 8203 { 8204 unsigned long busy_time = 0; 8205 int cpu; 8206 8207 for_each_cpu(cpu, pd_cpus) { 8208 unsigned long util = cpu_util(cpu, p, -1, 0); 8209 8210 busy_time += effective_cpu_util(cpu, util, NULL, NULL); 8211 } 8212 8213 eenv->pd_busy_time = min(eenv->pd_cap, busy_time); 8214 } 8215 8216 /* 8217 * Compute the maximum utilization for compute_energy() when the task @p 8218 * is placed on the cpu @dst_cpu. 8219 * 8220 * Returns the maximum utilization among @eenv->cpus. This utilization can't 8221 * exceed @eenv->cpu_cap. 8222 */ 8223 static inline unsigned long 8224 eenv_pd_max_util(struct energy_env *eenv, struct cpumask *pd_cpus, 8225 struct task_struct *p, int dst_cpu) 8226 { 8227 unsigned long max_util = 0; 8228 int cpu; 8229 8230 for_each_cpu(cpu, pd_cpus) { 8231 struct task_struct *tsk = (cpu == dst_cpu) ? p : NULL; 8232 unsigned long util = cpu_util(cpu, p, dst_cpu, 1); 8233 unsigned long eff_util, min, max; 8234 8235 /* 8236 * Performance domain frequency: utilization clamping 8237 * must be considered since it affects the selection 8238 * of the performance domain frequency. 8239 * NOTE: in case RT tasks are running, by default the min 8240 * utilization can be max OPP. 8241 */ 8242 eff_util = effective_cpu_util(cpu, util, &min, &max); 8243 8244 /* Task's uclamp can modify min and max value */ 8245 if (tsk && uclamp_is_used()) { 8246 min = max(min, uclamp_eff_value(p, UCLAMP_MIN)); 8247 8248 /* 8249 * If there is no active max uclamp constraint, 8250 * directly use task's one, otherwise keep max. 8251 */ 8252 if (uclamp_rq_is_idle(cpu_rq(cpu))) 8253 max = uclamp_eff_value(p, UCLAMP_MAX); 8254 else 8255 max = max(max, uclamp_eff_value(p, UCLAMP_MAX)); 8256 } 8257 8258 eff_util = sugov_effective_cpu_perf(cpu, eff_util, min, max); 8259 max_util = max(max_util, eff_util); 8260 } 8261 8262 return min(max_util, eenv->cpu_cap); 8263 } 8264 8265 /* 8266 * compute_energy(): Use the Energy Model to estimate the energy that @pd would 8267 * consume for a given utilization landscape @eenv. When @dst_cpu < 0, the task 8268 * contribution is ignored. 8269 */ 8270 static inline unsigned long 8271 compute_energy(struct energy_env *eenv, struct perf_domain *pd, 8272 struct cpumask *pd_cpus, struct task_struct *p, int dst_cpu) 8273 { 8274 unsigned long max_util = eenv_pd_max_util(eenv, pd_cpus, p, dst_cpu); 8275 unsigned long busy_time = eenv->pd_busy_time; 8276 unsigned long energy; 8277 8278 if (dst_cpu >= 0) 8279 busy_time = min(eenv->pd_cap, busy_time + eenv->task_busy_time); 8280 8281 energy = em_cpu_energy(pd->em_pd, max_util, busy_time, eenv->cpu_cap); 8282 8283 trace_sched_compute_energy_tp(p, dst_cpu, energy, max_util, busy_time); 8284 8285 return energy; 8286 } 8287 8288 /* 8289 * find_energy_efficient_cpu(): Find most energy-efficient target CPU for the 8290 * waking task. find_energy_efficient_cpu() looks for the CPU with maximum 8291 * spare capacity in each performance domain and uses it as a potential 8292 * candidate to execute the task. Then, it uses the Energy Model to figure 8293 * out which of the CPU candidates is the most energy-efficient. 8294 * 8295 * The rationale for this heuristic is as follows. In a performance domain, 8296 * all the most energy efficient CPU candidates (according to the Energy 8297 * Model) are those for which we'll request a low frequency. When there are 8298 * several CPUs for which the frequency request will be the same, we don't 8299 * have enough data to break the tie between them, because the Energy Model 8300 * only includes active power costs. With this model, if we assume that 8301 * frequency requests follow utilization (e.g. using schedutil), the CPU with 8302 * the maximum spare capacity in a performance domain is guaranteed to be among 8303 * the best candidates of the performance domain. 8304 * 8305 * In practice, it could be preferable from an energy standpoint to pack 8306 * small tasks on a CPU in order to let other CPUs go in deeper idle states, 8307 * but that could also hurt our chances to go cluster idle, and we have no 8308 * ways to tell with the current Energy Model if this is actually a good 8309 * idea or not. So, find_energy_efficient_cpu() basically favors 8310 * cluster-packing, and spreading inside a cluster. That should at least be 8311 * a good thing for latency, and this is consistent with the idea that most 8312 * of the energy savings of EAS come from the asymmetry of the system, and 8313 * not so much from breaking the tie between identical CPUs. That's also the 8314 * reason why EAS is enabled in the topology code only for systems where 8315 * SD_ASYM_CPUCAPACITY is set. 8316 * 8317 * NOTE: Forkees are not accepted in the energy-aware wake-up path because 8318 * they don't have any useful utilization data yet and it's not possible to 8319 * forecast their impact on energy consumption. Consequently, they will be 8320 * placed by sched_balance_find_dst_cpu() on the least loaded CPU, which might turn out 8321 * to be energy-inefficient in some use-cases. The alternative would be to 8322 * bias new tasks towards specific types of CPUs first, or to try to infer 8323 * their util_avg from the parent task, but those heuristics could hurt 8324 * other use-cases too. So, until someone finds a better way to solve this, 8325 * let's keep things simple by re-using the existing slow path. 8326 */ 8327 static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) 8328 { 8329 struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask); 8330 unsigned long prev_delta = ULONG_MAX, best_delta = ULONG_MAX; 8331 unsigned long p_util_min = uclamp_is_used() ? uclamp_eff_value(p, UCLAMP_MIN) : 0; 8332 unsigned long p_util_max = uclamp_is_used() ? uclamp_eff_value(p, UCLAMP_MAX) : 1024; 8333 struct root_domain *rd = this_rq()->rd; 8334 int cpu, best_energy_cpu, target = -1; 8335 int prev_fits = -1, best_fits = -1; 8336 unsigned long best_actual_cap = 0; 8337 unsigned long prev_actual_cap = 0; 8338 struct sched_domain *sd; 8339 struct perf_domain *pd; 8340 struct energy_env eenv; 8341 8342 rcu_read_lock(); 8343 pd = rcu_dereference(rd->pd); 8344 if (!pd) 8345 goto unlock; 8346 8347 /* 8348 * Energy-aware wake-up happens on the lowest sched_domain starting 8349 * from sd_asym_cpucapacity spanning over this_cpu and prev_cpu. 8350 */ 8351 sd = rcu_dereference(*this_cpu_ptr(&sd_asym_cpucapacity)); 8352 while (sd && !cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) 8353 sd = sd->parent; 8354 if (!sd) 8355 goto unlock; 8356 8357 target = prev_cpu; 8358 8359 sync_entity_load_avg(&p->se); 8360 if (!task_util_est(p) && p_util_min == 0) 8361 goto unlock; 8362 8363 eenv_task_busy_time(&eenv, p, prev_cpu); 8364 8365 for (; pd; pd = pd->next) { 8366 unsigned long util_min = p_util_min, util_max = p_util_max; 8367 unsigned long cpu_cap, cpu_actual_cap, util; 8368 long prev_spare_cap = -1, max_spare_cap = -1; 8369 unsigned long rq_util_min, rq_util_max; 8370 unsigned long cur_delta, base_energy; 8371 int max_spare_cap_cpu = -1; 8372 int fits, max_fits = -1; 8373 8374 cpumask_and(cpus, perf_domain_span(pd), cpu_online_mask); 8375 8376 if (cpumask_empty(cpus)) 8377 continue; 8378 8379 /* Account external pressure for the energy estimation */ 8380 cpu = cpumask_first(cpus); 8381 cpu_actual_cap = get_actual_cpu_capacity(cpu); 8382 8383 eenv.cpu_cap = cpu_actual_cap; 8384 eenv.pd_cap = 0; 8385 8386 for_each_cpu(cpu, cpus) { 8387 struct rq *rq = cpu_rq(cpu); 8388 8389 eenv.pd_cap += cpu_actual_cap; 8390 8391 if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) 8392 continue; 8393 8394 if (!cpumask_test_cpu(cpu, p->cpus_ptr)) 8395 continue; 8396 8397 util = cpu_util(cpu, p, cpu, 0); 8398 cpu_cap = capacity_of(cpu); 8399 8400 /* 8401 * Skip CPUs that cannot satisfy the capacity request. 8402 * IOW, placing the task there would make the CPU 8403 * overutilized. Take uclamp into account to see how 8404 * much capacity we can get out of the CPU; this is 8405 * aligned with sched_cpu_util(). 8406 */ 8407 if (uclamp_is_used() && !uclamp_rq_is_idle(rq)) { 8408 /* 8409 * Open code uclamp_rq_util_with() except for 8410 * the clamp() part. I.e.: apply max aggregation 8411 * only. util_fits_cpu() logic requires to 8412 * operate on non clamped util but must use the 8413 * max-aggregated uclamp_{min, max}. 8414 */ 8415 rq_util_min = uclamp_rq_get(rq, UCLAMP_MIN); 8416 rq_util_max = uclamp_rq_get(rq, UCLAMP_MAX); 8417 8418 util_min = max(rq_util_min, p_util_min); 8419 util_max = max(rq_util_max, p_util_max); 8420 } 8421 8422 fits = util_fits_cpu(util, util_min, util_max, cpu); 8423 if (!fits) 8424 continue; 8425 8426 lsub_positive(&cpu_cap, util); 8427 8428 if (cpu == prev_cpu) { 8429 /* Always use prev_cpu as a candidate. */ 8430 prev_spare_cap = cpu_cap; 8431 prev_fits = fits; 8432 } else if ((fits > max_fits) || 8433 ((fits == max_fits) && ((long)cpu_cap > max_spare_cap))) { 8434 /* 8435 * Find the CPU with the maximum spare capacity 8436 * among the remaining CPUs in the performance 8437 * domain. 8438 */ 8439 max_spare_cap = cpu_cap; 8440 max_spare_cap_cpu = cpu; 8441 max_fits = fits; 8442 } 8443 } 8444 8445 if (max_spare_cap_cpu < 0 && prev_spare_cap < 0) 8446 continue; 8447 8448 eenv_pd_busy_time(&eenv, cpus, p); 8449 /* Compute the 'base' energy of the pd, without @p */ 8450 base_energy = compute_energy(&eenv, pd, cpus, p, -1); 8451 8452 /* Evaluate the energy impact of using prev_cpu. */ 8453 if (prev_spare_cap > -1) { 8454 prev_delta = compute_energy(&eenv, pd, cpus, p, 8455 prev_cpu); 8456 /* CPU utilization has changed */ 8457 if (prev_delta < base_energy) 8458 goto unlock; 8459 prev_delta -= base_energy; 8460 prev_actual_cap = cpu_actual_cap; 8461 best_delta = min(best_delta, prev_delta); 8462 } 8463 8464 /* Evaluate the energy impact of using max_spare_cap_cpu. */ 8465 if (max_spare_cap_cpu >= 0 && max_spare_cap > prev_spare_cap) { 8466 /* Current best energy cpu fits better */ 8467 if (max_fits < best_fits) 8468 continue; 8469 8470 /* 8471 * Both don't fit performance hint (i.e. uclamp_min) 8472 * but best energy cpu has better capacity. 8473 */ 8474 if ((max_fits < 0) && 8475 (cpu_actual_cap <= best_actual_cap)) 8476 continue; 8477 8478 cur_delta = compute_energy(&eenv, pd, cpus, p, 8479 max_spare_cap_cpu); 8480 /* CPU utilization has changed */ 8481 if (cur_delta < base_energy) 8482 goto unlock; 8483 cur_delta -= base_energy; 8484 8485 /* 8486 * Both fit for the task but best energy cpu has lower 8487 * energy impact. 8488 */ 8489 if ((max_fits > 0) && (best_fits > 0) && 8490 (cur_delta >= best_delta)) 8491 continue; 8492 8493 best_delta = cur_delta; 8494 best_energy_cpu = max_spare_cap_cpu; 8495 best_fits = max_fits; 8496 best_actual_cap = cpu_actual_cap; 8497 } 8498 } 8499 rcu_read_unlock(); 8500 8501 if ((best_fits > prev_fits) || 8502 ((best_fits > 0) && (best_delta < prev_delta)) || 8503 ((best_fits < 0) && (best_actual_cap > prev_actual_cap))) 8504 target = best_energy_cpu; 8505 8506 return target; 8507 8508 unlock: 8509 rcu_read_unlock(); 8510 8511 return target; 8512 } 8513 8514 /* 8515 * select_task_rq_fair: Select target runqueue for the waking task in domains 8516 * that have the relevant SD flag set. In practice, this is SD_BALANCE_WAKE, 8517 * SD_BALANCE_FORK, or SD_BALANCE_EXEC. 8518 * 8519 * Balances load by selecting the idlest CPU in the idlest group, or under 8520 * certain conditions an idle sibling CPU if the domain has SD_WAKE_AFFINE set. 8521 * 8522 * Returns the target CPU number. 8523 */ 8524 static int 8525 select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) 8526 { 8527 int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING); 8528 struct sched_domain *tmp, *sd = NULL; 8529 int cpu = smp_processor_id(); 8530 int new_cpu = prev_cpu; 8531 int want_affine = 0; 8532 /* SD_flags and WF_flags share the first nibble */ 8533 int sd_flag = wake_flags & 0xF; 8534 8535 /* 8536 * required for stable ->cpus_allowed 8537 */ 8538 lockdep_assert_held(&p->pi_lock); 8539 if (wake_flags & WF_TTWU) { 8540 record_wakee(p); 8541 8542 if ((wake_flags & WF_CURRENT_CPU) && 8543 cpumask_test_cpu(cpu, p->cpus_ptr)) 8544 return cpu; 8545 8546 if (!is_rd_overutilized(this_rq()->rd)) { 8547 new_cpu = find_energy_efficient_cpu(p, prev_cpu); 8548 if (new_cpu >= 0) 8549 return new_cpu; 8550 new_cpu = prev_cpu; 8551 } 8552 8553 want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, p->cpus_ptr); 8554 } 8555 8556 rcu_read_lock(); 8557 for_each_domain(cpu, tmp) { 8558 /* 8559 * If both 'cpu' and 'prev_cpu' are part of this domain, 8560 * cpu is a valid SD_WAKE_AFFINE target. 8561 */ 8562 if (want_affine && (tmp->flags & SD_WAKE_AFFINE) && 8563 cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) { 8564 if (cpu != prev_cpu) 8565 new_cpu = wake_affine(tmp, p, cpu, prev_cpu, sync); 8566 8567 sd = NULL; /* Prefer wake_affine over balance flags */ 8568 break; 8569 } 8570 8571 /* 8572 * Usually only true for WF_EXEC and WF_FORK, as sched_domains 8573 * usually do not have SD_BALANCE_WAKE set. That means wakeup 8574 * will usually go to the fast path. 8575 */ 8576 if (tmp->flags & sd_flag) 8577 sd = tmp; 8578 else if (!want_affine) 8579 break; 8580 } 8581 8582 if (unlikely(sd)) { 8583 /* Slow path */ 8584 new_cpu = sched_balance_find_dst_cpu(sd, p, cpu, prev_cpu, sd_flag); 8585 } else if (wake_flags & WF_TTWU) { /* XXX always ? */ 8586 /* Fast path */ 8587 new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); 8588 } 8589 rcu_read_unlock(); 8590 8591 return new_cpu; 8592 } 8593 8594 /* 8595 * Called immediately before a task is migrated to a new CPU; task_cpu(p) and 8596 * cfs_rq_of(p) references at time of call are still valid and identify the 8597 * previous CPU. The caller guarantees p->pi_lock or task_rq(p)->lock is held. 8598 */ 8599 static void migrate_task_rq_fair(struct task_struct *p, int new_cpu) 8600 { 8601 struct sched_entity *se = &p->se; 8602 8603 if (!task_on_rq_migrating(p)) { 8604 remove_entity_load_avg(se); 8605 8606 /* 8607 * Here, the task's PELT values have been updated according to 8608 * the current rq's clock. But if that clock hasn't been 8609 * updated in a while, a substantial idle time will be missed, 8610 * leading to an inflation after wake-up on the new rq. 8611 * 8612 * Estimate the missing time from the cfs_rq last_update_time 8613 * and update sched_avg to improve the PELT continuity after 8614 * migration. 8615 */ 8616 migrate_se_pelt_lag(se); 8617 } 8618 8619 /* Tell new CPU we are migrated */ 8620 se->avg.last_update_time = 0; 8621 8622 update_scan_period(p, new_cpu); 8623 } 8624 8625 static void task_dead_fair(struct task_struct *p) 8626 { 8627 struct sched_entity *se = &p->se; 8628 8629 if (se->sched_delayed) { 8630 struct rq_flags rf; 8631 struct rq *rq; 8632 8633 rq = task_rq_lock(p, &rf); 8634 if (se->sched_delayed) { 8635 update_rq_clock(rq); 8636 dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED); 8637 } 8638 task_rq_unlock(rq, p, &rf); 8639 } 8640 8641 remove_entity_load_avg(se); 8642 } 8643 8644 /* 8645 * Set the max capacity the task is allowed to run at for misfit detection. 8646 */ 8647 static void set_task_max_allowed_capacity(struct task_struct *p) 8648 { 8649 struct asym_cap_data *entry; 8650 8651 if (!sched_asym_cpucap_active()) 8652 return; 8653 8654 rcu_read_lock(); 8655 list_for_each_entry_rcu(entry, &asym_cap_list, link) { 8656 cpumask_t *cpumask; 8657 8658 cpumask = cpu_capacity_span(entry); 8659 if (!cpumask_intersects(p->cpus_ptr, cpumask)) 8660 continue; 8661 8662 p->max_allowed_capacity = entry->capacity; 8663 break; 8664 } 8665 rcu_read_unlock(); 8666 } 8667 8668 static void set_cpus_allowed_fair(struct task_struct *p, struct affinity_context *ctx) 8669 { 8670 set_cpus_allowed_common(p, ctx); 8671 set_task_max_allowed_capacity(p); 8672 } 8673 8674 static int 8675 balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) 8676 { 8677 if (sched_fair_runnable(rq)) 8678 return 1; 8679 8680 return sched_balance_newidle(rq, rf) != 0; 8681 } 8682 #else 8683 static inline void set_task_max_allowed_capacity(struct task_struct *p) {} 8684 #endif /* CONFIG_SMP */ 8685 8686 static void set_next_buddy(struct sched_entity *se) 8687 { 8688 for_each_sched_entity(se) { 8689 if (SCHED_WARN_ON(!se->on_rq)) 8690 return; 8691 if (se_is_idle(se)) 8692 return; 8693 cfs_rq_of(se)->next = se; 8694 } 8695 } 8696 8697 /* 8698 * Preempt the current task with a newly woken task if needed: 8699 */ 8700 static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int wake_flags) 8701 { 8702 struct task_struct *donor = rq->donor; 8703 struct sched_entity *se = &donor->se, *pse = &p->se; 8704 struct cfs_rq *cfs_rq = task_cfs_rq(donor); 8705 int cse_is_idle, pse_is_idle; 8706 8707 if (unlikely(se == pse)) 8708 return; 8709 8710 /* 8711 * This is possible from callers such as attach_tasks(), in which we 8712 * unconditionally wakeup_preempt() after an enqueue (which may have 8713 * lead to a throttle). This both saves work and prevents false 8714 * next-buddy nomination below. 8715 */ 8716 if (unlikely(throttled_hierarchy(cfs_rq_of(pse)))) 8717 return; 8718 8719 if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK) && !pse->sched_delayed) { 8720 set_next_buddy(pse); 8721 } 8722 8723 /* 8724 * We can come here with TIF_NEED_RESCHED already set from new task 8725 * wake up path. 8726 * 8727 * Note: this also catches the edge-case of curr being in a throttled 8728 * group (e.g. via set_curr_task), since update_curr() (in the 8729 * enqueue of curr) will have resulted in resched being set. This 8730 * prevents us from potentially nominating it as a false LAST_BUDDY 8731 * below. 8732 */ 8733 if (test_tsk_need_resched(rq->curr)) 8734 return; 8735 8736 if (!sched_feat(WAKEUP_PREEMPTION)) 8737 return; 8738 8739 find_matching_se(&se, &pse); 8740 WARN_ON_ONCE(!pse); 8741 8742 cse_is_idle = se_is_idle(se); 8743 pse_is_idle = se_is_idle(pse); 8744 8745 /* 8746 * Preempt an idle entity in favor of a non-idle entity (and don't preempt 8747 * in the inverse case). 8748 */ 8749 if (cse_is_idle && !pse_is_idle) 8750 goto preempt; 8751 if (cse_is_idle != pse_is_idle) 8752 return; 8753 8754 /* 8755 * BATCH and IDLE tasks do not preempt others. 8756 */ 8757 if (unlikely(!normal_policy(p->policy))) 8758 return; 8759 8760 cfs_rq = cfs_rq_of(se); 8761 update_curr(cfs_rq); 8762 /* 8763 * If @p has a shorter slice than current and @p is eligible, override 8764 * current's slice protection in order to allow preemption. 8765 * 8766 * Note that even if @p does not turn out to be the most eligible 8767 * task at this moment, current's slice protection will be lost. 8768 */ 8769 if (do_preempt_short(cfs_rq, pse, se) && se->vlag == se->deadline) 8770 se->vlag = se->deadline + 1; 8771 8772 /* 8773 * If @p has become the most eligible task, force preemption. 8774 */ 8775 if (pick_eevdf(cfs_rq) == pse) 8776 goto preempt; 8777 8778 return; 8779 8780 preempt: 8781 resched_curr_lazy(rq); 8782 } 8783 8784 static struct task_struct *pick_task_fair(struct rq *rq) 8785 { 8786 struct sched_entity *se; 8787 struct cfs_rq *cfs_rq; 8788 8789 again: 8790 cfs_rq = &rq->cfs; 8791 if (!cfs_rq->nr_running) 8792 return NULL; 8793 8794 do { 8795 /* Might not have done put_prev_entity() */ 8796 if (cfs_rq->curr && cfs_rq->curr->on_rq) 8797 update_curr(cfs_rq); 8798 8799 if (unlikely(check_cfs_rq_runtime(cfs_rq))) 8800 goto again; 8801 8802 se = pick_next_entity(rq, cfs_rq); 8803 if (!se) 8804 goto again; 8805 cfs_rq = group_cfs_rq(se); 8806 } while (cfs_rq); 8807 8808 return task_of(se); 8809 } 8810 8811 static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool first); 8812 static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first); 8813 8814 struct task_struct * 8815 pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) 8816 { 8817 struct sched_entity *se; 8818 struct task_struct *p; 8819 int new_tasks; 8820 8821 again: 8822 p = pick_task_fair(rq); 8823 if (!p) 8824 goto idle; 8825 se = &p->se; 8826 8827 #ifdef CONFIG_FAIR_GROUP_SCHED 8828 if (prev->sched_class != &fair_sched_class) 8829 goto simple; 8830 8831 __put_prev_set_next_dl_server(rq, prev, p); 8832 8833 /* 8834 * Because of the set_next_buddy() in dequeue_task_fair() it is rather 8835 * likely that a next task is from the same cgroup as the current. 8836 * 8837 * Therefore attempt to avoid putting and setting the entire cgroup 8838 * hierarchy, only change the part that actually changes. 8839 * 8840 * Since we haven't yet done put_prev_entity and if the selected task 8841 * is a different task than we started out with, try and touch the 8842 * least amount of cfs_rqs. 8843 */ 8844 if (prev != p) { 8845 struct sched_entity *pse = &prev->se; 8846 struct cfs_rq *cfs_rq; 8847 8848 while (!(cfs_rq = is_same_group(se, pse))) { 8849 int se_depth = se->depth; 8850 int pse_depth = pse->depth; 8851 8852 if (se_depth <= pse_depth) { 8853 put_prev_entity(cfs_rq_of(pse), pse); 8854 pse = parent_entity(pse); 8855 } 8856 if (se_depth >= pse_depth) { 8857 set_next_entity(cfs_rq_of(se), se); 8858 se = parent_entity(se); 8859 } 8860 } 8861 8862 put_prev_entity(cfs_rq, pse); 8863 set_next_entity(cfs_rq, se); 8864 8865 __set_next_task_fair(rq, p, true); 8866 } 8867 8868 return p; 8869 8870 simple: 8871 #endif 8872 put_prev_set_next_task(rq, prev, p); 8873 return p; 8874 8875 idle: 8876 if (!rf) 8877 return NULL; 8878 8879 new_tasks = sched_balance_newidle(rq, rf); 8880 8881 /* 8882 * Because sched_balance_newidle() releases (and re-acquires) rq->lock, it is 8883 * possible for any higher priority task to appear. In that case we 8884 * must re-start the pick_next_entity() loop. 8885 */ 8886 if (new_tasks < 0) 8887 return RETRY_TASK; 8888 8889 if (new_tasks > 0) 8890 goto again; 8891 8892 /* 8893 * rq is about to be idle, check if we need to update the 8894 * lost_idle_time of clock_pelt 8895 */ 8896 update_idle_rq_clock_pelt(rq); 8897 8898 return NULL; 8899 } 8900 8901 static struct task_struct *__pick_next_task_fair(struct rq *rq, struct task_struct *prev) 8902 { 8903 return pick_next_task_fair(rq, prev, NULL); 8904 } 8905 8906 static bool fair_server_has_tasks(struct sched_dl_entity *dl_se) 8907 { 8908 return !!dl_se->rq->cfs.nr_running; 8909 } 8910 8911 static struct task_struct *fair_server_pick_task(struct sched_dl_entity *dl_se) 8912 { 8913 return pick_task_fair(dl_se->rq); 8914 } 8915 8916 void fair_server_init(struct rq *rq) 8917 { 8918 struct sched_dl_entity *dl_se = &rq->fair_server; 8919 8920 init_dl_entity(dl_se); 8921 8922 dl_server_init(dl_se, rq, fair_server_has_tasks, fair_server_pick_task); 8923 } 8924 8925 /* 8926 * Account for a descheduled task: 8927 */ 8928 static void put_prev_task_fair(struct rq *rq, struct task_struct *prev, struct task_struct *next) 8929 { 8930 struct sched_entity *se = &prev->se; 8931 struct cfs_rq *cfs_rq; 8932 8933 for_each_sched_entity(se) { 8934 cfs_rq = cfs_rq_of(se); 8935 put_prev_entity(cfs_rq, se); 8936 } 8937 } 8938 8939 /* 8940 * sched_yield() is very simple 8941 */ 8942 static void yield_task_fair(struct rq *rq) 8943 { 8944 struct task_struct *curr = rq->curr; 8945 struct cfs_rq *cfs_rq = task_cfs_rq(curr); 8946 struct sched_entity *se = &curr->se; 8947 8948 /* 8949 * Are we the only task in the tree? 8950 */ 8951 if (unlikely(rq->nr_running == 1)) 8952 return; 8953 8954 clear_buddies(cfs_rq, se); 8955 8956 update_rq_clock(rq); 8957 /* 8958 * Update run-time statistics of the 'current'. 8959 */ 8960 update_curr(cfs_rq); 8961 /* 8962 * Tell update_rq_clock() that we've just updated, 8963 * so we don't do microscopic update in schedule() 8964 * and double the fastpath cost. 8965 */ 8966 rq_clock_skip_update(rq); 8967 8968 se->deadline += calc_delta_fair(se->slice, se); 8969 } 8970 8971 static bool yield_to_task_fair(struct rq *rq, struct task_struct *p) 8972 { 8973 struct sched_entity *se = &p->se; 8974 8975 /* throttled hierarchies are not runnable */ 8976 if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se))) 8977 return false; 8978 8979 /* Tell the scheduler that we'd really like se to run next. */ 8980 set_next_buddy(se); 8981 8982 yield_task_fair(rq); 8983 8984 return true; 8985 } 8986 8987 #ifdef CONFIG_SMP 8988 /************************************************** 8989 * Fair scheduling class load-balancing methods. 8990 * 8991 * BASICS 8992 * 8993 * The purpose of load-balancing is to achieve the same basic fairness the 8994 * per-CPU scheduler provides, namely provide a proportional amount of compute 8995 * time to each task. This is expressed in the following equation: 8996 * 8997 * W_i,n/P_i == W_j,n/P_j for all i,j (1) 8998 * 8999 * Where W_i,n is the n-th weight average for CPU i. The instantaneous weight 9000 * W_i,0 is defined as: 9001 * 9002 * W_i,0 = \Sum_j w_i,j (2) 9003 * 9004 * Where w_i,j is the weight of the j-th runnable task on CPU i. This weight 9005 * is derived from the nice value as per sched_prio_to_weight[]. 9006 * 9007 * The weight average is an exponential decay average of the instantaneous 9008 * weight: 9009 * 9010 * W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3) 9011 * 9012 * C_i is the compute capacity of CPU i, typically it is the 9013 * fraction of 'recent' time available for SCHED_OTHER task execution. But it 9014 * can also include other factors [XXX]. 9015 * 9016 * To achieve this balance we define a measure of imbalance which follows 9017 * directly from (1): 9018 * 9019 * imb_i,j = max{ avg(W/C), W_i/C_i } - min{ avg(W/C), W_j/C_j } (4) 9020 * 9021 * We them move tasks around to minimize the imbalance. In the continuous 9022 * function space it is obvious this converges, in the discrete case we get 9023 * a few fun cases generally called infeasible weight scenarios. 9024 * 9025 * [XXX expand on: 9026 * - infeasible weights; 9027 * - local vs global optima in the discrete case. ] 9028 * 9029 * 9030 * SCHED DOMAINS 9031 * 9032 * In order to solve the imbalance equation (4), and avoid the obvious O(n^2) 9033 * for all i,j solution, we create a tree of CPUs that follows the hardware 9034 * topology where each level pairs two lower groups (or better). This results 9035 * in O(log n) layers. Furthermore we reduce the number of CPUs going up the 9036 * tree to only the first of the previous level and we decrease the frequency 9037 * of load-balance at each level inversely proportional to the number of CPUs in 9038 * the groups. 9039 * 9040 * This yields: 9041 * 9042 * log_2 n 1 n 9043 * \Sum { --- * --- * 2^i } = O(n) (5) 9044 * i = 0 2^i 2^i 9045 * `- size of each group 9046 * | | `- number of CPUs doing load-balance 9047 * | `- freq 9048 * `- sum over all levels 9049 * 9050 * Coupled with a limit on how many tasks we can migrate every balance pass, 9051 * this makes (5) the runtime complexity of the balancer. 9052 * 9053 * An important property here is that each CPU is still (indirectly) connected 9054 * to every other CPU in at most O(log n) steps: 9055 * 9056 * The adjacency matrix of the resulting graph is given by: 9057 * 9058 * log_2 n 9059 * A_i,j = \Union (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1) (6) 9060 * k = 0 9061 * 9062 * And you'll find that: 9063 * 9064 * A^(log_2 n)_i,j != 0 for all i,j (7) 9065 * 9066 * Showing there's indeed a path between every CPU in at most O(log n) steps. 9067 * The task movement gives a factor of O(m), giving a convergence complexity 9068 * of: 9069 * 9070 * O(nm log n), n := nr_cpus, m := nr_tasks (8) 9071 * 9072 * 9073 * WORK CONSERVING 9074 * 9075 * In order to avoid CPUs going idle while there's still work to do, new idle 9076 * balancing is more aggressive and has the newly idle CPU iterate up the domain 9077 * tree itself instead of relying on other CPUs to bring it work. 9078 * 9079 * This adds some complexity to both (5) and (8) but it reduces the total idle 9080 * time. 9081 * 9082 * [XXX more?] 9083 * 9084 * 9085 * CGROUPS 9086 * 9087 * Cgroups make a horror show out of (2), instead of a simple sum we get: 9088 * 9089 * s_k,i 9090 * W_i,0 = \Sum_j \Prod_k w_k * ----- (9) 9091 * S_k 9092 * 9093 * Where 9094 * 9095 * s_k,i = \Sum_j w_i,j,k and S_k = \Sum_i s_k,i (10) 9096 * 9097 * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on CPU i. 9098 * 9099 * The big problem is S_k, its a global sum needed to compute a local (W_i) 9100 * property. 9101 * 9102 * [XXX write more on how we solve this.. _after_ merging pjt's patches that 9103 * rewrite all of this once again.] 9104 */ 9105 9106 static unsigned long __read_mostly max_load_balance_interval = HZ/10; 9107 9108 enum fbq_type { regular, remote, all }; 9109 9110 /* 9111 * 'group_type' describes the group of CPUs at the moment of load balancing. 9112 * 9113 * The enum is ordered by pulling priority, with the group with lowest priority 9114 * first so the group_type can simply be compared when selecting the busiest 9115 * group. See update_sd_pick_busiest(). 9116 */ 9117 enum group_type { 9118 /* The group has spare capacity that can be used to run more tasks. */ 9119 group_has_spare = 0, 9120 /* 9121 * The group is fully used and the tasks don't compete for more CPU 9122 * cycles. Nevertheless, some tasks might wait before running. 9123 */ 9124 group_fully_busy, 9125 /* 9126 * One task doesn't fit with CPU's capacity and must be migrated to a 9127 * more powerful CPU. 9128 */ 9129 group_misfit_task, 9130 /* 9131 * Balance SMT group that's fully busy. Can benefit from migration 9132 * a task on SMT with busy sibling to another CPU on idle core. 9133 */ 9134 group_smt_balance, 9135 /* 9136 * SD_ASYM_PACKING only: One local CPU with higher capacity is available, 9137 * and the task should be migrated to it instead of running on the 9138 * current CPU. 9139 */ 9140 group_asym_packing, 9141 /* 9142 * The tasks' affinity constraints previously prevented the scheduler 9143 * from balancing the load across the system. 9144 */ 9145 group_imbalanced, 9146 /* 9147 * The CPU is overloaded and can't provide expected CPU cycles to all 9148 * tasks. 9149 */ 9150 group_overloaded 9151 }; 9152 9153 enum migration_type { 9154 migrate_load = 0, 9155 migrate_util, 9156 migrate_task, 9157 migrate_misfit 9158 }; 9159 9160 #define LBF_ALL_PINNED 0x01 9161 #define LBF_NEED_BREAK 0x02 9162 #define LBF_DST_PINNED 0x04 9163 #define LBF_SOME_PINNED 0x08 9164 #define LBF_ACTIVE_LB 0x10 9165 9166 struct lb_env { 9167 struct sched_domain *sd; 9168 9169 struct rq *src_rq; 9170 int src_cpu; 9171 9172 int dst_cpu; 9173 struct rq *dst_rq; 9174 9175 struct cpumask *dst_grpmask; 9176 int new_dst_cpu; 9177 enum cpu_idle_type idle; 9178 long imbalance; 9179 /* The set of CPUs under consideration for load-balancing */ 9180 struct cpumask *cpus; 9181 9182 unsigned int flags; 9183 9184 unsigned int loop; 9185 unsigned int loop_break; 9186 unsigned int loop_max; 9187 9188 enum fbq_type fbq_type; 9189 enum migration_type migration_type; 9190 struct list_head tasks; 9191 }; 9192 9193 /* 9194 * Is this task likely cache-hot: 9195 */ 9196 static int task_hot(struct task_struct *p, struct lb_env *env) 9197 { 9198 s64 delta; 9199 9200 lockdep_assert_rq_held(env->src_rq); 9201 9202 if (p->sched_class != &fair_sched_class) 9203 return 0; 9204 9205 if (unlikely(task_has_idle_policy(p))) 9206 return 0; 9207 9208 /* SMT siblings share cache */ 9209 if (env->sd->flags & SD_SHARE_CPUCAPACITY) 9210 return 0; 9211 9212 /* 9213 * Buddy candidates are cache hot: 9214 */ 9215 if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running && 9216 (&p->se == cfs_rq_of(&p->se)->next)) 9217 return 1; 9218 9219 if (sysctl_sched_migration_cost == -1) 9220 return 1; 9221 9222 /* 9223 * Don't migrate task if the task's cookie does not match 9224 * with the destination CPU's core cookie. 9225 */ 9226 if (!sched_core_cookie_match(cpu_rq(env->dst_cpu), p)) 9227 return 1; 9228 9229 if (sysctl_sched_migration_cost == 0) 9230 return 0; 9231 9232 delta = rq_clock_task(env->src_rq) - p->se.exec_start; 9233 9234 return delta < (s64)sysctl_sched_migration_cost; 9235 } 9236 9237 #ifdef CONFIG_NUMA_BALANCING 9238 /* 9239 * Returns 1, if task migration degrades locality 9240 * Returns 0, if task migration improves locality i.e migration preferred. 9241 * Returns -1, if task migration is not affected by locality. 9242 */ 9243 static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env) 9244 { 9245 struct numa_group *numa_group = rcu_dereference(p->numa_group); 9246 unsigned long src_weight, dst_weight; 9247 int src_nid, dst_nid, dist; 9248 9249 if (!static_branch_likely(&sched_numa_balancing)) 9250 return -1; 9251 9252 if (!p->numa_faults || !(env->sd->flags & SD_NUMA)) 9253 return -1; 9254 9255 src_nid = cpu_to_node(env->src_cpu); 9256 dst_nid = cpu_to_node(env->dst_cpu); 9257 9258 if (src_nid == dst_nid) 9259 return -1; 9260 9261 /* Migrating away from the preferred node is always bad. */ 9262 if (src_nid == p->numa_preferred_nid) { 9263 if (env->src_rq->nr_running > env->src_rq->nr_preferred_running) 9264 return 1; 9265 else 9266 return -1; 9267 } 9268 9269 /* Encourage migration to the preferred node. */ 9270 if (dst_nid == p->numa_preferred_nid) 9271 return 0; 9272 9273 /* Leaving a core idle is often worse than degrading locality. */ 9274 if (env->idle == CPU_IDLE) 9275 return -1; 9276 9277 dist = node_distance(src_nid, dst_nid); 9278 if (numa_group) { 9279 src_weight = group_weight(p, src_nid, dist); 9280 dst_weight = group_weight(p, dst_nid, dist); 9281 } else { 9282 src_weight = task_weight(p, src_nid, dist); 9283 dst_weight = task_weight(p, dst_nid, dist); 9284 } 9285 9286 return dst_weight < src_weight; 9287 } 9288 9289 #else 9290 static inline int migrate_degrades_locality(struct task_struct *p, 9291 struct lb_env *env) 9292 { 9293 return -1; 9294 } 9295 #endif 9296 9297 /* 9298 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? 9299 */ 9300 static 9301 int can_migrate_task(struct task_struct *p, struct lb_env *env) 9302 { 9303 int tsk_cache_hot; 9304 9305 lockdep_assert_rq_held(env->src_rq); 9306 9307 /* 9308 * We do not migrate tasks that are: 9309 * 1) throttled_lb_pair, or 9310 * 2) cannot be migrated to this CPU due to cpus_ptr, or 9311 * 3) running (obviously), or 9312 * 4) are cache-hot on their current CPU. 9313 */ 9314 if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu)) 9315 return 0; 9316 9317 /* Disregard percpu kthreads; they are where they need to be. */ 9318 if (kthread_is_per_cpu(p)) 9319 return 0; 9320 9321 if (!cpumask_test_cpu(env->dst_cpu, p->cpus_ptr)) { 9322 int cpu; 9323 9324 schedstat_inc(p->stats.nr_failed_migrations_affine); 9325 9326 env->flags |= LBF_SOME_PINNED; 9327 9328 /* 9329 * Remember if this task can be migrated to any other CPU in 9330 * our sched_group. We may want to revisit it if we couldn't 9331 * meet load balance goals by pulling other tasks on src_cpu. 9332 * 9333 * Avoid computing new_dst_cpu 9334 * - for NEWLY_IDLE 9335 * - if we have already computed one in current iteration 9336 * - if it's an active balance 9337 */ 9338 if (env->idle == CPU_NEWLY_IDLE || 9339 env->flags & (LBF_DST_PINNED | LBF_ACTIVE_LB)) 9340 return 0; 9341 9342 /* Prevent to re-select dst_cpu via env's CPUs: */ 9343 for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) { 9344 if (cpumask_test_cpu(cpu, p->cpus_ptr)) { 9345 env->flags |= LBF_DST_PINNED; 9346 env->new_dst_cpu = cpu; 9347 break; 9348 } 9349 } 9350 9351 return 0; 9352 } 9353 9354 /* Record that we found at least one task that could run on dst_cpu */ 9355 env->flags &= ~LBF_ALL_PINNED; 9356 9357 if (task_on_cpu(env->src_rq, p)) { 9358 schedstat_inc(p->stats.nr_failed_migrations_running); 9359 return 0; 9360 } 9361 9362 /* 9363 * Aggressive migration if: 9364 * 1) active balance 9365 * 2) destination numa is preferred 9366 * 3) task is cache cold, or 9367 * 4) too many balance attempts have failed. 9368 */ 9369 if (env->flags & LBF_ACTIVE_LB) 9370 return 1; 9371 9372 tsk_cache_hot = migrate_degrades_locality(p, env); 9373 if (tsk_cache_hot == -1) 9374 tsk_cache_hot = task_hot(p, env); 9375 9376 if (tsk_cache_hot <= 0 || 9377 env->sd->nr_balance_failed > env->sd->cache_nice_tries) { 9378 if (tsk_cache_hot == 1) { 9379 schedstat_inc(env->sd->lb_hot_gained[env->idle]); 9380 schedstat_inc(p->stats.nr_forced_migrations); 9381 } 9382 return 1; 9383 } 9384 9385 schedstat_inc(p->stats.nr_failed_migrations_hot); 9386 return 0; 9387 } 9388 9389 /* 9390 * detach_task() -- detach the task for the migration specified in env 9391 */ 9392 static void detach_task(struct task_struct *p, struct lb_env *env) 9393 { 9394 lockdep_assert_rq_held(env->src_rq); 9395 9396 deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK); 9397 set_task_cpu(p, env->dst_cpu); 9398 } 9399 9400 /* 9401 * detach_one_task() -- tries to dequeue exactly one task from env->src_rq, as 9402 * part of active balancing operations within "domain". 9403 * 9404 * Returns a task if successful and NULL otherwise. 9405 */ 9406 static struct task_struct *detach_one_task(struct lb_env *env) 9407 { 9408 struct task_struct *p; 9409 9410 lockdep_assert_rq_held(env->src_rq); 9411 9412 list_for_each_entry_reverse(p, 9413 &env->src_rq->cfs_tasks, se.group_node) { 9414 if (!can_migrate_task(p, env)) 9415 continue; 9416 9417 detach_task(p, env); 9418 9419 /* 9420 * Right now, this is only the second place where 9421 * lb_gained[env->idle] is updated (other is detach_tasks) 9422 * so we can safely collect stats here rather than 9423 * inside detach_tasks(). 9424 */ 9425 schedstat_inc(env->sd->lb_gained[env->idle]); 9426 return p; 9427 } 9428 return NULL; 9429 } 9430 9431 /* 9432 * detach_tasks() -- tries to detach up to imbalance load/util/tasks from 9433 * busiest_rq, as part of a balancing operation within domain "sd". 9434 * 9435 * Returns number of detached tasks if successful and 0 otherwise. 9436 */ 9437 static int detach_tasks(struct lb_env *env) 9438 { 9439 struct list_head *tasks = &env->src_rq->cfs_tasks; 9440 unsigned long util, load; 9441 struct task_struct *p; 9442 int detached = 0; 9443 9444 lockdep_assert_rq_held(env->src_rq); 9445 9446 /* 9447 * Source run queue has been emptied by another CPU, clear 9448 * LBF_ALL_PINNED flag as we will not test any task. 9449 */ 9450 if (env->src_rq->nr_running <= 1) { 9451 env->flags &= ~LBF_ALL_PINNED; 9452 return 0; 9453 } 9454 9455 if (env->imbalance <= 0) 9456 return 0; 9457 9458 while (!list_empty(tasks)) { 9459 /* 9460 * We don't want to steal all, otherwise we may be treated likewise, 9461 * which could at worst lead to a livelock crash. 9462 */ 9463 if (env->idle && env->src_rq->nr_running <= 1) 9464 break; 9465 9466 env->loop++; 9467 /* We've more or less seen every task there is, call it quits */ 9468 if (env->loop > env->loop_max) 9469 break; 9470 9471 /* take a breather every nr_migrate tasks */ 9472 if (env->loop > env->loop_break) { 9473 env->loop_break += SCHED_NR_MIGRATE_BREAK; 9474 env->flags |= LBF_NEED_BREAK; 9475 break; 9476 } 9477 9478 p = list_last_entry(tasks, struct task_struct, se.group_node); 9479 9480 if (!can_migrate_task(p, env)) 9481 goto next; 9482 9483 switch (env->migration_type) { 9484 case migrate_load: 9485 /* 9486 * Depending of the number of CPUs and tasks and the 9487 * cgroup hierarchy, task_h_load() can return a null 9488 * value. Make sure that env->imbalance decreases 9489 * otherwise detach_tasks() will stop only after 9490 * detaching up to loop_max tasks. 9491 */ 9492 load = max_t(unsigned long, task_h_load(p), 1); 9493 9494 if (sched_feat(LB_MIN) && 9495 load < 16 && !env->sd->nr_balance_failed) 9496 goto next; 9497 9498 /* 9499 * Make sure that we don't migrate too much load. 9500 * Nevertheless, let relax the constraint if 9501 * scheduler fails to find a good waiting task to 9502 * migrate. 9503 */ 9504 if (shr_bound(load, env->sd->nr_balance_failed) > env->imbalance) 9505 goto next; 9506 9507 env->imbalance -= load; 9508 break; 9509 9510 case migrate_util: 9511 util = task_util_est(p); 9512 9513 if (shr_bound(util, env->sd->nr_balance_failed) > env->imbalance) 9514 goto next; 9515 9516 env->imbalance -= util; 9517 break; 9518 9519 case migrate_task: 9520 env->imbalance--; 9521 break; 9522 9523 case migrate_misfit: 9524 /* This is not a misfit task */ 9525 if (task_fits_cpu(p, env->src_cpu)) 9526 goto next; 9527 9528 env->imbalance = 0; 9529 break; 9530 } 9531 9532 detach_task(p, env); 9533 list_add(&p->se.group_node, &env->tasks); 9534 9535 detached++; 9536 9537 #ifdef CONFIG_PREEMPTION 9538 /* 9539 * NEWIDLE balancing is a source of latency, so preemptible 9540 * kernels will stop after the first task is detached to minimize 9541 * the critical section. 9542 */ 9543 if (env->idle == CPU_NEWLY_IDLE) 9544 break; 9545 #endif 9546 9547 /* 9548 * We only want to steal up to the prescribed amount of 9549 * load/util/tasks. 9550 */ 9551 if (env->imbalance <= 0) 9552 break; 9553 9554 continue; 9555 next: 9556 list_move(&p->se.group_node, tasks); 9557 } 9558 9559 /* 9560 * Right now, this is one of only two places we collect this stat 9561 * so we can safely collect detach_one_task() stats here rather 9562 * than inside detach_one_task(). 9563 */ 9564 schedstat_add(env->sd->lb_gained[env->idle], detached); 9565 9566 return detached; 9567 } 9568 9569 /* 9570 * attach_task() -- attach the task detached by detach_task() to its new rq. 9571 */ 9572 static void attach_task(struct rq *rq, struct task_struct *p) 9573 { 9574 lockdep_assert_rq_held(rq); 9575 9576 WARN_ON_ONCE(task_rq(p) != rq); 9577 activate_task(rq, p, ENQUEUE_NOCLOCK); 9578 wakeup_preempt(rq, p, 0); 9579 } 9580 9581 /* 9582 * attach_one_task() -- attaches the task returned from detach_one_task() to 9583 * its new rq. 9584 */ 9585 static void attach_one_task(struct rq *rq, struct task_struct *p) 9586 { 9587 struct rq_flags rf; 9588 9589 rq_lock(rq, &rf); 9590 update_rq_clock(rq); 9591 attach_task(rq, p); 9592 rq_unlock(rq, &rf); 9593 } 9594 9595 /* 9596 * attach_tasks() -- attaches all tasks detached by detach_tasks() to their 9597 * new rq. 9598 */ 9599 static void attach_tasks(struct lb_env *env) 9600 { 9601 struct list_head *tasks = &env->tasks; 9602 struct task_struct *p; 9603 struct rq_flags rf; 9604 9605 rq_lock(env->dst_rq, &rf); 9606 update_rq_clock(env->dst_rq); 9607 9608 while (!list_empty(tasks)) { 9609 p = list_first_entry(tasks, struct task_struct, se.group_node); 9610 list_del_init(&p->se.group_node); 9611 9612 attach_task(env->dst_rq, p); 9613 } 9614 9615 rq_unlock(env->dst_rq, &rf); 9616 } 9617 9618 #ifdef CONFIG_NO_HZ_COMMON 9619 static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) 9620 { 9621 if (cfs_rq->avg.load_avg) 9622 return true; 9623 9624 if (cfs_rq->avg.util_avg) 9625 return true; 9626 9627 return false; 9628 } 9629 9630 static inline bool others_have_blocked(struct rq *rq) 9631 { 9632 if (cpu_util_rt(rq)) 9633 return true; 9634 9635 if (cpu_util_dl(rq)) 9636 return true; 9637 9638 if (hw_load_avg(rq)) 9639 return true; 9640 9641 if (cpu_util_irq(rq)) 9642 return true; 9643 9644 return false; 9645 } 9646 9647 static inline void update_blocked_load_tick(struct rq *rq) 9648 { 9649 WRITE_ONCE(rq->last_blocked_load_update_tick, jiffies); 9650 } 9651 9652 static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) 9653 { 9654 if (!has_blocked) 9655 rq->has_blocked_load = 0; 9656 } 9657 #else 9658 static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) { return false; } 9659 static inline bool others_have_blocked(struct rq *rq) { return false; } 9660 static inline void update_blocked_load_tick(struct rq *rq) {} 9661 static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) {} 9662 #endif 9663 9664 static bool __update_blocked_others(struct rq *rq, bool *done) 9665 { 9666 bool updated; 9667 9668 /* 9669 * update_load_avg() can call cpufreq_update_util(). Make sure that RT, 9670 * DL and IRQ signals have been updated before updating CFS. 9671 */ 9672 updated = update_other_load_avgs(rq); 9673 9674 if (others_have_blocked(rq)) 9675 *done = false; 9676 9677 return updated; 9678 } 9679 9680 #ifdef CONFIG_FAIR_GROUP_SCHED 9681 9682 static bool __update_blocked_fair(struct rq *rq, bool *done) 9683 { 9684 struct cfs_rq *cfs_rq, *pos; 9685 bool decayed = false; 9686 int cpu = cpu_of(rq); 9687 9688 /* 9689 * Iterates the task_group tree in a bottom up fashion, see 9690 * list_add_leaf_cfs_rq() for details. 9691 */ 9692 for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) { 9693 struct sched_entity *se; 9694 9695 if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq)) { 9696 update_tg_load_avg(cfs_rq); 9697 9698 if (cfs_rq->nr_running == 0) 9699 update_idle_cfs_rq_clock_pelt(cfs_rq); 9700 9701 if (cfs_rq == &rq->cfs) 9702 decayed = true; 9703 } 9704 9705 /* Propagate pending load changes to the parent, if any: */ 9706 se = cfs_rq->tg->se[cpu]; 9707 if (se && !skip_blocked_update(se)) 9708 update_load_avg(cfs_rq_of(se), se, UPDATE_TG); 9709 9710 /* 9711 * There can be a lot of idle CPU cgroups. Don't let fully 9712 * decayed cfs_rqs linger on the list. 9713 */ 9714 if (cfs_rq_is_decayed(cfs_rq)) 9715 list_del_leaf_cfs_rq(cfs_rq); 9716 9717 /* Don't need periodic decay once load/util_avg are null */ 9718 if (cfs_rq_has_blocked(cfs_rq)) 9719 *done = false; 9720 } 9721 9722 return decayed; 9723 } 9724 9725 /* 9726 * Compute the hierarchical load factor for cfs_rq and all its ascendants. 9727 * This needs to be done in a top-down fashion because the load of a child 9728 * group is a fraction of its parents load. 9729 */ 9730 static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq) 9731 { 9732 struct rq *rq = rq_of(cfs_rq); 9733 struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)]; 9734 unsigned long now = jiffies; 9735 unsigned long load; 9736 9737 if (cfs_rq->last_h_load_update == now) 9738 return; 9739 9740 WRITE_ONCE(cfs_rq->h_load_next, NULL); 9741 for_each_sched_entity(se) { 9742 cfs_rq = cfs_rq_of(se); 9743 WRITE_ONCE(cfs_rq->h_load_next, se); 9744 if (cfs_rq->last_h_load_update == now) 9745 break; 9746 } 9747 9748 if (!se) { 9749 cfs_rq->h_load = cfs_rq_load_avg(cfs_rq); 9750 cfs_rq->last_h_load_update = now; 9751 } 9752 9753 while ((se = READ_ONCE(cfs_rq->h_load_next)) != NULL) { 9754 load = cfs_rq->h_load; 9755 load = div64_ul(load * se->avg.load_avg, 9756 cfs_rq_load_avg(cfs_rq) + 1); 9757 cfs_rq = group_cfs_rq(se); 9758 cfs_rq->h_load = load; 9759 cfs_rq->last_h_load_update = now; 9760 } 9761 } 9762 9763 static unsigned long task_h_load(struct task_struct *p) 9764 { 9765 struct cfs_rq *cfs_rq = task_cfs_rq(p); 9766 9767 update_cfs_rq_h_load(cfs_rq); 9768 return div64_ul(p->se.avg.load_avg * cfs_rq->h_load, 9769 cfs_rq_load_avg(cfs_rq) + 1); 9770 } 9771 #else 9772 static bool __update_blocked_fair(struct rq *rq, bool *done) 9773 { 9774 struct cfs_rq *cfs_rq = &rq->cfs; 9775 bool decayed; 9776 9777 decayed = update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq); 9778 if (cfs_rq_has_blocked(cfs_rq)) 9779 *done = false; 9780 9781 return decayed; 9782 } 9783 9784 static unsigned long task_h_load(struct task_struct *p) 9785 { 9786 return p->se.avg.load_avg; 9787 } 9788 #endif 9789 9790 static void sched_balance_update_blocked_averages(int cpu) 9791 { 9792 bool decayed = false, done = true; 9793 struct rq *rq = cpu_rq(cpu); 9794 struct rq_flags rf; 9795 9796 rq_lock_irqsave(rq, &rf); 9797 update_blocked_load_tick(rq); 9798 update_rq_clock(rq); 9799 9800 decayed |= __update_blocked_others(rq, &done); 9801 decayed |= __update_blocked_fair(rq, &done); 9802 9803 update_blocked_load_status(rq, !done); 9804 if (decayed) 9805 cpufreq_update_util(rq, 0); 9806 rq_unlock_irqrestore(rq, &rf); 9807 } 9808 9809 /********** Helpers for sched_balance_find_src_group ************************/ 9810 9811 /* 9812 * sg_lb_stats - stats of a sched_group required for load-balancing: 9813 */ 9814 struct sg_lb_stats { 9815 unsigned long avg_load; /* Avg load over the CPUs of the group */ 9816 unsigned long group_load; /* Total load over the CPUs of the group */ 9817 unsigned long group_capacity; /* Capacity over the CPUs of the group */ 9818 unsigned long group_util; /* Total utilization over the CPUs of the group */ 9819 unsigned long group_runnable; /* Total runnable time over the CPUs of the group */ 9820 unsigned int sum_nr_running; /* Nr of all tasks running in the group */ 9821 unsigned int sum_h_nr_running; /* Nr of CFS tasks running in the group */ 9822 unsigned int idle_cpus; /* Nr of idle CPUs in the group */ 9823 unsigned int group_weight; 9824 enum group_type group_type; 9825 unsigned int group_asym_packing; /* Tasks should be moved to preferred CPU */ 9826 unsigned int group_smt_balance; /* Task on busy SMT be moved */ 9827 unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */ 9828 #ifdef CONFIG_NUMA_BALANCING 9829 unsigned int nr_numa_running; 9830 unsigned int nr_preferred_running; 9831 #endif 9832 }; 9833 9834 /* 9835 * sd_lb_stats - stats of a sched_domain required for load-balancing: 9836 */ 9837 struct sd_lb_stats { 9838 struct sched_group *busiest; /* Busiest group in this sd */ 9839 struct sched_group *local; /* Local group in this sd */ 9840 unsigned long total_load; /* Total load of all groups in sd */ 9841 unsigned long total_capacity; /* Total capacity of all groups in sd */ 9842 unsigned long avg_load; /* Average load across all groups in sd */ 9843 unsigned int prefer_sibling; /* Tasks should go to sibling first */ 9844 9845 struct sg_lb_stats busiest_stat; /* Statistics of the busiest group */ 9846 struct sg_lb_stats local_stat; /* Statistics of the local group */ 9847 }; 9848 9849 static inline void init_sd_lb_stats(struct sd_lb_stats *sds) 9850 { 9851 /* 9852 * Skimp on the clearing to avoid duplicate work. We can avoid clearing 9853 * local_stat because update_sg_lb_stats() does a full clear/assignment. 9854 * We must however set busiest_stat::group_type and 9855 * busiest_stat::idle_cpus to the worst busiest group because 9856 * update_sd_pick_busiest() reads these before assignment. 9857 */ 9858 *sds = (struct sd_lb_stats){ 9859 .busiest = NULL, 9860 .local = NULL, 9861 .total_load = 0UL, 9862 .total_capacity = 0UL, 9863 .busiest_stat = { 9864 .idle_cpus = UINT_MAX, 9865 .group_type = group_has_spare, 9866 }, 9867 }; 9868 } 9869 9870 static unsigned long scale_rt_capacity(int cpu) 9871 { 9872 unsigned long max = get_actual_cpu_capacity(cpu); 9873 struct rq *rq = cpu_rq(cpu); 9874 unsigned long used, free; 9875 unsigned long irq; 9876 9877 irq = cpu_util_irq(rq); 9878 9879 if (unlikely(irq >= max)) 9880 return 1; 9881 9882 /* 9883 * avg_rt.util_avg and avg_dl.util_avg track binary signals 9884 * (running and not running) with weights 0 and 1024 respectively. 9885 */ 9886 used = cpu_util_rt(rq); 9887 used += cpu_util_dl(rq); 9888 9889 if (unlikely(used >= max)) 9890 return 1; 9891 9892 free = max - used; 9893 9894 return scale_irq_capacity(free, irq, max); 9895 } 9896 9897 static void update_cpu_capacity(struct sched_domain *sd, int cpu) 9898 { 9899 unsigned long capacity = scale_rt_capacity(cpu); 9900 struct sched_group *sdg = sd->groups; 9901 9902 if (!capacity) 9903 capacity = 1; 9904 9905 cpu_rq(cpu)->cpu_capacity = capacity; 9906 trace_sched_cpu_capacity_tp(cpu_rq(cpu)); 9907 9908 sdg->sgc->capacity = capacity; 9909 sdg->sgc->min_capacity = capacity; 9910 sdg->sgc->max_capacity = capacity; 9911 } 9912 9913 void update_group_capacity(struct sched_domain *sd, int cpu) 9914 { 9915 struct sched_domain *child = sd->child; 9916 struct sched_group *group, *sdg = sd->groups; 9917 unsigned long capacity, min_capacity, max_capacity; 9918 unsigned long interval; 9919 9920 interval = msecs_to_jiffies(sd->balance_interval); 9921 interval = clamp(interval, 1UL, max_load_balance_interval); 9922 sdg->sgc->next_update = jiffies + interval; 9923 9924 if (!child) { 9925 update_cpu_capacity(sd, cpu); 9926 return; 9927 } 9928 9929 capacity = 0; 9930 min_capacity = ULONG_MAX; 9931 max_capacity = 0; 9932 9933 if (child->flags & SD_OVERLAP) { 9934 /* 9935 * SD_OVERLAP domains cannot assume that child groups 9936 * span the current group. 9937 */ 9938 9939 for_each_cpu(cpu, sched_group_span(sdg)) { 9940 unsigned long cpu_cap = capacity_of(cpu); 9941 9942 capacity += cpu_cap; 9943 min_capacity = min(cpu_cap, min_capacity); 9944 max_capacity = max(cpu_cap, max_capacity); 9945 } 9946 } else { 9947 /* 9948 * !SD_OVERLAP domains can assume that child groups 9949 * span the current group. 9950 */ 9951 9952 group = child->groups; 9953 do { 9954 struct sched_group_capacity *sgc = group->sgc; 9955 9956 capacity += sgc->capacity; 9957 min_capacity = min(sgc->min_capacity, min_capacity); 9958 max_capacity = max(sgc->max_capacity, max_capacity); 9959 group = group->next; 9960 } while (group != child->groups); 9961 } 9962 9963 sdg->sgc->capacity = capacity; 9964 sdg->sgc->min_capacity = min_capacity; 9965 sdg->sgc->max_capacity = max_capacity; 9966 } 9967 9968 /* 9969 * Check whether the capacity of the rq has been noticeably reduced by side 9970 * activity. The imbalance_pct is used for the threshold. 9971 * Return true is the capacity is reduced 9972 */ 9973 static inline int 9974 check_cpu_capacity(struct rq *rq, struct sched_domain *sd) 9975 { 9976 return ((rq->cpu_capacity * sd->imbalance_pct) < 9977 (arch_scale_cpu_capacity(cpu_of(rq)) * 100)); 9978 } 9979 9980 /* Check if the rq has a misfit task */ 9981 static inline bool check_misfit_status(struct rq *rq) 9982 { 9983 return rq->misfit_task_load; 9984 } 9985 9986 /* 9987 * Group imbalance indicates (and tries to solve) the problem where balancing 9988 * groups is inadequate due to ->cpus_ptr constraints. 9989 * 9990 * Imagine a situation of two groups of 4 CPUs each and 4 tasks each with a 9991 * cpumask covering 1 CPU of the first group and 3 CPUs of the second group. 9992 * Something like: 9993 * 9994 * { 0 1 2 3 } { 4 5 6 7 } 9995 * * * * * 9996 * 9997 * If we were to balance group-wise we'd place two tasks in the first group and 9998 * two tasks in the second group. Clearly this is undesired as it will overload 9999 * cpu 3 and leave one of the CPUs in the second group unused. 10000 * 10001 * The current solution to this issue is detecting the skew in the first group 10002 * by noticing the lower domain failed to reach balance and had difficulty 10003 * moving tasks due to affinity constraints. 10004 * 10005 * When this is so detected; this group becomes a candidate for busiest; see 10006 * update_sd_pick_busiest(). And calculate_imbalance() and 10007 * sched_balance_find_src_group() avoid some of the usual balance conditions to allow it 10008 * to create an effective group imbalance. 10009 * 10010 * This is a somewhat tricky proposition since the next run might not find the 10011 * group imbalance and decide the groups need to be balanced again. A most 10012 * subtle and fragile situation. 10013 */ 10014 10015 static inline int sg_imbalanced(struct sched_group *group) 10016 { 10017 return group->sgc->imbalance; 10018 } 10019 10020 /* 10021 * group_has_capacity returns true if the group has spare capacity that could 10022 * be used by some tasks. 10023 * We consider that a group has spare capacity if the number of task is 10024 * smaller than the number of CPUs or if the utilization is lower than the 10025 * available capacity for CFS tasks. 10026 * For the latter, we use a threshold to stabilize the state, to take into 10027 * account the variance of the tasks' load and to return true if the available 10028 * capacity in meaningful for the load balancer. 10029 * As an example, an available capacity of 1% can appear but it doesn't make 10030 * any benefit for the load balance. 10031 */ 10032 static inline bool 10033 group_has_capacity(unsigned int imbalance_pct, struct sg_lb_stats *sgs) 10034 { 10035 if (sgs->sum_nr_running < sgs->group_weight) 10036 return true; 10037 10038 if ((sgs->group_capacity * imbalance_pct) < 10039 (sgs->group_runnable * 100)) 10040 return false; 10041 10042 if ((sgs->group_capacity * 100) > 10043 (sgs->group_util * imbalance_pct)) 10044 return true; 10045 10046 return false; 10047 } 10048 10049 /* 10050 * group_is_overloaded returns true if the group has more tasks than it can 10051 * handle. 10052 * group_is_overloaded is not equals to !group_has_capacity because a group 10053 * with the exact right number of tasks, has no more spare capacity but is not 10054 * overloaded so both group_has_capacity and group_is_overloaded return 10055 * false. 10056 */ 10057 static inline bool 10058 group_is_overloaded(unsigned int imbalance_pct, struct sg_lb_stats *sgs) 10059 { 10060 if (sgs->sum_nr_running <= sgs->group_weight) 10061 return false; 10062 10063 if ((sgs->group_capacity * 100) < 10064 (sgs->group_util * imbalance_pct)) 10065 return true; 10066 10067 if ((sgs->group_capacity * imbalance_pct) < 10068 (sgs->group_runnable * 100)) 10069 return true; 10070 10071 return false; 10072 } 10073 10074 static inline enum 10075 group_type group_classify(unsigned int imbalance_pct, 10076 struct sched_group *group, 10077 struct sg_lb_stats *sgs) 10078 { 10079 if (group_is_overloaded(imbalance_pct, sgs)) 10080 return group_overloaded; 10081 10082 if (sg_imbalanced(group)) 10083 return group_imbalanced; 10084 10085 if (sgs->group_asym_packing) 10086 return group_asym_packing; 10087 10088 if (sgs->group_smt_balance) 10089 return group_smt_balance; 10090 10091 if (sgs->group_misfit_task_load) 10092 return group_misfit_task; 10093 10094 if (!group_has_capacity(imbalance_pct, sgs)) 10095 return group_fully_busy; 10096 10097 return group_has_spare; 10098 } 10099 10100 /** 10101 * sched_use_asym_prio - Check whether asym_packing priority must be used 10102 * @sd: The scheduling domain of the load balancing 10103 * @cpu: A CPU 10104 * 10105 * Always use CPU priority when balancing load between SMT siblings. When 10106 * balancing load between cores, it is not sufficient that @cpu is idle. Only 10107 * use CPU priority if the whole core is idle. 10108 * 10109 * Returns: True if the priority of @cpu must be followed. False otherwise. 10110 */ 10111 static bool sched_use_asym_prio(struct sched_domain *sd, int cpu) 10112 { 10113 if (!(sd->flags & SD_ASYM_PACKING)) 10114 return false; 10115 10116 if (!sched_smt_active()) 10117 return true; 10118 10119 return sd->flags & SD_SHARE_CPUCAPACITY || is_core_idle(cpu); 10120 } 10121 10122 static inline bool sched_asym(struct sched_domain *sd, int dst_cpu, int src_cpu) 10123 { 10124 /* 10125 * First check if @dst_cpu can do asym_packing load balance. Only do it 10126 * if it has higher priority than @src_cpu. 10127 */ 10128 return sched_use_asym_prio(sd, dst_cpu) && 10129 sched_asym_prefer(dst_cpu, src_cpu); 10130 } 10131 10132 /** 10133 * sched_group_asym - Check if the destination CPU can do asym_packing balance 10134 * @env: The load balancing environment 10135 * @sgs: Load-balancing statistics of the candidate busiest group 10136 * @group: The candidate busiest group 10137 * 10138 * @env::dst_cpu can do asym_packing if it has higher priority than the 10139 * preferred CPU of @group. 10140 * 10141 * Return: true if @env::dst_cpu can do with asym_packing load balance. False 10142 * otherwise. 10143 */ 10144 static inline bool 10145 sched_group_asym(struct lb_env *env, struct sg_lb_stats *sgs, struct sched_group *group) 10146 { 10147 /* 10148 * CPU priorities do not make sense for SMT cores with more than one 10149 * busy sibling. 10150 */ 10151 if ((group->flags & SD_SHARE_CPUCAPACITY) && 10152 (sgs->group_weight - sgs->idle_cpus != 1)) 10153 return false; 10154 10155 return sched_asym(env->sd, env->dst_cpu, group->asym_prefer_cpu); 10156 } 10157 10158 /* One group has more than one SMT CPU while the other group does not */ 10159 static inline bool smt_vs_nonsmt_groups(struct sched_group *sg1, 10160 struct sched_group *sg2) 10161 { 10162 if (!sg1 || !sg2) 10163 return false; 10164 10165 return (sg1->flags & SD_SHARE_CPUCAPACITY) != 10166 (sg2->flags & SD_SHARE_CPUCAPACITY); 10167 } 10168 10169 static inline bool smt_balance(struct lb_env *env, struct sg_lb_stats *sgs, 10170 struct sched_group *group) 10171 { 10172 if (!env->idle) 10173 return false; 10174 10175 /* 10176 * For SMT source group, it is better to move a task 10177 * to a CPU that doesn't have multiple tasks sharing its CPU capacity. 10178 * Note that if a group has a single SMT, SD_SHARE_CPUCAPACITY 10179 * will not be on. 10180 */ 10181 if (group->flags & SD_SHARE_CPUCAPACITY && 10182 sgs->sum_h_nr_running > 1) 10183 return true; 10184 10185 return false; 10186 } 10187 10188 static inline long sibling_imbalance(struct lb_env *env, 10189 struct sd_lb_stats *sds, 10190 struct sg_lb_stats *busiest, 10191 struct sg_lb_stats *local) 10192 { 10193 int ncores_busiest, ncores_local; 10194 long imbalance; 10195 10196 if (!env->idle || !busiest->sum_nr_running) 10197 return 0; 10198 10199 ncores_busiest = sds->busiest->cores; 10200 ncores_local = sds->local->cores; 10201 10202 if (ncores_busiest == ncores_local) { 10203 imbalance = busiest->sum_nr_running; 10204 lsub_positive(&imbalance, local->sum_nr_running); 10205 return imbalance; 10206 } 10207 10208 /* Balance such that nr_running/ncores ratio are same on both groups */ 10209 imbalance = ncores_local * busiest->sum_nr_running; 10210 lsub_positive(&imbalance, ncores_busiest * local->sum_nr_running); 10211 /* Normalize imbalance and do rounding on normalization */ 10212 imbalance = 2 * imbalance + ncores_local + ncores_busiest; 10213 imbalance /= ncores_local + ncores_busiest; 10214 10215 /* Take advantage of resource in an empty sched group */ 10216 if (imbalance <= 1 && local->sum_nr_running == 0 && 10217 busiest->sum_nr_running > 1) 10218 imbalance = 2; 10219 10220 return imbalance; 10221 } 10222 10223 static inline bool 10224 sched_reduced_capacity(struct rq *rq, struct sched_domain *sd) 10225 { 10226 /* 10227 * When there is more than 1 task, the group_overloaded case already 10228 * takes care of cpu with reduced capacity 10229 */ 10230 if (rq->cfs.h_nr_running != 1) 10231 return false; 10232 10233 return check_cpu_capacity(rq, sd); 10234 } 10235 10236 /** 10237 * update_sg_lb_stats - Update sched_group's statistics for load balancing. 10238 * @env: The load balancing environment. 10239 * @sds: Load-balancing data with statistics of the local group. 10240 * @group: sched_group whose statistics are to be updated. 10241 * @sgs: variable to hold the statistics for this group. 10242 * @sg_overloaded: sched_group is overloaded 10243 * @sg_overutilized: sched_group is overutilized 10244 */ 10245 static inline void update_sg_lb_stats(struct lb_env *env, 10246 struct sd_lb_stats *sds, 10247 struct sched_group *group, 10248 struct sg_lb_stats *sgs, 10249 bool *sg_overloaded, 10250 bool *sg_overutilized) 10251 { 10252 int i, nr_running, local_group; 10253 10254 memset(sgs, 0, sizeof(*sgs)); 10255 10256 local_group = group == sds->local; 10257 10258 for_each_cpu_and(i, sched_group_span(group), env->cpus) { 10259 struct rq *rq = cpu_rq(i); 10260 unsigned long load = cpu_load(rq); 10261 10262 sgs->group_load += load; 10263 sgs->group_util += cpu_util_cfs(i); 10264 sgs->group_runnable += cpu_runnable(rq); 10265 sgs->sum_h_nr_running += rq->cfs.h_nr_running; 10266 10267 nr_running = rq->nr_running; 10268 sgs->sum_nr_running += nr_running; 10269 10270 if (nr_running > 1) 10271 *sg_overloaded = 1; 10272 10273 if (cpu_overutilized(i)) 10274 *sg_overutilized = 1; 10275 10276 #ifdef CONFIG_NUMA_BALANCING 10277 sgs->nr_numa_running += rq->nr_numa_running; 10278 sgs->nr_preferred_running += rq->nr_preferred_running; 10279 #endif 10280 /* 10281 * No need to call idle_cpu() if nr_running is not 0 10282 */ 10283 if (!nr_running && idle_cpu(i)) { 10284 sgs->idle_cpus++; 10285 /* Idle cpu can't have misfit task */ 10286 continue; 10287 } 10288 10289 if (local_group) 10290 continue; 10291 10292 if (env->sd->flags & SD_ASYM_CPUCAPACITY) { 10293 /* Check for a misfit task on the cpu */ 10294 if (sgs->group_misfit_task_load < rq->misfit_task_load) { 10295 sgs->group_misfit_task_load = rq->misfit_task_load; 10296 *sg_overloaded = 1; 10297 } 10298 } else if (env->idle && sched_reduced_capacity(rq, env->sd)) { 10299 /* Check for a task running on a CPU with reduced capacity */ 10300 if (sgs->group_misfit_task_load < load) 10301 sgs->group_misfit_task_load = load; 10302 } 10303 } 10304 10305 sgs->group_capacity = group->sgc->capacity; 10306 10307 sgs->group_weight = group->group_weight; 10308 10309 /* Check if dst CPU is idle and preferred to this group */ 10310 if (!local_group && env->idle && sgs->sum_h_nr_running && 10311 sched_group_asym(env, sgs, group)) 10312 sgs->group_asym_packing = 1; 10313 10314 /* Check for loaded SMT group to be balanced to dst CPU */ 10315 if (!local_group && smt_balance(env, sgs, group)) 10316 sgs->group_smt_balance = 1; 10317 10318 sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs); 10319 10320 /* Computing avg_load makes sense only when group is overloaded */ 10321 if (sgs->group_type == group_overloaded) 10322 sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) / 10323 sgs->group_capacity; 10324 } 10325 10326 /** 10327 * update_sd_pick_busiest - return 1 on busiest group 10328 * @env: The load balancing environment. 10329 * @sds: sched_domain statistics 10330 * @sg: sched_group candidate to be checked for being the busiest 10331 * @sgs: sched_group statistics 10332 * 10333 * Determine if @sg is a busier group than the previously selected 10334 * busiest group. 10335 * 10336 * Return: %true if @sg is a busier group than the previously selected 10337 * busiest group. %false otherwise. 10338 */ 10339 static bool update_sd_pick_busiest(struct lb_env *env, 10340 struct sd_lb_stats *sds, 10341 struct sched_group *sg, 10342 struct sg_lb_stats *sgs) 10343 { 10344 struct sg_lb_stats *busiest = &sds->busiest_stat; 10345 10346 /* Make sure that there is at least one task to pull */ 10347 if (!sgs->sum_h_nr_running) 10348 return false; 10349 10350 /* 10351 * Don't try to pull misfit tasks we can't help. 10352 * We can use max_capacity here as reduction in capacity on some 10353 * CPUs in the group should either be possible to resolve 10354 * internally or be covered by avg_load imbalance (eventually). 10355 */ 10356 if ((env->sd->flags & SD_ASYM_CPUCAPACITY) && 10357 (sgs->group_type == group_misfit_task) && 10358 (!capacity_greater(capacity_of(env->dst_cpu), sg->sgc->max_capacity) || 10359 sds->local_stat.group_type != group_has_spare)) 10360 return false; 10361 10362 if (sgs->group_type > busiest->group_type) 10363 return true; 10364 10365 if (sgs->group_type < busiest->group_type) 10366 return false; 10367 10368 /* 10369 * The candidate and the current busiest group are the same type of 10370 * group. Let check which one is the busiest according to the type. 10371 */ 10372 10373 switch (sgs->group_type) { 10374 case group_overloaded: 10375 /* Select the overloaded group with highest avg_load. */ 10376 return sgs->avg_load > busiest->avg_load; 10377 10378 case group_imbalanced: 10379 /* 10380 * Select the 1st imbalanced group as we don't have any way to 10381 * choose one more than another. 10382 */ 10383 return false; 10384 10385 case group_asym_packing: 10386 /* Prefer to move from lowest priority CPU's work */ 10387 return sched_asym_prefer(sds->busiest->asym_prefer_cpu, sg->asym_prefer_cpu); 10388 10389 case group_misfit_task: 10390 /* 10391 * If we have more than one misfit sg go with the biggest 10392 * misfit. 10393 */ 10394 return sgs->group_misfit_task_load > busiest->group_misfit_task_load; 10395 10396 case group_smt_balance: 10397 /* 10398 * Check if we have spare CPUs on either SMT group to 10399 * choose has spare or fully busy handling. 10400 */ 10401 if (sgs->idle_cpus != 0 || busiest->idle_cpus != 0) 10402 goto has_spare; 10403 10404 fallthrough; 10405 10406 case group_fully_busy: 10407 /* 10408 * Select the fully busy group with highest avg_load. In 10409 * theory, there is no need to pull task from such kind of 10410 * group because tasks have all compute capacity that they need 10411 * but we can still improve the overall throughput by reducing 10412 * contention when accessing shared HW resources. 10413 * 10414 * XXX for now avg_load is not computed and always 0 so we 10415 * select the 1st one, except if @sg is composed of SMT 10416 * siblings. 10417 */ 10418 10419 if (sgs->avg_load < busiest->avg_load) 10420 return false; 10421 10422 if (sgs->avg_load == busiest->avg_load) { 10423 /* 10424 * SMT sched groups need more help than non-SMT groups. 10425 * If @sg happens to also be SMT, either choice is good. 10426 */ 10427 if (sds->busiest->flags & SD_SHARE_CPUCAPACITY) 10428 return false; 10429 } 10430 10431 break; 10432 10433 case group_has_spare: 10434 /* 10435 * Do not pick sg with SMT CPUs over sg with pure CPUs, 10436 * as we do not want to pull task off SMT core with one task 10437 * and make the core idle. 10438 */ 10439 if (smt_vs_nonsmt_groups(sds->busiest, sg)) { 10440 if (sg->flags & SD_SHARE_CPUCAPACITY && sgs->sum_h_nr_running <= 1) 10441 return false; 10442 else 10443 return true; 10444 } 10445 has_spare: 10446 10447 /* 10448 * Select not overloaded group with lowest number of idle CPUs 10449 * and highest number of running tasks. We could also compare 10450 * the spare capacity which is more stable but it can end up 10451 * that the group has less spare capacity but finally more idle 10452 * CPUs which means less opportunity to pull tasks. 10453 */ 10454 if (sgs->idle_cpus > busiest->idle_cpus) 10455 return false; 10456 else if ((sgs->idle_cpus == busiest->idle_cpus) && 10457 (sgs->sum_nr_running <= busiest->sum_nr_running)) 10458 return false; 10459 10460 break; 10461 } 10462 10463 /* 10464 * Candidate sg has no more than one task per CPU and has higher 10465 * per-CPU capacity. Migrating tasks to less capable CPUs may harm 10466 * throughput. Maximize throughput, power/energy consequences are not 10467 * considered. 10468 */ 10469 if ((env->sd->flags & SD_ASYM_CPUCAPACITY) && 10470 (sgs->group_type <= group_fully_busy) && 10471 (capacity_greater(sg->sgc->min_capacity, capacity_of(env->dst_cpu)))) 10472 return false; 10473 10474 return true; 10475 } 10476 10477 #ifdef CONFIG_NUMA_BALANCING 10478 static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs) 10479 { 10480 if (sgs->sum_h_nr_running > sgs->nr_numa_running) 10481 return regular; 10482 if (sgs->sum_h_nr_running > sgs->nr_preferred_running) 10483 return remote; 10484 return all; 10485 } 10486 10487 static inline enum fbq_type fbq_classify_rq(struct rq *rq) 10488 { 10489 if (rq->nr_running > rq->nr_numa_running) 10490 return regular; 10491 if (rq->nr_running > rq->nr_preferred_running) 10492 return remote; 10493 return all; 10494 } 10495 #else 10496 static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs) 10497 { 10498 return all; 10499 } 10500 10501 static inline enum fbq_type fbq_classify_rq(struct rq *rq) 10502 { 10503 return regular; 10504 } 10505 #endif /* CONFIG_NUMA_BALANCING */ 10506 10507 10508 struct sg_lb_stats; 10509 10510 /* 10511 * task_running_on_cpu - return 1 if @p is running on @cpu. 10512 */ 10513 10514 static unsigned int task_running_on_cpu(int cpu, struct task_struct *p) 10515 { 10516 /* Task has no contribution or is new */ 10517 if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time)) 10518 return 0; 10519 10520 if (task_on_rq_queued(p)) 10521 return 1; 10522 10523 return 0; 10524 } 10525 10526 /** 10527 * idle_cpu_without - would a given CPU be idle without p ? 10528 * @cpu: the processor on which idleness is tested. 10529 * @p: task which should be ignored. 10530 * 10531 * Return: 1 if the CPU would be idle. 0 otherwise. 10532 */ 10533 static int idle_cpu_without(int cpu, struct task_struct *p) 10534 { 10535 struct rq *rq = cpu_rq(cpu); 10536 10537 if (rq->curr != rq->idle && rq->curr != p) 10538 return 0; 10539 10540 /* 10541 * rq->nr_running can't be used but an updated version without the 10542 * impact of p on cpu must be used instead. The updated nr_running 10543 * be computed and tested before calling idle_cpu_without(). 10544 */ 10545 10546 if (rq->ttwu_pending) 10547 return 0; 10548 10549 return 1; 10550 } 10551 10552 /* 10553 * update_sg_wakeup_stats - Update sched_group's statistics for wakeup. 10554 * @sd: The sched_domain level to look for idlest group. 10555 * @group: sched_group whose statistics are to be updated. 10556 * @sgs: variable to hold the statistics for this group. 10557 * @p: The task for which we look for the idlest group/CPU. 10558 */ 10559 static inline void update_sg_wakeup_stats(struct sched_domain *sd, 10560 struct sched_group *group, 10561 struct sg_lb_stats *sgs, 10562 struct task_struct *p) 10563 { 10564 int i, nr_running; 10565 10566 memset(sgs, 0, sizeof(*sgs)); 10567 10568 /* Assume that task can't fit any CPU of the group */ 10569 if (sd->flags & SD_ASYM_CPUCAPACITY) 10570 sgs->group_misfit_task_load = 1; 10571 10572 for_each_cpu(i, sched_group_span(group)) { 10573 struct rq *rq = cpu_rq(i); 10574 unsigned int local; 10575 10576 sgs->group_load += cpu_load_without(rq, p); 10577 sgs->group_util += cpu_util_without(i, p); 10578 sgs->group_runnable += cpu_runnable_without(rq, p); 10579 local = task_running_on_cpu(i, p); 10580 sgs->sum_h_nr_running += rq->cfs.h_nr_running - local; 10581 10582 nr_running = rq->nr_running - local; 10583 sgs->sum_nr_running += nr_running; 10584 10585 /* 10586 * No need to call idle_cpu_without() if nr_running is not 0 10587 */ 10588 if (!nr_running && idle_cpu_without(i, p)) 10589 sgs->idle_cpus++; 10590 10591 /* Check if task fits in the CPU */ 10592 if (sd->flags & SD_ASYM_CPUCAPACITY && 10593 sgs->group_misfit_task_load && 10594 task_fits_cpu(p, i)) 10595 sgs->group_misfit_task_load = 0; 10596 10597 } 10598 10599 sgs->group_capacity = group->sgc->capacity; 10600 10601 sgs->group_weight = group->group_weight; 10602 10603 sgs->group_type = group_classify(sd->imbalance_pct, group, sgs); 10604 10605 /* 10606 * Computing avg_load makes sense only when group is fully busy or 10607 * overloaded 10608 */ 10609 if (sgs->group_type == group_fully_busy || 10610 sgs->group_type == group_overloaded) 10611 sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) / 10612 sgs->group_capacity; 10613 } 10614 10615 static bool update_pick_idlest(struct sched_group *idlest, 10616 struct sg_lb_stats *idlest_sgs, 10617 struct sched_group *group, 10618 struct sg_lb_stats *sgs) 10619 { 10620 if (sgs->group_type < idlest_sgs->group_type) 10621 return true; 10622 10623 if (sgs->group_type > idlest_sgs->group_type) 10624 return false; 10625 10626 /* 10627 * The candidate and the current idlest group are the same type of 10628 * group. Let check which one is the idlest according to the type. 10629 */ 10630 10631 switch (sgs->group_type) { 10632 case group_overloaded: 10633 case group_fully_busy: 10634 /* Select the group with lowest avg_load. */ 10635 if (idlest_sgs->avg_load <= sgs->avg_load) 10636 return false; 10637 break; 10638 10639 case group_imbalanced: 10640 case group_asym_packing: 10641 case group_smt_balance: 10642 /* Those types are not used in the slow wakeup path */ 10643 return false; 10644 10645 case group_misfit_task: 10646 /* Select group with the highest max capacity */ 10647 if (idlest->sgc->max_capacity >= group->sgc->max_capacity) 10648 return false; 10649 break; 10650 10651 case group_has_spare: 10652 /* Select group with most idle CPUs */ 10653 if (idlest_sgs->idle_cpus > sgs->idle_cpus) 10654 return false; 10655 10656 /* Select group with lowest group_util */ 10657 if (idlest_sgs->idle_cpus == sgs->idle_cpus && 10658 idlest_sgs->group_util <= sgs->group_util) 10659 return false; 10660 10661 break; 10662 } 10663 10664 return true; 10665 } 10666 10667 /* 10668 * sched_balance_find_dst_group() finds and returns the least busy CPU group within the 10669 * domain. 10670 * 10671 * Assumes p is allowed on at least one CPU in sd. 10672 */ 10673 static struct sched_group * 10674 sched_balance_find_dst_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) 10675 { 10676 struct sched_group *idlest = NULL, *local = NULL, *group = sd->groups; 10677 struct sg_lb_stats local_sgs, tmp_sgs; 10678 struct sg_lb_stats *sgs; 10679 unsigned long imbalance; 10680 struct sg_lb_stats idlest_sgs = { 10681 .avg_load = UINT_MAX, 10682 .group_type = group_overloaded, 10683 }; 10684 10685 do { 10686 int local_group; 10687 10688 /* Skip over this group if it has no CPUs allowed */ 10689 if (!cpumask_intersects(sched_group_span(group), 10690 p->cpus_ptr)) 10691 continue; 10692 10693 /* Skip over this group if no cookie matched */ 10694 if (!sched_group_cookie_match(cpu_rq(this_cpu), p, group)) 10695 continue; 10696 10697 local_group = cpumask_test_cpu(this_cpu, 10698 sched_group_span(group)); 10699 10700 if (local_group) { 10701 sgs = &local_sgs; 10702 local = group; 10703 } else { 10704 sgs = &tmp_sgs; 10705 } 10706 10707 update_sg_wakeup_stats(sd, group, sgs, p); 10708 10709 if (!local_group && update_pick_idlest(idlest, &idlest_sgs, group, sgs)) { 10710 idlest = group; 10711 idlest_sgs = *sgs; 10712 } 10713 10714 } while (group = group->next, group != sd->groups); 10715 10716 10717 /* There is no idlest group to push tasks to */ 10718 if (!idlest) 10719 return NULL; 10720 10721 /* The local group has been skipped because of CPU affinity */ 10722 if (!local) 10723 return idlest; 10724 10725 /* 10726 * If the local group is idler than the selected idlest group 10727 * don't try and push the task. 10728 */ 10729 if (local_sgs.group_type < idlest_sgs.group_type) 10730 return NULL; 10731 10732 /* 10733 * If the local group is busier than the selected idlest group 10734 * try and push the task. 10735 */ 10736 if (local_sgs.group_type > idlest_sgs.group_type) 10737 return idlest; 10738 10739 switch (local_sgs.group_type) { 10740 case group_overloaded: 10741 case group_fully_busy: 10742 10743 /* Calculate allowed imbalance based on load */ 10744 imbalance = scale_load_down(NICE_0_LOAD) * 10745 (sd->imbalance_pct-100) / 100; 10746 10747 /* 10748 * When comparing groups across NUMA domains, it's possible for 10749 * the local domain to be very lightly loaded relative to the 10750 * remote domains but "imbalance" skews the comparison making 10751 * remote CPUs look much more favourable. When considering 10752 * cross-domain, add imbalance to the load on the remote node 10753 * and consider staying local. 10754 */ 10755 10756 if ((sd->flags & SD_NUMA) && 10757 ((idlest_sgs.avg_load + imbalance) >= local_sgs.avg_load)) 10758 return NULL; 10759 10760 /* 10761 * If the local group is less loaded than the selected 10762 * idlest group don't try and push any tasks. 10763 */ 10764 if (idlest_sgs.avg_load >= (local_sgs.avg_load + imbalance)) 10765 return NULL; 10766 10767 if (100 * local_sgs.avg_load <= sd->imbalance_pct * idlest_sgs.avg_load) 10768 return NULL; 10769 break; 10770 10771 case group_imbalanced: 10772 case group_asym_packing: 10773 case group_smt_balance: 10774 /* Those type are not used in the slow wakeup path */ 10775 return NULL; 10776 10777 case group_misfit_task: 10778 /* Select group with the highest max capacity */ 10779 if (local->sgc->max_capacity >= idlest->sgc->max_capacity) 10780 return NULL; 10781 break; 10782 10783 case group_has_spare: 10784 #ifdef CONFIG_NUMA 10785 if (sd->flags & SD_NUMA) { 10786 int imb_numa_nr = sd->imb_numa_nr; 10787 #ifdef CONFIG_NUMA_BALANCING 10788 int idlest_cpu; 10789 /* 10790 * If there is spare capacity at NUMA, try to select 10791 * the preferred node 10792 */ 10793 if (cpu_to_node(this_cpu) == p->numa_preferred_nid) 10794 return NULL; 10795 10796 idlest_cpu = cpumask_first(sched_group_span(idlest)); 10797 if (cpu_to_node(idlest_cpu) == p->numa_preferred_nid) 10798 return idlest; 10799 #endif /* CONFIG_NUMA_BALANCING */ 10800 /* 10801 * Otherwise, keep the task close to the wakeup source 10802 * and improve locality if the number of running tasks 10803 * would remain below threshold where an imbalance is 10804 * allowed while accounting for the possibility the 10805 * task is pinned to a subset of CPUs. If there is a 10806 * real need of migration, periodic load balance will 10807 * take care of it. 10808 */ 10809 if (p->nr_cpus_allowed != NR_CPUS) { 10810 struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask); 10811 10812 cpumask_and(cpus, sched_group_span(local), p->cpus_ptr); 10813 imb_numa_nr = min(cpumask_weight(cpus), sd->imb_numa_nr); 10814 } 10815 10816 imbalance = abs(local_sgs.idle_cpus - idlest_sgs.idle_cpus); 10817 if (!adjust_numa_imbalance(imbalance, 10818 local_sgs.sum_nr_running + 1, 10819 imb_numa_nr)) { 10820 return NULL; 10821 } 10822 } 10823 #endif /* CONFIG_NUMA */ 10824 10825 /* 10826 * Select group with highest number of idle CPUs. We could also 10827 * compare the utilization which is more stable but it can end 10828 * up that the group has less spare capacity but finally more 10829 * idle CPUs which means more opportunity to run task. 10830 */ 10831 if (local_sgs.idle_cpus >= idlest_sgs.idle_cpus) 10832 return NULL; 10833 break; 10834 } 10835 10836 return idlest; 10837 } 10838 10839 static void update_idle_cpu_scan(struct lb_env *env, 10840 unsigned long sum_util) 10841 { 10842 struct sched_domain_shared *sd_share; 10843 int llc_weight, pct; 10844 u64 x, y, tmp; 10845 /* 10846 * Update the number of CPUs to scan in LLC domain, which could 10847 * be used as a hint in select_idle_cpu(). The update of sd_share 10848 * could be expensive because it is within a shared cache line. 10849 * So the write of this hint only occurs during periodic load 10850 * balancing, rather than CPU_NEWLY_IDLE, because the latter 10851 * can fire way more frequently than the former. 10852 */ 10853 if (!sched_feat(SIS_UTIL) || env->idle == CPU_NEWLY_IDLE) 10854 return; 10855 10856 llc_weight = per_cpu(sd_llc_size, env->dst_cpu); 10857 if (env->sd->span_weight != llc_weight) 10858 return; 10859 10860 sd_share = rcu_dereference(per_cpu(sd_llc_shared, env->dst_cpu)); 10861 if (!sd_share) 10862 return; 10863 10864 /* 10865 * The number of CPUs to search drops as sum_util increases, when 10866 * sum_util hits 85% or above, the scan stops. 10867 * The reason to choose 85% as the threshold is because this is the 10868 * imbalance_pct(117) when a LLC sched group is overloaded. 10869 * 10870 * let y = SCHED_CAPACITY_SCALE - p * x^2 [1] 10871 * and y'= y / SCHED_CAPACITY_SCALE 10872 * 10873 * x is the ratio of sum_util compared to the CPU capacity: 10874 * x = sum_util / (llc_weight * SCHED_CAPACITY_SCALE) 10875 * y' is the ratio of CPUs to be scanned in the LLC domain, 10876 * and the number of CPUs to scan is calculated by: 10877 * 10878 * nr_scan = llc_weight * y' [2] 10879 * 10880 * When x hits the threshold of overloaded, AKA, when 10881 * x = 100 / pct, y drops to 0. According to [1], 10882 * p should be SCHED_CAPACITY_SCALE * pct^2 / 10000 10883 * 10884 * Scale x by SCHED_CAPACITY_SCALE: 10885 * x' = sum_util / llc_weight; [3] 10886 * 10887 * and finally [1] becomes: 10888 * y = SCHED_CAPACITY_SCALE - 10889 * x'^2 * pct^2 / (10000 * SCHED_CAPACITY_SCALE) [4] 10890 * 10891 */ 10892 /* equation [3] */ 10893 x = sum_util; 10894 do_div(x, llc_weight); 10895 10896 /* equation [4] */ 10897 pct = env->sd->imbalance_pct; 10898 tmp = x * x * pct * pct; 10899 do_div(tmp, 10000 * SCHED_CAPACITY_SCALE); 10900 tmp = min_t(long, tmp, SCHED_CAPACITY_SCALE); 10901 y = SCHED_CAPACITY_SCALE - tmp; 10902 10903 /* equation [2] */ 10904 y *= llc_weight; 10905 do_div(y, SCHED_CAPACITY_SCALE); 10906 if ((int)y != sd_share->nr_idle_scan) 10907 WRITE_ONCE(sd_share->nr_idle_scan, (int)y); 10908 } 10909 10910 /** 10911 * update_sd_lb_stats - Update sched_domain's statistics for load balancing. 10912 * @env: The load balancing environment. 10913 * @sds: variable to hold the statistics for this sched_domain. 10914 */ 10915 10916 static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds) 10917 { 10918 struct sched_group *sg = env->sd->groups; 10919 struct sg_lb_stats *local = &sds->local_stat; 10920 struct sg_lb_stats tmp_sgs; 10921 unsigned long sum_util = 0; 10922 bool sg_overloaded = 0, sg_overutilized = 0; 10923 10924 do { 10925 struct sg_lb_stats *sgs = &tmp_sgs; 10926 int local_group; 10927 10928 local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(sg)); 10929 if (local_group) { 10930 sds->local = sg; 10931 sgs = local; 10932 10933 if (env->idle != CPU_NEWLY_IDLE || 10934 time_after_eq(jiffies, sg->sgc->next_update)) 10935 update_group_capacity(env->sd, env->dst_cpu); 10936 } 10937 10938 update_sg_lb_stats(env, sds, sg, sgs, &sg_overloaded, &sg_overutilized); 10939 10940 if (!local_group && update_sd_pick_busiest(env, sds, sg, sgs)) { 10941 sds->busiest = sg; 10942 sds->busiest_stat = *sgs; 10943 } 10944 10945 /* Now, start updating sd_lb_stats */ 10946 sds->total_load += sgs->group_load; 10947 sds->total_capacity += sgs->group_capacity; 10948 10949 sum_util += sgs->group_util; 10950 sg = sg->next; 10951 } while (sg != env->sd->groups); 10952 10953 /* 10954 * Indicate that the child domain of the busiest group prefers tasks 10955 * go to a child's sibling domains first. NB the flags of a sched group 10956 * are those of the child domain. 10957 */ 10958 if (sds->busiest) 10959 sds->prefer_sibling = !!(sds->busiest->flags & SD_PREFER_SIBLING); 10960 10961 10962 if (env->sd->flags & SD_NUMA) 10963 env->fbq_type = fbq_classify_group(&sds->busiest_stat); 10964 10965 if (!env->sd->parent) { 10966 /* update overload indicator if we are at root domain */ 10967 set_rd_overloaded(env->dst_rq->rd, sg_overloaded); 10968 10969 /* Update over-utilization (tipping point, U >= 0) indicator */ 10970 set_rd_overutilized(env->dst_rq->rd, sg_overutilized); 10971 } else if (sg_overutilized) { 10972 set_rd_overutilized(env->dst_rq->rd, sg_overutilized); 10973 } 10974 10975 update_idle_cpu_scan(env, sum_util); 10976 } 10977 10978 /** 10979 * calculate_imbalance - Calculate the amount of imbalance present within the 10980 * groups of a given sched_domain during load balance. 10981 * @env: load balance environment 10982 * @sds: statistics of the sched_domain whose imbalance is to be calculated. 10983 */ 10984 static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds) 10985 { 10986 struct sg_lb_stats *local, *busiest; 10987 10988 local = &sds->local_stat; 10989 busiest = &sds->busiest_stat; 10990 10991 if (busiest->group_type == group_misfit_task) { 10992 if (env->sd->flags & SD_ASYM_CPUCAPACITY) { 10993 /* Set imbalance to allow misfit tasks to be balanced. */ 10994 env->migration_type = migrate_misfit; 10995 env->imbalance = 1; 10996 } else { 10997 /* 10998 * Set load imbalance to allow moving task from cpu 10999 * with reduced capacity. 11000 */ 11001 env->migration_type = migrate_load; 11002 env->imbalance = busiest->group_misfit_task_load; 11003 } 11004 return; 11005 } 11006 11007 if (busiest->group_type == group_asym_packing) { 11008 /* 11009 * In case of asym capacity, we will try to migrate all load to 11010 * the preferred CPU. 11011 */ 11012 env->migration_type = migrate_task; 11013 env->imbalance = busiest->sum_h_nr_running; 11014 return; 11015 } 11016 11017 if (busiest->group_type == group_smt_balance) { 11018 /* Reduce number of tasks sharing CPU capacity */ 11019 env->migration_type = migrate_task; 11020 env->imbalance = 1; 11021 return; 11022 } 11023 11024 if (busiest->group_type == group_imbalanced) { 11025 /* 11026 * In the group_imb case we cannot rely on group-wide averages 11027 * to ensure CPU-load equilibrium, try to move any task to fix 11028 * the imbalance. The next load balance will take care of 11029 * balancing back the system. 11030 */ 11031 env->migration_type = migrate_task; 11032 env->imbalance = 1; 11033 return; 11034 } 11035 11036 /* 11037 * Try to use spare capacity of local group without overloading it or 11038 * emptying busiest. 11039 */ 11040 if (local->group_type == group_has_spare) { 11041 if ((busiest->group_type > group_fully_busy) && 11042 !(env->sd->flags & SD_SHARE_LLC)) { 11043 /* 11044 * If busiest is overloaded, try to fill spare 11045 * capacity. This might end up creating spare capacity 11046 * in busiest or busiest still being overloaded but 11047 * there is no simple way to directly compute the 11048 * amount of load to migrate in order to balance the 11049 * system. 11050 */ 11051 env->migration_type = migrate_util; 11052 env->imbalance = max(local->group_capacity, local->group_util) - 11053 local->group_util; 11054 11055 /* 11056 * In some cases, the group's utilization is max or even 11057 * higher than capacity because of migrations but the 11058 * local CPU is (newly) idle. There is at least one 11059 * waiting task in this overloaded busiest group. Let's 11060 * try to pull it. 11061 */ 11062 if (env->idle && env->imbalance == 0) { 11063 env->migration_type = migrate_task; 11064 env->imbalance = 1; 11065 } 11066 11067 return; 11068 } 11069 11070 if (busiest->group_weight == 1 || sds->prefer_sibling) { 11071 /* 11072 * When prefer sibling, evenly spread running tasks on 11073 * groups. 11074 */ 11075 env->migration_type = migrate_task; 11076 env->imbalance = sibling_imbalance(env, sds, busiest, local); 11077 } else { 11078 11079 /* 11080 * If there is no overload, we just want to even the number of 11081 * idle CPUs. 11082 */ 11083 env->migration_type = migrate_task; 11084 env->imbalance = max_t(long, 0, 11085 (local->idle_cpus - busiest->idle_cpus)); 11086 } 11087 11088 #ifdef CONFIG_NUMA 11089 /* Consider allowing a small imbalance between NUMA groups */ 11090 if (env->sd->flags & SD_NUMA) { 11091 env->imbalance = adjust_numa_imbalance(env->imbalance, 11092 local->sum_nr_running + 1, 11093 env->sd->imb_numa_nr); 11094 } 11095 #endif 11096 11097 /* Number of tasks to move to restore balance */ 11098 env->imbalance >>= 1; 11099 11100 return; 11101 } 11102 11103 /* 11104 * Local is fully busy but has to take more load to relieve the 11105 * busiest group 11106 */ 11107 if (local->group_type < group_overloaded) { 11108 /* 11109 * Local will become overloaded so the avg_load metrics are 11110 * finally needed. 11111 */ 11112 11113 local->avg_load = (local->group_load * SCHED_CAPACITY_SCALE) / 11114 local->group_capacity; 11115 11116 /* 11117 * If the local group is more loaded than the selected 11118 * busiest group don't try to pull any tasks. 11119 */ 11120 if (local->avg_load >= busiest->avg_load) { 11121 env->imbalance = 0; 11122 return; 11123 } 11124 11125 sds->avg_load = (sds->total_load * SCHED_CAPACITY_SCALE) / 11126 sds->total_capacity; 11127 11128 /* 11129 * If the local group is more loaded than the average system 11130 * load, don't try to pull any tasks. 11131 */ 11132 if (local->avg_load >= sds->avg_load) { 11133 env->imbalance = 0; 11134 return; 11135 } 11136 11137 } 11138 11139 /* 11140 * Both group are or will become overloaded and we're trying to get all 11141 * the CPUs to the average_load, so we don't want to push ourselves 11142 * above the average load, nor do we wish to reduce the max loaded CPU 11143 * below the average load. At the same time, we also don't want to 11144 * reduce the group load below the group capacity. Thus we look for 11145 * the minimum possible imbalance. 11146 */ 11147 env->migration_type = migrate_load; 11148 env->imbalance = min( 11149 (busiest->avg_load - sds->avg_load) * busiest->group_capacity, 11150 (sds->avg_load - local->avg_load) * local->group_capacity 11151 ) / SCHED_CAPACITY_SCALE; 11152 } 11153 11154 /******* sched_balance_find_src_group() helpers end here *********************/ 11155 11156 /* 11157 * Decision matrix according to the local and busiest group type: 11158 * 11159 * busiest \ local has_spare fully_busy misfit asym imbalanced overloaded 11160 * has_spare nr_idle balanced N/A N/A balanced balanced 11161 * fully_busy nr_idle nr_idle N/A N/A balanced balanced 11162 * misfit_task force N/A N/A N/A N/A N/A 11163 * asym_packing force force N/A N/A force force 11164 * imbalanced force force N/A N/A force force 11165 * overloaded force force N/A N/A force avg_load 11166 * 11167 * N/A : Not Applicable because already filtered while updating 11168 * statistics. 11169 * balanced : The system is balanced for these 2 groups. 11170 * force : Calculate the imbalance as load migration is probably needed. 11171 * avg_load : Only if imbalance is significant enough. 11172 * nr_idle : dst_cpu is not busy and the number of idle CPUs is quite 11173 * different in groups. 11174 */ 11175 11176 /** 11177 * sched_balance_find_src_group - Returns the busiest group within the sched_domain 11178 * if there is an imbalance. 11179 * @env: The load balancing environment. 11180 * 11181 * Also calculates the amount of runnable load which should be moved 11182 * to restore balance. 11183 * 11184 * Return: - The busiest group if imbalance exists. 11185 */ 11186 static struct sched_group *sched_balance_find_src_group(struct lb_env *env) 11187 { 11188 struct sg_lb_stats *local, *busiest; 11189 struct sd_lb_stats sds; 11190 11191 init_sd_lb_stats(&sds); 11192 11193 /* 11194 * Compute the various statistics relevant for load balancing at 11195 * this level. 11196 */ 11197 update_sd_lb_stats(env, &sds); 11198 11199 /* There is no busy sibling group to pull tasks from */ 11200 if (!sds.busiest) 11201 goto out_balanced; 11202 11203 busiest = &sds.busiest_stat; 11204 11205 /* Misfit tasks should be dealt with regardless of the avg load */ 11206 if (busiest->group_type == group_misfit_task) 11207 goto force_balance; 11208 11209 if (!is_rd_overutilized(env->dst_rq->rd) && 11210 rcu_dereference(env->dst_rq->rd->pd)) 11211 goto out_balanced; 11212 11213 /* ASYM feature bypasses nice load balance check */ 11214 if (busiest->group_type == group_asym_packing) 11215 goto force_balance; 11216 11217 /* 11218 * If the busiest group is imbalanced the below checks don't 11219 * work because they assume all things are equal, which typically 11220 * isn't true due to cpus_ptr constraints and the like. 11221 */ 11222 if (busiest->group_type == group_imbalanced) 11223 goto force_balance; 11224 11225 local = &sds.local_stat; 11226 /* 11227 * If the local group is busier than the selected busiest group 11228 * don't try and pull any tasks. 11229 */ 11230 if (local->group_type > busiest->group_type) 11231 goto out_balanced; 11232 11233 /* 11234 * When groups are overloaded, use the avg_load to ensure fairness 11235 * between tasks. 11236 */ 11237 if (local->group_type == group_overloaded) { 11238 /* 11239 * If the local group is more loaded than the selected 11240 * busiest group don't try to pull any tasks. 11241 */ 11242 if (local->avg_load >= busiest->avg_load) 11243 goto out_balanced; 11244 11245 /* XXX broken for overlapping NUMA groups */ 11246 sds.avg_load = (sds.total_load * SCHED_CAPACITY_SCALE) / 11247 sds.total_capacity; 11248 11249 /* 11250 * Don't pull any tasks if this group is already above the 11251 * domain average load. 11252 */ 11253 if (local->avg_load >= sds.avg_load) 11254 goto out_balanced; 11255 11256 /* 11257 * If the busiest group is more loaded, use imbalance_pct to be 11258 * conservative. 11259 */ 11260 if (100 * busiest->avg_load <= 11261 env->sd->imbalance_pct * local->avg_load) 11262 goto out_balanced; 11263 } 11264 11265 /* 11266 * Try to move all excess tasks to a sibling domain of the busiest 11267 * group's child domain. 11268 */ 11269 if (sds.prefer_sibling && local->group_type == group_has_spare && 11270 sibling_imbalance(env, &sds, busiest, local) > 1) 11271 goto force_balance; 11272 11273 if (busiest->group_type != group_overloaded) { 11274 if (!env->idle) { 11275 /* 11276 * If the busiest group is not overloaded (and as a 11277 * result the local one too) but this CPU is already 11278 * busy, let another idle CPU try to pull task. 11279 */ 11280 goto out_balanced; 11281 } 11282 11283 if (busiest->group_type == group_smt_balance && 11284 smt_vs_nonsmt_groups(sds.local, sds.busiest)) { 11285 /* Let non SMT CPU pull from SMT CPU sharing with sibling */ 11286 goto force_balance; 11287 } 11288 11289 if (busiest->group_weight > 1 && 11290 local->idle_cpus <= (busiest->idle_cpus + 1)) { 11291 /* 11292 * If the busiest group is not overloaded 11293 * and there is no imbalance between this and busiest 11294 * group wrt idle CPUs, it is balanced. The imbalance 11295 * becomes significant if the diff is greater than 1 11296 * otherwise we might end up to just move the imbalance 11297 * on another group. Of course this applies only if 11298 * there is more than 1 CPU per group. 11299 */ 11300 goto out_balanced; 11301 } 11302 11303 if (busiest->sum_h_nr_running == 1) { 11304 /* 11305 * busiest doesn't have any tasks waiting to run 11306 */ 11307 goto out_balanced; 11308 } 11309 } 11310 11311 force_balance: 11312 /* Looks like there is an imbalance. Compute it */ 11313 calculate_imbalance(env, &sds); 11314 return env->imbalance ? sds.busiest : NULL; 11315 11316 out_balanced: 11317 env->imbalance = 0; 11318 return NULL; 11319 } 11320 11321 /* 11322 * sched_balance_find_src_rq - find the busiest runqueue among the CPUs in the group. 11323 */ 11324 static struct rq *sched_balance_find_src_rq(struct lb_env *env, 11325 struct sched_group *group) 11326 { 11327 struct rq *busiest = NULL, *rq; 11328 unsigned long busiest_util = 0, busiest_load = 0, busiest_capacity = 1; 11329 unsigned int busiest_nr = 0; 11330 int i; 11331 11332 for_each_cpu_and(i, sched_group_span(group), env->cpus) { 11333 unsigned long capacity, load, util; 11334 unsigned int nr_running; 11335 enum fbq_type rt; 11336 11337 rq = cpu_rq(i); 11338 rt = fbq_classify_rq(rq); 11339 11340 /* 11341 * We classify groups/runqueues into three groups: 11342 * - regular: there are !numa tasks 11343 * - remote: there are numa tasks that run on the 'wrong' node 11344 * - all: there is no distinction 11345 * 11346 * In order to avoid migrating ideally placed numa tasks, 11347 * ignore those when there's better options. 11348 * 11349 * If we ignore the actual busiest queue to migrate another 11350 * task, the next balance pass can still reduce the busiest 11351 * queue by moving tasks around inside the node. 11352 * 11353 * If we cannot move enough load due to this classification 11354 * the next pass will adjust the group classification and 11355 * allow migration of more tasks. 11356 * 11357 * Both cases only affect the total convergence complexity. 11358 */ 11359 if (rt > env->fbq_type) 11360 continue; 11361 11362 nr_running = rq->cfs.h_nr_running; 11363 if (!nr_running) 11364 continue; 11365 11366 capacity = capacity_of(i); 11367 11368 /* 11369 * For ASYM_CPUCAPACITY domains, don't pick a CPU that could 11370 * eventually lead to active_balancing high->low capacity. 11371 * Higher per-CPU capacity is considered better than balancing 11372 * average load. 11373 */ 11374 if (env->sd->flags & SD_ASYM_CPUCAPACITY && 11375 !capacity_greater(capacity_of(env->dst_cpu), capacity) && 11376 nr_running == 1) 11377 continue; 11378 11379 /* 11380 * Make sure we only pull tasks from a CPU of lower priority 11381 * when balancing between SMT siblings. 11382 * 11383 * If balancing between cores, let lower priority CPUs help 11384 * SMT cores with more than one busy sibling. 11385 */ 11386 if (sched_asym(env->sd, i, env->dst_cpu) && nr_running == 1) 11387 continue; 11388 11389 switch (env->migration_type) { 11390 case migrate_load: 11391 /* 11392 * When comparing with load imbalance, use cpu_load() 11393 * which is not scaled with the CPU capacity. 11394 */ 11395 load = cpu_load(rq); 11396 11397 if (nr_running == 1 && load > env->imbalance && 11398 !check_cpu_capacity(rq, env->sd)) 11399 break; 11400 11401 /* 11402 * For the load comparisons with the other CPUs, 11403 * consider the cpu_load() scaled with the CPU 11404 * capacity, so that the load can be moved away 11405 * from the CPU that is potentially running at a 11406 * lower capacity. 11407 * 11408 * Thus we're looking for max(load_i / capacity_i), 11409 * crosswise multiplication to rid ourselves of the 11410 * division works out to: 11411 * load_i * capacity_j > load_j * capacity_i; 11412 * where j is our previous maximum. 11413 */ 11414 if (load * busiest_capacity > busiest_load * capacity) { 11415 busiest_load = load; 11416 busiest_capacity = capacity; 11417 busiest = rq; 11418 } 11419 break; 11420 11421 case migrate_util: 11422 util = cpu_util_cfs_boost(i); 11423 11424 /* 11425 * Don't try to pull utilization from a CPU with one 11426 * running task. Whatever its utilization, we will fail 11427 * detach the task. 11428 */ 11429 if (nr_running <= 1) 11430 continue; 11431 11432 if (busiest_util < util) { 11433 busiest_util = util; 11434 busiest = rq; 11435 } 11436 break; 11437 11438 case migrate_task: 11439 if (busiest_nr < nr_running) { 11440 busiest_nr = nr_running; 11441 busiest = rq; 11442 } 11443 break; 11444 11445 case migrate_misfit: 11446 /* 11447 * For ASYM_CPUCAPACITY domains with misfit tasks we 11448 * simply seek the "biggest" misfit task. 11449 */ 11450 if (rq->misfit_task_load > busiest_load) { 11451 busiest_load = rq->misfit_task_load; 11452 busiest = rq; 11453 } 11454 11455 break; 11456 11457 } 11458 } 11459 11460 return busiest; 11461 } 11462 11463 /* 11464 * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but 11465 * so long as it is large enough. 11466 */ 11467 #define MAX_PINNED_INTERVAL 512 11468 11469 static inline bool 11470 asym_active_balance(struct lb_env *env) 11471 { 11472 /* 11473 * ASYM_PACKING needs to force migrate tasks from busy but lower 11474 * priority CPUs in order to pack all tasks in the highest priority 11475 * CPUs. When done between cores, do it only if the whole core if the 11476 * whole core is idle. 11477 * 11478 * If @env::src_cpu is an SMT core with busy siblings, let 11479 * the lower priority @env::dst_cpu help it. Do not follow 11480 * CPU priority. 11481 */ 11482 return env->idle && sched_use_asym_prio(env->sd, env->dst_cpu) && 11483 (sched_asym_prefer(env->dst_cpu, env->src_cpu) || 11484 !sched_use_asym_prio(env->sd, env->src_cpu)); 11485 } 11486 11487 static inline bool 11488 imbalanced_active_balance(struct lb_env *env) 11489 { 11490 struct sched_domain *sd = env->sd; 11491 11492 /* 11493 * The imbalanced case includes the case of pinned tasks preventing a fair 11494 * distribution of the load on the system but also the even distribution of the 11495 * threads on a system with spare capacity 11496 */ 11497 if ((env->migration_type == migrate_task) && 11498 (sd->nr_balance_failed > sd->cache_nice_tries+2)) 11499 return 1; 11500 11501 return 0; 11502 } 11503 11504 static int need_active_balance(struct lb_env *env) 11505 { 11506 struct sched_domain *sd = env->sd; 11507 11508 if (asym_active_balance(env)) 11509 return 1; 11510 11511 if (imbalanced_active_balance(env)) 11512 return 1; 11513 11514 /* 11515 * The dst_cpu is idle and the src_cpu CPU has only 1 CFS task. 11516 * It's worth migrating the task if the src_cpu's capacity is reduced 11517 * because of other sched_class or IRQs if more capacity stays 11518 * available on dst_cpu. 11519 */ 11520 if (env->idle && 11521 (env->src_rq->cfs.h_nr_running == 1)) { 11522 if ((check_cpu_capacity(env->src_rq, sd)) && 11523 (capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100)) 11524 return 1; 11525 } 11526 11527 if (env->migration_type == migrate_misfit) 11528 return 1; 11529 11530 return 0; 11531 } 11532 11533 static int active_load_balance_cpu_stop(void *data); 11534 11535 static int should_we_balance(struct lb_env *env) 11536 { 11537 struct cpumask *swb_cpus = this_cpu_cpumask_var_ptr(should_we_balance_tmpmask); 11538 struct sched_group *sg = env->sd->groups; 11539 int cpu, idle_smt = -1; 11540 11541 /* 11542 * Ensure the balancing environment is consistent; can happen 11543 * when the softirq triggers 'during' hotplug. 11544 */ 11545 if (!cpumask_test_cpu(env->dst_cpu, env->cpus)) 11546 return 0; 11547 11548 /* 11549 * In the newly idle case, we will allow all the CPUs 11550 * to do the newly idle load balance. 11551 * 11552 * However, we bail out if we already have tasks or a wakeup pending, 11553 * to optimize wakeup latency. 11554 */ 11555 if (env->idle == CPU_NEWLY_IDLE) { 11556 if (env->dst_rq->nr_running > 0 || env->dst_rq->ttwu_pending) 11557 return 0; 11558 return 1; 11559 } 11560 11561 cpumask_copy(swb_cpus, group_balance_mask(sg)); 11562 /* Try to find first idle CPU */ 11563 for_each_cpu_and(cpu, swb_cpus, env->cpus) { 11564 if (!idle_cpu(cpu)) 11565 continue; 11566 11567 /* 11568 * Don't balance to idle SMT in busy core right away when 11569 * balancing cores, but remember the first idle SMT CPU for 11570 * later consideration. Find CPU on an idle core first. 11571 */ 11572 if (!(env->sd->flags & SD_SHARE_CPUCAPACITY) && !is_core_idle(cpu)) { 11573 if (idle_smt == -1) 11574 idle_smt = cpu; 11575 /* 11576 * If the core is not idle, and first SMT sibling which is 11577 * idle has been found, then its not needed to check other 11578 * SMT siblings for idleness: 11579 */ 11580 #ifdef CONFIG_SCHED_SMT 11581 cpumask_andnot(swb_cpus, swb_cpus, cpu_smt_mask(cpu)); 11582 #endif 11583 continue; 11584 } 11585 11586 /* 11587 * Are we the first idle core in a non-SMT domain or higher, 11588 * or the first idle CPU in a SMT domain? 11589 */ 11590 return cpu == env->dst_cpu; 11591 } 11592 11593 /* Are we the first idle CPU with busy siblings? */ 11594 if (idle_smt != -1) 11595 return idle_smt == env->dst_cpu; 11596 11597 /* Are we the first CPU of this group ? */ 11598 return group_balance_cpu(sg) == env->dst_cpu; 11599 } 11600 11601 /* 11602 * Check this_cpu to ensure it is balanced within domain. Attempt to move 11603 * tasks if there is an imbalance. 11604 */ 11605 static int sched_balance_rq(int this_cpu, struct rq *this_rq, 11606 struct sched_domain *sd, enum cpu_idle_type idle, 11607 int *continue_balancing) 11608 { 11609 int ld_moved, cur_ld_moved, active_balance = 0; 11610 struct sched_domain *sd_parent = sd->parent; 11611 struct sched_group *group; 11612 struct rq *busiest; 11613 struct rq_flags rf; 11614 struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask); 11615 struct lb_env env = { 11616 .sd = sd, 11617 .dst_cpu = this_cpu, 11618 .dst_rq = this_rq, 11619 .dst_grpmask = group_balance_mask(sd->groups), 11620 .idle = idle, 11621 .loop_break = SCHED_NR_MIGRATE_BREAK, 11622 .cpus = cpus, 11623 .fbq_type = all, 11624 .tasks = LIST_HEAD_INIT(env.tasks), 11625 }; 11626 11627 cpumask_and(cpus, sched_domain_span(sd), cpu_active_mask); 11628 11629 schedstat_inc(sd->lb_count[idle]); 11630 11631 redo: 11632 if (!should_we_balance(&env)) { 11633 *continue_balancing = 0; 11634 goto out_balanced; 11635 } 11636 11637 group = sched_balance_find_src_group(&env); 11638 if (!group) { 11639 schedstat_inc(sd->lb_nobusyg[idle]); 11640 goto out_balanced; 11641 } 11642 11643 busiest = sched_balance_find_src_rq(&env, group); 11644 if (!busiest) { 11645 schedstat_inc(sd->lb_nobusyq[idle]); 11646 goto out_balanced; 11647 } 11648 11649 WARN_ON_ONCE(busiest == env.dst_rq); 11650 11651 schedstat_add(sd->lb_imbalance[idle], env.imbalance); 11652 11653 env.src_cpu = busiest->cpu; 11654 env.src_rq = busiest; 11655 11656 ld_moved = 0; 11657 /* Clear this flag as soon as we find a pullable task */ 11658 env.flags |= LBF_ALL_PINNED; 11659 if (busiest->nr_running > 1) { 11660 /* 11661 * Attempt to move tasks. If sched_balance_find_src_group has found 11662 * an imbalance but busiest->nr_running <= 1, the group is 11663 * still unbalanced. ld_moved simply stays zero, so it is 11664 * correctly treated as an imbalance. 11665 */ 11666 env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); 11667 11668 more_balance: 11669 rq_lock_irqsave(busiest, &rf); 11670 update_rq_clock(busiest); 11671 11672 /* 11673 * cur_ld_moved - load moved in current iteration 11674 * ld_moved - cumulative load moved across iterations 11675 */ 11676 cur_ld_moved = detach_tasks(&env); 11677 11678 /* 11679 * We've detached some tasks from busiest_rq. Every 11680 * task is masked "TASK_ON_RQ_MIGRATING", so we can safely 11681 * unlock busiest->lock, and we are able to be sure 11682 * that nobody can manipulate the tasks in parallel. 11683 * See task_rq_lock() family for the details. 11684 */ 11685 11686 rq_unlock(busiest, &rf); 11687 11688 if (cur_ld_moved) { 11689 attach_tasks(&env); 11690 ld_moved += cur_ld_moved; 11691 } 11692 11693 local_irq_restore(rf.flags); 11694 11695 if (env.flags & LBF_NEED_BREAK) { 11696 env.flags &= ~LBF_NEED_BREAK; 11697 goto more_balance; 11698 } 11699 11700 /* 11701 * Revisit (affine) tasks on src_cpu that couldn't be moved to 11702 * us and move them to an alternate dst_cpu in our sched_group 11703 * where they can run. The upper limit on how many times we 11704 * iterate on same src_cpu is dependent on number of CPUs in our 11705 * sched_group. 11706 * 11707 * This changes load balance semantics a bit on who can move 11708 * load to a given_cpu. In addition to the given_cpu itself 11709 * (or a ilb_cpu acting on its behalf where given_cpu is 11710 * nohz-idle), we now have balance_cpu in a position to move 11711 * load to given_cpu. In rare situations, this may cause 11712 * conflicts (balance_cpu and given_cpu/ilb_cpu deciding 11713 * _independently_ and at _same_ time to move some load to 11714 * given_cpu) causing excess load to be moved to given_cpu. 11715 * This however should not happen so much in practice and 11716 * moreover subsequent load balance cycles should correct the 11717 * excess load moved. 11718 */ 11719 if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) { 11720 11721 /* Prevent to re-select dst_cpu via env's CPUs */ 11722 __cpumask_clear_cpu(env.dst_cpu, env.cpus); 11723 11724 env.dst_rq = cpu_rq(env.new_dst_cpu); 11725 env.dst_cpu = env.new_dst_cpu; 11726 env.flags &= ~LBF_DST_PINNED; 11727 env.loop = 0; 11728 env.loop_break = SCHED_NR_MIGRATE_BREAK; 11729 11730 /* 11731 * Go back to "more_balance" rather than "redo" since we 11732 * need to continue with same src_cpu. 11733 */ 11734 goto more_balance; 11735 } 11736 11737 /* 11738 * We failed to reach balance because of affinity. 11739 */ 11740 if (sd_parent) { 11741 int *group_imbalance = &sd_parent->groups->sgc->imbalance; 11742 11743 if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) 11744 *group_imbalance = 1; 11745 } 11746 11747 /* All tasks on this runqueue were pinned by CPU affinity */ 11748 if (unlikely(env.flags & LBF_ALL_PINNED)) { 11749 __cpumask_clear_cpu(cpu_of(busiest), cpus); 11750 /* 11751 * Attempting to continue load balancing at the current 11752 * sched_domain level only makes sense if there are 11753 * active CPUs remaining as possible busiest CPUs to 11754 * pull load from which are not contained within the 11755 * destination group that is receiving any migrated 11756 * load. 11757 */ 11758 if (!cpumask_subset(cpus, env.dst_grpmask)) { 11759 env.loop = 0; 11760 env.loop_break = SCHED_NR_MIGRATE_BREAK; 11761 goto redo; 11762 } 11763 goto out_all_pinned; 11764 } 11765 } 11766 11767 if (!ld_moved) { 11768 schedstat_inc(sd->lb_failed[idle]); 11769 /* 11770 * Increment the failure counter only on periodic balance. 11771 * We do not want newidle balance, which can be very 11772 * frequent, pollute the failure counter causing 11773 * excessive cache_hot migrations and active balances. 11774 * 11775 * Similarly for migration_misfit which is not related to 11776 * load/util migration, don't pollute nr_balance_failed. 11777 */ 11778 if (idle != CPU_NEWLY_IDLE && 11779 env.migration_type != migrate_misfit) 11780 sd->nr_balance_failed++; 11781 11782 if (need_active_balance(&env)) { 11783 unsigned long flags; 11784 11785 raw_spin_rq_lock_irqsave(busiest, flags); 11786 11787 /* 11788 * Don't kick the active_load_balance_cpu_stop, 11789 * if the curr task on busiest CPU can't be 11790 * moved to this_cpu: 11791 */ 11792 if (!cpumask_test_cpu(this_cpu, busiest->curr->cpus_ptr)) { 11793 raw_spin_rq_unlock_irqrestore(busiest, flags); 11794 goto out_one_pinned; 11795 } 11796 11797 /* Record that we found at least one task that could run on this_cpu */ 11798 env.flags &= ~LBF_ALL_PINNED; 11799 11800 /* 11801 * ->active_balance synchronizes accesses to 11802 * ->active_balance_work. Once set, it's cleared 11803 * only after active load balance is finished. 11804 */ 11805 if (!busiest->active_balance) { 11806 busiest->active_balance = 1; 11807 busiest->push_cpu = this_cpu; 11808 active_balance = 1; 11809 } 11810 11811 preempt_disable(); 11812 raw_spin_rq_unlock_irqrestore(busiest, flags); 11813 if (active_balance) { 11814 stop_one_cpu_nowait(cpu_of(busiest), 11815 active_load_balance_cpu_stop, busiest, 11816 &busiest->active_balance_work); 11817 } 11818 preempt_enable(); 11819 } 11820 } else { 11821 sd->nr_balance_failed = 0; 11822 } 11823 11824 if (likely(!active_balance) || need_active_balance(&env)) { 11825 /* We were unbalanced, so reset the balancing interval */ 11826 sd->balance_interval = sd->min_interval; 11827 } 11828 11829 goto out; 11830 11831 out_balanced: 11832 /* 11833 * We reach balance although we may have faced some affinity 11834 * constraints. Clear the imbalance flag only if other tasks got 11835 * a chance to move and fix the imbalance. 11836 */ 11837 if (sd_parent && !(env.flags & LBF_ALL_PINNED)) { 11838 int *group_imbalance = &sd_parent->groups->sgc->imbalance; 11839 11840 if (*group_imbalance) 11841 *group_imbalance = 0; 11842 } 11843 11844 out_all_pinned: 11845 /* 11846 * We reach balance because all tasks are pinned at this level so 11847 * we can't migrate them. Let the imbalance flag set so parent level 11848 * can try to migrate them. 11849 */ 11850 schedstat_inc(sd->lb_balanced[idle]); 11851 11852 sd->nr_balance_failed = 0; 11853 11854 out_one_pinned: 11855 ld_moved = 0; 11856 11857 /* 11858 * sched_balance_newidle() disregards balance intervals, so we could 11859 * repeatedly reach this code, which would lead to balance_interval 11860 * skyrocketing in a short amount of time. Skip the balance_interval 11861 * increase logic to avoid that. 11862 * 11863 * Similarly misfit migration which is not necessarily an indication of 11864 * the system being busy and requires lb to backoff to let it settle 11865 * down. 11866 */ 11867 if (env.idle == CPU_NEWLY_IDLE || 11868 env.migration_type == migrate_misfit) 11869 goto out; 11870 11871 /* tune up the balancing interval */ 11872 if ((env.flags & LBF_ALL_PINNED && 11873 sd->balance_interval < MAX_PINNED_INTERVAL) || 11874 sd->balance_interval < sd->max_interval) 11875 sd->balance_interval *= 2; 11876 out: 11877 return ld_moved; 11878 } 11879 11880 static inline unsigned long 11881 get_sd_balance_interval(struct sched_domain *sd, int cpu_busy) 11882 { 11883 unsigned long interval = sd->balance_interval; 11884 11885 if (cpu_busy) 11886 interval *= sd->busy_factor; 11887 11888 /* scale ms to jiffies */ 11889 interval = msecs_to_jiffies(interval); 11890 11891 /* 11892 * Reduce likelihood of busy balancing at higher domains racing with 11893 * balancing at lower domains by preventing their balancing periods 11894 * from being multiples of each other. 11895 */ 11896 if (cpu_busy) 11897 interval -= 1; 11898 11899 interval = clamp(interval, 1UL, max_load_balance_interval); 11900 11901 return interval; 11902 } 11903 11904 static inline void 11905 update_next_balance(struct sched_domain *sd, unsigned long *next_balance) 11906 { 11907 unsigned long interval, next; 11908 11909 /* used by idle balance, so cpu_busy = 0 */ 11910 interval = get_sd_balance_interval(sd, 0); 11911 next = sd->last_balance + interval; 11912 11913 if (time_after(*next_balance, next)) 11914 *next_balance = next; 11915 } 11916 11917 /* 11918 * active_load_balance_cpu_stop is run by the CPU stopper. It pushes 11919 * running tasks off the busiest CPU onto idle CPUs. It requires at 11920 * least 1 task to be running on each physical CPU where possible, and 11921 * avoids physical / logical imbalances. 11922 */ 11923 static int active_load_balance_cpu_stop(void *data) 11924 { 11925 struct rq *busiest_rq = data; 11926 int busiest_cpu = cpu_of(busiest_rq); 11927 int target_cpu = busiest_rq->push_cpu; 11928 struct rq *target_rq = cpu_rq(target_cpu); 11929 struct sched_domain *sd; 11930 struct task_struct *p = NULL; 11931 struct rq_flags rf; 11932 11933 rq_lock_irq(busiest_rq, &rf); 11934 /* 11935 * Between queueing the stop-work and running it is a hole in which 11936 * CPUs can become inactive. We should not move tasks from or to 11937 * inactive CPUs. 11938 */ 11939 if (!cpu_active(busiest_cpu) || !cpu_active(target_cpu)) 11940 goto out_unlock; 11941 11942 /* Make sure the requested CPU hasn't gone down in the meantime: */ 11943 if (unlikely(busiest_cpu != smp_processor_id() || 11944 !busiest_rq->active_balance)) 11945 goto out_unlock; 11946 11947 /* Is there any task to move? */ 11948 if (busiest_rq->nr_running <= 1) 11949 goto out_unlock; 11950 11951 /* 11952 * This condition is "impossible", if it occurs 11953 * we need to fix it. Originally reported by 11954 * Bjorn Helgaas on a 128-CPU setup. 11955 */ 11956 WARN_ON_ONCE(busiest_rq == target_rq); 11957 11958 /* Search for an sd spanning us and the target CPU. */ 11959 rcu_read_lock(); 11960 for_each_domain(target_cpu, sd) { 11961 if (cpumask_test_cpu(busiest_cpu, sched_domain_span(sd))) 11962 break; 11963 } 11964 11965 if (likely(sd)) { 11966 struct lb_env env = { 11967 .sd = sd, 11968 .dst_cpu = target_cpu, 11969 .dst_rq = target_rq, 11970 .src_cpu = busiest_rq->cpu, 11971 .src_rq = busiest_rq, 11972 .idle = CPU_IDLE, 11973 .flags = LBF_ACTIVE_LB, 11974 }; 11975 11976 schedstat_inc(sd->alb_count); 11977 update_rq_clock(busiest_rq); 11978 11979 p = detach_one_task(&env); 11980 if (p) { 11981 schedstat_inc(sd->alb_pushed); 11982 /* Active balancing done, reset the failure counter. */ 11983 sd->nr_balance_failed = 0; 11984 } else { 11985 schedstat_inc(sd->alb_failed); 11986 } 11987 } 11988 rcu_read_unlock(); 11989 out_unlock: 11990 busiest_rq->active_balance = 0; 11991 rq_unlock(busiest_rq, &rf); 11992 11993 if (p) 11994 attach_one_task(target_rq, p); 11995 11996 local_irq_enable(); 11997 11998 return 0; 11999 } 12000 12001 /* 12002 * This flag serializes load-balancing passes over large domains 12003 * (above the NODE topology level) - only one load-balancing instance 12004 * may run at a time, to reduce overhead on very large systems with 12005 * lots of CPUs and large NUMA distances. 12006 * 12007 * - Note that load-balancing passes triggered while another one 12008 * is executing are skipped and not re-tried. 12009 * 12010 * - Also note that this does not serialize rebalance_domains() 12011 * execution, as non-SD_SERIALIZE domains will still be 12012 * load-balanced in parallel. 12013 */ 12014 static atomic_t sched_balance_running = ATOMIC_INIT(0); 12015 12016 /* 12017 * Scale the max sched_balance_rq interval with the number of CPUs in the system. 12018 * This trades load-balance latency on larger machines for less cross talk. 12019 */ 12020 void update_max_interval(void) 12021 { 12022 max_load_balance_interval = HZ*num_online_cpus()/10; 12023 } 12024 12025 static inline bool update_newidle_cost(struct sched_domain *sd, u64 cost) 12026 { 12027 if (cost > sd->max_newidle_lb_cost) { 12028 /* 12029 * Track max cost of a domain to make sure to not delay the 12030 * next wakeup on the CPU. 12031 */ 12032 sd->max_newidle_lb_cost = cost; 12033 sd->last_decay_max_lb_cost = jiffies; 12034 } else if (time_after(jiffies, sd->last_decay_max_lb_cost + HZ)) { 12035 /* 12036 * Decay the newidle max times by ~1% per second to ensure that 12037 * it is not outdated and the current max cost is actually 12038 * shorter. 12039 */ 12040 sd->max_newidle_lb_cost = (sd->max_newidle_lb_cost * 253) / 256; 12041 sd->last_decay_max_lb_cost = jiffies; 12042 12043 return true; 12044 } 12045 12046 return false; 12047 } 12048 12049 /* 12050 * It checks each scheduling domain to see if it is due to be balanced, 12051 * and initiates a balancing operation if so. 12052 * 12053 * Balancing parameters are set up in init_sched_domains. 12054 */ 12055 static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle) 12056 { 12057 int continue_balancing = 1; 12058 int cpu = rq->cpu; 12059 int busy = idle != CPU_IDLE && !sched_idle_cpu(cpu); 12060 unsigned long interval; 12061 struct sched_domain *sd; 12062 /* Earliest time when we have to do rebalance again */ 12063 unsigned long next_balance = jiffies + 60*HZ; 12064 int update_next_balance = 0; 12065 int need_serialize, need_decay = 0; 12066 u64 max_cost = 0; 12067 12068 rcu_read_lock(); 12069 for_each_domain(cpu, sd) { 12070 /* 12071 * Decay the newidle max times here because this is a regular 12072 * visit to all the domains. 12073 */ 12074 need_decay = update_newidle_cost(sd, 0); 12075 max_cost += sd->max_newidle_lb_cost; 12076 12077 /* 12078 * Stop the load balance at this level. There is another 12079 * CPU in our sched group which is doing load balancing more 12080 * actively. 12081 */ 12082 if (!continue_balancing) { 12083 if (need_decay) 12084 continue; 12085 break; 12086 } 12087 12088 interval = get_sd_balance_interval(sd, busy); 12089 12090 need_serialize = sd->flags & SD_SERIALIZE; 12091 if (need_serialize) { 12092 if (atomic_cmpxchg_acquire(&sched_balance_running, 0, 1)) 12093 goto out; 12094 } 12095 12096 if (time_after_eq(jiffies, sd->last_balance + interval)) { 12097 if (sched_balance_rq(cpu, rq, sd, idle, &continue_balancing)) { 12098 /* 12099 * The LBF_DST_PINNED logic could have changed 12100 * env->dst_cpu, so we can't know our idle 12101 * state even if we migrated tasks. Update it. 12102 */ 12103 idle = idle_cpu(cpu); 12104 busy = !idle && !sched_idle_cpu(cpu); 12105 } 12106 sd->last_balance = jiffies; 12107 interval = get_sd_balance_interval(sd, busy); 12108 } 12109 if (need_serialize) 12110 atomic_set_release(&sched_balance_running, 0); 12111 out: 12112 if (time_after(next_balance, sd->last_balance + interval)) { 12113 next_balance = sd->last_balance + interval; 12114 update_next_balance = 1; 12115 } 12116 } 12117 if (need_decay) { 12118 /* 12119 * Ensure the rq-wide value also decays but keep it at a 12120 * reasonable floor to avoid funnies with rq->avg_idle. 12121 */ 12122 rq->max_idle_balance_cost = 12123 max((u64)sysctl_sched_migration_cost, max_cost); 12124 } 12125 rcu_read_unlock(); 12126 12127 /* 12128 * next_balance will be updated only when there is a need. 12129 * When the cpu is attached to null domain for ex, it will not be 12130 * updated. 12131 */ 12132 if (likely(update_next_balance)) 12133 rq->next_balance = next_balance; 12134 12135 } 12136 12137 static inline int on_null_domain(struct rq *rq) 12138 { 12139 return unlikely(!rcu_dereference_sched(rq->sd)); 12140 } 12141 12142 #ifdef CONFIG_NO_HZ_COMMON 12143 /* 12144 * NOHZ idle load balancing (ILB) details: 12145 * 12146 * - When one of the busy CPUs notices that there may be an idle rebalancing 12147 * needed, they will kick the idle load balancer, which then does idle 12148 * load balancing for all the idle CPUs. 12149 * 12150 * - HK_TYPE_MISC CPUs are used for this task, because HK_TYPE_SCHED is not set 12151 * anywhere yet. 12152 */ 12153 static inline int find_new_ilb(void) 12154 { 12155 const struct cpumask *hk_mask; 12156 int ilb_cpu; 12157 12158 hk_mask = housekeeping_cpumask(HK_TYPE_MISC); 12159 12160 for_each_cpu_and(ilb_cpu, nohz.idle_cpus_mask, hk_mask) { 12161 12162 if (ilb_cpu == smp_processor_id()) 12163 continue; 12164 12165 if (idle_cpu(ilb_cpu)) 12166 return ilb_cpu; 12167 } 12168 12169 return -1; 12170 } 12171 12172 /* 12173 * Kick a CPU to do the NOHZ balancing, if it is time for it, via a cross-CPU 12174 * SMP function call (IPI). 12175 * 12176 * We pick the first idle CPU in the HK_TYPE_MISC housekeeping set (if there is one). 12177 */ 12178 static void kick_ilb(unsigned int flags) 12179 { 12180 int ilb_cpu; 12181 12182 /* 12183 * Increase nohz.next_balance only when if full ilb is triggered but 12184 * not if we only update stats. 12185 */ 12186 if (flags & NOHZ_BALANCE_KICK) 12187 nohz.next_balance = jiffies+1; 12188 12189 ilb_cpu = find_new_ilb(); 12190 if (ilb_cpu < 0) 12191 return; 12192 12193 /* 12194 * Don't bother if no new NOHZ balance work items for ilb_cpu, 12195 * i.e. all bits in flags are already set in ilb_cpu. 12196 */ 12197 if ((atomic_read(nohz_flags(ilb_cpu)) & flags) == flags) 12198 return; 12199 12200 /* 12201 * Access to rq::nohz_csd is serialized by NOHZ_KICK_MASK; he who sets 12202 * the first flag owns it; cleared by nohz_csd_func(). 12203 */ 12204 flags = atomic_fetch_or(flags, nohz_flags(ilb_cpu)); 12205 if (flags & NOHZ_KICK_MASK) 12206 return; 12207 12208 /* 12209 * This way we generate an IPI on the target CPU which 12210 * is idle, and the softirq performing NOHZ idle load balancing 12211 * will be run before returning from the IPI. 12212 */ 12213 smp_call_function_single_async(ilb_cpu, &cpu_rq(ilb_cpu)->nohz_csd); 12214 } 12215 12216 /* 12217 * Current decision point for kicking the idle load balancer in the presence 12218 * of idle CPUs in the system. 12219 */ 12220 static void nohz_balancer_kick(struct rq *rq) 12221 { 12222 unsigned long now = jiffies; 12223 struct sched_domain_shared *sds; 12224 struct sched_domain *sd; 12225 int nr_busy, i, cpu = rq->cpu; 12226 unsigned int flags = 0; 12227 12228 if (unlikely(rq->idle_balance)) 12229 return; 12230 12231 /* 12232 * We may be recently in ticked or tickless idle mode. At the first 12233 * busy tick after returning from idle, we will update the busy stats. 12234 */ 12235 nohz_balance_exit_idle(rq); 12236 12237 /* 12238 * None are in tickless mode and hence no need for NOHZ idle load 12239 * balancing: 12240 */ 12241 if (likely(!atomic_read(&nohz.nr_cpus))) 12242 return; 12243 12244 if (READ_ONCE(nohz.has_blocked) && 12245 time_after(now, READ_ONCE(nohz.next_blocked))) 12246 flags = NOHZ_STATS_KICK; 12247 12248 if (time_before(now, nohz.next_balance)) 12249 goto out; 12250 12251 if (rq->nr_running >= 2) { 12252 flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK; 12253 goto out; 12254 } 12255 12256 rcu_read_lock(); 12257 12258 sd = rcu_dereference(rq->sd); 12259 if (sd) { 12260 /* 12261 * If there's a runnable CFS task and the current CPU has reduced 12262 * capacity, kick the ILB to see if there's a better CPU to run on: 12263 */ 12264 if (rq->cfs.h_nr_running >= 1 && check_cpu_capacity(rq, sd)) { 12265 flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK; 12266 goto unlock; 12267 } 12268 } 12269 12270 sd = rcu_dereference(per_cpu(sd_asym_packing, cpu)); 12271 if (sd) { 12272 /* 12273 * When ASYM_PACKING; see if there's a more preferred CPU 12274 * currently idle; in which case, kick the ILB to move tasks 12275 * around. 12276 * 12277 * When balancing between cores, all the SMT siblings of the 12278 * preferred CPU must be idle. 12279 */ 12280 for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) { 12281 if (sched_asym(sd, i, cpu)) { 12282 flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK; 12283 goto unlock; 12284 } 12285 } 12286 } 12287 12288 sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, cpu)); 12289 if (sd) { 12290 /* 12291 * When ASYM_CPUCAPACITY; see if there's a higher capacity CPU 12292 * to run the misfit task on. 12293 */ 12294 if (check_misfit_status(rq)) { 12295 flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK; 12296 goto unlock; 12297 } 12298 12299 /* 12300 * For asymmetric systems, we do not want to nicely balance 12301 * cache use, instead we want to embrace asymmetry and only 12302 * ensure tasks have enough CPU capacity. 12303 * 12304 * Skip the LLC logic because it's not relevant in that case. 12305 */ 12306 goto unlock; 12307 } 12308 12309 sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); 12310 if (sds) { 12311 /* 12312 * If there is an imbalance between LLC domains (IOW we could 12313 * increase the overall cache utilization), we need a less-loaded LLC 12314 * domain to pull some load from. Likewise, we may need to spread 12315 * load within the current LLC domain (e.g. packed SMT cores but 12316 * other CPUs are idle). We can't really know from here how busy 12317 * the others are - so just get a NOHZ balance going if it looks 12318 * like this LLC domain has tasks we could move. 12319 */ 12320 nr_busy = atomic_read(&sds->nr_busy_cpus); 12321 if (nr_busy > 1) { 12322 flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK; 12323 goto unlock; 12324 } 12325 } 12326 unlock: 12327 rcu_read_unlock(); 12328 out: 12329 if (READ_ONCE(nohz.needs_update)) 12330 flags |= NOHZ_NEXT_KICK; 12331 12332 if (flags) 12333 kick_ilb(flags); 12334 } 12335 12336 static void set_cpu_sd_state_busy(int cpu) 12337 { 12338 struct sched_domain *sd; 12339 12340 rcu_read_lock(); 12341 sd = rcu_dereference(per_cpu(sd_llc, cpu)); 12342 12343 if (!sd || !sd->nohz_idle) 12344 goto unlock; 12345 sd->nohz_idle = 0; 12346 12347 atomic_inc(&sd->shared->nr_busy_cpus); 12348 unlock: 12349 rcu_read_unlock(); 12350 } 12351 12352 void nohz_balance_exit_idle(struct rq *rq) 12353 { 12354 SCHED_WARN_ON(rq != this_rq()); 12355 12356 if (likely(!rq->nohz_tick_stopped)) 12357 return; 12358 12359 rq->nohz_tick_stopped = 0; 12360 cpumask_clear_cpu(rq->cpu, nohz.idle_cpus_mask); 12361 atomic_dec(&nohz.nr_cpus); 12362 12363 set_cpu_sd_state_busy(rq->cpu); 12364 } 12365 12366 static void set_cpu_sd_state_idle(int cpu) 12367 { 12368 struct sched_domain *sd; 12369 12370 rcu_read_lock(); 12371 sd = rcu_dereference(per_cpu(sd_llc, cpu)); 12372 12373 if (!sd || sd->nohz_idle) 12374 goto unlock; 12375 sd->nohz_idle = 1; 12376 12377 atomic_dec(&sd->shared->nr_busy_cpus); 12378 unlock: 12379 rcu_read_unlock(); 12380 } 12381 12382 /* 12383 * This routine will record that the CPU is going idle with tick stopped. 12384 * This info will be used in performing idle load balancing in the future. 12385 */ 12386 void nohz_balance_enter_idle(int cpu) 12387 { 12388 struct rq *rq = cpu_rq(cpu); 12389 12390 SCHED_WARN_ON(cpu != smp_processor_id()); 12391 12392 /* If this CPU is going down, then nothing needs to be done: */ 12393 if (!cpu_active(cpu)) 12394 return; 12395 12396 /* Spare idle load balancing on CPUs that don't want to be disturbed: */ 12397 if (!housekeeping_cpu(cpu, HK_TYPE_SCHED)) 12398 return; 12399 12400 /* 12401 * Can be set safely without rq->lock held 12402 * If a clear happens, it will have evaluated last additions because 12403 * rq->lock is held during the check and the clear 12404 */ 12405 rq->has_blocked_load = 1; 12406 12407 /* 12408 * The tick is still stopped but load could have been added in the 12409 * meantime. We set the nohz.has_blocked flag to trig a check of the 12410 * *_avg. The CPU is already part of nohz.idle_cpus_mask so the clear 12411 * of nohz.has_blocked can only happen after checking the new load 12412 */ 12413 if (rq->nohz_tick_stopped) 12414 goto out; 12415 12416 /* If we're a completely isolated CPU, we don't play: */ 12417 if (on_null_domain(rq)) 12418 return; 12419 12420 rq->nohz_tick_stopped = 1; 12421 12422 cpumask_set_cpu(cpu, nohz.idle_cpus_mask); 12423 atomic_inc(&nohz.nr_cpus); 12424 12425 /* 12426 * Ensures that if nohz_idle_balance() fails to observe our 12427 * @idle_cpus_mask store, it must observe the @has_blocked 12428 * and @needs_update stores. 12429 */ 12430 smp_mb__after_atomic(); 12431 12432 set_cpu_sd_state_idle(cpu); 12433 12434 WRITE_ONCE(nohz.needs_update, 1); 12435 out: 12436 /* 12437 * Each time a cpu enter idle, we assume that it has blocked load and 12438 * enable the periodic update of the load of idle CPUs 12439 */ 12440 WRITE_ONCE(nohz.has_blocked, 1); 12441 } 12442 12443 static bool update_nohz_stats(struct rq *rq) 12444 { 12445 unsigned int cpu = rq->cpu; 12446 12447 if (!rq->has_blocked_load) 12448 return false; 12449 12450 if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask)) 12451 return false; 12452 12453 if (!time_after(jiffies, READ_ONCE(rq->last_blocked_load_update_tick))) 12454 return true; 12455 12456 sched_balance_update_blocked_averages(cpu); 12457 12458 return rq->has_blocked_load; 12459 } 12460 12461 /* 12462 * Internal function that runs load balance for all idle CPUs. The load balance 12463 * can be a simple update of blocked load or a complete load balance with 12464 * tasks movement depending of flags. 12465 */ 12466 static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags) 12467 { 12468 /* Earliest time when we have to do rebalance again */ 12469 unsigned long now = jiffies; 12470 unsigned long next_balance = now + 60*HZ; 12471 bool has_blocked_load = false; 12472 int update_next_balance = 0; 12473 int this_cpu = this_rq->cpu; 12474 int balance_cpu; 12475 struct rq *rq; 12476 12477 SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK); 12478 12479 /* 12480 * We assume there will be no idle load after this update and clear 12481 * the has_blocked flag. If a cpu enters idle in the mean time, it will 12482 * set the has_blocked flag and trigger another update of idle load. 12483 * Because a cpu that becomes idle, is added to idle_cpus_mask before 12484 * setting the flag, we are sure to not clear the state and not 12485 * check the load of an idle cpu. 12486 * 12487 * Same applies to idle_cpus_mask vs needs_update. 12488 */ 12489 if (flags & NOHZ_STATS_KICK) 12490 WRITE_ONCE(nohz.has_blocked, 0); 12491 if (flags & NOHZ_NEXT_KICK) 12492 WRITE_ONCE(nohz.needs_update, 0); 12493 12494 /* 12495 * Ensures that if we miss the CPU, we must see the has_blocked 12496 * store from nohz_balance_enter_idle(). 12497 */ 12498 smp_mb(); 12499 12500 /* 12501 * Start with the next CPU after this_cpu so we will end with this_cpu and let a 12502 * chance for other idle cpu to pull load. 12503 */ 12504 for_each_cpu_wrap(balance_cpu, nohz.idle_cpus_mask, this_cpu+1) { 12505 if (!idle_cpu(balance_cpu)) 12506 continue; 12507 12508 /* 12509 * If this CPU gets work to do, stop the load balancing 12510 * work being done for other CPUs. Next load 12511 * balancing owner will pick it up. 12512 */ 12513 if (!idle_cpu(this_cpu) && need_resched()) { 12514 if (flags & NOHZ_STATS_KICK) 12515 has_blocked_load = true; 12516 if (flags & NOHZ_NEXT_KICK) 12517 WRITE_ONCE(nohz.needs_update, 1); 12518 goto abort; 12519 } 12520 12521 rq = cpu_rq(balance_cpu); 12522 12523 if (flags & NOHZ_STATS_KICK) 12524 has_blocked_load |= update_nohz_stats(rq); 12525 12526 /* 12527 * If time for next balance is due, 12528 * do the balance. 12529 */ 12530 if (time_after_eq(jiffies, rq->next_balance)) { 12531 struct rq_flags rf; 12532 12533 rq_lock_irqsave(rq, &rf); 12534 update_rq_clock(rq); 12535 rq_unlock_irqrestore(rq, &rf); 12536 12537 if (flags & NOHZ_BALANCE_KICK) 12538 sched_balance_domains(rq, CPU_IDLE); 12539 } 12540 12541 if (time_after(next_balance, rq->next_balance)) { 12542 next_balance = rq->next_balance; 12543 update_next_balance = 1; 12544 } 12545 } 12546 12547 /* 12548 * next_balance will be updated only when there is a need. 12549 * When the CPU is attached to null domain for ex, it will not be 12550 * updated. 12551 */ 12552 if (likely(update_next_balance)) 12553 nohz.next_balance = next_balance; 12554 12555 if (flags & NOHZ_STATS_KICK) 12556 WRITE_ONCE(nohz.next_blocked, 12557 now + msecs_to_jiffies(LOAD_AVG_PERIOD)); 12558 12559 abort: 12560 /* There is still blocked load, enable periodic update */ 12561 if (has_blocked_load) 12562 WRITE_ONCE(nohz.has_blocked, 1); 12563 } 12564 12565 /* 12566 * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the 12567 * rebalancing for all the CPUs for whom scheduler ticks are stopped. 12568 */ 12569 static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) 12570 { 12571 unsigned int flags = this_rq->nohz_idle_balance; 12572 12573 if (!flags) 12574 return false; 12575 12576 this_rq->nohz_idle_balance = 0; 12577 12578 if (idle != CPU_IDLE) 12579 return false; 12580 12581 _nohz_idle_balance(this_rq, flags); 12582 12583 return true; 12584 } 12585 12586 /* 12587 * Check if we need to directly run the ILB for updating blocked load before 12588 * entering idle state. Here we run ILB directly without issuing IPIs. 12589 * 12590 * Note that when this function is called, the tick may not yet be stopped on 12591 * this CPU yet. nohz.idle_cpus_mask is updated only when tick is stopped and 12592 * cleared on the next busy tick. In other words, nohz.idle_cpus_mask updates 12593 * don't align with CPUs enter/exit idle to avoid bottlenecks due to high idle 12594 * entry/exit rate (usec). So it is possible that _nohz_idle_balance() is 12595 * called from this function on (this) CPU that's not yet in the mask. That's 12596 * OK because the goal of nohz_run_idle_balance() is to run ILB only for 12597 * updating the blocked load of already idle CPUs without waking up one of 12598 * those idle CPUs and outside the preempt disable / IRQ off phase of the local 12599 * cpu about to enter idle, because it can take a long time. 12600 */ 12601 void nohz_run_idle_balance(int cpu) 12602 { 12603 unsigned int flags; 12604 12605 flags = atomic_fetch_andnot(NOHZ_NEWILB_KICK, nohz_flags(cpu)); 12606 12607 /* 12608 * Update the blocked load only if no SCHED_SOFTIRQ is about to happen 12609 * (i.e. NOHZ_STATS_KICK set) and will do the same. 12610 */ 12611 if ((flags == NOHZ_NEWILB_KICK) && !need_resched()) 12612 _nohz_idle_balance(cpu_rq(cpu), NOHZ_STATS_KICK); 12613 } 12614 12615 static void nohz_newidle_balance(struct rq *this_rq) 12616 { 12617 int this_cpu = this_rq->cpu; 12618 12619 /* 12620 * This CPU doesn't want to be disturbed by scheduler 12621 * housekeeping 12622 */ 12623 if (!housekeeping_cpu(this_cpu, HK_TYPE_SCHED)) 12624 return; 12625 12626 /* Will wake up very soon. No time for doing anything else*/ 12627 if (this_rq->avg_idle < sysctl_sched_migration_cost) 12628 return; 12629 12630 /* Don't need to update blocked load of idle CPUs*/ 12631 if (!READ_ONCE(nohz.has_blocked) || 12632 time_before(jiffies, READ_ONCE(nohz.next_blocked))) 12633 return; 12634 12635 /* 12636 * Set the need to trigger ILB in order to update blocked load 12637 * before entering idle state. 12638 */ 12639 atomic_or(NOHZ_NEWILB_KICK, nohz_flags(this_cpu)); 12640 } 12641 12642 #else /* !CONFIG_NO_HZ_COMMON */ 12643 static inline void nohz_balancer_kick(struct rq *rq) { } 12644 12645 static inline bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) 12646 { 12647 return false; 12648 } 12649 12650 static inline void nohz_newidle_balance(struct rq *this_rq) { } 12651 #endif /* CONFIG_NO_HZ_COMMON */ 12652 12653 /* 12654 * sched_balance_newidle is called by schedule() if this_cpu is about to become 12655 * idle. Attempts to pull tasks from other CPUs. 12656 * 12657 * Returns: 12658 * < 0 - we released the lock and there are !fair tasks present 12659 * 0 - failed, no new tasks 12660 * > 0 - success, new (fair) tasks present 12661 */ 12662 static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf) 12663 { 12664 unsigned long next_balance = jiffies + HZ; 12665 int this_cpu = this_rq->cpu; 12666 int continue_balancing = 1; 12667 u64 t0, t1, curr_cost = 0; 12668 struct sched_domain *sd; 12669 int pulled_task = 0; 12670 12671 update_misfit_status(NULL, this_rq); 12672 12673 /* 12674 * There is a task waiting to run. No need to search for one. 12675 * Return 0; the task will be enqueued when switching to idle. 12676 */ 12677 if (this_rq->ttwu_pending) 12678 return 0; 12679 12680 /* 12681 * We must set idle_stamp _before_ calling sched_balance_rq() 12682 * for CPU_NEWLY_IDLE, such that we measure the this duration 12683 * as idle time. 12684 */ 12685 this_rq->idle_stamp = rq_clock(this_rq); 12686 12687 /* 12688 * Do not pull tasks towards !active CPUs... 12689 */ 12690 if (!cpu_active(this_cpu)) 12691 return 0; 12692 12693 /* 12694 * This is OK, because current is on_cpu, which avoids it being picked 12695 * for load-balance and preemption/IRQs are still disabled avoiding 12696 * further scheduler activity on it and we're being very careful to 12697 * re-start the picking loop. 12698 */ 12699 rq_unpin_lock(this_rq, rf); 12700 12701 rcu_read_lock(); 12702 sd = rcu_dereference_check_sched_domain(this_rq->sd); 12703 12704 if (!get_rd_overloaded(this_rq->rd) || 12705 (sd && this_rq->avg_idle < sd->max_newidle_lb_cost)) { 12706 12707 if (sd) 12708 update_next_balance(sd, &next_balance); 12709 rcu_read_unlock(); 12710 12711 goto out; 12712 } 12713 rcu_read_unlock(); 12714 12715 raw_spin_rq_unlock(this_rq); 12716 12717 t0 = sched_clock_cpu(this_cpu); 12718 sched_balance_update_blocked_averages(this_cpu); 12719 12720 rcu_read_lock(); 12721 for_each_domain(this_cpu, sd) { 12722 u64 domain_cost; 12723 12724 update_next_balance(sd, &next_balance); 12725 12726 if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) 12727 break; 12728 12729 if (sd->flags & SD_BALANCE_NEWIDLE) { 12730 12731 pulled_task = sched_balance_rq(this_cpu, this_rq, 12732 sd, CPU_NEWLY_IDLE, 12733 &continue_balancing); 12734 12735 t1 = sched_clock_cpu(this_cpu); 12736 domain_cost = t1 - t0; 12737 update_newidle_cost(sd, domain_cost); 12738 12739 curr_cost += domain_cost; 12740 t0 = t1; 12741 } 12742 12743 /* 12744 * Stop searching for tasks to pull if there are 12745 * now runnable tasks on this rq. 12746 */ 12747 if (pulled_task || !continue_balancing) 12748 break; 12749 } 12750 rcu_read_unlock(); 12751 12752 raw_spin_rq_lock(this_rq); 12753 12754 if (curr_cost > this_rq->max_idle_balance_cost) 12755 this_rq->max_idle_balance_cost = curr_cost; 12756 12757 /* 12758 * While browsing the domains, we released the rq lock, a task could 12759 * have been enqueued in the meantime. Since we're not going idle, 12760 * pretend we pulled a task. 12761 */ 12762 if (this_rq->cfs.h_nr_running && !pulled_task) 12763 pulled_task = 1; 12764 12765 /* Is there a task of a high priority class? */ 12766 if (this_rq->nr_running != this_rq->cfs.h_nr_running) 12767 pulled_task = -1; 12768 12769 out: 12770 /* Move the next balance forward */ 12771 if (time_after(this_rq->next_balance, next_balance)) 12772 this_rq->next_balance = next_balance; 12773 12774 if (pulled_task) 12775 this_rq->idle_stamp = 0; 12776 else 12777 nohz_newidle_balance(this_rq); 12778 12779 rq_repin_lock(this_rq, rf); 12780 12781 return pulled_task; 12782 } 12783 12784 /* 12785 * This softirq handler is triggered via SCHED_SOFTIRQ from two places: 12786 * 12787 * - directly from the local scheduler_tick() for periodic load balancing 12788 * 12789 * - indirectly from a remote scheduler_tick() for NOHZ idle balancing 12790 * through the SMP cross-call nohz_csd_func() 12791 */ 12792 static __latent_entropy void sched_balance_softirq(void) 12793 { 12794 struct rq *this_rq = this_rq(); 12795 enum cpu_idle_type idle = this_rq->idle_balance; 12796 /* 12797 * If this CPU has a pending NOHZ_BALANCE_KICK, then do the 12798 * balancing on behalf of the other idle CPUs whose ticks are 12799 * stopped. Do nohz_idle_balance *before* sched_balance_domains to 12800 * give the idle CPUs a chance to load balance. Else we may 12801 * load balance only within the local sched_domain hierarchy 12802 * and abort nohz_idle_balance altogether if we pull some load. 12803 */ 12804 if (nohz_idle_balance(this_rq, idle)) 12805 return; 12806 12807 /* normal load balance */ 12808 sched_balance_update_blocked_averages(this_rq->cpu); 12809 sched_balance_domains(this_rq, idle); 12810 } 12811 12812 /* 12813 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. 12814 */ 12815 void sched_balance_trigger(struct rq *rq) 12816 { 12817 /* 12818 * Don't need to rebalance while attached to NULL domain or 12819 * runqueue CPU is not active 12820 */ 12821 if (unlikely(on_null_domain(rq) || !cpu_active(cpu_of(rq)))) 12822 return; 12823 12824 if (time_after_eq(jiffies, rq->next_balance)) 12825 raise_softirq(SCHED_SOFTIRQ); 12826 12827 nohz_balancer_kick(rq); 12828 } 12829 12830 static void rq_online_fair(struct rq *rq) 12831 { 12832 update_sysctl(); 12833 12834 update_runtime_enabled(rq); 12835 } 12836 12837 static void rq_offline_fair(struct rq *rq) 12838 { 12839 update_sysctl(); 12840 12841 /* Ensure any throttled groups are reachable by pick_next_task */ 12842 unthrottle_offline_cfs_rqs(rq); 12843 12844 /* Ensure that we remove rq contribution to group share: */ 12845 clear_tg_offline_cfs_rqs(rq); 12846 } 12847 12848 #endif /* CONFIG_SMP */ 12849 12850 #ifdef CONFIG_SCHED_CORE 12851 static inline bool 12852 __entity_slice_used(struct sched_entity *se, int min_nr_tasks) 12853 { 12854 u64 rtime = se->sum_exec_runtime - se->prev_sum_exec_runtime; 12855 u64 slice = se->slice; 12856 12857 return (rtime * min_nr_tasks > slice); 12858 } 12859 12860 #define MIN_NR_TASKS_DURING_FORCEIDLE 2 12861 static inline void task_tick_core(struct rq *rq, struct task_struct *curr) 12862 { 12863 if (!sched_core_enabled(rq)) 12864 return; 12865 12866 /* 12867 * If runqueue has only one task which used up its slice and 12868 * if the sibling is forced idle, then trigger schedule to 12869 * give forced idle task a chance. 12870 * 12871 * sched_slice() considers only this active rq and it gets the 12872 * whole slice. But during force idle, we have siblings acting 12873 * like a single runqueue and hence we need to consider runnable 12874 * tasks on this CPU and the forced idle CPU. Ideally, we should 12875 * go through the forced idle rq, but that would be a perf hit. 12876 * We can assume that the forced idle CPU has at least 12877 * MIN_NR_TASKS_DURING_FORCEIDLE - 1 tasks and use that to check 12878 * if we need to give up the CPU. 12879 */ 12880 if (rq->core->core_forceidle_count && rq->cfs.nr_running == 1 && 12881 __entity_slice_used(&curr->se, MIN_NR_TASKS_DURING_FORCEIDLE)) 12882 resched_curr(rq); 12883 } 12884 12885 /* 12886 * se_fi_update - Update the cfs_rq->min_vruntime_fi in a CFS hierarchy if needed. 12887 */ 12888 static void se_fi_update(const struct sched_entity *se, unsigned int fi_seq, 12889 bool forceidle) 12890 { 12891 for_each_sched_entity(se) { 12892 struct cfs_rq *cfs_rq = cfs_rq_of(se); 12893 12894 if (forceidle) { 12895 if (cfs_rq->forceidle_seq == fi_seq) 12896 break; 12897 cfs_rq->forceidle_seq = fi_seq; 12898 } 12899 12900 cfs_rq->min_vruntime_fi = cfs_rq->min_vruntime; 12901 } 12902 } 12903 12904 void task_vruntime_update(struct rq *rq, struct task_struct *p, bool in_fi) 12905 { 12906 struct sched_entity *se = &p->se; 12907 12908 if (p->sched_class != &fair_sched_class) 12909 return; 12910 12911 se_fi_update(se, rq->core->core_forceidle_seq, in_fi); 12912 } 12913 12914 bool cfs_prio_less(const struct task_struct *a, const struct task_struct *b, 12915 bool in_fi) 12916 { 12917 struct rq *rq = task_rq(a); 12918 const struct sched_entity *sea = &a->se; 12919 const struct sched_entity *seb = &b->se; 12920 struct cfs_rq *cfs_rqa; 12921 struct cfs_rq *cfs_rqb; 12922 s64 delta; 12923 12924 SCHED_WARN_ON(task_rq(b)->core != rq->core); 12925 12926 #ifdef CONFIG_FAIR_GROUP_SCHED 12927 /* 12928 * Find an se in the hierarchy for tasks a and b, such that the se's 12929 * are immediate siblings. 12930 */ 12931 while (sea->cfs_rq->tg != seb->cfs_rq->tg) { 12932 int sea_depth = sea->depth; 12933 int seb_depth = seb->depth; 12934 12935 if (sea_depth >= seb_depth) 12936 sea = parent_entity(sea); 12937 if (sea_depth <= seb_depth) 12938 seb = parent_entity(seb); 12939 } 12940 12941 se_fi_update(sea, rq->core->core_forceidle_seq, in_fi); 12942 se_fi_update(seb, rq->core->core_forceidle_seq, in_fi); 12943 12944 cfs_rqa = sea->cfs_rq; 12945 cfs_rqb = seb->cfs_rq; 12946 #else 12947 cfs_rqa = &task_rq(a)->cfs; 12948 cfs_rqb = &task_rq(b)->cfs; 12949 #endif 12950 12951 /* 12952 * Find delta after normalizing se's vruntime with its cfs_rq's 12953 * min_vruntime_fi, which would have been updated in prior calls 12954 * to se_fi_update(). 12955 */ 12956 delta = (s64)(sea->vruntime - seb->vruntime) + 12957 (s64)(cfs_rqb->min_vruntime_fi - cfs_rqa->min_vruntime_fi); 12958 12959 return delta > 0; 12960 } 12961 12962 static int task_is_throttled_fair(struct task_struct *p, int cpu) 12963 { 12964 struct cfs_rq *cfs_rq; 12965 12966 #ifdef CONFIG_FAIR_GROUP_SCHED 12967 cfs_rq = task_group(p)->cfs_rq[cpu]; 12968 #else 12969 cfs_rq = &cpu_rq(cpu)->cfs; 12970 #endif 12971 return throttled_hierarchy(cfs_rq); 12972 } 12973 #else 12974 static inline void task_tick_core(struct rq *rq, struct task_struct *curr) {} 12975 #endif 12976 12977 /* 12978 * scheduler tick hitting a task of our scheduling class. 12979 * 12980 * NOTE: This function can be called remotely by the tick offload that 12981 * goes along full dynticks. Therefore no local assumption can be made 12982 * and everything must be accessed through the @rq and @curr passed in 12983 * parameters. 12984 */ 12985 static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) 12986 { 12987 struct cfs_rq *cfs_rq; 12988 struct sched_entity *se = &curr->se; 12989 12990 for_each_sched_entity(se) { 12991 cfs_rq = cfs_rq_of(se); 12992 entity_tick(cfs_rq, se, queued); 12993 } 12994 12995 if (static_branch_unlikely(&sched_numa_balancing)) 12996 task_tick_numa(rq, curr); 12997 12998 update_misfit_status(curr, rq); 12999 check_update_overutilized_status(task_rq(curr)); 13000 13001 task_tick_core(rq, curr); 13002 } 13003 13004 /* 13005 * called on fork with the child task as argument from the parent's context 13006 * - child not yet on the tasklist 13007 * - preemption disabled 13008 */ 13009 static void task_fork_fair(struct task_struct *p) 13010 { 13011 set_task_max_allowed_capacity(p); 13012 } 13013 13014 /* 13015 * Priority of the task has changed. Check to see if we preempt 13016 * the current task. 13017 */ 13018 static void 13019 prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) 13020 { 13021 if (!task_on_rq_queued(p)) 13022 return; 13023 13024 if (rq->cfs.nr_running == 1) 13025 return; 13026 13027 /* 13028 * Reschedule if we are currently running on this runqueue and 13029 * our priority decreased, or if we are not currently running on 13030 * this runqueue and our priority is higher than the current's 13031 */ 13032 if (task_current_donor(rq, p)) { 13033 if (p->prio > oldprio) 13034 resched_curr(rq); 13035 } else 13036 wakeup_preempt(rq, p, 0); 13037 } 13038 13039 #ifdef CONFIG_FAIR_GROUP_SCHED 13040 /* 13041 * Propagate the changes of the sched_entity across the tg tree to make it 13042 * visible to the root 13043 */ 13044 static void propagate_entity_cfs_rq(struct sched_entity *se) 13045 { 13046 struct cfs_rq *cfs_rq = cfs_rq_of(se); 13047 13048 if (cfs_rq_throttled(cfs_rq)) 13049 return; 13050 13051 if (!throttled_hierarchy(cfs_rq)) 13052 list_add_leaf_cfs_rq(cfs_rq); 13053 13054 /* Start to propagate at parent */ 13055 se = se->parent; 13056 13057 for_each_sched_entity(se) { 13058 cfs_rq = cfs_rq_of(se); 13059 13060 update_load_avg(cfs_rq, se, UPDATE_TG); 13061 13062 if (cfs_rq_throttled(cfs_rq)) 13063 break; 13064 13065 if (!throttled_hierarchy(cfs_rq)) 13066 list_add_leaf_cfs_rq(cfs_rq); 13067 } 13068 } 13069 #else 13070 static void propagate_entity_cfs_rq(struct sched_entity *se) { } 13071 #endif 13072 13073 static void detach_entity_cfs_rq(struct sched_entity *se) 13074 { 13075 struct cfs_rq *cfs_rq = cfs_rq_of(se); 13076 13077 #ifdef CONFIG_SMP 13078 /* 13079 * In case the task sched_avg hasn't been attached: 13080 * - A forked task which hasn't been woken up by wake_up_new_task(). 13081 * - A task which has been woken up by try_to_wake_up() but is 13082 * waiting for actually being woken up by sched_ttwu_pending(). 13083 */ 13084 if (!se->avg.last_update_time) 13085 return; 13086 #endif 13087 13088 /* Catch up with the cfs_rq and remove our load when we leave */ 13089 update_load_avg(cfs_rq, se, 0); 13090 detach_entity_load_avg(cfs_rq, se); 13091 update_tg_load_avg(cfs_rq); 13092 propagate_entity_cfs_rq(se); 13093 } 13094 13095 static void attach_entity_cfs_rq(struct sched_entity *se) 13096 { 13097 struct cfs_rq *cfs_rq = cfs_rq_of(se); 13098 13099 /* Synchronize entity with its cfs_rq */ 13100 update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD); 13101 attach_entity_load_avg(cfs_rq, se); 13102 update_tg_load_avg(cfs_rq); 13103 propagate_entity_cfs_rq(se); 13104 } 13105 13106 static void detach_task_cfs_rq(struct task_struct *p) 13107 { 13108 struct sched_entity *se = &p->se; 13109 13110 detach_entity_cfs_rq(se); 13111 } 13112 13113 static void attach_task_cfs_rq(struct task_struct *p) 13114 { 13115 struct sched_entity *se = &p->se; 13116 13117 attach_entity_cfs_rq(se); 13118 } 13119 13120 static void switched_from_fair(struct rq *rq, struct task_struct *p) 13121 { 13122 detach_task_cfs_rq(p); 13123 } 13124 13125 static void switched_to_fair(struct rq *rq, struct task_struct *p) 13126 { 13127 SCHED_WARN_ON(p->se.sched_delayed); 13128 13129 attach_task_cfs_rq(p); 13130 13131 set_task_max_allowed_capacity(p); 13132 13133 if (task_on_rq_queued(p)) { 13134 /* 13135 * We were most likely switched from sched_rt, so 13136 * kick off the schedule if running, otherwise just see 13137 * if we can still preempt the current task. 13138 */ 13139 if (task_current_donor(rq, p)) 13140 resched_curr(rq); 13141 else 13142 wakeup_preempt(rq, p, 0); 13143 } 13144 } 13145 13146 static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool first) 13147 { 13148 struct sched_entity *se = &p->se; 13149 13150 #ifdef CONFIG_SMP 13151 if (task_on_rq_queued(p)) { 13152 /* 13153 * Move the next running task to the front of the list, so our 13154 * cfs_tasks list becomes MRU one. 13155 */ 13156 list_move(&se->group_node, &rq->cfs_tasks); 13157 } 13158 #endif 13159 if (!first) 13160 return; 13161 13162 SCHED_WARN_ON(se->sched_delayed); 13163 13164 if (hrtick_enabled_fair(rq)) 13165 hrtick_start_fair(rq, p); 13166 13167 update_misfit_status(p, rq); 13168 sched_fair_update_stop_tick(rq, p); 13169 } 13170 13171 /* 13172 * Account for a task changing its policy or group. 13173 * 13174 * This routine is mostly called to set cfs_rq->curr field when a task 13175 * migrates between groups/classes. 13176 */ 13177 static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first) 13178 { 13179 struct sched_entity *se = &p->se; 13180 13181 for_each_sched_entity(se) { 13182 struct cfs_rq *cfs_rq = cfs_rq_of(se); 13183 13184 set_next_entity(cfs_rq, se); 13185 /* ensure bandwidth has been allocated on our new cfs_rq */ 13186 account_cfs_rq_runtime(cfs_rq, 0); 13187 } 13188 13189 __set_next_task_fair(rq, p, first); 13190 } 13191 13192 void init_cfs_rq(struct cfs_rq *cfs_rq) 13193 { 13194 cfs_rq->tasks_timeline = RB_ROOT_CACHED; 13195 cfs_rq->min_vruntime = (u64)(-(1LL << 20)); 13196 #ifdef CONFIG_SMP 13197 raw_spin_lock_init(&cfs_rq->removed.lock); 13198 #endif 13199 } 13200 13201 #ifdef CONFIG_FAIR_GROUP_SCHED 13202 static void task_change_group_fair(struct task_struct *p) 13203 { 13204 /* 13205 * We couldn't detach or attach a forked task which 13206 * hasn't been woken up by wake_up_new_task(). 13207 */ 13208 if (READ_ONCE(p->__state) == TASK_NEW) 13209 return; 13210 13211 detach_task_cfs_rq(p); 13212 13213 #ifdef CONFIG_SMP 13214 /* Tell se's cfs_rq has been changed -- migrated */ 13215 p->se.avg.last_update_time = 0; 13216 #endif 13217 set_task_rq(p, task_cpu(p)); 13218 attach_task_cfs_rq(p); 13219 } 13220 13221 void free_fair_sched_group(struct task_group *tg) 13222 { 13223 int i; 13224 13225 for_each_possible_cpu(i) { 13226 if (tg->cfs_rq) 13227 kfree(tg->cfs_rq[i]); 13228 if (tg->se) 13229 kfree(tg->se[i]); 13230 } 13231 13232 kfree(tg->cfs_rq); 13233 kfree(tg->se); 13234 } 13235 13236 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) 13237 { 13238 struct sched_entity *se; 13239 struct cfs_rq *cfs_rq; 13240 int i; 13241 13242 tg->cfs_rq = kcalloc(nr_cpu_ids, sizeof(cfs_rq), GFP_KERNEL); 13243 if (!tg->cfs_rq) 13244 goto err; 13245 tg->se = kcalloc(nr_cpu_ids, sizeof(se), GFP_KERNEL); 13246 if (!tg->se) 13247 goto err; 13248 13249 tg->shares = NICE_0_LOAD; 13250 13251 init_cfs_bandwidth(tg_cfs_bandwidth(tg), tg_cfs_bandwidth(parent)); 13252 13253 for_each_possible_cpu(i) { 13254 cfs_rq = kzalloc_node(sizeof(struct cfs_rq), 13255 GFP_KERNEL, cpu_to_node(i)); 13256 if (!cfs_rq) 13257 goto err; 13258 13259 se = kzalloc_node(sizeof(struct sched_entity_stats), 13260 GFP_KERNEL, cpu_to_node(i)); 13261 if (!se) 13262 goto err_free_rq; 13263 13264 init_cfs_rq(cfs_rq); 13265 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); 13266 init_entity_runnable_average(se); 13267 } 13268 13269 return 1; 13270 13271 err_free_rq: 13272 kfree(cfs_rq); 13273 err: 13274 return 0; 13275 } 13276 13277 void online_fair_sched_group(struct task_group *tg) 13278 { 13279 struct sched_entity *se; 13280 struct rq_flags rf; 13281 struct rq *rq; 13282 int i; 13283 13284 for_each_possible_cpu(i) { 13285 rq = cpu_rq(i); 13286 se = tg->se[i]; 13287 rq_lock_irq(rq, &rf); 13288 update_rq_clock(rq); 13289 attach_entity_cfs_rq(se); 13290 sync_throttle(tg, i); 13291 rq_unlock_irq(rq, &rf); 13292 } 13293 } 13294 13295 void unregister_fair_sched_group(struct task_group *tg) 13296 { 13297 int cpu; 13298 13299 destroy_cfs_bandwidth(tg_cfs_bandwidth(tg)); 13300 13301 for_each_possible_cpu(cpu) { 13302 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu]; 13303 struct sched_entity *se = tg->se[cpu]; 13304 struct rq *rq = cpu_rq(cpu); 13305 13306 if (se) { 13307 if (se->sched_delayed) { 13308 guard(rq_lock_irqsave)(rq); 13309 if (se->sched_delayed) { 13310 update_rq_clock(rq); 13311 dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED); 13312 } 13313 list_del_leaf_cfs_rq(cfs_rq); 13314 } 13315 remove_entity_load_avg(se); 13316 } 13317 13318 /* 13319 * Only empty task groups can be destroyed; so we can speculatively 13320 * check on_list without danger of it being re-added. 13321 */ 13322 if (cfs_rq->on_list) { 13323 guard(rq_lock_irqsave)(rq); 13324 list_del_leaf_cfs_rq(cfs_rq); 13325 } 13326 } 13327 } 13328 13329 void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, 13330 struct sched_entity *se, int cpu, 13331 struct sched_entity *parent) 13332 { 13333 struct rq *rq = cpu_rq(cpu); 13334 13335 cfs_rq->tg = tg; 13336 cfs_rq->rq = rq; 13337 init_cfs_rq_runtime(cfs_rq); 13338 13339 tg->cfs_rq[cpu] = cfs_rq; 13340 tg->se[cpu] = se; 13341 13342 /* se could be NULL for root_task_group */ 13343 if (!se) 13344 return; 13345 13346 if (!parent) { 13347 se->cfs_rq = &rq->cfs; 13348 se->depth = 0; 13349 } else { 13350 se->cfs_rq = parent->my_q; 13351 se->depth = parent->depth + 1; 13352 } 13353 13354 se->my_q = cfs_rq; 13355 /* guarantee group entities always have weight */ 13356 update_load_set(&se->load, NICE_0_LOAD); 13357 se->parent = parent; 13358 } 13359 13360 static DEFINE_MUTEX(shares_mutex); 13361 13362 static int __sched_group_set_shares(struct task_group *tg, unsigned long shares) 13363 { 13364 int i; 13365 13366 lockdep_assert_held(&shares_mutex); 13367 13368 /* 13369 * We can't change the weight of the root cgroup. 13370 */ 13371 if (!tg->se[0]) 13372 return -EINVAL; 13373 13374 shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES)); 13375 13376 if (tg->shares == shares) 13377 return 0; 13378 13379 tg->shares = shares; 13380 for_each_possible_cpu(i) { 13381 struct rq *rq = cpu_rq(i); 13382 struct sched_entity *se = tg->se[i]; 13383 struct rq_flags rf; 13384 13385 /* Propagate contribution to hierarchy */ 13386 rq_lock_irqsave(rq, &rf); 13387 update_rq_clock(rq); 13388 for_each_sched_entity(se) { 13389 update_load_avg(cfs_rq_of(se), se, UPDATE_TG); 13390 update_cfs_group(se); 13391 } 13392 rq_unlock_irqrestore(rq, &rf); 13393 } 13394 13395 return 0; 13396 } 13397 13398 int sched_group_set_shares(struct task_group *tg, unsigned long shares) 13399 { 13400 int ret; 13401 13402 mutex_lock(&shares_mutex); 13403 if (tg_is_idle(tg)) 13404 ret = -EINVAL; 13405 else 13406 ret = __sched_group_set_shares(tg, shares); 13407 mutex_unlock(&shares_mutex); 13408 13409 return ret; 13410 } 13411 13412 int sched_group_set_idle(struct task_group *tg, long idle) 13413 { 13414 int i; 13415 13416 if (tg == &root_task_group) 13417 return -EINVAL; 13418 13419 if (idle < 0 || idle > 1) 13420 return -EINVAL; 13421 13422 mutex_lock(&shares_mutex); 13423 13424 if (tg->idle == idle) { 13425 mutex_unlock(&shares_mutex); 13426 return 0; 13427 } 13428 13429 tg->idle = idle; 13430 13431 for_each_possible_cpu(i) { 13432 struct rq *rq = cpu_rq(i); 13433 struct sched_entity *se = tg->se[i]; 13434 struct cfs_rq *parent_cfs_rq, *grp_cfs_rq = tg->cfs_rq[i]; 13435 bool was_idle = cfs_rq_is_idle(grp_cfs_rq); 13436 long idle_task_delta; 13437 struct rq_flags rf; 13438 13439 rq_lock_irqsave(rq, &rf); 13440 13441 grp_cfs_rq->idle = idle; 13442 if (WARN_ON_ONCE(was_idle == cfs_rq_is_idle(grp_cfs_rq))) 13443 goto next_cpu; 13444 13445 if (se->on_rq) { 13446 parent_cfs_rq = cfs_rq_of(se); 13447 if (cfs_rq_is_idle(grp_cfs_rq)) 13448 parent_cfs_rq->idle_nr_running++; 13449 else 13450 parent_cfs_rq->idle_nr_running--; 13451 } 13452 13453 idle_task_delta = grp_cfs_rq->h_nr_running - 13454 grp_cfs_rq->idle_h_nr_running; 13455 if (!cfs_rq_is_idle(grp_cfs_rq)) 13456 idle_task_delta *= -1; 13457 13458 for_each_sched_entity(se) { 13459 struct cfs_rq *cfs_rq = cfs_rq_of(se); 13460 13461 if (!se->on_rq) 13462 break; 13463 13464 cfs_rq->idle_h_nr_running += idle_task_delta; 13465 13466 /* Already accounted at parent level and above. */ 13467 if (cfs_rq_is_idle(cfs_rq)) 13468 break; 13469 } 13470 13471 next_cpu: 13472 rq_unlock_irqrestore(rq, &rf); 13473 } 13474 13475 /* Idle groups have minimum weight. */ 13476 if (tg_is_idle(tg)) 13477 __sched_group_set_shares(tg, scale_load(WEIGHT_IDLEPRIO)); 13478 else 13479 __sched_group_set_shares(tg, NICE_0_LOAD); 13480 13481 mutex_unlock(&shares_mutex); 13482 return 0; 13483 } 13484 13485 #endif /* CONFIG_FAIR_GROUP_SCHED */ 13486 13487 13488 static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task) 13489 { 13490 struct sched_entity *se = &task->se; 13491 unsigned int rr_interval = 0; 13492 13493 /* 13494 * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise 13495 * idle runqueue: 13496 */ 13497 if (rq->cfs.load.weight) 13498 rr_interval = NS_TO_JIFFIES(se->slice); 13499 13500 return rr_interval; 13501 } 13502 13503 /* 13504 * All the scheduling class methods: 13505 */ 13506 DEFINE_SCHED_CLASS(fair) = { 13507 13508 .enqueue_task = enqueue_task_fair, 13509 .dequeue_task = dequeue_task_fair, 13510 .yield_task = yield_task_fair, 13511 .yield_to_task = yield_to_task_fair, 13512 13513 .wakeup_preempt = check_preempt_wakeup_fair, 13514 13515 .pick_task = pick_task_fair, 13516 .pick_next_task = __pick_next_task_fair, 13517 .put_prev_task = put_prev_task_fair, 13518 .set_next_task = set_next_task_fair, 13519 13520 #ifdef CONFIG_SMP 13521 .balance = balance_fair, 13522 .select_task_rq = select_task_rq_fair, 13523 .migrate_task_rq = migrate_task_rq_fair, 13524 13525 .rq_online = rq_online_fair, 13526 .rq_offline = rq_offline_fair, 13527 13528 .task_dead = task_dead_fair, 13529 .set_cpus_allowed = set_cpus_allowed_fair, 13530 #endif 13531 13532 .task_tick = task_tick_fair, 13533 .task_fork = task_fork_fair, 13534 13535 .reweight_task = reweight_task_fair, 13536 .prio_changed = prio_changed_fair, 13537 .switched_from = switched_from_fair, 13538 .switched_to = switched_to_fair, 13539 13540 .get_rr_interval = get_rr_interval_fair, 13541 13542 .update_curr = update_curr_fair, 13543 13544 #ifdef CONFIG_FAIR_GROUP_SCHED 13545 .task_change_group = task_change_group_fair, 13546 #endif 13547 13548 #ifdef CONFIG_SCHED_CORE 13549 .task_is_throttled = task_is_throttled_fair, 13550 #endif 13551 13552 #ifdef CONFIG_UCLAMP_TASK 13553 .uclamp_enabled = 1, 13554 #endif 13555 }; 13556 13557 #ifdef CONFIG_SCHED_DEBUG 13558 void print_cfs_stats(struct seq_file *m, int cpu) 13559 { 13560 struct cfs_rq *cfs_rq, *pos; 13561 13562 rcu_read_lock(); 13563 for_each_leaf_cfs_rq_safe(cpu_rq(cpu), cfs_rq, pos) 13564 print_cfs_rq(m, cpu, cfs_rq); 13565 rcu_read_unlock(); 13566 } 13567 13568 #ifdef CONFIG_NUMA_BALANCING 13569 void show_numa_stats(struct task_struct *p, struct seq_file *m) 13570 { 13571 int node; 13572 unsigned long tsf = 0, tpf = 0, gsf = 0, gpf = 0; 13573 struct numa_group *ng; 13574 13575 rcu_read_lock(); 13576 ng = rcu_dereference(p->numa_group); 13577 for_each_online_node(node) { 13578 if (p->numa_faults) { 13579 tsf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 0)]; 13580 tpf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 1)]; 13581 } 13582 if (ng) { 13583 gsf = ng->faults[task_faults_idx(NUMA_MEM, node, 0)], 13584 gpf = ng->faults[task_faults_idx(NUMA_MEM, node, 1)]; 13585 } 13586 print_numa_stats(m, node, tsf, tpf, gsf, gpf); 13587 } 13588 rcu_read_unlock(); 13589 } 13590 #endif /* CONFIG_NUMA_BALANCING */ 13591 #endif /* CONFIG_SCHED_DEBUG */ 13592 13593 __init void init_sched_fair_class(void) 13594 { 13595 #ifdef CONFIG_SMP 13596 int i; 13597 13598 for_each_possible_cpu(i) { 13599 zalloc_cpumask_var_node(&per_cpu(load_balance_mask, i), GFP_KERNEL, cpu_to_node(i)); 13600 zalloc_cpumask_var_node(&per_cpu(select_rq_mask, i), GFP_KERNEL, cpu_to_node(i)); 13601 zalloc_cpumask_var_node(&per_cpu(should_we_balance_tmpmask, i), 13602 GFP_KERNEL, cpu_to_node(i)); 13603 13604 #ifdef CONFIG_CFS_BANDWIDTH 13605 INIT_CSD(&cpu_rq(i)->cfsb_csd, __cfsb_csd_unthrottle, cpu_rq(i)); 13606 INIT_LIST_HEAD(&cpu_rq(i)->cfsb_csd_list); 13607 #endif 13608 } 13609 13610 open_softirq(SCHED_SOFTIRQ, sched_balance_softirq); 13611 13612 #ifdef CONFIG_NO_HZ_COMMON 13613 nohz.next_balance = jiffies; 13614 nohz.next_blocked = jiffies; 13615 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); 13616 #endif 13617 #endif /* SMP */ 13618 13619 } 13620