1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
4 *
5 * Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
6 *
7 * Interactivity improvements by Mike Galbraith
8 * (C) 2007 Mike Galbraith <efault@gmx.de>
9 *
10 * Various enhancements by Dmitry Adamushko.
11 * (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
12 *
13 * Group scheduling enhancements by Srivatsa Vaddagiri
14 * Copyright IBM Corporation, 2007
15 * Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
16 *
17 * Scaled math optimizations by Thomas Gleixner
18 * Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
19 *
20 * Adaptive scheduling granularity, math enhancements by Peter Zijlstra
21 * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
22 */
23 #include <linux/energy_model.h>
24 #include <linux/mmap_lock.h>
25 #include <linux/hugetlb_inline.h>
26 #include <linux/jiffies.h>
27 #include <linux/mm_api.h>
28 #include <linux/highmem.h>
29 #include <linux/spinlock_api.h>
30 #include <linux/cpumask_api.h>
31 #include <linux/lockdep_api.h>
32 #include <linux/softirq.h>
33 #include <linux/refcount_api.h>
34 #include <linux/topology.h>
35 #include <linux/sched/clock.h>
36 #include <linux/sched/cond_resched.h>
37 #include <linux/sched/cputime.h>
38 #include <linux/sched/isolation.h>
39 #include <linux/sched/nohz.h>
40 #include <linux/sched/prio.h>
41
42 #include <linux/cpuidle.h>
43 #include <linux/interrupt.h>
44 #include <linux/memory-tiers.h>
45 #include <linux/mempolicy.h>
46 #include <linux/mutex_api.h>
47 #include <linux/profile.h>
48 #include <linux/psi.h>
49 #include <linux/ratelimit.h>
50 #include <linux/task_work.h>
51 #include <linux/rbtree_augmented.h>
52
53 #include <asm/switch_to.h>
54
55 #include <uapi/linux/sched/types.h>
56
57 #include "sched.h"
58 #include "stats.h"
59 #include "autogroup.h"
60
61 /*
62 * The initial- and re-scaling of tunables is configurable
63 *
64 * Options are:
65 *
66 * SCHED_TUNABLESCALING_NONE - unscaled, always *1
67 * SCHED_TUNABLESCALING_LOG - scaled logarithmically, *1+ilog(ncpus)
68 * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
69 *
70 * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
71 */
72 unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
73
74 /*
75 * Minimal preemption granularity for CPU-bound tasks:
76 *
77 * (default: 0.70 msec * (1 + ilog(ncpus)), units: nanoseconds)
78 */
79 unsigned int sysctl_sched_base_slice = 700000ULL;
80 static unsigned int normalized_sysctl_sched_base_slice = 700000ULL;
81
82 __read_mostly unsigned int sysctl_sched_migration_cost = 500000UL;
83
setup_sched_thermal_decay_shift(char * str)84 static int __init setup_sched_thermal_decay_shift(char *str)
85 {
86 pr_warn("Ignoring the deprecated sched_thermal_decay_shift= option\n");
87 return 1;
88 }
89 __setup("sched_thermal_decay_shift=", setup_sched_thermal_decay_shift);
90
91 /*
92 * For asym packing, by default the lower numbered CPU has higher priority.
93 */
arch_asym_cpu_priority(int cpu)94 int __weak arch_asym_cpu_priority(int cpu)
95 {
96 return -cpu;
97 }
98
99 /*
100 * The margin used when comparing utilization with CPU capacity.
101 *
102 * (default: ~20%)
103 */
104 #define fits_capacity(cap, max) ((cap) * 1280 < (max) * 1024)
105
106 /*
107 * The margin used when comparing CPU capacities.
108 * is 'cap1' noticeably greater than 'cap2'
109 *
110 * (default: ~5%)
111 */
112 #define capacity_greater(cap1, cap2) ((cap1) * 1024 > (cap2) * 1078)
113
114 #ifdef CONFIG_CFS_BANDWIDTH
115 /*
116 * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
117 * each time a cfs_rq requests quota.
118 *
119 * Note: in the case that the slice exceeds the runtime remaining (either due
120 * to consumption or the quota being specified to be smaller than the slice)
121 * we will always only issue the remaining available time.
122 *
123 * (default: 5 msec, units: microseconds)
124 */
125 static unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
126 #endif
127
128 #ifdef CONFIG_NUMA_BALANCING
129 /* Restrict the NUMA promotion throughput (MB/s) for each target node. */
130 static unsigned int sysctl_numa_balancing_promote_rate_limit = 65536;
131 #endif
132
133 #ifdef CONFIG_SYSCTL
134 static const struct ctl_table sched_fair_sysctls[] = {
135 #ifdef CONFIG_CFS_BANDWIDTH
136 {
137 .procname = "sched_cfs_bandwidth_slice_us",
138 .data = &sysctl_sched_cfs_bandwidth_slice,
139 .maxlen = sizeof(unsigned int),
140 .mode = 0644,
141 .proc_handler = proc_dointvec_minmax,
142 .extra1 = SYSCTL_ONE,
143 },
144 #endif
145 #ifdef CONFIG_NUMA_BALANCING
146 {
147 .procname = "numa_balancing_promote_rate_limit_MBps",
148 .data = &sysctl_numa_balancing_promote_rate_limit,
149 .maxlen = sizeof(unsigned int),
150 .mode = 0644,
151 .proc_handler = proc_dointvec_minmax,
152 .extra1 = SYSCTL_ZERO,
153 },
154 #endif /* CONFIG_NUMA_BALANCING */
155 };
156
sched_fair_sysctl_init(void)157 static int __init sched_fair_sysctl_init(void)
158 {
159 register_sysctl_init("kernel", sched_fair_sysctls);
160 return 0;
161 }
162 late_initcall(sched_fair_sysctl_init);
163 #endif /* CONFIG_SYSCTL */
164
update_load_add(struct load_weight * lw,unsigned long inc)165 static inline void update_load_add(struct load_weight *lw, unsigned long inc)
166 {
167 lw->weight += inc;
168 lw->inv_weight = 0;
169 }
170
update_load_sub(struct load_weight * lw,unsigned long dec)171 static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
172 {
173 lw->weight -= dec;
174 lw->inv_weight = 0;
175 }
176
update_load_set(struct load_weight * lw,unsigned long w)177 static inline void update_load_set(struct load_weight *lw, unsigned long w)
178 {
179 lw->weight = w;
180 lw->inv_weight = 0;
181 }
182
183 /*
184 * Increase the granularity value when there are more CPUs,
185 * because with more CPUs the 'effective latency' as visible
186 * to users decreases. But the relationship is not linear,
187 * so pick a second-best guess by going with the log2 of the
188 * number of CPUs.
189 *
190 * This idea comes from the SD scheduler of Con Kolivas:
191 */
get_update_sysctl_factor(void)192 static unsigned int get_update_sysctl_factor(void)
193 {
194 unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8);
195 unsigned int factor;
196
197 switch (sysctl_sched_tunable_scaling) {
198 case SCHED_TUNABLESCALING_NONE:
199 factor = 1;
200 break;
201 case SCHED_TUNABLESCALING_LINEAR:
202 factor = cpus;
203 break;
204 case SCHED_TUNABLESCALING_LOG:
205 default:
206 factor = 1 + ilog2(cpus);
207 break;
208 }
209
210 return factor;
211 }
212
update_sysctl(void)213 static void update_sysctl(void)
214 {
215 unsigned int factor = get_update_sysctl_factor();
216
217 #define SET_SYSCTL(name) \
218 (sysctl_##name = (factor) * normalized_sysctl_##name)
219 SET_SYSCTL(sched_base_slice);
220 #undef SET_SYSCTL
221 }
222
sched_init_granularity(void)223 void __init sched_init_granularity(void)
224 {
225 update_sysctl();
226 }
227
228 #define WMULT_CONST (~0U)
229 #define WMULT_SHIFT 32
230
__update_inv_weight(struct load_weight * lw)231 static void __update_inv_weight(struct load_weight *lw)
232 {
233 unsigned long w;
234
235 if (likely(lw->inv_weight))
236 return;
237
238 w = scale_load_down(lw->weight);
239
240 if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
241 lw->inv_weight = 1;
242 else if (unlikely(!w))
243 lw->inv_weight = WMULT_CONST;
244 else
245 lw->inv_weight = WMULT_CONST / w;
246 }
247
248 /*
249 * delta_exec * weight / lw.weight
250 * OR
251 * (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT
252 *
253 * Either weight := NICE_0_LOAD and lw \e sched_prio_to_wmult[], in which case
254 * we're guaranteed shift stays positive because inv_weight is guaranteed to
255 * fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22.
256 *
257 * Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus
258 * weight/lw.weight <= 1, and therefore our shift will also be positive.
259 */
__calc_delta(u64 delta_exec,unsigned long weight,struct load_weight * lw)260 static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
261 {
262 u64 fact = scale_load_down(weight);
263 u32 fact_hi = (u32)(fact >> 32);
264 int shift = WMULT_SHIFT;
265 int fs;
266
267 __update_inv_weight(lw);
268
269 if (unlikely(fact_hi)) {
270 fs = fls(fact_hi);
271 shift -= fs;
272 fact >>= fs;
273 }
274
275 fact = mul_u32_u32(fact, lw->inv_weight);
276
277 fact_hi = (u32)(fact >> 32);
278 if (fact_hi) {
279 fs = fls(fact_hi);
280 shift -= fs;
281 fact >>= fs;
282 }
283
284 return mul_u64_u32_shr(delta_exec, fact, shift);
285 }
286
287 /*
288 * delta /= w
289 */
calc_delta_fair(u64 delta,struct sched_entity * se)290 static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
291 {
292 if (unlikely(se->load.weight != NICE_0_LOAD))
293 delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
294
295 return delta;
296 }
297
298 const struct sched_class fair_sched_class;
299
300 /**************************************************************
301 * CFS operations on generic schedulable entities:
302 */
303
304 #ifdef CONFIG_FAIR_GROUP_SCHED
305
306 /* Walk up scheduling entities hierarchy */
307 #define for_each_sched_entity(se) \
308 for (; se; se = se->parent)
309
list_add_leaf_cfs_rq(struct cfs_rq * cfs_rq)310 static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
311 {
312 struct rq *rq = rq_of(cfs_rq);
313 int cpu = cpu_of(rq);
314
315 if (cfs_rq->on_list)
316 return rq->tmp_alone_branch == &rq->leaf_cfs_rq_list;
317
318 cfs_rq->on_list = 1;
319
320 /*
321 * Ensure we either appear before our parent (if already
322 * enqueued) or force our parent to appear after us when it is
323 * enqueued. The fact that we always enqueue bottom-up
324 * reduces this to two cases and a special case for the root
325 * cfs_rq. Furthermore, it also means that we will always reset
326 * tmp_alone_branch either when the branch is connected
327 * to a tree or when we reach the top of the tree
328 */
329 if (cfs_rq->tg->parent &&
330 cfs_rq->tg->parent->cfs_rq[cpu]->on_list) {
331 /*
332 * If parent is already on the list, we add the child
333 * just before. Thanks to circular linked property of
334 * the list, this means to put the child at the tail
335 * of the list that starts by parent.
336 */
337 list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
338 &(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list));
339 /*
340 * The branch is now connected to its tree so we can
341 * reset tmp_alone_branch to the beginning of the
342 * list.
343 */
344 rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
345 return true;
346 }
347
348 if (!cfs_rq->tg->parent) {
349 /*
350 * cfs rq without parent should be put
351 * at the tail of the list.
352 */
353 list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
354 &rq->leaf_cfs_rq_list);
355 /*
356 * We have reach the top of a tree so we can reset
357 * tmp_alone_branch to the beginning of the list.
358 */
359 rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
360 return true;
361 }
362
363 /*
364 * The parent has not already been added so we want to
365 * make sure that it will be put after us.
366 * tmp_alone_branch points to the begin of the branch
367 * where we will add parent.
368 */
369 list_add_rcu(&cfs_rq->leaf_cfs_rq_list, rq->tmp_alone_branch);
370 /*
371 * update tmp_alone_branch to points to the new begin
372 * of the branch
373 */
374 rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list;
375 return false;
376 }
377
list_del_leaf_cfs_rq(struct cfs_rq * cfs_rq)378 static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
379 {
380 if (cfs_rq->on_list) {
381 struct rq *rq = rq_of(cfs_rq);
382
383 /*
384 * With cfs_rq being unthrottled/throttled during an enqueue,
385 * it can happen the tmp_alone_branch points to the leaf that
386 * we finally want to delete. In this case, tmp_alone_branch moves
387 * to the prev element but it will point to rq->leaf_cfs_rq_list
388 * at the end of the enqueue.
389 */
390 if (rq->tmp_alone_branch == &cfs_rq->leaf_cfs_rq_list)
391 rq->tmp_alone_branch = cfs_rq->leaf_cfs_rq_list.prev;
392
393 list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
394 cfs_rq->on_list = 0;
395 }
396 }
397
assert_list_leaf_cfs_rq(struct rq * rq)398 static inline void assert_list_leaf_cfs_rq(struct rq *rq)
399 {
400 WARN_ON_ONCE(rq->tmp_alone_branch != &rq->leaf_cfs_rq_list);
401 }
402
403 /* Iterate through all leaf cfs_rq's on a runqueue */
404 #define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \
405 list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list, \
406 leaf_cfs_rq_list)
407
408 /* Do the two (enqueued) entities belong to the same group ? */
409 static inline struct cfs_rq *
is_same_group(struct sched_entity * se,struct sched_entity * pse)410 is_same_group(struct sched_entity *se, struct sched_entity *pse)
411 {
412 if (se->cfs_rq == pse->cfs_rq)
413 return se->cfs_rq;
414
415 return NULL;
416 }
417
parent_entity(const struct sched_entity * se)418 static inline struct sched_entity *parent_entity(const struct sched_entity *se)
419 {
420 return se->parent;
421 }
422
423 static void
find_matching_se(struct sched_entity ** se,struct sched_entity ** pse)424 find_matching_se(struct sched_entity **se, struct sched_entity **pse)
425 {
426 int se_depth, pse_depth;
427
428 /*
429 * preemption test can be made between sibling entities who are in the
430 * same cfs_rq i.e who have a common parent. Walk up the hierarchy of
431 * both tasks until we find their ancestors who are siblings of common
432 * parent.
433 */
434
435 /* First walk up until both entities are at same depth */
436 se_depth = (*se)->depth;
437 pse_depth = (*pse)->depth;
438
439 while (se_depth > pse_depth) {
440 se_depth--;
441 *se = parent_entity(*se);
442 }
443
444 while (pse_depth > se_depth) {
445 pse_depth--;
446 *pse = parent_entity(*pse);
447 }
448
449 while (!is_same_group(*se, *pse)) {
450 *se = parent_entity(*se);
451 *pse = parent_entity(*pse);
452 }
453 }
454
tg_is_idle(struct task_group * tg)455 static int tg_is_idle(struct task_group *tg)
456 {
457 return tg->idle > 0;
458 }
459
cfs_rq_is_idle(struct cfs_rq * cfs_rq)460 static int cfs_rq_is_idle(struct cfs_rq *cfs_rq)
461 {
462 return cfs_rq->idle > 0;
463 }
464
se_is_idle(struct sched_entity * se)465 static int se_is_idle(struct sched_entity *se)
466 {
467 if (entity_is_task(se))
468 return task_has_idle_policy(task_of(se));
469 return cfs_rq_is_idle(group_cfs_rq(se));
470 }
471
472 #else /* !CONFIG_FAIR_GROUP_SCHED: */
473
474 #define for_each_sched_entity(se) \
475 for (; se; se = NULL)
476
list_add_leaf_cfs_rq(struct cfs_rq * cfs_rq)477 static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
478 {
479 return true;
480 }
481
list_del_leaf_cfs_rq(struct cfs_rq * cfs_rq)482 static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
483 {
484 }
485
assert_list_leaf_cfs_rq(struct rq * rq)486 static inline void assert_list_leaf_cfs_rq(struct rq *rq)
487 {
488 }
489
490 #define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \
491 for (cfs_rq = &rq->cfs, pos = NULL; cfs_rq; cfs_rq = pos)
492
parent_entity(struct sched_entity * se)493 static inline struct sched_entity *parent_entity(struct sched_entity *se)
494 {
495 return NULL;
496 }
497
498 static inline void
find_matching_se(struct sched_entity ** se,struct sched_entity ** pse)499 find_matching_se(struct sched_entity **se, struct sched_entity **pse)
500 {
501 }
502
tg_is_idle(struct task_group * tg)503 static inline int tg_is_idle(struct task_group *tg)
504 {
505 return 0;
506 }
507
cfs_rq_is_idle(struct cfs_rq * cfs_rq)508 static int cfs_rq_is_idle(struct cfs_rq *cfs_rq)
509 {
510 return 0;
511 }
512
se_is_idle(struct sched_entity * se)513 static int se_is_idle(struct sched_entity *se)
514 {
515 return task_has_idle_policy(task_of(se));
516 }
517
518 #endif /* !CONFIG_FAIR_GROUP_SCHED */
519
520 static __always_inline
521 void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
522
523 /**************************************************************
524 * Scheduling class tree data structure manipulation methods:
525 */
526
max_vruntime(u64 max_vruntime,u64 vruntime)527 static inline __maybe_unused u64 max_vruntime(u64 max_vruntime, u64 vruntime)
528 {
529 s64 delta = (s64)(vruntime - max_vruntime);
530 if (delta > 0)
531 max_vruntime = vruntime;
532
533 return max_vruntime;
534 }
535
min_vruntime(u64 min_vruntime,u64 vruntime)536 static inline __maybe_unused u64 min_vruntime(u64 min_vruntime, u64 vruntime)
537 {
538 s64 delta = (s64)(vruntime - min_vruntime);
539 if (delta < 0)
540 min_vruntime = vruntime;
541
542 return min_vruntime;
543 }
544
entity_before(const struct sched_entity * a,const struct sched_entity * b)545 static inline bool entity_before(const struct sched_entity *a,
546 const struct sched_entity *b)
547 {
548 /*
549 * Tiebreak on vruntime seems unnecessary since it can
550 * hardly happen.
551 */
552 return (s64)(a->deadline - b->deadline) < 0;
553 }
554
entity_key(struct cfs_rq * cfs_rq,struct sched_entity * se)555 static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
556 {
557 return (s64)(se->vruntime - cfs_rq->min_vruntime);
558 }
559
560 #define __node_2_se(node) \
561 rb_entry((node), struct sched_entity, run_node)
562
563 /*
564 * Compute virtual time from the per-task service numbers:
565 *
566 * Fair schedulers conserve lag:
567 *
568 * \Sum lag_i = 0
569 *
570 * Where lag_i is given by:
571 *
572 * lag_i = S - s_i = w_i * (V - v_i)
573 *
574 * Where S is the ideal service time and V is it's virtual time counterpart.
575 * Therefore:
576 *
577 * \Sum lag_i = 0
578 * \Sum w_i * (V - v_i) = 0
579 * \Sum w_i * V - w_i * v_i = 0
580 *
581 * From which we can solve an expression for V in v_i (which we have in
582 * se->vruntime):
583 *
584 * \Sum v_i * w_i \Sum v_i * w_i
585 * V = -------------- = --------------
586 * \Sum w_i W
587 *
588 * Specifically, this is the weighted average of all entity virtual runtimes.
589 *
590 * [[ NOTE: this is only equal to the ideal scheduler under the condition
591 * that join/leave operations happen at lag_i = 0, otherwise the
592 * virtual time has non-contiguous motion equivalent to:
593 *
594 * V +-= lag_i / W
595 *
596 * Also see the comment in place_entity() that deals with this. ]]
597 *
598 * However, since v_i is u64, and the multiplication could easily overflow
599 * transform it into a relative form that uses smaller quantities:
600 *
601 * Substitute: v_i == (v_i - v0) + v0
602 *
603 * \Sum ((v_i - v0) + v0) * w_i \Sum (v_i - v0) * w_i
604 * V = ---------------------------- = --------------------- + v0
605 * W W
606 *
607 * Which we track using:
608 *
609 * v0 := cfs_rq->min_vruntime
610 * \Sum (v_i - v0) * w_i := cfs_rq->avg_vruntime
611 * \Sum w_i := cfs_rq->avg_load
612 *
613 * Since min_vruntime is a monotonic increasing variable that closely tracks
614 * the per-task service, these deltas: (v_i - v), will be in the order of the
615 * maximal (virtual) lag induced in the system due to quantisation.
616 *
617 * Also, we use scale_load_down() to reduce the size.
618 *
619 * As measured, the max (key * weight) value was ~44 bits for a kernel build.
620 */
621 static void
avg_vruntime_add(struct cfs_rq * cfs_rq,struct sched_entity * se)622 avg_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
623 {
624 unsigned long weight = scale_load_down(se->load.weight);
625 s64 key = entity_key(cfs_rq, se);
626
627 cfs_rq->avg_vruntime += key * weight;
628 cfs_rq->avg_load += weight;
629 }
630
631 static void
avg_vruntime_sub(struct cfs_rq * cfs_rq,struct sched_entity * se)632 avg_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se)
633 {
634 unsigned long weight = scale_load_down(se->load.weight);
635 s64 key = entity_key(cfs_rq, se);
636
637 cfs_rq->avg_vruntime -= key * weight;
638 cfs_rq->avg_load -= weight;
639 }
640
641 static inline
avg_vruntime_update(struct cfs_rq * cfs_rq,s64 delta)642 void avg_vruntime_update(struct cfs_rq *cfs_rq, s64 delta)
643 {
644 /*
645 * v' = v + d ==> avg_vruntime' = avg_runtime - d*avg_load
646 */
647 cfs_rq->avg_vruntime -= cfs_rq->avg_load * delta;
648 }
649
650 /*
651 * Specifically: avg_runtime() + 0 must result in entity_eligible() := true
652 * For this to be so, the result of this function must have a left bias.
653 */
avg_vruntime(struct cfs_rq * cfs_rq)654 u64 avg_vruntime(struct cfs_rq *cfs_rq)
655 {
656 struct sched_entity *curr = cfs_rq->curr;
657 s64 avg = cfs_rq->avg_vruntime;
658 long load = cfs_rq->avg_load;
659
660 if (curr && curr->on_rq) {
661 unsigned long weight = scale_load_down(curr->load.weight);
662
663 avg += entity_key(cfs_rq, curr) * weight;
664 load += weight;
665 }
666
667 if (load) {
668 /* sign flips effective floor / ceiling */
669 if (avg < 0)
670 avg -= (load - 1);
671 avg = div_s64(avg, load);
672 }
673
674 return cfs_rq->min_vruntime + avg;
675 }
676
677 /*
678 * lag_i = S - s_i = w_i * (V - v_i)
679 *
680 * However, since V is approximated by the weighted average of all entities it
681 * is possible -- by addition/removal/reweight to the tree -- to move V around
682 * and end up with a larger lag than we started with.
683 *
684 * Limit this to either double the slice length with a minimum of TICK_NSEC
685 * since that is the timing granularity.
686 *
687 * EEVDF gives the following limit for a steady state system:
688 *
689 * -r_max < lag < max(r_max, q)
690 *
691 * XXX could add max_slice to the augmented data to track this.
692 */
update_entity_lag(struct cfs_rq * cfs_rq,struct sched_entity * se)693 static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
694 {
695 s64 vlag, limit;
696
697 WARN_ON_ONCE(!se->on_rq);
698
699 vlag = avg_vruntime(cfs_rq) - se->vruntime;
700 limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se);
701
702 se->vlag = clamp(vlag, -limit, limit);
703 }
704
705 /*
706 * Entity is eligible once it received less service than it ought to have,
707 * eg. lag >= 0.
708 *
709 * lag_i = S - s_i = w_i*(V - v_i)
710 *
711 * lag_i >= 0 -> V >= v_i
712 *
713 * \Sum (v_i - v)*w_i
714 * V = ------------------ + v
715 * \Sum w_i
716 *
717 * lag_i >= 0 -> \Sum (v_i - v)*w_i >= (v_i - v)*(\Sum w_i)
718 *
719 * Note: using 'avg_vruntime() > se->vruntime' is inaccurate due
720 * to the loss in precision caused by the division.
721 */
vruntime_eligible(struct cfs_rq * cfs_rq,u64 vruntime)722 static int vruntime_eligible(struct cfs_rq *cfs_rq, u64 vruntime)
723 {
724 struct sched_entity *curr = cfs_rq->curr;
725 s64 avg = cfs_rq->avg_vruntime;
726 long load = cfs_rq->avg_load;
727
728 if (curr && curr->on_rq) {
729 unsigned long weight = scale_load_down(curr->load.weight);
730
731 avg += entity_key(cfs_rq, curr) * weight;
732 load += weight;
733 }
734
735 return avg >= (s64)(vruntime - cfs_rq->min_vruntime) * load;
736 }
737
entity_eligible(struct cfs_rq * cfs_rq,struct sched_entity * se)738 int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se)
739 {
740 return vruntime_eligible(cfs_rq, se->vruntime);
741 }
742
__update_min_vruntime(struct cfs_rq * cfs_rq,u64 vruntime)743 static u64 __update_min_vruntime(struct cfs_rq *cfs_rq, u64 vruntime)
744 {
745 u64 min_vruntime = cfs_rq->min_vruntime;
746 /*
747 * open coded max_vruntime() to allow updating avg_vruntime
748 */
749 s64 delta = (s64)(vruntime - min_vruntime);
750 if (delta > 0) {
751 avg_vruntime_update(cfs_rq, delta);
752 min_vruntime = vruntime;
753 }
754 return min_vruntime;
755 }
756
update_min_vruntime(struct cfs_rq * cfs_rq)757 static void update_min_vruntime(struct cfs_rq *cfs_rq)
758 {
759 struct sched_entity *se = __pick_root_entity(cfs_rq);
760 struct sched_entity *curr = cfs_rq->curr;
761 u64 vruntime = cfs_rq->min_vruntime;
762
763 if (curr) {
764 if (curr->on_rq)
765 vruntime = curr->vruntime;
766 else
767 curr = NULL;
768 }
769
770 if (se) {
771 if (!curr)
772 vruntime = se->min_vruntime;
773 else
774 vruntime = min_vruntime(vruntime, se->min_vruntime);
775 }
776
777 /* ensure we never gain time by being placed backwards. */
778 cfs_rq->min_vruntime = __update_min_vruntime(cfs_rq, vruntime);
779 }
780
cfs_rq_min_slice(struct cfs_rq * cfs_rq)781 static inline u64 cfs_rq_min_slice(struct cfs_rq *cfs_rq)
782 {
783 struct sched_entity *root = __pick_root_entity(cfs_rq);
784 struct sched_entity *curr = cfs_rq->curr;
785 u64 min_slice = ~0ULL;
786
787 if (curr && curr->on_rq)
788 min_slice = curr->slice;
789
790 if (root)
791 min_slice = min(min_slice, root->min_slice);
792
793 return min_slice;
794 }
795
__entity_less(struct rb_node * a,const struct rb_node * b)796 static inline bool __entity_less(struct rb_node *a, const struct rb_node *b)
797 {
798 return entity_before(__node_2_se(a), __node_2_se(b));
799 }
800
801 #define vruntime_gt(field, lse, rse) ({ (s64)((lse)->field - (rse)->field) > 0; })
802
__min_vruntime_update(struct sched_entity * se,struct rb_node * node)803 static inline void __min_vruntime_update(struct sched_entity *se, struct rb_node *node)
804 {
805 if (node) {
806 struct sched_entity *rse = __node_2_se(node);
807 if (vruntime_gt(min_vruntime, se, rse))
808 se->min_vruntime = rse->min_vruntime;
809 }
810 }
811
__min_slice_update(struct sched_entity * se,struct rb_node * node)812 static inline void __min_slice_update(struct sched_entity *se, struct rb_node *node)
813 {
814 if (node) {
815 struct sched_entity *rse = __node_2_se(node);
816 if (rse->min_slice < se->min_slice)
817 se->min_slice = rse->min_slice;
818 }
819 }
820
821 /*
822 * se->min_vruntime = min(se->vruntime, {left,right}->min_vruntime)
823 */
min_vruntime_update(struct sched_entity * se,bool exit)824 static inline bool min_vruntime_update(struct sched_entity *se, bool exit)
825 {
826 u64 old_min_vruntime = se->min_vruntime;
827 u64 old_min_slice = se->min_slice;
828 struct rb_node *node = &se->run_node;
829
830 se->min_vruntime = se->vruntime;
831 __min_vruntime_update(se, node->rb_right);
832 __min_vruntime_update(se, node->rb_left);
833
834 se->min_slice = se->slice;
835 __min_slice_update(se, node->rb_right);
836 __min_slice_update(se, node->rb_left);
837
838 return se->min_vruntime == old_min_vruntime &&
839 se->min_slice == old_min_slice;
840 }
841
842 RB_DECLARE_CALLBACKS(static, min_vruntime_cb, struct sched_entity,
843 run_node, min_vruntime, min_vruntime_update);
844
845 /*
846 * Enqueue an entity into the rb-tree:
847 */
__enqueue_entity(struct cfs_rq * cfs_rq,struct sched_entity * se)848 static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
849 {
850 avg_vruntime_add(cfs_rq, se);
851 se->min_vruntime = se->vruntime;
852 se->min_slice = se->slice;
853 rb_add_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline,
854 __entity_less, &min_vruntime_cb);
855 }
856
__dequeue_entity(struct cfs_rq * cfs_rq,struct sched_entity * se)857 static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
858 {
859 rb_erase_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline,
860 &min_vruntime_cb);
861 avg_vruntime_sub(cfs_rq, se);
862 }
863
__pick_root_entity(struct cfs_rq * cfs_rq)864 struct sched_entity *__pick_root_entity(struct cfs_rq *cfs_rq)
865 {
866 struct rb_node *root = cfs_rq->tasks_timeline.rb_root.rb_node;
867
868 if (!root)
869 return NULL;
870
871 return __node_2_se(root);
872 }
873
__pick_first_entity(struct cfs_rq * cfs_rq)874 struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
875 {
876 struct rb_node *left = rb_first_cached(&cfs_rq->tasks_timeline);
877
878 if (!left)
879 return NULL;
880
881 return __node_2_se(left);
882 }
883
884 /*
885 * Set the vruntime up to which an entity can run before looking
886 * for another entity to pick.
887 * In case of run to parity, we use the shortest slice of the enqueued
888 * entities to set the protected period.
889 * When run to parity is disabled, we give a minimum quantum to the running
890 * entity to ensure progress.
891 */
set_protect_slice(struct cfs_rq * cfs_rq,struct sched_entity * se)892 static inline void set_protect_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
893 {
894 u64 slice = normalized_sysctl_sched_base_slice;
895 u64 vprot = se->deadline;
896
897 if (sched_feat(RUN_TO_PARITY))
898 slice = cfs_rq_min_slice(cfs_rq);
899
900 slice = min(slice, se->slice);
901 if (slice != se->slice)
902 vprot = min_vruntime(vprot, se->vruntime + calc_delta_fair(slice, se));
903
904 se->vprot = vprot;
905 }
906
update_protect_slice(struct cfs_rq * cfs_rq,struct sched_entity * se)907 static inline void update_protect_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
908 {
909 u64 slice = cfs_rq_min_slice(cfs_rq);
910
911 se->vprot = min_vruntime(se->vprot, se->vruntime + calc_delta_fair(slice, se));
912 }
913
protect_slice(struct sched_entity * se)914 static inline bool protect_slice(struct sched_entity *se)
915 {
916 return ((s64)(se->vprot - se->vruntime) > 0);
917 }
918
cancel_protect_slice(struct sched_entity * se)919 static inline void cancel_protect_slice(struct sched_entity *se)
920 {
921 if (protect_slice(se))
922 se->vprot = se->vruntime;
923 }
924
925 /*
926 * Earliest Eligible Virtual Deadline First
927 *
928 * In order to provide latency guarantees for different request sizes
929 * EEVDF selects the best runnable task from two criteria:
930 *
931 * 1) the task must be eligible (must be owed service)
932 *
933 * 2) from those tasks that meet 1), we select the one
934 * with the earliest virtual deadline.
935 *
936 * We can do this in O(log n) time due to an augmented RB-tree. The
937 * tree keeps the entries sorted on deadline, but also functions as a
938 * heap based on the vruntime by keeping:
939 *
940 * se->min_vruntime = min(se->vruntime, se->{left,right}->min_vruntime)
941 *
942 * Which allows tree pruning through eligibility.
943 */
__pick_eevdf(struct cfs_rq * cfs_rq,bool protect)944 static struct sched_entity *__pick_eevdf(struct cfs_rq *cfs_rq, bool protect)
945 {
946 struct rb_node *node = cfs_rq->tasks_timeline.rb_root.rb_node;
947 struct sched_entity *se = __pick_first_entity(cfs_rq);
948 struct sched_entity *curr = cfs_rq->curr;
949 struct sched_entity *best = NULL;
950
951 /*
952 * We can safely skip eligibility check if there is only one entity
953 * in this cfs_rq, saving some cycles.
954 */
955 if (cfs_rq->nr_queued == 1)
956 return curr && curr->on_rq ? curr : se;
957
958 if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr)))
959 curr = NULL;
960
961 if (curr && protect && protect_slice(curr))
962 return curr;
963
964 /* Pick the leftmost entity if it's eligible */
965 if (se && entity_eligible(cfs_rq, se)) {
966 best = se;
967 goto found;
968 }
969
970 /* Heap search for the EEVD entity */
971 while (node) {
972 struct rb_node *left = node->rb_left;
973
974 /*
975 * Eligible entities in left subtree are always better
976 * choices, since they have earlier deadlines.
977 */
978 if (left && vruntime_eligible(cfs_rq,
979 __node_2_se(left)->min_vruntime)) {
980 node = left;
981 continue;
982 }
983
984 se = __node_2_se(node);
985
986 /*
987 * The left subtree either is empty or has no eligible
988 * entity, so check the current node since it is the one
989 * with earliest deadline that might be eligible.
990 */
991 if (entity_eligible(cfs_rq, se)) {
992 best = se;
993 break;
994 }
995
996 node = node->rb_right;
997 }
998 found:
999 if (!best || (curr && entity_before(curr, best)))
1000 best = curr;
1001
1002 return best;
1003 }
1004
pick_eevdf(struct cfs_rq * cfs_rq)1005 static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
1006 {
1007 return __pick_eevdf(cfs_rq, true);
1008 }
1009
__pick_last_entity(struct cfs_rq * cfs_rq)1010 struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
1011 {
1012 struct rb_node *last = rb_last(&cfs_rq->tasks_timeline.rb_root);
1013
1014 if (!last)
1015 return NULL;
1016
1017 return __node_2_se(last);
1018 }
1019
1020 /**************************************************************
1021 * Scheduling class statistics methods:
1022 */
sched_update_scaling(void)1023 int sched_update_scaling(void)
1024 {
1025 unsigned int factor = get_update_sysctl_factor();
1026
1027 #define WRT_SYSCTL(name) \
1028 (normalized_sysctl_##name = sysctl_##name / (factor))
1029 WRT_SYSCTL(sched_base_slice);
1030 #undef WRT_SYSCTL
1031
1032 return 0;
1033 }
1034
1035 static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se);
1036
1037 /*
1038 * XXX: strictly: vd_i += N*r_i/w_i such that: vd_i > ve_i
1039 * this is probably good enough.
1040 */
update_deadline(struct cfs_rq * cfs_rq,struct sched_entity * se)1041 static bool update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se)
1042 {
1043 if ((s64)(se->vruntime - se->deadline) < 0)
1044 return false;
1045
1046 /*
1047 * For EEVDF the virtual time slope is determined by w_i (iow.
1048 * nice) while the request time r_i is determined by
1049 * sysctl_sched_base_slice.
1050 */
1051 if (!se->custom_slice)
1052 se->slice = sysctl_sched_base_slice;
1053
1054 /*
1055 * EEVDF: vd_i = ve_i + r_i / w_i
1056 */
1057 se->deadline = se->vruntime + calc_delta_fair(se->slice, se);
1058
1059 /*
1060 * The task has consumed its request, reschedule.
1061 */
1062 return true;
1063 }
1064
1065 #include "pelt.h"
1066
1067 static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
1068 static unsigned long task_h_load(struct task_struct *p);
1069 static unsigned long capacity_of(int cpu);
1070
1071 /* Give new sched_entity start runnable values to heavy its load in infant time */
init_entity_runnable_average(struct sched_entity * se)1072 void init_entity_runnable_average(struct sched_entity *se)
1073 {
1074 struct sched_avg *sa = &se->avg;
1075
1076 memset(sa, 0, sizeof(*sa));
1077
1078 /*
1079 * Tasks are initialized with full load to be seen as heavy tasks until
1080 * they get a chance to stabilize to their real load level.
1081 * Group entities are initialized with zero load to reflect the fact that
1082 * nothing has been attached to the task group yet.
1083 */
1084 if (entity_is_task(se))
1085 sa->load_avg = scale_load_down(se->load.weight);
1086
1087 /* when this task is enqueued, it will contribute to its cfs_rq's load_avg */
1088 }
1089
1090 /*
1091 * With new tasks being created, their initial util_avgs are extrapolated
1092 * based on the cfs_rq's current util_avg:
1093 *
1094 * util_avg = cfs_rq->avg.util_avg / (cfs_rq->avg.load_avg + 1)
1095 * * se_weight(se)
1096 *
1097 * However, in many cases, the above util_avg does not give a desired
1098 * value. Moreover, the sum of the util_avgs may be divergent, such
1099 * as when the series is a harmonic series.
1100 *
1101 * To solve this problem, we also cap the util_avg of successive tasks to
1102 * only 1/2 of the left utilization budget:
1103 *
1104 * util_avg_cap = (cpu_scale - cfs_rq->avg.util_avg) / 2^n
1105 *
1106 * where n denotes the nth task and cpu_scale the CPU capacity.
1107 *
1108 * For example, for a CPU with 1024 of capacity, a simplest series from
1109 * the beginning would be like:
1110 *
1111 * task util_avg: 512, 256, 128, 64, 32, 16, 8, ...
1112 * cfs_rq util_avg: 512, 768, 896, 960, 992, 1008, 1016, ...
1113 *
1114 * Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap)
1115 * if util_avg > util_avg_cap.
1116 */
post_init_entity_util_avg(struct task_struct * p)1117 void post_init_entity_util_avg(struct task_struct *p)
1118 {
1119 struct sched_entity *se = &p->se;
1120 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1121 struct sched_avg *sa = &se->avg;
1122 long cpu_scale = arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq)));
1123 long cap = (long)(cpu_scale - cfs_rq->avg.util_avg) / 2;
1124
1125 if (p->sched_class != &fair_sched_class) {
1126 /*
1127 * For !fair tasks do:
1128 *
1129 update_cfs_rq_load_avg(now, cfs_rq);
1130 attach_entity_load_avg(cfs_rq, se);
1131 switched_from_fair(rq, p);
1132 *
1133 * such that the next switched_to_fair() has the
1134 * expected state.
1135 */
1136 se->avg.last_update_time = cfs_rq_clock_pelt(cfs_rq);
1137 return;
1138 }
1139
1140 if (cap > 0) {
1141 if (cfs_rq->avg.util_avg != 0) {
1142 sa->util_avg = cfs_rq->avg.util_avg * se_weight(se);
1143 sa->util_avg /= (cfs_rq->avg.load_avg + 1);
1144
1145 if (sa->util_avg > cap)
1146 sa->util_avg = cap;
1147 } else {
1148 sa->util_avg = cap;
1149 }
1150 }
1151
1152 sa->runnable_avg = sa->util_avg;
1153 }
1154
update_se(struct rq * rq,struct sched_entity * se)1155 static s64 update_se(struct rq *rq, struct sched_entity *se)
1156 {
1157 u64 now = rq_clock_task(rq);
1158 s64 delta_exec;
1159
1160 delta_exec = now - se->exec_start;
1161 if (unlikely(delta_exec <= 0))
1162 return delta_exec;
1163
1164 se->exec_start = now;
1165 if (entity_is_task(se)) {
1166 struct task_struct *donor = task_of(se);
1167 struct task_struct *running = rq->curr;
1168 /*
1169 * If se is a task, we account the time against the running
1170 * task, as w/ proxy-exec they may not be the same.
1171 */
1172 running->se.exec_start = now;
1173 running->se.sum_exec_runtime += delta_exec;
1174
1175 trace_sched_stat_runtime(running, delta_exec);
1176 account_group_exec_runtime(running, delta_exec);
1177
1178 /* cgroup time is always accounted against the donor */
1179 cgroup_account_cputime(donor, delta_exec);
1180 } else {
1181 /* If not task, account the time against donor se */
1182 se->sum_exec_runtime += delta_exec;
1183 }
1184
1185 if (schedstat_enabled()) {
1186 struct sched_statistics *stats;
1187
1188 stats = __schedstats_from_se(se);
1189 __schedstat_set(stats->exec_max,
1190 max(delta_exec, stats->exec_max));
1191 }
1192
1193 return delta_exec;
1194 }
1195
1196 /*
1197 * Used by other classes to account runtime.
1198 */
update_curr_common(struct rq * rq)1199 s64 update_curr_common(struct rq *rq)
1200 {
1201 return update_se(rq, &rq->donor->se);
1202 }
1203
1204 /*
1205 * Update the current task's runtime statistics.
1206 */
update_curr(struct cfs_rq * cfs_rq)1207 static void update_curr(struct cfs_rq *cfs_rq)
1208 {
1209 /*
1210 * Note: cfs_rq->curr corresponds to the task picked to
1211 * run (ie: rq->donor.se) which due to proxy-exec may
1212 * not necessarily be the actual task running
1213 * (rq->curr.se). This is easy to confuse!
1214 */
1215 struct sched_entity *curr = cfs_rq->curr;
1216 struct rq *rq = rq_of(cfs_rq);
1217 s64 delta_exec;
1218 bool resched;
1219
1220 if (unlikely(!curr))
1221 return;
1222
1223 delta_exec = update_se(rq, curr);
1224 if (unlikely(delta_exec <= 0))
1225 return;
1226
1227 curr->vruntime += calc_delta_fair(delta_exec, curr);
1228 resched = update_deadline(cfs_rq, curr);
1229 update_min_vruntime(cfs_rq);
1230
1231 if (entity_is_task(curr)) {
1232 /*
1233 * If the fair_server is active, we need to account for the
1234 * fair_server time whether or not the task is running on
1235 * behalf of fair_server or not:
1236 * - If the task is running on behalf of fair_server, we need
1237 * to limit its time based on the assigned runtime.
1238 * - Fair task that runs outside of fair_server should account
1239 * against fair_server such that it can account for this time
1240 * and possibly avoid running this period.
1241 */
1242 if (dl_server_active(&rq->fair_server))
1243 dl_server_update(&rq->fair_server, delta_exec);
1244 }
1245
1246 account_cfs_rq_runtime(cfs_rq, delta_exec);
1247
1248 if (cfs_rq->nr_queued == 1)
1249 return;
1250
1251 if (resched || !protect_slice(curr)) {
1252 resched_curr_lazy(rq);
1253 clear_buddies(cfs_rq, curr);
1254 }
1255 }
1256
update_curr_fair(struct rq * rq)1257 static void update_curr_fair(struct rq *rq)
1258 {
1259 update_curr(cfs_rq_of(&rq->donor->se));
1260 }
1261
1262 static inline void
update_stats_wait_start_fair(struct cfs_rq * cfs_rq,struct sched_entity * se)1263 update_stats_wait_start_fair(struct cfs_rq *cfs_rq, struct sched_entity *se)
1264 {
1265 struct sched_statistics *stats;
1266 struct task_struct *p = NULL;
1267
1268 if (!schedstat_enabled())
1269 return;
1270
1271 stats = __schedstats_from_se(se);
1272
1273 if (entity_is_task(se))
1274 p = task_of(se);
1275
1276 __update_stats_wait_start(rq_of(cfs_rq), p, stats);
1277 }
1278
1279 static inline void
update_stats_wait_end_fair(struct cfs_rq * cfs_rq,struct sched_entity * se)1280 update_stats_wait_end_fair(struct cfs_rq *cfs_rq, struct sched_entity *se)
1281 {
1282 struct sched_statistics *stats;
1283 struct task_struct *p = NULL;
1284
1285 if (!schedstat_enabled())
1286 return;
1287
1288 stats = __schedstats_from_se(se);
1289
1290 /*
1291 * When the sched_schedstat changes from 0 to 1, some sched se
1292 * maybe already in the runqueue, the se->statistics.wait_start
1293 * will be 0.So it will let the delta wrong. We need to avoid this
1294 * scenario.
1295 */
1296 if (unlikely(!schedstat_val(stats->wait_start)))
1297 return;
1298
1299 if (entity_is_task(se))
1300 p = task_of(se);
1301
1302 __update_stats_wait_end(rq_of(cfs_rq), p, stats);
1303 }
1304
1305 static inline void
update_stats_enqueue_sleeper_fair(struct cfs_rq * cfs_rq,struct sched_entity * se)1306 update_stats_enqueue_sleeper_fair(struct cfs_rq *cfs_rq, struct sched_entity *se)
1307 {
1308 struct sched_statistics *stats;
1309 struct task_struct *tsk = NULL;
1310
1311 if (!schedstat_enabled())
1312 return;
1313
1314 stats = __schedstats_from_se(se);
1315
1316 if (entity_is_task(se))
1317 tsk = task_of(se);
1318
1319 __update_stats_enqueue_sleeper(rq_of(cfs_rq), tsk, stats);
1320 }
1321
1322 /*
1323 * Task is being enqueued - update stats:
1324 */
1325 static inline void
update_stats_enqueue_fair(struct cfs_rq * cfs_rq,struct sched_entity * se,int flags)1326 update_stats_enqueue_fair(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
1327 {
1328 if (!schedstat_enabled())
1329 return;
1330
1331 /*
1332 * Are we enqueueing a waiting task? (for current tasks
1333 * a dequeue/enqueue event is a NOP)
1334 */
1335 if (se != cfs_rq->curr)
1336 update_stats_wait_start_fair(cfs_rq, se);
1337
1338 if (flags & ENQUEUE_WAKEUP)
1339 update_stats_enqueue_sleeper_fair(cfs_rq, se);
1340 }
1341
1342 static inline void
update_stats_dequeue_fair(struct cfs_rq * cfs_rq,struct sched_entity * se,int flags)1343 update_stats_dequeue_fair(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
1344 {
1345
1346 if (!schedstat_enabled())
1347 return;
1348
1349 /*
1350 * Mark the end of the wait period if dequeueing a
1351 * waiting task:
1352 */
1353 if (se != cfs_rq->curr)
1354 update_stats_wait_end_fair(cfs_rq, se);
1355
1356 if ((flags & DEQUEUE_SLEEP) && entity_is_task(se)) {
1357 struct task_struct *tsk = task_of(se);
1358 unsigned int state;
1359
1360 /* XXX racy against TTWU */
1361 state = READ_ONCE(tsk->__state);
1362 if (state & TASK_INTERRUPTIBLE)
1363 __schedstat_set(tsk->stats.sleep_start,
1364 rq_clock(rq_of(cfs_rq)));
1365 if (state & TASK_UNINTERRUPTIBLE)
1366 __schedstat_set(tsk->stats.block_start,
1367 rq_clock(rq_of(cfs_rq)));
1368 }
1369 }
1370
1371 /*
1372 * We are picking a new current task - update its stats:
1373 */
1374 static inline void
update_stats_curr_start(struct cfs_rq * cfs_rq,struct sched_entity * se)1375 update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
1376 {
1377 /*
1378 * We are starting a new run period:
1379 */
1380 se->exec_start = rq_clock_task(rq_of(cfs_rq));
1381 }
1382
1383 /**************************************************
1384 * Scheduling class queueing methods:
1385 */
1386
is_core_idle(int cpu)1387 static inline bool is_core_idle(int cpu)
1388 {
1389 #ifdef CONFIG_SCHED_SMT
1390 int sibling;
1391
1392 for_each_cpu(sibling, cpu_smt_mask(cpu)) {
1393 if (cpu == sibling)
1394 continue;
1395
1396 if (!idle_cpu(sibling))
1397 return false;
1398 }
1399 #endif
1400
1401 return true;
1402 }
1403
1404 #ifdef CONFIG_NUMA
1405 #define NUMA_IMBALANCE_MIN 2
1406
1407 static inline long
adjust_numa_imbalance(int imbalance,int dst_running,int imb_numa_nr)1408 adjust_numa_imbalance(int imbalance, int dst_running, int imb_numa_nr)
1409 {
1410 /*
1411 * Allow a NUMA imbalance if busy CPUs is less than the maximum
1412 * threshold. Above this threshold, individual tasks may be contending
1413 * for both memory bandwidth and any shared HT resources. This is an
1414 * approximation as the number of running tasks may not be related to
1415 * the number of busy CPUs due to sched_setaffinity.
1416 */
1417 if (dst_running > imb_numa_nr)
1418 return imbalance;
1419
1420 /*
1421 * Allow a small imbalance based on a simple pair of communicating
1422 * tasks that remain local when the destination is lightly loaded.
1423 */
1424 if (imbalance <= NUMA_IMBALANCE_MIN)
1425 return 0;
1426
1427 return imbalance;
1428 }
1429 #endif /* CONFIG_NUMA */
1430
1431 #ifdef CONFIG_NUMA_BALANCING
1432 /*
1433 * Approximate time to scan a full NUMA task in ms. The task scan period is
1434 * calculated based on the tasks virtual memory size and
1435 * numa_balancing_scan_size.
1436 */
1437 unsigned int sysctl_numa_balancing_scan_period_min = 1000;
1438 unsigned int sysctl_numa_balancing_scan_period_max = 60000;
1439
1440 /* Portion of address space to scan in MB */
1441 unsigned int sysctl_numa_balancing_scan_size = 256;
1442
1443 /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
1444 unsigned int sysctl_numa_balancing_scan_delay = 1000;
1445
1446 /* The page with hint page fault latency < threshold in ms is considered hot */
1447 unsigned int sysctl_numa_balancing_hot_threshold = MSEC_PER_SEC;
1448
1449 struct numa_group {
1450 refcount_t refcount;
1451
1452 spinlock_t lock; /* nr_tasks, tasks */
1453 int nr_tasks;
1454 pid_t gid;
1455 int active_nodes;
1456
1457 struct rcu_head rcu;
1458 unsigned long total_faults;
1459 unsigned long max_faults_cpu;
1460 /*
1461 * faults[] array is split into two regions: faults_mem and faults_cpu.
1462 *
1463 * Faults_cpu is used to decide whether memory should move
1464 * towards the CPU. As a consequence, these stats are weighted
1465 * more by CPU use than by memory faults.
1466 */
1467 unsigned long faults[];
1468 };
1469
1470 /*
1471 * For functions that can be called in multiple contexts that permit reading
1472 * ->numa_group (see struct task_struct for locking rules).
1473 */
deref_task_numa_group(struct task_struct * p)1474 static struct numa_group *deref_task_numa_group(struct task_struct *p)
1475 {
1476 return rcu_dereference_check(p->numa_group, p == current ||
1477 (lockdep_is_held(__rq_lockp(task_rq(p))) && !READ_ONCE(p->on_cpu)));
1478 }
1479
deref_curr_numa_group(struct task_struct * p)1480 static struct numa_group *deref_curr_numa_group(struct task_struct *p)
1481 {
1482 return rcu_dereference_protected(p->numa_group, p == current);
1483 }
1484
1485 static inline unsigned long group_faults_priv(struct numa_group *ng);
1486 static inline unsigned long group_faults_shared(struct numa_group *ng);
1487
task_nr_scan_windows(struct task_struct * p)1488 static unsigned int task_nr_scan_windows(struct task_struct *p)
1489 {
1490 unsigned long rss = 0;
1491 unsigned long nr_scan_pages;
1492
1493 /*
1494 * Calculations based on RSS as non-present and empty pages are skipped
1495 * by the PTE scanner and NUMA hinting faults should be trapped based
1496 * on resident pages
1497 */
1498 nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT);
1499 rss = get_mm_rss(p->mm);
1500 if (!rss)
1501 rss = nr_scan_pages;
1502
1503 rss = round_up(rss, nr_scan_pages);
1504 return rss / nr_scan_pages;
1505 }
1506
1507 /* For sanity's sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */
1508 #define MAX_SCAN_WINDOW 2560
1509
task_scan_min(struct task_struct * p)1510 static unsigned int task_scan_min(struct task_struct *p)
1511 {
1512 unsigned int scan_size = READ_ONCE(sysctl_numa_balancing_scan_size);
1513 unsigned int scan, floor;
1514 unsigned int windows = 1;
1515
1516 if (scan_size < MAX_SCAN_WINDOW)
1517 windows = MAX_SCAN_WINDOW / scan_size;
1518 floor = 1000 / windows;
1519
1520 scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
1521 return max_t(unsigned int, floor, scan);
1522 }
1523
task_scan_start(struct task_struct * p)1524 static unsigned int task_scan_start(struct task_struct *p)
1525 {
1526 unsigned long smin = task_scan_min(p);
1527 unsigned long period = smin;
1528 struct numa_group *ng;
1529
1530 /* Scale the maximum scan period with the amount of shared memory. */
1531 rcu_read_lock();
1532 ng = rcu_dereference(p->numa_group);
1533 if (ng) {
1534 unsigned long shared = group_faults_shared(ng);
1535 unsigned long private = group_faults_priv(ng);
1536
1537 period *= refcount_read(&ng->refcount);
1538 period *= shared + 1;
1539 period /= private + shared + 1;
1540 }
1541 rcu_read_unlock();
1542
1543 return max(smin, period);
1544 }
1545
task_scan_max(struct task_struct * p)1546 static unsigned int task_scan_max(struct task_struct *p)
1547 {
1548 unsigned long smin = task_scan_min(p);
1549 unsigned long smax;
1550 struct numa_group *ng;
1551
1552 /* Watch for min being lower than max due to floor calculations */
1553 smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
1554
1555 /* Scale the maximum scan period with the amount of shared memory. */
1556 ng = deref_curr_numa_group(p);
1557 if (ng) {
1558 unsigned long shared = group_faults_shared(ng);
1559 unsigned long private = group_faults_priv(ng);
1560 unsigned long period = smax;
1561
1562 period *= refcount_read(&ng->refcount);
1563 period *= shared + 1;
1564 period /= private + shared + 1;
1565
1566 smax = max(smax, period);
1567 }
1568
1569 return max(smin, smax);
1570 }
1571
account_numa_enqueue(struct rq * rq,struct task_struct * p)1572 static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
1573 {
1574 rq->nr_numa_running += (p->numa_preferred_nid != NUMA_NO_NODE);
1575 rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
1576 }
1577
account_numa_dequeue(struct rq * rq,struct task_struct * p)1578 static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
1579 {
1580 rq->nr_numa_running -= (p->numa_preferred_nid != NUMA_NO_NODE);
1581 rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
1582 }
1583
1584 /* Shared or private faults. */
1585 #define NR_NUMA_HINT_FAULT_TYPES 2
1586
1587 /* Memory and CPU locality */
1588 #define NR_NUMA_HINT_FAULT_STATS (NR_NUMA_HINT_FAULT_TYPES * 2)
1589
1590 /* Averaged statistics, and temporary buffers. */
1591 #define NR_NUMA_HINT_FAULT_BUCKETS (NR_NUMA_HINT_FAULT_STATS * 2)
1592
task_numa_group_id(struct task_struct * p)1593 pid_t task_numa_group_id(struct task_struct *p)
1594 {
1595 struct numa_group *ng;
1596 pid_t gid = 0;
1597
1598 rcu_read_lock();
1599 ng = rcu_dereference(p->numa_group);
1600 if (ng)
1601 gid = ng->gid;
1602 rcu_read_unlock();
1603
1604 return gid;
1605 }
1606
1607 /*
1608 * The averaged statistics, shared & private, memory & CPU,
1609 * occupy the first half of the array. The second half of the
1610 * array is for current counters, which are averaged into the
1611 * first set by task_numa_placement.
1612 */
task_faults_idx(enum numa_faults_stats s,int nid,int priv)1613 static inline int task_faults_idx(enum numa_faults_stats s, int nid, int priv)
1614 {
1615 return NR_NUMA_HINT_FAULT_TYPES * (s * nr_node_ids + nid) + priv;
1616 }
1617
task_faults(struct task_struct * p,int nid)1618 static inline unsigned long task_faults(struct task_struct *p, int nid)
1619 {
1620 if (!p->numa_faults)
1621 return 0;
1622
1623 return p->numa_faults[task_faults_idx(NUMA_MEM, nid, 0)] +
1624 p->numa_faults[task_faults_idx(NUMA_MEM, nid, 1)];
1625 }
1626
group_faults(struct task_struct * p,int nid)1627 static inline unsigned long group_faults(struct task_struct *p, int nid)
1628 {
1629 struct numa_group *ng = deref_task_numa_group(p);
1630
1631 if (!ng)
1632 return 0;
1633
1634 return ng->faults[task_faults_idx(NUMA_MEM, nid, 0)] +
1635 ng->faults[task_faults_idx(NUMA_MEM, nid, 1)];
1636 }
1637
group_faults_cpu(struct numa_group * group,int nid)1638 static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
1639 {
1640 return group->faults[task_faults_idx(NUMA_CPU, nid, 0)] +
1641 group->faults[task_faults_idx(NUMA_CPU, nid, 1)];
1642 }
1643
group_faults_priv(struct numa_group * ng)1644 static inline unsigned long group_faults_priv(struct numa_group *ng)
1645 {
1646 unsigned long faults = 0;
1647 int node;
1648
1649 for_each_online_node(node) {
1650 faults += ng->faults[task_faults_idx(NUMA_MEM, node, 1)];
1651 }
1652
1653 return faults;
1654 }
1655
group_faults_shared(struct numa_group * ng)1656 static inline unsigned long group_faults_shared(struct numa_group *ng)
1657 {
1658 unsigned long faults = 0;
1659 int node;
1660
1661 for_each_online_node(node) {
1662 faults += ng->faults[task_faults_idx(NUMA_MEM, node, 0)];
1663 }
1664
1665 return faults;
1666 }
1667
1668 /*
1669 * A node triggering more than 1/3 as many NUMA faults as the maximum is
1670 * considered part of a numa group's pseudo-interleaving set. Migrations
1671 * between these nodes are slowed down, to allow things to settle down.
1672 */
1673 #define ACTIVE_NODE_FRACTION 3
1674
numa_is_active_node(int nid,struct numa_group * ng)1675 static bool numa_is_active_node(int nid, struct numa_group *ng)
1676 {
1677 return group_faults_cpu(ng, nid) * ACTIVE_NODE_FRACTION > ng->max_faults_cpu;
1678 }
1679
1680 /* Handle placement on systems where not all nodes are directly connected. */
score_nearby_nodes(struct task_struct * p,int nid,int lim_dist,bool task)1681 static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
1682 int lim_dist, bool task)
1683 {
1684 unsigned long score = 0;
1685 int node, max_dist;
1686
1687 /*
1688 * All nodes are directly connected, and the same distance
1689 * from each other. No need for fancy placement algorithms.
1690 */
1691 if (sched_numa_topology_type == NUMA_DIRECT)
1692 return 0;
1693
1694 /* sched_max_numa_distance may be changed in parallel. */
1695 max_dist = READ_ONCE(sched_max_numa_distance);
1696 /*
1697 * This code is called for each node, introducing N^2 complexity,
1698 * which should be OK given the number of nodes rarely exceeds 8.
1699 */
1700 for_each_online_node(node) {
1701 unsigned long faults;
1702 int dist = node_distance(nid, node);
1703
1704 /*
1705 * The furthest away nodes in the system are not interesting
1706 * for placement; nid was already counted.
1707 */
1708 if (dist >= max_dist || node == nid)
1709 continue;
1710
1711 /*
1712 * On systems with a backplane NUMA topology, compare groups
1713 * of nodes, and move tasks towards the group with the most
1714 * memory accesses. When comparing two nodes at distance
1715 * "hoplimit", only nodes closer by than "hoplimit" are part
1716 * of each group. Skip other nodes.
1717 */
1718 if (sched_numa_topology_type == NUMA_BACKPLANE && dist >= lim_dist)
1719 continue;
1720
1721 /* Add up the faults from nearby nodes. */
1722 if (task)
1723 faults = task_faults(p, node);
1724 else
1725 faults = group_faults(p, node);
1726
1727 /*
1728 * On systems with a glueless mesh NUMA topology, there are
1729 * no fixed "groups of nodes". Instead, nodes that are not
1730 * directly connected bounce traffic through intermediate
1731 * nodes; a numa_group can occupy any set of nodes.
1732 * The further away a node is, the less the faults count.
1733 * This seems to result in good task placement.
1734 */
1735 if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
1736 faults *= (max_dist - dist);
1737 faults /= (max_dist - LOCAL_DISTANCE);
1738 }
1739
1740 score += faults;
1741 }
1742
1743 return score;
1744 }
1745
1746 /*
1747 * These return the fraction of accesses done by a particular task, or
1748 * task group, on a particular numa node. The group weight is given a
1749 * larger multiplier, in order to group tasks together that are almost
1750 * evenly spread out between numa nodes.
1751 */
task_weight(struct task_struct * p,int nid,int dist)1752 static inline unsigned long task_weight(struct task_struct *p, int nid,
1753 int dist)
1754 {
1755 unsigned long faults, total_faults;
1756
1757 if (!p->numa_faults)
1758 return 0;
1759
1760 total_faults = p->total_numa_faults;
1761
1762 if (!total_faults)
1763 return 0;
1764
1765 faults = task_faults(p, nid);
1766 faults += score_nearby_nodes(p, nid, dist, true);
1767
1768 return 1000 * faults / total_faults;
1769 }
1770
group_weight(struct task_struct * p,int nid,int dist)1771 static inline unsigned long group_weight(struct task_struct *p, int nid,
1772 int dist)
1773 {
1774 struct numa_group *ng = deref_task_numa_group(p);
1775 unsigned long faults, total_faults;
1776
1777 if (!ng)
1778 return 0;
1779
1780 total_faults = ng->total_faults;
1781
1782 if (!total_faults)
1783 return 0;
1784
1785 faults = group_faults(p, nid);
1786 faults += score_nearby_nodes(p, nid, dist, false);
1787
1788 return 1000 * faults / total_faults;
1789 }
1790
1791 /*
1792 * If memory tiering mode is enabled, cpupid of slow memory page is
1793 * used to record scan time instead of CPU and PID. When tiering mode
1794 * is disabled at run time, the scan time (in cpupid) will be
1795 * interpreted as CPU and PID. So CPU needs to be checked to avoid to
1796 * access out of array bound.
1797 */
cpupid_valid(int cpupid)1798 static inline bool cpupid_valid(int cpupid)
1799 {
1800 return cpupid_to_cpu(cpupid) < nr_cpu_ids;
1801 }
1802
1803 /*
1804 * For memory tiering mode, if there are enough free pages (more than
1805 * enough watermark defined here) in fast memory node, to take full
1806 * advantage of fast memory capacity, all recently accessed slow
1807 * memory pages will be migrated to fast memory node without
1808 * considering hot threshold.
1809 */
pgdat_free_space_enough(struct pglist_data * pgdat)1810 static bool pgdat_free_space_enough(struct pglist_data *pgdat)
1811 {
1812 int z;
1813 unsigned long enough_wmark;
1814
1815 enough_wmark = max(1UL * 1024 * 1024 * 1024 >> PAGE_SHIFT,
1816 pgdat->node_present_pages >> 4);
1817 for (z = pgdat->nr_zones - 1; z >= 0; z--) {
1818 struct zone *zone = pgdat->node_zones + z;
1819
1820 if (!populated_zone(zone))
1821 continue;
1822
1823 if (zone_watermark_ok(zone, 0,
1824 promo_wmark_pages(zone) + enough_wmark,
1825 ZONE_MOVABLE, 0))
1826 return true;
1827 }
1828 return false;
1829 }
1830
1831 /*
1832 * For memory tiering mode, when page tables are scanned, the scan
1833 * time will be recorded in struct page in addition to make page
1834 * PROT_NONE for slow memory page. So when the page is accessed, in
1835 * hint page fault handler, the hint page fault latency is calculated
1836 * via,
1837 *
1838 * hint page fault latency = hint page fault time - scan time
1839 *
1840 * The smaller the hint page fault latency, the higher the possibility
1841 * for the page to be hot.
1842 */
numa_hint_fault_latency(struct folio * folio)1843 static int numa_hint_fault_latency(struct folio *folio)
1844 {
1845 int last_time, time;
1846
1847 time = jiffies_to_msecs(jiffies);
1848 last_time = folio_xchg_access_time(folio, time);
1849
1850 return (time - last_time) & PAGE_ACCESS_TIME_MASK;
1851 }
1852
1853 /*
1854 * For memory tiering mode, too high promotion/demotion throughput may
1855 * hurt application latency. So we provide a mechanism to rate limit
1856 * the number of pages that are tried to be promoted.
1857 */
numa_promotion_rate_limit(struct pglist_data * pgdat,unsigned long rate_limit,int nr)1858 static bool numa_promotion_rate_limit(struct pglist_data *pgdat,
1859 unsigned long rate_limit, int nr)
1860 {
1861 unsigned long nr_cand;
1862 unsigned int now, start;
1863
1864 now = jiffies_to_msecs(jiffies);
1865 mod_node_page_state(pgdat, PGPROMOTE_CANDIDATE, nr);
1866 nr_cand = node_page_state(pgdat, PGPROMOTE_CANDIDATE);
1867 start = pgdat->nbp_rl_start;
1868 if (now - start > MSEC_PER_SEC &&
1869 cmpxchg(&pgdat->nbp_rl_start, start, now) == start)
1870 pgdat->nbp_rl_nr_cand = nr_cand;
1871 if (nr_cand - pgdat->nbp_rl_nr_cand >= rate_limit)
1872 return true;
1873 return false;
1874 }
1875
1876 #define NUMA_MIGRATION_ADJUST_STEPS 16
1877
numa_promotion_adjust_threshold(struct pglist_data * pgdat,unsigned long rate_limit,unsigned int ref_th)1878 static void numa_promotion_adjust_threshold(struct pglist_data *pgdat,
1879 unsigned long rate_limit,
1880 unsigned int ref_th)
1881 {
1882 unsigned int now, start, th_period, unit_th, th;
1883 unsigned long nr_cand, ref_cand, diff_cand;
1884
1885 now = jiffies_to_msecs(jiffies);
1886 th_period = sysctl_numa_balancing_scan_period_max;
1887 start = pgdat->nbp_th_start;
1888 if (now - start > th_period &&
1889 cmpxchg(&pgdat->nbp_th_start, start, now) == start) {
1890 ref_cand = rate_limit *
1891 sysctl_numa_balancing_scan_period_max / MSEC_PER_SEC;
1892 nr_cand = node_page_state(pgdat, PGPROMOTE_CANDIDATE);
1893 diff_cand = nr_cand - pgdat->nbp_th_nr_cand;
1894 unit_th = ref_th * 2 / NUMA_MIGRATION_ADJUST_STEPS;
1895 th = pgdat->nbp_threshold ? : ref_th;
1896 if (diff_cand > ref_cand * 11 / 10)
1897 th = max(th - unit_th, unit_th);
1898 else if (diff_cand < ref_cand * 9 / 10)
1899 th = min(th + unit_th, ref_th * 2);
1900 pgdat->nbp_th_nr_cand = nr_cand;
1901 pgdat->nbp_threshold = th;
1902 }
1903 }
1904
should_numa_migrate_memory(struct task_struct * p,struct folio * folio,int src_nid,int dst_cpu)1905 bool should_numa_migrate_memory(struct task_struct *p, struct folio *folio,
1906 int src_nid, int dst_cpu)
1907 {
1908 struct numa_group *ng = deref_curr_numa_group(p);
1909 int dst_nid = cpu_to_node(dst_cpu);
1910 int last_cpupid, this_cpupid;
1911
1912 /*
1913 * Cannot migrate to memoryless nodes.
1914 */
1915 if (!node_state(dst_nid, N_MEMORY))
1916 return false;
1917
1918 /*
1919 * The pages in slow memory node should be migrated according
1920 * to hot/cold instead of private/shared.
1921 */
1922 if (folio_use_access_time(folio)) {
1923 struct pglist_data *pgdat;
1924 unsigned long rate_limit;
1925 unsigned int latency, th, def_th;
1926
1927 pgdat = NODE_DATA(dst_nid);
1928 if (pgdat_free_space_enough(pgdat)) {
1929 /* workload changed, reset hot threshold */
1930 pgdat->nbp_threshold = 0;
1931 return true;
1932 }
1933
1934 def_th = sysctl_numa_balancing_hot_threshold;
1935 rate_limit = sysctl_numa_balancing_promote_rate_limit << \
1936 (20 - PAGE_SHIFT);
1937 numa_promotion_adjust_threshold(pgdat, rate_limit, def_th);
1938
1939 th = pgdat->nbp_threshold ? : def_th;
1940 latency = numa_hint_fault_latency(folio);
1941 if (latency >= th)
1942 return false;
1943
1944 return !numa_promotion_rate_limit(pgdat, rate_limit,
1945 folio_nr_pages(folio));
1946 }
1947
1948 this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
1949 last_cpupid = folio_xchg_last_cpupid(folio, this_cpupid);
1950
1951 if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) &&
1952 !node_is_toptier(src_nid) && !cpupid_valid(last_cpupid))
1953 return false;
1954
1955 /*
1956 * Allow first faults or private faults to migrate immediately early in
1957 * the lifetime of a task. The magic number 4 is based on waiting for
1958 * two full passes of the "multi-stage node selection" test that is
1959 * executed below.
1960 */
1961 if ((p->numa_preferred_nid == NUMA_NO_NODE || p->numa_scan_seq <= 4) &&
1962 (cpupid_pid_unset(last_cpupid) || cpupid_match_pid(p, last_cpupid)))
1963 return true;
1964
1965 /*
1966 * Multi-stage node selection is used in conjunction with a periodic
1967 * migration fault to build a temporal task<->page relation. By using
1968 * a two-stage filter we remove short/unlikely relations.
1969 *
1970 * Using P(p) ~ n_p / n_t as per frequentist probability, we can equate
1971 * a task's usage of a particular page (n_p) per total usage of this
1972 * page (n_t) (in a given time-span) to a probability.
1973 *
1974 * Our periodic faults will sample this probability and getting the
1975 * same result twice in a row, given these samples are fully
1976 * independent, is then given by P(n)^2, provided our sample period
1977 * is sufficiently short compared to the usage pattern.
1978 *
1979 * This quadric squishes small probabilities, making it less likely we
1980 * act on an unlikely task<->page relation.
1981 */
1982 if (!cpupid_pid_unset(last_cpupid) &&
1983 cpupid_to_nid(last_cpupid) != dst_nid)
1984 return false;
1985
1986 /* Always allow migrate on private faults */
1987 if (cpupid_match_pid(p, last_cpupid))
1988 return true;
1989
1990 /* A shared fault, but p->numa_group has not been set up yet. */
1991 if (!ng)
1992 return true;
1993
1994 /*
1995 * Destination node is much more heavily used than the source
1996 * node? Allow migration.
1997 */
1998 if (group_faults_cpu(ng, dst_nid) > group_faults_cpu(ng, src_nid) *
1999 ACTIVE_NODE_FRACTION)
2000 return true;
2001
2002 /*
2003 * Distribute memory according to CPU & memory use on each node,
2004 * with 3/4 hysteresis to avoid unnecessary memory migrations:
2005 *
2006 * faults_cpu(dst) 3 faults_cpu(src)
2007 * --------------- * - > ---------------
2008 * faults_mem(dst) 4 faults_mem(src)
2009 */
2010 return group_faults_cpu(ng, dst_nid) * group_faults(p, src_nid) * 3 >
2011 group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
2012 }
2013
2014 /*
2015 * 'numa_type' describes the node at the moment of load balancing.
2016 */
2017 enum numa_type {
2018 /* The node has spare capacity that can be used to run more tasks. */
2019 node_has_spare = 0,
2020 /*
2021 * The node is fully used and the tasks don't compete for more CPU
2022 * cycles. Nevertheless, some tasks might wait before running.
2023 */
2024 node_fully_busy,
2025 /*
2026 * The node is overloaded and can't provide expected CPU cycles to all
2027 * tasks.
2028 */
2029 node_overloaded
2030 };
2031
2032 /* Cached statistics for all CPUs within a node */
2033 struct numa_stats {
2034 unsigned long load;
2035 unsigned long runnable;
2036 unsigned long util;
2037 /* Total compute capacity of CPUs on a node */
2038 unsigned long compute_capacity;
2039 unsigned int nr_running;
2040 unsigned int weight;
2041 enum numa_type node_type;
2042 int idle_cpu;
2043 };
2044
2045 struct task_numa_env {
2046 struct task_struct *p;
2047
2048 int src_cpu, src_nid;
2049 int dst_cpu, dst_nid;
2050 int imb_numa_nr;
2051
2052 struct numa_stats src_stats, dst_stats;
2053
2054 int imbalance_pct;
2055 int dist;
2056
2057 struct task_struct *best_task;
2058 long best_imp;
2059 int best_cpu;
2060 };
2061
2062 static unsigned long cpu_load(struct rq *rq);
2063 static unsigned long cpu_runnable(struct rq *rq);
2064
2065 static inline enum
numa_classify(unsigned int imbalance_pct,struct numa_stats * ns)2066 numa_type numa_classify(unsigned int imbalance_pct,
2067 struct numa_stats *ns)
2068 {
2069 if ((ns->nr_running > ns->weight) &&
2070 (((ns->compute_capacity * 100) < (ns->util * imbalance_pct)) ||
2071 ((ns->compute_capacity * imbalance_pct) < (ns->runnable * 100))))
2072 return node_overloaded;
2073
2074 if ((ns->nr_running < ns->weight) ||
2075 (((ns->compute_capacity * 100) > (ns->util * imbalance_pct)) &&
2076 ((ns->compute_capacity * imbalance_pct) > (ns->runnable * 100))))
2077 return node_has_spare;
2078
2079 return node_fully_busy;
2080 }
2081
2082 #ifdef CONFIG_SCHED_SMT
2083 /* Forward declarations of select_idle_sibling helpers */
2084 static inline bool test_idle_cores(int cpu);
numa_idle_core(int idle_core,int cpu)2085 static inline int numa_idle_core(int idle_core, int cpu)
2086 {
2087 if (!static_branch_likely(&sched_smt_present) ||
2088 idle_core >= 0 || !test_idle_cores(cpu))
2089 return idle_core;
2090
2091 /*
2092 * Prefer cores instead of packing HT siblings
2093 * and triggering future load balancing.
2094 */
2095 if (is_core_idle(cpu))
2096 idle_core = cpu;
2097
2098 return idle_core;
2099 }
2100 #else /* !CONFIG_SCHED_SMT: */
numa_idle_core(int idle_core,int cpu)2101 static inline int numa_idle_core(int idle_core, int cpu)
2102 {
2103 return idle_core;
2104 }
2105 #endif /* !CONFIG_SCHED_SMT */
2106
2107 /*
2108 * Gather all necessary information to make NUMA balancing placement
2109 * decisions that are compatible with standard load balancer. This
2110 * borrows code and logic from update_sg_lb_stats but sharing a
2111 * common implementation is impractical.
2112 */
update_numa_stats(struct task_numa_env * env,struct numa_stats * ns,int nid,bool find_idle)2113 static void update_numa_stats(struct task_numa_env *env,
2114 struct numa_stats *ns, int nid,
2115 bool find_idle)
2116 {
2117 int cpu, idle_core = -1;
2118
2119 memset(ns, 0, sizeof(*ns));
2120 ns->idle_cpu = -1;
2121
2122 rcu_read_lock();
2123 for_each_cpu(cpu, cpumask_of_node(nid)) {
2124 struct rq *rq = cpu_rq(cpu);
2125
2126 ns->load += cpu_load(rq);
2127 ns->runnable += cpu_runnable(rq);
2128 ns->util += cpu_util_cfs(cpu);
2129 ns->nr_running += rq->cfs.h_nr_runnable;
2130 ns->compute_capacity += capacity_of(cpu);
2131
2132 if (find_idle && idle_core < 0 && !rq->nr_running && idle_cpu(cpu)) {
2133 if (READ_ONCE(rq->numa_migrate_on) ||
2134 !cpumask_test_cpu(cpu, env->p->cpus_ptr))
2135 continue;
2136
2137 if (ns->idle_cpu == -1)
2138 ns->idle_cpu = cpu;
2139
2140 idle_core = numa_idle_core(idle_core, cpu);
2141 }
2142 }
2143 rcu_read_unlock();
2144
2145 ns->weight = cpumask_weight(cpumask_of_node(nid));
2146
2147 ns->node_type = numa_classify(env->imbalance_pct, ns);
2148
2149 if (idle_core >= 0)
2150 ns->idle_cpu = idle_core;
2151 }
2152
task_numa_assign(struct task_numa_env * env,struct task_struct * p,long imp)2153 static void task_numa_assign(struct task_numa_env *env,
2154 struct task_struct *p, long imp)
2155 {
2156 struct rq *rq = cpu_rq(env->dst_cpu);
2157
2158 /* Check if run-queue part of active NUMA balance. */
2159 if (env->best_cpu != env->dst_cpu && xchg(&rq->numa_migrate_on, 1)) {
2160 int cpu;
2161 int start = env->dst_cpu;
2162
2163 /* Find alternative idle CPU. */
2164 for_each_cpu_wrap(cpu, cpumask_of_node(env->dst_nid), start + 1) {
2165 if (cpu == env->best_cpu || !idle_cpu(cpu) ||
2166 !cpumask_test_cpu(cpu, env->p->cpus_ptr)) {
2167 continue;
2168 }
2169
2170 env->dst_cpu = cpu;
2171 rq = cpu_rq(env->dst_cpu);
2172 if (!xchg(&rq->numa_migrate_on, 1))
2173 goto assign;
2174 }
2175
2176 /* Failed to find an alternative idle CPU */
2177 return;
2178 }
2179
2180 assign:
2181 /*
2182 * Clear previous best_cpu/rq numa-migrate flag, since task now
2183 * found a better CPU to move/swap.
2184 */
2185 if (env->best_cpu != -1 && env->best_cpu != env->dst_cpu) {
2186 rq = cpu_rq(env->best_cpu);
2187 WRITE_ONCE(rq->numa_migrate_on, 0);
2188 }
2189
2190 if (env->best_task)
2191 put_task_struct(env->best_task);
2192 if (p)
2193 get_task_struct(p);
2194
2195 env->best_task = p;
2196 env->best_imp = imp;
2197 env->best_cpu = env->dst_cpu;
2198 }
2199
load_too_imbalanced(long src_load,long dst_load,struct task_numa_env * env)2200 static bool load_too_imbalanced(long src_load, long dst_load,
2201 struct task_numa_env *env)
2202 {
2203 long imb, old_imb;
2204 long orig_src_load, orig_dst_load;
2205 long src_capacity, dst_capacity;
2206
2207 /*
2208 * The load is corrected for the CPU capacity available on each node.
2209 *
2210 * src_load dst_load
2211 * ------------ vs ---------
2212 * src_capacity dst_capacity
2213 */
2214 src_capacity = env->src_stats.compute_capacity;
2215 dst_capacity = env->dst_stats.compute_capacity;
2216
2217 imb = abs(dst_load * src_capacity - src_load * dst_capacity);
2218
2219 orig_src_load = env->src_stats.load;
2220 orig_dst_load = env->dst_stats.load;
2221
2222 old_imb = abs(orig_dst_load * src_capacity - orig_src_load * dst_capacity);
2223
2224 /* Would this change make things worse? */
2225 return (imb > old_imb);
2226 }
2227
2228 /*
2229 * Maximum NUMA importance can be 1998 (2*999);
2230 * SMALLIMP @ 30 would be close to 1998/64.
2231 * Used to deter task migration.
2232 */
2233 #define SMALLIMP 30
2234
2235 /*
2236 * This checks if the overall compute and NUMA accesses of the system would
2237 * be improved if the source tasks was migrated to the target dst_cpu taking
2238 * into account that it might be best if task running on the dst_cpu should
2239 * be exchanged with the source task
2240 */
task_numa_compare(struct task_numa_env * env,long taskimp,long groupimp,bool maymove)2241 static bool task_numa_compare(struct task_numa_env *env,
2242 long taskimp, long groupimp, bool maymove)
2243 {
2244 struct numa_group *cur_ng, *p_ng = deref_curr_numa_group(env->p);
2245 struct rq *dst_rq = cpu_rq(env->dst_cpu);
2246 long imp = p_ng ? groupimp : taskimp;
2247 struct task_struct *cur;
2248 long src_load, dst_load;
2249 int dist = env->dist;
2250 long moveimp = imp;
2251 long load;
2252 bool stopsearch = false;
2253
2254 if (READ_ONCE(dst_rq->numa_migrate_on))
2255 return false;
2256
2257 rcu_read_lock();
2258 cur = rcu_dereference(dst_rq->curr);
2259 if (cur && ((cur->flags & (PF_EXITING | PF_KTHREAD)) ||
2260 !cur->mm))
2261 cur = NULL;
2262
2263 /*
2264 * Because we have preemption enabled we can get migrated around and
2265 * end try selecting ourselves (current == env->p) as a swap candidate.
2266 */
2267 if (cur == env->p) {
2268 stopsearch = true;
2269 goto unlock;
2270 }
2271
2272 if (!cur) {
2273 if (maymove && moveimp >= env->best_imp)
2274 goto assign;
2275 else
2276 goto unlock;
2277 }
2278
2279 /* Skip this swap candidate if cannot move to the source cpu. */
2280 if (!cpumask_test_cpu(env->src_cpu, cur->cpus_ptr))
2281 goto unlock;
2282
2283 /*
2284 * Skip this swap candidate if it is not moving to its preferred
2285 * node and the best task is.
2286 */
2287 if (env->best_task &&
2288 env->best_task->numa_preferred_nid == env->src_nid &&
2289 cur->numa_preferred_nid != env->src_nid) {
2290 goto unlock;
2291 }
2292
2293 /*
2294 * "imp" is the fault differential for the source task between the
2295 * source and destination node. Calculate the total differential for
2296 * the source task and potential destination task. The more negative
2297 * the value is, the more remote accesses that would be expected to
2298 * be incurred if the tasks were swapped.
2299 *
2300 * If dst and source tasks are in the same NUMA group, or not
2301 * in any group then look only at task weights.
2302 */
2303 cur_ng = rcu_dereference(cur->numa_group);
2304 if (cur_ng == p_ng) {
2305 /*
2306 * Do not swap within a group or between tasks that have
2307 * no group if there is spare capacity. Swapping does
2308 * not address the load imbalance and helps one task at
2309 * the cost of punishing another.
2310 */
2311 if (env->dst_stats.node_type == node_has_spare)
2312 goto unlock;
2313
2314 imp = taskimp + task_weight(cur, env->src_nid, dist) -
2315 task_weight(cur, env->dst_nid, dist);
2316 /*
2317 * Add some hysteresis to prevent swapping the
2318 * tasks within a group over tiny differences.
2319 */
2320 if (cur_ng)
2321 imp -= imp / 16;
2322 } else {
2323 /*
2324 * Compare the group weights. If a task is all by itself
2325 * (not part of a group), use the task weight instead.
2326 */
2327 if (cur_ng && p_ng)
2328 imp += group_weight(cur, env->src_nid, dist) -
2329 group_weight(cur, env->dst_nid, dist);
2330 else
2331 imp += task_weight(cur, env->src_nid, dist) -
2332 task_weight(cur, env->dst_nid, dist);
2333 }
2334
2335 /* Discourage picking a task already on its preferred node */
2336 if (cur->numa_preferred_nid == env->dst_nid)
2337 imp -= imp / 16;
2338
2339 /*
2340 * Encourage picking a task that moves to its preferred node.
2341 * This potentially makes imp larger than it's maximum of
2342 * 1998 (see SMALLIMP and task_weight for why) but in this
2343 * case, it does not matter.
2344 */
2345 if (cur->numa_preferred_nid == env->src_nid)
2346 imp += imp / 8;
2347
2348 if (maymove && moveimp > imp && moveimp > env->best_imp) {
2349 imp = moveimp;
2350 cur = NULL;
2351 goto assign;
2352 }
2353
2354 /*
2355 * Prefer swapping with a task moving to its preferred node over a
2356 * task that is not.
2357 */
2358 if (env->best_task && cur->numa_preferred_nid == env->src_nid &&
2359 env->best_task->numa_preferred_nid != env->src_nid) {
2360 goto assign;
2361 }
2362
2363 /*
2364 * If the NUMA importance is less than SMALLIMP,
2365 * task migration might only result in ping pong
2366 * of tasks and also hurt performance due to cache
2367 * misses.
2368 */
2369 if (imp < SMALLIMP || imp <= env->best_imp + SMALLIMP / 2)
2370 goto unlock;
2371
2372 /*
2373 * In the overloaded case, try and keep the load balanced.
2374 */
2375 load = task_h_load(env->p) - task_h_load(cur);
2376 if (!load)
2377 goto assign;
2378
2379 dst_load = env->dst_stats.load + load;
2380 src_load = env->src_stats.load - load;
2381
2382 if (load_too_imbalanced(src_load, dst_load, env))
2383 goto unlock;
2384
2385 assign:
2386 /* Evaluate an idle CPU for a task numa move. */
2387 if (!cur) {
2388 int cpu = env->dst_stats.idle_cpu;
2389
2390 /* Nothing cached so current CPU went idle since the search. */
2391 if (cpu < 0)
2392 cpu = env->dst_cpu;
2393
2394 /*
2395 * If the CPU is no longer truly idle and the previous best CPU
2396 * is, keep using it.
2397 */
2398 if (!idle_cpu(cpu) && env->best_cpu >= 0 &&
2399 idle_cpu(env->best_cpu)) {
2400 cpu = env->best_cpu;
2401 }
2402
2403 env->dst_cpu = cpu;
2404 }
2405
2406 task_numa_assign(env, cur, imp);
2407
2408 /*
2409 * If a move to idle is allowed because there is capacity or load
2410 * balance improves then stop the search. While a better swap
2411 * candidate may exist, a search is not free.
2412 */
2413 if (maymove && !cur && env->best_cpu >= 0 && idle_cpu(env->best_cpu))
2414 stopsearch = true;
2415
2416 /*
2417 * If a swap candidate must be identified and the current best task
2418 * moves its preferred node then stop the search.
2419 */
2420 if (!maymove && env->best_task &&
2421 env->best_task->numa_preferred_nid == env->src_nid) {
2422 stopsearch = true;
2423 }
2424 unlock:
2425 rcu_read_unlock();
2426
2427 return stopsearch;
2428 }
2429
task_numa_find_cpu(struct task_numa_env * env,long taskimp,long groupimp)2430 static void task_numa_find_cpu(struct task_numa_env *env,
2431 long taskimp, long groupimp)
2432 {
2433 bool maymove = false;
2434 int cpu;
2435
2436 /*
2437 * If dst node has spare capacity, then check if there is an
2438 * imbalance that would be overruled by the load balancer.
2439 */
2440 if (env->dst_stats.node_type == node_has_spare) {
2441 unsigned int imbalance;
2442 int src_running, dst_running;
2443
2444 /*
2445 * Would movement cause an imbalance? Note that if src has
2446 * more running tasks that the imbalance is ignored as the
2447 * move improves the imbalance from the perspective of the
2448 * CPU load balancer.
2449 * */
2450 src_running = env->src_stats.nr_running - 1;
2451 dst_running = env->dst_stats.nr_running + 1;
2452 imbalance = max(0, dst_running - src_running);
2453 imbalance = adjust_numa_imbalance(imbalance, dst_running,
2454 env->imb_numa_nr);
2455
2456 /* Use idle CPU if there is no imbalance */
2457 if (!imbalance) {
2458 maymove = true;
2459 if (env->dst_stats.idle_cpu >= 0) {
2460 env->dst_cpu = env->dst_stats.idle_cpu;
2461 task_numa_assign(env, NULL, 0);
2462 return;
2463 }
2464 }
2465 } else {
2466 long src_load, dst_load, load;
2467 /*
2468 * If the improvement from just moving env->p direction is better
2469 * than swapping tasks around, check if a move is possible.
2470 */
2471 load = task_h_load(env->p);
2472 dst_load = env->dst_stats.load + load;
2473 src_load = env->src_stats.load - load;
2474 maymove = !load_too_imbalanced(src_load, dst_load, env);
2475 }
2476
2477 for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
2478 /* Skip this CPU if the source task cannot migrate */
2479 if (!cpumask_test_cpu(cpu, env->p->cpus_ptr))
2480 continue;
2481
2482 env->dst_cpu = cpu;
2483 if (task_numa_compare(env, taskimp, groupimp, maymove))
2484 break;
2485 }
2486 }
2487
task_numa_migrate(struct task_struct * p)2488 static int task_numa_migrate(struct task_struct *p)
2489 {
2490 struct task_numa_env env = {
2491 .p = p,
2492
2493 .src_cpu = task_cpu(p),
2494 .src_nid = task_node(p),
2495
2496 .imbalance_pct = 112,
2497
2498 .best_task = NULL,
2499 .best_imp = 0,
2500 .best_cpu = -1,
2501 };
2502 unsigned long taskweight, groupweight;
2503 struct sched_domain *sd;
2504 long taskimp, groupimp;
2505 struct numa_group *ng;
2506 struct rq *best_rq;
2507 int nid, ret, dist;
2508
2509 /*
2510 * Pick the lowest SD_NUMA domain, as that would have the smallest
2511 * imbalance and would be the first to start moving tasks about.
2512 *
2513 * And we want to avoid any moving of tasks about, as that would create
2514 * random movement of tasks -- counter the numa conditions we're trying
2515 * to satisfy here.
2516 */
2517 rcu_read_lock();
2518 sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
2519 if (sd) {
2520 env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
2521 env.imb_numa_nr = sd->imb_numa_nr;
2522 }
2523 rcu_read_unlock();
2524
2525 /*
2526 * Cpusets can break the scheduler domain tree into smaller
2527 * balance domains, some of which do not cross NUMA boundaries.
2528 * Tasks that are "trapped" in such domains cannot be migrated
2529 * elsewhere, so there is no point in (re)trying.
2530 */
2531 if (unlikely(!sd)) {
2532 sched_setnuma(p, task_node(p));
2533 return -EINVAL;
2534 }
2535
2536 env.dst_nid = p->numa_preferred_nid;
2537 dist = env.dist = node_distance(env.src_nid, env.dst_nid);
2538 taskweight = task_weight(p, env.src_nid, dist);
2539 groupweight = group_weight(p, env.src_nid, dist);
2540 update_numa_stats(&env, &env.src_stats, env.src_nid, false);
2541 taskimp = task_weight(p, env.dst_nid, dist) - taskweight;
2542 groupimp = group_weight(p, env.dst_nid, dist) - groupweight;
2543 update_numa_stats(&env, &env.dst_stats, env.dst_nid, true);
2544
2545 /* Try to find a spot on the preferred nid. */
2546 task_numa_find_cpu(&env, taskimp, groupimp);
2547
2548 /*
2549 * Look at other nodes in these cases:
2550 * - there is no space available on the preferred_nid
2551 * - the task is part of a numa_group that is interleaved across
2552 * multiple NUMA nodes; in order to better consolidate the group,
2553 * we need to check other locations.
2554 */
2555 ng = deref_curr_numa_group(p);
2556 if (env.best_cpu == -1 || (ng && ng->active_nodes > 1)) {
2557 for_each_node_state(nid, N_CPU) {
2558 if (nid == env.src_nid || nid == p->numa_preferred_nid)
2559 continue;
2560
2561 dist = node_distance(env.src_nid, env.dst_nid);
2562 if (sched_numa_topology_type == NUMA_BACKPLANE &&
2563 dist != env.dist) {
2564 taskweight = task_weight(p, env.src_nid, dist);
2565 groupweight = group_weight(p, env.src_nid, dist);
2566 }
2567
2568 /* Only consider nodes where both task and groups benefit */
2569 taskimp = task_weight(p, nid, dist) - taskweight;
2570 groupimp = group_weight(p, nid, dist) - groupweight;
2571 if (taskimp < 0 && groupimp < 0)
2572 continue;
2573
2574 env.dist = dist;
2575 env.dst_nid = nid;
2576 update_numa_stats(&env, &env.dst_stats, env.dst_nid, true);
2577 task_numa_find_cpu(&env, taskimp, groupimp);
2578 }
2579 }
2580
2581 /*
2582 * If the task is part of a workload that spans multiple NUMA nodes,
2583 * and is migrating into one of the workload's active nodes, remember
2584 * this node as the task's preferred numa node, so the workload can
2585 * settle down.
2586 * A task that migrated to a second choice node will be better off
2587 * trying for a better one later. Do not set the preferred node here.
2588 */
2589 if (ng) {
2590 if (env.best_cpu == -1)
2591 nid = env.src_nid;
2592 else
2593 nid = cpu_to_node(env.best_cpu);
2594
2595 if (nid != p->numa_preferred_nid)
2596 sched_setnuma(p, nid);
2597 }
2598
2599 /* No better CPU than the current one was found. */
2600 if (env.best_cpu == -1) {
2601 trace_sched_stick_numa(p, env.src_cpu, NULL, -1);
2602 return -EAGAIN;
2603 }
2604
2605 best_rq = cpu_rq(env.best_cpu);
2606 if (env.best_task == NULL) {
2607 ret = migrate_task_to(p, env.best_cpu);
2608 WRITE_ONCE(best_rq->numa_migrate_on, 0);
2609 if (ret != 0)
2610 trace_sched_stick_numa(p, env.src_cpu, NULL, env.best_cpu);
2611 return ret;
2612 }
2613
2614 ret = migrate_swap(p, env.best_task, env.best_cpu, env.src_cpu);
2615 WRITE_ONCE(best_rq->numa_migrate_on, 0);
2616
2617 if (ret != 0)
2618 trace_sched_stick_numa(p, env.src_cpu, env.best_task, env.best_cpu);
2619 put_task_struct(env.best_task);
2620 return ret;
2621 }
2622
2623 /* Attempt to migrate a task to a CPU on the preferred node. */
numa_migrate_preferred(struct task_struct * p)2624 static void numa_migrate_preferred(struct task_struct *p)
2625 {
2626 unsigned long interval = HZ;
2627
2628 /* This task has no NUMA fault statistics yet */
2629 if (unlikely(p->numa_preferred_nid == NUMA_NO_NODE || !p->numa_faults))
2630 return;
2631
2632 /* Periodically retry migrating the task to the preferred node */
2633 interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16);
2634 p->numa_migrate_retry = jiffies + interval;
2635
2636 /* Success if task is already running on preferred CPU */
2637 if (task_node(p) == p->numa_preferred_nid)
2638 return;
2639
2640 /* Otherwise, try migrate to a CPU on the preferred node */
2641 task_numa_migrate(p);
2642 }
2643
2644 /*
2645 * Find out how many nodes the workload is actively running on. Do this by
2646 * tracking the nodes from which NUMA hinting faults are triggered. This can
2647 * be different from the set of nodes where the workload's memory is currently
2648 * located.
2649 */
numa_group_count_active_nodes(struct numa_group * numa_group)2650 static void numa_group_count_active_nodes(struct numa_group *numa_group)
2651 {
2652 unsigned long faults, max_faults = 0;
2653 int nid, active_nodes = 0;
2654
2655 for_each_node_state(nid, N_CPU) {
2656 faults = group_faults_cpu(numa_group, nid);
2657 if (faults > max_faults)
2658 max_faults = faults;
2659 }
2660
2661 for_each_node_state(nid, N_CPU) {
2662 faults = group_faults_cpu(numa_group, nid);
2663 if (faults * ACTIVE_NODE_FRACTION > max_faults)
2664 active_nodes++;
2665 }
2666
2667 numa_group->max_faults_cpu = max_faults;
2668 numa_group->active_nodes = active_nodes;
2669 }
2670
2671 /*
2672 * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
2673 * increments. The more local the fault statistics are, the higher the scan
2674 * period will be for the next scan window. If local/(local+remote) ratio is
2675 * below NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS)
2676 * the scan period will decrease. Aim for 70% local accesses.
2677 */
2678 #define NUMA_PERIOD_SLOTS 10
2679 #define NUMA_PERIOD_THRESHOLD 7
2680
2681 /*
2682 * Increase the scan period (slow down scanning) if the majority of
2683 * our memory is already on our local node, or if the majority of
2684 * the page accesses are shared with other processes.
2685 * Otherwise, decrease the scan period.
2686 */
update_task_scan_period(struct task_struct * p,unsigned long shared,unsigned long private)2687 static void update_task_scan_period(struct task_struct *p,
2688 unsigned long shared, unsigned long private)
2689 {
2690 unsigned int period_slot;
2691 int lr_ratio, ps_ratio;
2692 int diff;
2693
2694 unsigned long remote = p->numa_faults_locality[0];
2695 unsigned long local = p->numa_faults_locality[1];
2696
2697 /*
2698 * If there were no record hinting faults then either the task is
2699 * completely idle or all activity is in areas that are not of interest
2700 * to automatic numa balancing. Related to that, if there were failed
2701 * migration then it implies we are migrating too quickly or the local
2702 * node is overloaded. In either case, scan slower
2703 */
2704 if (local + shared == 0 || p->numa_faults_locality[2]) {
2705 p->numa_scan_period = min(p->numa_scan_period_max,
2706 p->numa_scan_period << 1);
2707
2708 p->mm->numa_next_scan = jiffies +
2709 msecs_to_jiffies(p->numa_scan_period);
2710
2711 return;
2712 }
2713
2714 /*
2715 * Prepare to scale scan period relative to the current period.
2716 * == NUMA_PERIOD_THRESHOLD scan period stays the same
2717 * < NUMA_PERIOD_THRESHOLD scan period decreases (scan faster)
2718 * >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower)
2719 */
2720 period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);
2721 lr_ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
2722 ps_ratio = (private * NUMA_PERIOD_SLOTS) / (private + shared);
2723
2724 if (ps_ratio >= NUMA_PERIOD_THRESHOLD) {
2725 /*
2726 * Most memory accesses are local. There is no need to
2727 * do fast NUMA scanning, since memory is already local.
2728 */
2729 int slot = ps_ratio - NUMA_PERIOD_THRESHOLD;
2730 if (!slot)
2731 slot = 1;
2732 diff = slot * period_slot;
2733 } else if (lr_ratio >= NUMA_PERIOD_THRESHOLD) {
2734 /*
2735 * Most memory accesses are shared with other tasks.
2736 * There is no point in continuing fast NUMA scanning,
2737 * since other tasks may just move the memory elsewhere.
2738 */
2739 int slot = lr_ratio - NUMA_PERIOD_THRESHOLD;
2740 if (!slot)
2741 slot = 1;
2742 diff = slot * period_slot;
2743 } else {
2744 /*
2745 * Private memory faults exceed (SLOTS-THRESHOLD)/SLOTS,
2746 * yet they are not on the local NUMA node. Speed up
2747 * NUMA scanning to get the memory moved over.
2748 */
2749 int ratio = max(lr_ratio, ps_ratio);
2750 diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
2751 }
2752
2753 p->numa_scan_period = clamp(p->numa_scan_period + diff,
2754 task_scan_min(p), task_scan_max(p));
2755 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
2756 }
2757
2758 /*
2759 * Get the fraction of time the task has been running since the last
2760 * NUMA placement cycle. The scheduler keeps similar statistics, but
2761 * decays those on a 32ms period, which is orders of magnitude off
2762 * from the dozens-of-seconds NUMA balancing period. Use the scheduler
2763 * stats only if the task is so new there are no NUMA statistics yet.
2764 */
numa_get_avg_runtime(struct task_struct * p,u64 * period)2765 static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
2766 {
2767 u64 runtime, delta, now;
2768 /* Use the start of this time slice to avoid calculations. */
2769 now = p->se.exec_start;
2770 runtime = p->se.sum_exec_runtime;
2771
2772 if (p->last_task_numa_placement) {
2773 delta = runtime - p->last_sum_exec_runtime;
2774 *period = now - p->last_task_numa_placement;
2775
2776 /* Avoid time going backwards, prevent potential divide error: */
2777 if (unlikely((s64)*period < 0))
2778 *period = 0;
2779 } else {
2780 delta = p->se.avg.load_sum;
2781 *period = LOAD_AVG_MAX;
2782 }
2783
2784 p->last_sum_exec_runtime = runtime;
2785 p->last_task_numa_placement = now;
2786
2787 return delta;
2788 }
2789
2790 /*
2791 * Determine the preferred nid for a task in a numa_group. This needs to
2792 * be done in a way that produces consistent results with group_weight,
2793 * otherwise workloads might not converge.
2794 */
preferred_group_nid(struct task_struct * p,int nid)2795 static int preferred_group_nid(struct task_struct *p, int nid)
2796 {
2797 nodemask_t nodes;
2798 int dist;
2799
2800 /* Direct connections between all NUMA nodes. */
2801 if (sched_numa_topology_type == NUMA_DIRECT)
2802 return nid;
2803
2804 /*
2805 * On a system with glueless mesh NUMA topology, group_weight
2806 * scores nodes according to the number of NUMA hinting faults on
2807 * both the node itself, and on nearby nodes.
2808 */
2809 if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
2810 unsigned long score, max_score = 0;
2811 int node, max_node = nid;
2812
2813 dist = sched_max_numa_distance;
2814
2815 for_each_node_state(node, N_CPU) {
2816 score = group_weight(p, node, dist);
2817 if (score > max_score) {
2818 max_score = score;
2819 max_node = node;
2820 }
2821 }
2822 return max_node;
2823 }
2824
2825 /*
2826 * Finding the preferred nid in a system with NUMA backplane
2827 * interconnect topology is more involved. The goal is to locate
2828 * tasks from numa_groups near each other in the system, and
2829 * untangle workloads from different sides of the system. This requires
2830 * searching down the hierarchy of node groups, recursively searching
2831 * inside the highest scoring group of nodes. The nodemask tricks
2832 * keep the complexity of the search down.
2833 */
2834 nodes = node_states[N_CPU];
2835 for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) {
2836 unsigned long max_faults = 0;
2837 nodemask_t max_group = NODE_MASK_NONE;
2838 int a, b;
2839
2840 /* Are there nodes at this distance from each other? */
2841 if (!find_numa_distance(dist))
2842 continue;
2843
2844 for_each_node_mask(a, nodes) {
2845 unsigned long faults = 0;
2846 nodemask_t this_group;
2847 nodes_clear(this_group);
2848
2849 /* Sum group's NUMA faults; includes a==b case. */
2850 for_each_node_mask(b, nodes) {
2851 if (node_distance(a, b) < dist) {
2852 faults += group_faults(p, b);
2853 node_set(b, this_group);
2854 node_clear(b, nodes);
2855 }
2856 }
2857
2858 /* Remember the top group. */
2859 if (faults > max_faults) {
2860 max_faults = faults;
2861 max_group = this_group;
2862 /*
2863 * subtle: at the smallest distance there is
2864 * just one node left in each "group", the
2865 * winner is the preferred nid.
2866 */
2867 nid = a;
2868 }
2869 }
2870 /* Next round, evaluate the nodes within max_group. */
2871 if (!max_faults)
2872 break;
2873 nodes = max_group;
2874 }
2875 return nid;
2876 }
2877
task_numa_placement(struct task_struct * p)2878 static void task_numa_placement(struct task_struct *p)
2879 {
2880 int seq, nid, max_nid = NUMA_NO_NODE;
2881 unsigned long max_faults = 0;
2882 unsigned long fault_types[2] = { 0, 0 };
2883 unsigned long total_faults;
2884 u64 runtime, period;
2885 spinlock_t *group_lock = NULL;
2886 struct numa_group *ng;
2887
2888 /*
2889 * The p->mm->numa_scan_seq field gets updated without
2890 * exclusive access. Use READ_ONCE() here to ensure
2891 * that the field is read in a single access:
2892 */
2893 seq = READ_ONCE(p->mm->numa_scan_seq);
2894 if (p->numa_scan_seq == seq)
2895 return;
2896 p->numa_scan_seq = seq;
2897 p->numa_scan_period_max = task_scan_max(p);
2898
2899 total_faults = p->numa_faults_locality[0] +
2900 p->numa_faults_locality[1];
2901 runtime = numa_get_avg_runtime(p, &period);
2902
2903 /* If the task is part of a group prevent parallel updates to group stats */
2904 ng = deref_curr_numa_group(p);
2905 if (ng) {
2906 group_lock = &ng->lock;
2907 spin_lock_irq(group_lock);
2908 }
2909
2910 /* Find the node with the highest number of faults */
2911 for_each_online_node(nid) {
2912 /* Keep track of the offsets in numa_faults array */
2913 int mem_idx, membuf_idx, cpu_idx, cpubuf_idx;
2914 unsigned long faults = 0, group_faults = 0;
2915 int priv;
2916
2917 for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) {
2918 long diff, f_diff, f_weight;
2919
2920 mem_idx = task_faults_idx(NUMA_MEM, nid, priv);
2921 membuf_idx = task_faults_idx(NUMA_MEMBUF, nid, priv);
2922 cpu_idx = task_faults_idx(NUMA_CPU, nid, priv);
2923 cpubuf_idx = task_faults_idx(NUMA_CPUBUF, nid, priv);
2924
2925 /* Decay existing window, copy faults since last scan */
2926 diff = p->numa_faults[membuf_idx] - p->numa_faults[mem_idx] / 2;
2927 fault_types[priv] += p->numa_faults[membuf_idx];
2928 p->numa_faults[membuf_idx] = 0;
2929
2930 /*
2931 * Normalize the faults_from, so all tasks in a group
2932 * count according to CPU use, instead of by the raw
2933 * number of faults. Tasks with little runtime have
2934 * little over-all impact on throughput, and thus their
2935 * faults are less important.
2936 */
2937 f_weight = div64_u64(runtime << 16, period + 1);
2938 f_weight = (f_weight * p->numa_faults[cpubuf_idx]) /
2939 (total_faults + 1);
2940 f_diff = f_weight - p->numa_faults[cpu_idx] / 2;
2941 p->numa_faults[cpubuf_idx] = 0;
2942
2943 p->numa_faults[mem_idx] += diff;
2944 p->numa_faults[cpu_idx] += f_diff;
2945 faults += p->numa_faults[mem_idx];
2946 p->total_numa_faults += diff;
2947 if (ng) {
2948 /*
2949 * safe because we can only change our own group
2950 *
2951 * mem_idx represents the offset for a given
2952 * nid and priv in a specific region because it
2953 * is at the beginning of the numa_faults array.
2954 */
2955 ng->faults[mem_idx] += diff;
2956 ng->faults[cpu_idx] += f_diff;
2957 ng->total_faults += diff;
2958 group_faults += ng->faults[mem_idx];
2959 }
2960 }
2961
2962 if (!ng) {
2963 if (faults > max_faults) {
2964 max_faults = faults;
2965 max_nid = nid;
2966 }
2967 } else if (group_faults > max_faults) {
2968 max_faults = group_faults;
2969 max_nid = nid;
2970 }
2971 }
2972
2973 /* Cannot migrate task to CPU-less node */
2974 max_nid = numa_nearest_node(max_nid, N_CPU);
2975
2976 if (ng) {
2977 numa_group_count_active_nodes(ng);
2978 spin_unlock_irq(group_lock);
2979 max_nid = preferred_group_nid(p, max_nid);
2980 }
2981
2982 if (max_faults) {
2983 /* Set the new preferred node */
2984 if (max_nid != p->numa_preferred_nid)
2985 sched_setnuma(p, max_nid);
2986 }
2987
2988 update_task_scan_period(p, fault_types[0], fault_types[1]);
2989 }
2990
get_numa_group(struct numa_group * grp)2991 static inline int get_numa_group(struct numa_group *grp)
2992 {
2993 return refcount_inc_not_zero(&grp->refcount);
2994 }
2995
put_numa_group(struct numa_group * grp)2996 static inline void put_numa_group(struct numa_group *grp)
2997 {
2998 if (refcount_dec_and_test(&grp->refcount))
2999 kfree_rcu(grp, rcu);
3000 }
3001
task_numa_group(struct task_struct * p,int cpupid,int flags,int * priv)3002 static void task_numa_group(struct task_struct *p, int cpupid, int flags,
3003 int *priv)
3004 {
3005 struct numa_group *grp, *my_grp;
3006 struct task_struct *tsk;
3007 bool join = false;
3008 int cpu = cpupid_to_cpu(cpupid);
3009 int i;
3010
3011 if (unlikely(!deref_curr_numa_group(p))) {
3012 unsigned int size = sizeof(struct numa_group) +
3013 NR_NUMA_HINT_FAULT_STATS *
3014 nr_node_ids * sizeof(unsigned long);
3015
3016 grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
3017 if (!grp)
3018 return;
3019
3020 refcount_set(&grp->refcount, 1);
3021 grp->active_nodes = 1;
3022 grp->max_faults_cpu = 0;
3023 spin_lock_init(&grp->lock);
3024 grp->gid = p->pid;
3025
3026 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
3027 grp->faults[i] = p->numa_faults[i];
3028
3029 grp->total_faults = p->total_numa_faults;
3030
3031 grp->nr_tasks++;
3032 rcu_assign_pointer(p->numa_group, grp);
3033 }
3034
3035 rcu_read_lock();
3036 tsk = READ_ONCE(cpu_rq(cpu)->curr);
3037
3038 if (!cpupid_match_pid(tsk, cpupid))
3039 goto no_join;
3040
3041 grp = rcu_dereference(tsk->numa_group);
3042 if (!grp)
3043 goto no_join;
3044
3045 my_grp = deref_curr_numa_group(p);
3046 if (grp == my_grp)
3047 goto no_join;
3048
3049 /*
3050 * Only join the other group if its bigger; if we're the bigger group,
3051 * the other task will join us.
3052 */
3053 if (my_grp->nr_tasks > grp->nr_tasks)
3054 goto no_join;
3055
3056 /*
3057 * Tie-break on the grp address.
3058 */
3059 if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp)
3060 goto no_join;
3061
3062 /* Always join threads in the same process. */
3063 if (tsk->mm == current->mm)
3064 join = true;
3065
3066 /* Simple filter to avoid false positives due to PID collisions */
3067 if (flags & TNF_SHARED)
3068 join = true;
3069
3070 /* Update priv based on whether false sharing was detected */
3071 *priv = !join;
3072
3073 if (join && !get_numa_group(grp))
3074 goto no_join;
3075
3076 rcu_read_unlock();
3077
3078 if (!join)
3079 return;
3080
3081 WARN_ON_ONCE(irqs_disabled());
3082 double_lock_irq(&my_grp->lock, &grp->lock);
3083
3084 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
3085 my_grp->faults[i] -= p->numa_faults[i];
3086 grp->faults[i] += p->numa_faults[i];
3087 }
3088 my_grp->total_faults -= p->total_numa_faults;
3089 grp->total_faults += p->total_numa_faults;
3090
3091 my_grp->nr_tasks--;
3092 grp->nr_tasks++;
3093
3094 spin_unlock(&my_grp->lock);
3095 spin_unlock_irq(&grp->lock);
3096
3097 rcu_assign_pointer(p->numa_group, grp);
3098
3099 put_numa_group(my_grp);
3100 return;
3101
3102 no_join:
3103 rcu_read_unlock();
3104 return;
3105 }
3106
3107 /*
3108 * Get rid of NUMA statistics associated with a task (either current or dead).
3109 * If @final is set, the task is dead and has reached refcount zero, so we can
3110 * safely free all relevant data structures. Otherwise, there might be
3111 * concurrent reads from places like load balancing and procfs, and we should
3112 * reset the data back to default state without freeing ->numa_faults.
3113 */
task_numa_free(struct task_struct * p,bool final)3114 void task_numa_free(struct task_struct *p, bool final)
3115 {
3116 /* safe: p either is current or is being freed by current */
3117 struct numa_group *grp = rcu_dereference_raw(p->numa_group);
3118 unsigned long *numa_faults = p->numa_faults;
3119 unsigned long flags;
3120 int i;
3121
3122 if (!numa_faults)
3123 return;
3124
3125 if (grp) {
3126 spin_lock_irqsave(&grp->lock, flags);
3127 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
3128 grp->faults[i] -= p->numa_faults[i];
3129 grp->total_faults -= p->total_numa_faults;
3130
3131 grp->nr_tasks--;
3132 spin_unlock_irqrestore(&grp->lock, flags);
3133 RCU_INIT_POINTER(p->numa_group, NULL);
3134 put_numa_group(grp);
3135 }
3136
3137 if (final) {
3138 p->numa_faults = NULL;
3139 kfree(numa_faults);
3140 } else {
3141 p->total_numa_faults = 0;
3142 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
3143 numa_faults[i] = 0;
3144 }
3145 }
3146
3147 /*
3148 * Got a PROT_NONE fault for a page on @node.
3149 */
task_numa_fault(int last_cpupid,int mem_node,int pages,int flags)3150 void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
3151 {
3152 struct task_struct *p = current;
3153 bool migrated = flags & TNF_MIGRATED;
3154 int cpu_node = task_node(current);
3155 int local = !!(flags & TNF_FAULT_LOCAL);
3156 struct numa_group *ng;
3157 int priv;
3158
3159 if (!static_branch_likely(&sched_numa_balancing))
3160 return;
3161
3162 /* for example, ksmd faulting in a user's mm */
3163 if (!p->mm)
3164 return;
3165
3166 /*
3167 * NUMA faults statistics are unnecessary for the slow memory
3168 * node for memory tiering mode.
3169 */
3170 if (!node_is_toptier(mem_node) &&
3171 (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING ||
3172 !cpupid_valid(last_cpupid)))
3173 return;
3174
3175 /* Allocate buffer to track faults on a per-node basis */
3176 if (unlikely(!p->numa_faults)) {
3177 int size = sizeof(*p->numa_faults) *
3178 NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids;
3179
3180 p->numa_faults = kzalloc(size, GFP_KERNEL|__GFP_NOWARN);
3181 if (!p->numa_faults)
3182 return;
3183
3184 p->total_numa_faults = 0;
3185 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
3186 }
3187
3188 /*
3189 * First accesses are treated as private, otherwise consider accesses
3190 * to be private if the accessing pid has not changed
3191 */
3192 if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) {
3193 priv = 1;
3194 } else {
3195 priv = cpupid_match_pid(p, last_cpupid);
3196 if (!priv && !(flags & TNF_NO_GROUP))
3197 task_numa_group(p, last_cpupid, flags, &priv);
3198 }
3199
3200 /*
3201 * If a workload spans multiple NUMA nodes, a shared fault that
3202 * occurs wholly within the set of nodes that the workload is
3203 * actively using should be counted as local. This allows the
3204 * scan rate to slow down when a workload has settled down.
3205 */
3206 ng = deref_curr_numa_group(p);
3207 if (!priv && !local && ng && ng->active_nodes > 1 &&
3208 numa_is_active_node(cpu_node, ng) &&
3209 numa_is_active_node(mem_node, ng))
3210 local = 1;
3211
3212 /*
3213 * Retry to migrate task to preferred node periodically, in case it
3214 * previously failed, or the scheduler moved us.
3215 */
3216 if (time_after(jiffies, p->numa_migrate_retry)) {
3217 task_numa_placement(p);
3218 numa_migrate_preferred(p);
3219 }
3220
3221 if (migrated)
3222 p->numa_pages_migrated += pages;
3223 if (flags & TNF_MIGRATE_FAIL)
3224 p->numa_faults_locality[2] += pages;
3225
3226 p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages;
3227 p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages;
3228 p->numa_faults_locality[local] += pages;
3229 }
3230
reset_ptenuma_scan(struct task_struct * p)3231 static void reset_ptenuma_scan(struct task_struct *p)
3232 {
3233 /*
3234 * We only did a read acquisition of the mmap sem, so
3235 * p->mm->numa_scan_seq is written to without exclusive access
3236 * and the update is not guaranteed to be atomic. That's not
3237 * much of an issue though, since this is just used for
3238 * statistical sampling. Use READ_ONCE/WRITE_ONCE, which are not
3239 * expensive, to avoid any form of compiler optimizations:
3240 */
3241 WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1);
3242 p->mm->numa_scan_offset = 0;
3243 }
3244
vma_is_accessed(struct mm_struct * mm,struct vm_area_struct * vma)3245 static bool vma_is_accessed(struct mm_struct *mm, struct vm_area_struct *vma)
3246 {
3247 unsigned long pids;
3248 /*
3249 * Allow unconditional access first two times, so that all the (pages)
3250 * of VMAs get prot_none fault introduced irrespective of accesses.
3251 * This is also done to avoid any side effect of task scanning
3252 * amplifying the unfairness of disjoint set of VMAs' access.
3253 */
3254 if ((READ_ONCE(current->mm->numa_scan_seq) - vma->numab_state->start_scan_seq) < 2)
3255 return true;
3256
3257 pids = vma->numab_state->pids_active[0] | vma->numab_state->pids_active[1];
3258 if (test_bit(hash_32(current->pid, ilog2(BITS_PER_LONG)), &pids))
3259 return true;
3260
3261 /*
3262 * Complete a scan that has already started regardless of PID access, or
3263 * some VMAs may never be scanned in multi-threaded applications:
3264 */
3265 if (mm->numa_scan_offset > vma->vm_start) {
3266 trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_IGNORE_PID);
3267 return true;
3268 }
3269
3270 /*
3271 * This vma has not been accessed for a while, and if the number
3272 * the threads in the same process is low, which means no other
3273 * threads can help scan this vma, force a vma scan.
3274 */
3275 if (READ_ONCE(mm->numa_scan_seq) >
3276 (vma->numab_state->prev_scan_seq + get_nr_threads(current)))
3277 return true;
3278
3279 return false;
3280 }
3281
3282 #define VMA_PID_RESET_PERIOD (4 * sysctl_numa_balancing_scan_delay)
3283
3284 /*
3285 * The expensive part of numa migration is done from task_work context.
3286 * Triggered from task_tick_numa().
3287 */
task_numa_work(struct callback_head * work)3288 static void task_numa_work(struct callback_head *work)
3289 {
3290 unsigned long migrate, next_scan, now = jiffies;
3291 struct task_struct *p = current;
3292 struct mm_struct *mm = p->mm;
3293 u64 runtime = p->se.sum_exec_runtime;
3294 struct vm_area_struct *vma;
3295 unsigned long start, end;
3296 unsigned long nr_pte_updates = 0;
3297 long pages, virtpages;
3298 struct vma_iterator vmi;
3299 bool vma_pids_skipped;
3300 bool vma_pids_forced = false;
3301
3302 WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
3303
3304 work->next = work;
3305 /*
3306 * Who cares about NUMA placement when they're dying.
3307 *
3308 * NOTE: make sure not to dereference p->mm before this check,
3309 * exit_task_work() happens _after_ exit_mm() so we could be called
3310 * without p->mm even though we still had it when we enqueued this
3311 * work.
3312 */
3313 if (p->flags & PF_EXITING)
3314 return;
3315
3316 /*
3317 * Memory is pinned to only one NUMA node via cpuset.mems, naturally
3318 * no page can be migrated.
3319 */
3320 if (cpusets_enabled() && nodes_weight(cpuset_current_mems_allowed) == 1) {
3321 trace_sched_skip_cpuset_numa(current, &cpuset_current_mems_allowed);
3322 return;
3323 }
3324
3325 if (!mm->numa_next_scan) {
3326 mm->numa_next_scan = now +
3327 msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
3328 }
3329
3330 /*
3331 * Enforce maximal scan/migration frequency..
3332 */
3333 migrate = mm->numa_next_scan;
3334 if (time_before(now, migrate))
3335 return;
3336
3337 if (p->numa_scan_period == 0) {
3338 p->numa_scan_period_max = task_scan_max(p);
3339 p->numa_scan_period = task_scan_start(p);
3340 }
3341
3342 next_scan = now + msecs_to_jiffies(p->numa_scan_period);
3343 if (!try_cmpxchg(&mm->numa_next_scan, &migrate, next_scan))
3344 return;
3345
3346 /*
3347 * Delay this task enough that another task of this mm will likely win
3348 * the next time around.
3349 */
3350 p->node_stamp += 2 * TICK_NSEC;
3351
3352 pages = sysctl_numa_balancing_scan_size;
3353 pages <<= 20 - PAGE_SHIFT; /* MB in pages */
3354 virtpages = pages * 8; /* Scan up to this much virtual space */
3355 if (!pages)
3356 return;
3357
3358
3359 if (!mmap_read_trylock(mm))
3360 return;
3361
3362 /*
3363 * VMAs are skipped if the current PID has not trapped a fault within
3364 * the VMA recently. Allow scanning to be forced if there is no
3365 * suitable VMA remaining.
3366 */
3367 vma_pids_skipped = false;
3368
3369 retry_pids:
3370 start = mm->numa_scan_offset;
3371 vma_iter_init(&vmi, mm, start);
3372 vma = vma_next(&vmi);
3373 if (!vma) {
3374 reset_ptenuma_scan(p);
3375 start = 0;
3376 vma_iter_set(&vmi, start);
3377 vma = vma_next(&vmi);
3378 }
3379
3380 for (; vma; vma = vma_next(&vmi)) {
3381 if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
3382 is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) {
3383 trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_UNSUITABLE);
3384 continue;
3385 }
3386
3387 /*
3388 * Shared library pages mapped by multiple processes are not
3389 * migrated as it is expected they are cache replicated. Avoid
3390 * hinting faults in read-only file-backed mappings or the vDSO
3391 * as migrating the pages will be of marginal benefit.
3392 */
3393 if (!vma->vm_mm ||
3394 (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ))) {
3395 trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_SHARED_RO);
3396 continue;
3397 }
3398
3399 /*
3400 * Skip inaccessible VMAs to avoid any confusion between
3401 * PROT_NONE and NUMA hinting PTEs
3402 */
3403 if (!vma_is_accessible(vma)) {
3404 trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_INACCESSIBLE);
3405 continue;
3406 }
3407
3408 /* Initialise new per-VMA NUMAB state. */
3409 if (!vma->numab_state) {
3410 struct vma_numab_state *ptr;
3411
3412 ptr = kzalloc(sizeof(*ptr), GFP_KERNEL);
3413 if (!ptr)
3414 continue;
3415
3416 if (cmpxchg(&vma->numab_state, NULL, ptr)) {
3417 kfree(ptr);
3418 continue;
3419 }
3420
3421 vma->numab_state->start_scan_seq = mm->numa_scan_seq;
3422
3423 vma->numab_state->next_scan = now +
3424 msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
3425
3426 /* Reset happens after 4 times scan delay of scan start */
3427 vma->numab_state->pids_active_reset = vma->numab_state->next_scan +
3428 msecs_to_jiffies(VMA_PID_RESET_PERIOD);
3429
3430 /*
3431 * Ensure prev_scan_seq does not match numa_scan_seq,
3432 * to prevent VMAs being skipped prematurely on the
3433 * first scan:
3434 */
3435 vma->numab_state->prev_scan_seq = mm->numa_scan_seq - 1;
3436 }
3437
3438 /*
3439 * Scanning the VMAs of short lived tasks add more overhead. So
3440 * delay the scan for new VMAs.
3441 */
3442 if (mm->numa_scan_seq && time_before(jiffies,
3443 vma->numab_state->next_scan)) {
3444 trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_SCAN_DELAY);
3445 continue;
3446 }
3447
3448 /* RESET access PIDs regularly for old VMAs. */
3449 if (mm->numa_scan_seq &&
3450 time_after(jiffies, vma->numab_state->pids_active_reset)) {
3451 vma->numab_state->pids_active_reset = vma->numab_state->pids_active_reset +
3452 msecs_to_jiffies(VMA_PID_RESET_PERIOD);
3453 vma->numab_state->pids_active[0] = READ_ONCE(vma->numab_state->pids_active[1]);
3454 vma->numab_state->pids_active[1] = 0;
3455 }
3456
3457 /* Do not rescan VMAs twice within the same sequence. */
3458 if (vma->numab_state->prev_scan_seq == mm->numa_scan_seq) {
3459 mm->numa_scan_offset = vma->vm_end;
3460 trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_SEQ_COMPLETED);
3461 continue;
3462 }
3463
3464 /*
3465 * Do not scan the VMA if task has not accessed it, unless no other
3466 * VMA candidate exists.
3467 */
3468 if (!vma_pids_forced && !vma_is_accessed(mm, vma)) {
3469 vma_pids_skipped = true;
3470 trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_PID_INACTIVE);
3471 continue;
3472 }
3473
3474 do {
3475 start = max(start, vma->vm_start);
3476 end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
3477 end = min(end, vma->vm_end);
3478 nr_pte_updates = change_prot_numa(vma, start, end);
3479
3480 /*
3481 * Try to scan sysctl_numa_balancing_size worth of
3482 * hpages that have at least one present PTE that
3483 * is not already PTE-numa. If the VMA contains
3484 * areas that are unused or already full of prot_numa
3485 * PTEs, scan up to virtpages, to skip through those
3486 * areas faster.
3487 */
3488 if (nr_pte_updates)
3489 pages -= (end - start) >> PAGE_SHIFT;
3490 virtpages -= (end - start) >> PAGE_SHIFT;
3491
3492 start = end;
3493 if (pages <= 0 || virtpages <= 0)
3494 goto out;
3495
3496 cond_resched();
3497 } while (end != vma->vm_end);
3498
3499 /* VMA scan is complete, do not scan until next sequence. */
3500 vma->numab_state->prev_scan_seq = mm->numa_scan_seq;
3501
3502 /*
3503 * Only force scan within one VMA at a time, to limit the
3504 * cost of scanning a potentially uninteresting VMA.
3505 */
3506 if (vma_pids_forced)
3507 break;
3508 }
3509
3510 /*
3511 * If no VMAs are remaining and VMAs were skipped due to the PID
3512 * not accessing the VMA previously, then force a scan to ensure
3513 * forward progress:
3514 */
3515 if (!vma && !vma_pids_forced && vma_pids_skipped) {
3516 vma_pids_forced = true;
3517 goto retry_pids;
3518 }
3519
3520 out:
3521 /*
3522 * It is possible to reach the end of the VMA list but the last few
3523 * VMAs are not guaranteed to the vma_migratable. If they are not, we
3524 * would find the !migratable VMA on the next scan but not reset the
3525 * scanner to the start so check it now.
3526 */
3527 if (vma)
3528 mm->numa_scan_offset = start;
3529 else
3530 reset_ptenuma_scan(p);
3531 mmap_read_unlock(mm);
3532
3533 /*
3534 * Make sure tasks use at least 32x as much time to run other code
3535 * than they used here, to limit NUMA PTE scanning overhead to 3% max.
3536 * Usually update_task_scan_period slows down scanning enough; on an
3537 * overloaded system we need to limit overhead on a per task basis.
3538 */
3539 if (unlikely(p->se.sum_exec_runtime != runtime)) {
3540 u64 diff = p->se.sum_exec_runtime - runtime;
3541 p->node_stamp += 32 * diff;
3542 }
3543 }
3544
init_numa_balancing(unsigned long clone_flags,struct task_struct * p)3545 void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
3546 {
3547 int mm_users = 0;
3548 struct mm_struct *mm = p->mm;
3549
3550 if (mm) {
3551 mm_users = atomic_read(&mm->mm_users);
3552 if (mm_users == 1) {
3553 mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
3554 mm->numa_scan_seq = 0;
3555 }
3556 }
3557 p->node_stamp = 0;
3558 p->numa_scan_seq = mm ? mm->numa_scan_seq : 0;
3559 p->numa_scan_period = sysctl_numa_balancing_scan_delay;
3560 p->numa_migrate_retry = 0;
3561 /* Protect against double add, see task_tick_numa and task_numa_work */
3562 p->numa_work.next = &p->numa_work;
3563 p->numa_faults = NULL;
3564 p->numa_pages_migrated = 0;
3565 p->total_numa_faults = 0;
3566 RCU_INIT_POINTER(p->numa_group, NULL);
3567 p->last_task_numa_placement = 0;
3568 p->last_sum_exec_runtime = 0;
3569
3570 init_task_work(&p->numa_work, task_numa_work);
3571
3572 /* New address space, reset the preferred nid */
3573 if (!(clone_flags & CLONE_VM)) {
3574 p->numa_preferred_nid = NUMA_NO_NODE;
3575 return;
3576 }
3577
3578 /*
3579 * New thread, keep existing numa_preferred_nid which should be copied
3580 * already by arch_dup_task_struct but stagger when scans start.
3581 */
3582 if (mm) {
3583 unsigned int delay;
3584
3585 delay = min_t(unsigned int, task_scan_max(current),
3586 current->numa_scan_period * mm_users * NSEC_PER_MSEC);
3587 delay += 2 * TICK_NSEC;
3588 p->node_stamp = delay;
3589 }
3590 }
3591
3592 /*
3593 * Drive the periodic memory faults..
3594 */
task_tick_numa(struct rq * rq,struct task_struct * curr)3595 static void task_tick_numa(struct rq *rq, struct task_struct *curr)
3596 {
3597 struct callback_head *work = &curr->numa_work;
3598 u64 period, now;
3599
3600 /*
3601 * We don't care about NUMA placement if we don't have memory.
3602 */
3603 if (!curr->mm || (curr->flags & (PF_EXITING | PF_KTHREAD)) || work->next != work)
3604 return;
3605
3606 /*
3607 * Using runtime rather than walltime has the dual advantage that
3608 * we (mostly) drive the selection from busy threads and that the
3609 * task needs to have done some actual work before we bother with
3610 * NUMA placement.
3611 */
3612 now = curr->se.sum_exec_runtime;
3613 period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
3614
3615 if (now > curr->node_stamp + period) {
3616 if (!curr->node_stamp)
3617 curr->numa_scan_period = task_scan_start(curr);
3618 curr->node_stamp += period;
3619
3620 if (!time_before(jiffies, curr->mm->numa_next_scan))
3621 task_work_add(curr, work, TWA_RESUME);
3622 }
3623 }
3624
update_scan_period(struct task_struct * p,int new_cpu)3625 static void update_scan_period(struct task_struct *p, int new_cpu)
3626 {
3627 int src_nid = cpu_to_node(task_cpu(p));
3628 int dst_nid = cpu_to_node(new_cpu);
3629
3630 if (!static_branch_likely(&sched_numa_balancing))
3631 return;
3632
3633 if (!p->mm || !p->numa_faults || (p->flags & PF_EXITING))
3634 return;
3635
3636 if (src_nid == dst_nid)
3637 return;
3638
3639 /*
3640 * Allow resets if faults have been trapped before one scan
3641 * has completed. This is most likely due to a new task that
3642 * is pulled cross-node due to wakeups or load balancing.
3643 */
3644 if (p->numa_scan_seq) {
3645 /*
3646 * Avoid scan adjustments if moving to the preferred
3647 * node or if the task was not previously running on
3648 * the preferred node.
3649 */
3650 if (dst_nid == p->numa_preferred_nid ||
3651 (p->numa_preferred_nid != NUMA_NO_NODE &&
3652 src_nid != p->numa_preferred_nid))
3653 return;
3654 }
3655
3656 p->numa_scan_period = task_scan_start(p);
3657 }
3658
3659 #else /* !CONFIG_NUMA_BALANCING: */
3660
task_tick_numa(struct rq * rq,struct task_struct * curr)3661 static void task_tick_numa(struct rq *rq, struct task_struct *curr)
3662 {
3663 }
3664
account_numa_enqueue(struct rq * rq,struct task_struct * p)3665 static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p)
3666 {
3667 }
3668
account_numa_dequeue(struct rq * rq,struct task_struct * p)3669 static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
3670 {
3671 }
3672
update_scan_period(struct task_struct * p,int new_cpu)3673 static inline void update_scan_period(struct task_struct *p, int new_cpu)
3674 {
3675 }
3676
3677 #endif /* !CONFIG_NUMA_BALANCING */
3678
3679 static void
account_entity_enqueue(struct cfs_rq * cfs_rq,struct sched_entity * se)3680 account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
3681 {
3682 update_load_add(&cfs_rq->load, se->load.weight);
3683 if (entity_is_task(se)) {
3684 struct rq *rq = rq_of(cfs_rq);
3685
3686 account_numa_enqueue(rq, task_of(se));
3687 list_add(&se->group_node, &rq->cfs_tasks);
3688 }
3689 cfs_rq->nr_queued++;
3690 }
3691
3692 static void
account_entity_dequeue(struct cfs_rq * cfs_rq,struct sched_entity * se)3693 account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
3694 {
3695 update_load_sub(&cfs_rq->load, se->load.weight);
3696 if (entity_is_task(se)) {
3697 account_numa_dequeue(rq_of(cfs_rq), task_of(se));
3698 list_del_init(&se->group_node);
3699 }
3700 cfs_rq->nr_queued--;
3701 }
3702
3703 /*
3704 * Signed add and clamp on underflow.
3705 *
3706 * Explicitly do a load-store to ensure the intermediate value never hits
3707 * memory. This allows lockless observations without ever seeing the negative
3708 * values.
3709 */
3710 #define add_positive(_ptr, _val) do { \
3711 typeof(_ptr) ptr = (_ptr); \
3712 typeof(_val) val = (_val); \
3713 typeof(*ptr) res, var = READ_ONCE(*ptr); \
3714 \
3715 res = var + val; \
3716 \
3717 if (val < 0 && res > var) \
3718 res = 0; \
3719 \
3720 WRITE_ONCE(*ptr, res); \
3721 } while (0)
3722
3723 /*
3724 * Unsigned subtract and clamp on underflow.
3725 *
3726 * Explicitly do a load-store to ensure the intermediate value never hits
3727 * memory. This allows lockless observations without ever seeing the negative
3728 * values.
3729 */
3730 #define sub_positive(_ptr, _val) do { \
3731 typeof(_ptr) ptr = (_ptr); \
3732 typeof(*ptr) val = (_val); \
3733 typeof(*ptr) res, var = READ_ONCE(*ptr); \
3734 res = var - val; \
3735 if (res > var) \
3736 res = 0; \
3737 WRITE_ONCE(*ptr, res); \
3738 } while (0)
3739
3740 /*
3741 * Remove and clamp on negative, from a local variable.
3742 *
3743 * A variant of sub_positive(), which does not use explicit load-store
3744 * and is thus optimized for local variable updates.
3745 */
3746 #define lsub_positive(_ptr, _val) do { \
3747 typeof(_ptr) ptr = (_ptr); \
3748 *ptr -= min_t(typeof(*ptr), *ptr, _val); \
3749 } while (0)
3750
3751 static inline void
enqueue_load_avg(struct cfs_rq * cfs_rq,struct sched_entity * se)3752 enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
3753 {
3754 cfs_rq->avg.load_avg += se->avg.load_avg;
3755 cfs_rq->avg.load_sum += se_weight(se) * se->avg.load_sum;
3756 }
3757
3758 static inline void
dequeue_load_avg(struct cfs_rq * cfs_rq,struct sched_entity * se)3759 dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
3760 {
3761 sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
3762 sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum);
3763 /* See update_cfs_rq_load_avg() */
3764 cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum,
3765 cfs_rq->avg.load_avg * PELT_MIN_DIVIDER);
3766 }
3767
3768 static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags);
3769
reweight_entity(struct cfs_rq * cfs_rq,struct sched_entity * se,unsigned long weight)3770 static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
3771 unsigned long weight)
3772 {
3773 bool curr = cfs_rq->curr == se;
3774
3775 if (se->on_rq) {
3776 /* commit outstanding execution time */
3777 update_curr(cfs_rq);
3778 update_entity_lag(cfs_rq, se);
3779 se->deadline -= se->vruntime;
3780 se->rel_deadline = 1;
3781 cfs_rq->nr_queued--;
3782 if (!curr)
3783 __dequeue_entity(cfs_rq, se);
3784 update_load_sub(&cfs_rq->load, se->load.weight);
3785 }
3786 dequeue_load_avg(cfs_rq, se);
3787
3788 /*
3789 * Because we keep se->vlag = V - v_i, while: lag_i = w_i*(V - v_i),
3790 * we need to scale se->vlag when w_i changes.
3791 */
3792 se->vlag = div_s64(se->vlag * se->load.weight, weight);
3793 if (se->rel_deadline)
3794 se->deadline = div_s64(se->deadline * se->load.weight, weight);
3795
3796 update_load_set(&se->load, weight);
3797
3798 do {
3799 u32 divider = get_pelt_divider(&se->avg);
3800
3801 se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider);
3802 } while (0);
3803
3804 enqueue_load_avg(cfs_rq, se);
3805 if (se->on_rq) {
3806 place_entity(cfs_rq, se, 0);
3807 update_load_add(&cfs_rq->load, se->load.weight);
3808 if (!curr)
3809 __enqueue_entity(cfs_rq, se);
3810 cfs_rq->nr_queued++;
3811
3812 /*
3813 * The entity's vruntime has been adjusted, so let's check
3814 * whether the rq-wide min_vruntime needs updated too. Since
3815 * the calculations above require stable min_vruntime rather
3816 * than up-to-date one, we do the update at the end of the
3817 * reweight process.
3818 */
3819 update_min_vruntime(cfs_rq);
3820 }
3821 }
3822
reweight_task_fair(struct rq * rq,struct task_struct * p,const struct load_weight * lw)3823 static void reweight_task_fair(struct rq *rq, struct task_struct *p,
3824 const struct load_weight *lw)
3825 {
3826 struct sched_entity *se = &p->se;
3827 struct cfs_rq *cfs_rq = cfs_rq_of(se);
3828 struct load_weight *load = &se->load;
3829
3830 reweight_entity(cfs_rq, se, lw->weight);
3831 load->inv_weight = lw->inv_weight;
3832 }
3833
3834 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
3835
3836 #ifdef CONFIG_FAIR_GROUP_SCHED
3837 /*
3838 * All this does is approximate the hierarchical proportion which includes that
3839 * global sum we all love to hate.
3840 *
3841 * That is, the weight of a group entity, is the proportional share of the
3842 * group weight based on the group runqueue weights. That is:
3843 *
3844 * tg->weight * grq->load.weight
3845 * ge->load.weight = ----------------------------- (1)
3846 * \Sum grq->load.weight
3847 *
3848 * Now, because computing that sum is prohibitively expensive to compute (been
3849 * there, done that) we approximate it with this average stuff. The average
3850 * moves slower and therefore the approximation is cheaper and more stable.
3851 *
3852 * So instead of the above, we substitute:
3853 *
3854 * grq->load.weight -> grq->avg.load_avg (2)
3855 *
3856 * which yields the following:
3857 *
3858 * tg->weight * grq->avg.load_avg
3859 * ge->load.weight = ------------------------------ (3)
3860 * tg->load_avg
3861 *
3862 * Where: tg->load_avg ~= \Sum grq->avg.load_avg
3863 *
3864 * That is shares_avg, and it is right (given the approximation (2)).
3865 *
3866 * The problem with it is that because the average is slow -- it was designed
3867 * to be exactly that of course -- this leads to transients in boundary
3868 * conditions. In specific, the case where the group was idle and we start the
3869 * one task. It takes time for our CPU's grq->avg.load_avg to build up,
3870 * yielding bad latency etc..
3871 *
3872 * Now, in that special case (1) reduces to:
3873 *
3874 * tg->weight * grq->load.weight
3875 * ge->load.weight = ----------------------------- = tg->weight (4)
3876 * grp->load.weight
3877 *
3878 * That is, the sum collapses because all other CPUs are idle; the UP scenario.
3879 *
3880 * So what we do is modify our approximation (3) to approach (4) in the (near)
3881 * UP case, like:
3882 *
3883 * ge->load.weight =
3884 *
3885 * tg->weight * grq->load.weight
3886 * --------------------------------------------------- (5)
3887 * tg->load_avg - grq->avg.load_avg + grq->load.weight
3888 *
3889 * But because grq->load.weight can drop to 0, resulting in a divide by zero,
3890 * we need to use grq->avg.load_avg as its lower bound, which then gives:
3891 *
3892 *
3893 * tg->weight * grq->load.weight
3894 * ge->load.weight = ----------------------------- (6)
3895 * tg_load_avg'
3896 *
3897 * Where:
3898 *
3899 * tg_load_avg' = tg->load_avg - grq->avg.load_avg +
3900 * max(grq->load.weight, grq->avg.load_avg)
3901 *
3902 * And that is shares_weight and is icky. In the (near) UP case it approaches
3903 * (4) while in the normal case it approaches (3). It consistently
3904 * overestimates the ge->load.weight and therefore:
3905 *
3906 * \Sum ge->load.weight >= tg->weight
3907 *
3908 * hence icky!
3909 */
calc_group_shares(struct cfs_rq * cfs_rq)3910 static long calc_group_shares(struct cfs_rq *cfs_rq)
3911 {
3912 long tg_weight, tg_shares, load, shares;
3913 struct task_group *tg = cfs_rq->tg;
3914
3915 tg_shares = READ_ONCE(tg->shares);
3916
3917 load = max(scale_load_down(cfs_rq->load.weight), cfs_rq->avg.load_avg);
3918
3919 tg_weight = atomic_long_read(&tg->load_avg);
3920
3921 /* Ensure tg_weight >= load */
3922 tg_weight -= cfs_rq->tg_load_avg_contrib;
3923 tg_weight += load;
3924
3925 shares = (tg_shares * load);
3926 if (tg_weight)
3927 shares /= tg_weight;
3928
3929 /*
3930 * MIN_SHARES has to be unscaled here to support per-CPU partitioning
3931 * of a group with small tg->shares value. It is a floor value which is
3932 * assigned as a minimum load.weight to the sched_entity representing
3933 * the group on a CPU.
3934 *
3935 * E.g. on 64-bit for a group with tg->shares of scale_load(15)=15*1024
3936 * on an 8-core system with 8 tasks each runnable on one CPU shares has
3937 * to be 15*1024*1/8=1920 instead of scale_load(MIN_SHARES)=2*1024. In
3938 * case no task is runnable on a CPU MIN_SHARES=2 should be returned
3939 * instead of 0.
3940 */
3941 return clamp_t(long, shares, MIN_SHARES, tg_shares);
3942 }
3943
3944 /*
3945 * Recomputes the group entity based on the current state of its group
3946 * runqueue.
3947 */
update_cfs_group(struct sched_entity * se)3948 static void update_cfs_group(struct sched_entity *se)
3949 {
3950 struct cfs_rq *gcfs_rq = group_cfs_rq(se);
3951 long shares;
3952
3953 /*
3954 * When a group becomes empty, preserve its weight. This matters for
3955 * DELAY_DEQUEUE.
3956 */
3957 if (!gcfs_rq || !gcfs_rq->load.weight)
3958 return;
3959
3960 if (throttled_hierarchy(gcfs_rq))
3961 return;
3962
3963 shares = calc_group_shares(gcfs_rq);
3964 if (unlikely(se->load.weight != shares))
3965 reweight_entity(cfs_rq_of(se), se, shares);
3966 }
3967
3968 #else /* !CONFIG_FAIR_GROUP_SCHED: */
update_cfs_group(struct sched_entity * se)3969 static inline void update_cfs_group(struct sched_entity *se)
3970 {
3971 }
3972 #endif /* !CONFIG_FAIR_GROUP_SCHED */
3973
cfs_rq_util_change(struct cfs_rq * cfs_rq,int flags)3974 static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags)
3975 {
3976 struct rq *rq = rq_of(cfs_rq);
3977
3978 if (&rq->cfs == cfs_rq) {
3979 /*
3980 * There are a few boundary cases this might miss but it should
3981 * get called often enough that that should (hopefully) not be
3982 * a real problem.
3983 *
3984 * It will not get called when we go idle, because the idle
3985 * thread is a different class (!fair), nor will the utilization
3986 * number include things like RT tasks.
3987 *
3988 * As is, the util number is not freq-invariant (we'd have to
3989 * implement arch_scale_freq_capacity() for that).
3990 *
3991 * See cpu_util_cfs().
3992 */
3993 cpufreq_update_util(rq, flags);
3994 }
3995 }
3996
load_avg_is_decayed(struct sched_avg * sa)3997 static inline bool load_avg_is_decayed(struct sched_avg *sa)
3998 {
3999 if (sa->load_sum)
4000 return false;
4001
4002 if (sa->util_sum)
4003 return false;
4004
4005 if (sa->runnable_sum)
4006 return false;
4007
4008 /*
4009 * _avg must be null when _sum are null because _avg = _sum / divider
4010 * Make sure that rounding and/or propagation of PELT values never
4011 * break this.
4012 */
4013 WARN_ON_ONCE(sa->load_avg ||
4014 sa->util_avg ||
4015 sa->runnable_avg);
4016
4017 return true;
4018 }
4019
cfs_rq_last_update_time(struct cfs_rq * cfs_rq)4020 static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
4021 {
4022 return u64_u32_load_copy(cfs_rq->avg.last_update_time,
4023 cfs_rq->last_update_time_copy);
4024 }
4025 #ifdef CONFIG_FAIR_GROUP_SCHED
4026 /*
4027 * Because list_add_leaf_cfs_rq always places a child cfs_rq on the list
4028 * immediately before a parent cfs_rq, and cfs_rqs are removed from the list
4029 * bottom-up, we only have to test whether the cfs_rq before us on the list
4030 * is our child.
4031 * If cfs_rq is not on the list, test whether a child needs its to be added to
4032 * connect a branch to the tree * (see list_add_leaf_cfs_rq() for details).
4033 */
child_cfs_rq_on_list(struct cfs_rq * cfs_rq)4034 static inline bool child_cfs_rq_on_list(struct cfs_rq *cfs_rq)
4035 {
4036 struct cfs_rq *prev_cfs_rq;
4037 struct list_head *prev;
4038 struct rq *rq = rq_of(cfs_rq);
4039
4040 if (cfs_rq->on_list) {
4041 prev = cfs_rq->leaf_cfs_rq_list.prev;
4042 } else {
4043 prev = rq->tmp_alone_branch;
4044 }
4045
4046 if (prev == &rq->leaf_cfs_rq_list)
4047 return false;
4048
4049 prev_cfs_rq = container_of(prev, struct cfs_rq, leaf_cfs_rq_list);
4050
4051 return (prev_cfs_rq->tg->parent == cfs_rq->tg);
4052 }
4053
cfs_rq_is_decayed(struct cfs_rq * cfs_rq)4054 static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
4055 {
4056 if (cfs_rq->load.weight)
4057 return false;
4058
4059 if (!load_avg_is_decayed(&cfs_rq->avg))
4060 return false;
4061
4062 if (child_cfs_rq_on_list(cfs_rq))
4063 return false;
4064
4065 return true;
4066 }
4067
4068 /**
4069 * update_tg_load_avg - update the tg's load avg
4070 * @cfs_rq: the cfs_rq whose avg changed
4071 *
4072 * This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load.
4073 * However, because tg->load_avg is a global value there are performance
4074 * considerations.
4075 *
4076 * In order to avoid having to look at the other cfs_rq's, we use a
4077 * differential update where we store the last value we propagated. This in
4078 * turn allows skipping updates if the differential is 'small'.
4079 *
4080 * Updating tg's load_avg is necessary before update_cfs_share().
4081 */
update_tg_load_avg(struct cfs_rq * cfs_rq)4082 static inline void update_tg_load_avg(struct cfs_rq *cfs_rq)
4083 {
4084 long delta;
4085 u64 now;
4086
4087 /*
4088 * No need to update load_avg for root_task_group as it is not used.
4089 */
4090 if (cfs_rq->tg == &root_task_group)
4091 return;
4092
4093 /* rq has been offline and doesn't contribute to the share anymore: */
4094 if (!cpu_active(cpu_of(rq_of(cfs_rq))))
4095 return;
4096
4097 /*
4098 * For migration heavy workloads, access to tg->load_avg can be
4099 * unbound. Limit the update rate to at most once per ms.
4100 */
4101 now = sched_clock_cpu(cpu_of(rq_of(cfs_rq)));
4102 if (now - cfs_rq->last_update_tg_load_avg < NSEC_PER_MSEC)
4103 return;
4104
4105 delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
4106 if (abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
4107 atomic_long_add(delta, &cfs_rq->tg->load_avg);
4108 cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
4109 cfs_rq->last_update_tg_load_avg = now;
4110 }
4111 }
4112
clear_tg_load_avg(struct cfs_rq * cfs_rq)4113 static inline void clear_tg_load_avg(struct cfs_rq *cfs_rq)
4114 {
4115 long delta;
4116 u64 now;
4117
4118 /*
4119 * No need to update load_avg for root_task_group, as it is not used.
4120 */
4121 if (cfs_rq->tg == &root_task_group)
4122 return;
4123
4124 now = sched_clock_cpu(cpu_of(rq_of(cfs_rq)));
4125 delta = 0 - cfs_rq->tg_load_avg_contrib;
4126 atomic_long_add(delta, &cfs_rq->tg->load_avg);
4127 cfs_rq->tg_load_avg_contrib = 0;
4128 cfs_rq->last_update_tg_load_avg = now;
4129 }
4130
4131 /* CPU offline callback: */
clear_tg_offline_cfs_rqs(struct rq * rq)4132 static void __maybe_unused clear_tg_offline_cfs_rqs(struct rq *rq)
4133 {
4134 struct task_group *tg;
4135
4136 lockdep_assert_rq_held(rq);
4137
4138 /*
4139 * The rq clock has already been updated in
4140 * set_rq_offline(), so we should skip updating
4141 * the rq clock again in unthrottle_cfs_rq().
4142 */
4143 rq_clock_start_loop_update(rq);
4144
4145 rcu_read_lock();
4146 list_for_each_entry_rcu(tg, &task_groups, list) {
4147 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
4148
4149 clear_tg_load_avg(cfs_rq);
4150 }
4151 rcu_read_unlock();
4152
4153 rq_clock_stop_loop_update(rq);
4154 }
4155
4156 /*
4157 * Called within set_task_rq() right before setting a task's CPU. The
4158 * caller only guarantees p->pi_lock is held; no other assumptions,
4159 * including the state of rq->lock, should be made.
4160 */
set_task_rq_fair(struct sched_entity * se,struct cfs_rq * prev,struct cfs_rq * next)4161 void set_task_rq_fair(struct sched_entity *se,
4162 struct cfs_rq *prev, struct cfs_rq *next)
4163 {
4164 u64 p_last_update_time;
4165 u64 n_last_update_time;
4166
4167 if (!sched_feat(ATTACH_AGE_LOAD))
4168 return;
4169
4170 /*
4171 * We are supposed to update the task to "current" time, then its up to
4172 * date and ready to go to new CPU/cfs_rq. But we have difficulty in
4173 * getting what current time is, so simply throw away the out-of-date
4174 * time. This will result in the wakee task is less decayed, but giving
4175 * the wakee more load sounds not bad.
4176 */
4177 if (!(se->avg.last_update_time && prev))
4178 return;
4179
4180 p_last_update_time = cfs_rq_last_update_time(prev);
4181 n_last_update_time = cfs_rq_last_update_time(next);
4182
4183 __update_load_avg_blocked_se(p_last_update_time, se);
4184 se->avg.last_update_time = n_last_update_time;
4185 }
4186
4187 /*
4188 * When on migration a sched_entity joins/leaves the PELT hierarchy, we need to
4189 * propagate its contribution. The key to this propagation is the invariant
4190 * that for each group:
4191 *
4192 * ge->avg == grq->avg (1)
4193 *
4194 * _IFF_ we look at the pure running and runnable sums. Because they
4195 * represent the very same entity, just at different points in the hierarchy.
4196 *
4197 * Per the above update_tg_cfs_util() and update_tg_cfs_runnable() are trivial
4198 * and simply copies the running/runnable sum over (but still wrong, because
4199 * the group entity and group rq do not have their PELT windows aligned).
4200 *
4201 * However, update_tg_cfs_load() is more complex. So we have:
4202 *
4203 * ge->avg.load_avg = ge->load.weight * ge->avg.runnable_avg (2)
4204 *
4205 * And since, like util, the runnable part should be directly transferable,
4206 * the following would _appear_ to be the straight forward approach:
4207 *
4208 * grq->avg.load_avg = grq->load.weight * grq->avg.runnable_avg (3)
4209 *
4210 * And per (1) we have:
4211 *
4212 * ge->avg.runnable_avg == grq->avg.runnable_avg
4213 *
4214 * Which gives:
4215 *
4216 * ge->load.weight * grq->avg.load_avg
4217 * ge->avg.load_avg = ----------------------------------- (4)
4218 * grq->load.weight
4219 *
4220 * Except that is wrong!
4221 *
4222 * Because while for entities historical weight is not important and we
4223 * really only care about our future and therefore can consider a pure
4224 * runnable sum, runqueues can NOT do this.
4225 *
4226 * We specifically want runqueues to have a load_avg that includes
4227 * historical weights. Those represent the blocked load, the load we expect
4228 * to (shortly) return to us. This only works by keeping the weights as
4229 * integral part of the sum. We therefore cannot decompose as per (3).
4230 *
4231 * Another reason this doesn't work is that runnable isn't a 0-sum entity.
4232 * Imagine a rq with 2 tasks that each are runnable 2/3 of the time. Then the
4233 * rq itself is runnable anywhere between 2/3 and 1 depending on how the
4234 * runnable section of these tasks overlap (or not). If they were to perfectly
4235 * align the rq as a whole would be runnable 2/3 of the time. If however we
4236 * always have at least 1 runnable task, the rq as a whole is always runnable.
4237 *
4238 * So we'll have to approximate.. :/
4239 *
4240 * Given the constraint:
4241 *
4242 * ge->avg.running_sum <= ge->avg.runnable_sum <= LOAD_AVG_MAX
4243 *
4244 * We can construct a rule that adds runnable to a rq by assuming minimal
4245 * overlap.
4246 *
4247 * On removal, we'll assume each task is equally runnable; which yields:
4248 *
4249 * grq->avg.runnable_sum = grq->avg.load_sum / grq->load.weight
4250 *
4251 * XXX: only do this for the part of runnable > running ?
4252 *
4253 */
4254 static inline void
update_tg_cfs_util(struct cfs_rq * cfs_rq,struct sched_entity * se,struct cfs_rq * gcfs_rq)4255 update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
4256 {
4257 long delta_sum, delta_avg = gcfs_rq->avg.util_avg - se->avg.util_avg;
4258 u32 new_sum, divider;
4259
4260 /* Nothing to update */
4261 if (!delta_avg)
4262 return;
4263
4264 /*
4265 * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
4266 * See ___update_load_avg() for details.
4267 */
4268 divider = get_pelt_divider(&cfs_rq->avg);
4269
4270
4271 /* Set new sched_entity's utilization */
4272 se->avg.util_avg = gcfs_rq->avg.util_avg;
4273 new_sum = se->avg.util_avg * divider;
4274 delta_sum = (long)new_sum - (long)se->avg.util_sum;
4275 se->avg.util_sum = new_sum;
4276
4277 /* Update parent cfs_rq utilization */
4278 add_positive(&cfs_rq->avg.util_avg, delta_avg);
4279 add_positive(&cfs_rq->avg.util_sum, delta_sum);
4280
4281 /* See update_cfs_rq_load_avg() */
4282 cfs_rq->avg.util_sum = max_t(u32, cfs_rq->avg.util_sum,
4283 cfs_rq->avg.util_avg * PELT_MIN_DIVIDER);
4284 }
4285
4286 static inline void
update_tg_cfs_runnable(struct cfs_rq * cfs_rq,struct sched_entity * se,struct cfs_rq * gcfs_rq)4287 update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
4288 {
4289 long delta_sum, delta_avg = gcfs_rq->avg.runnable_avg - se->avg.runnable_avg;
4290 u32 new_sum, divider;
4291
4292 /* Nothing to update */
4293 if (!delta_avg)
4294 return;
4295
4296 /*
4297 * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
4298 * See ___update_load_avg() for details.
4299 */
4300 divider = get_pelt_divider(&cfs_rq->avg);
4301
4302 /* Set new sched_entity's runnable */
4303 se->avg.runnable_avg = gcfs_rq->avg.runnable_avg;
4304 new_sum = se->avg.runnable_avg * divider;
4305 delta_sum = (long)new_sum - (long)se->avg.runnable_sum;
4306 se->avg.runnable_sum = new_sum;
4307
4308 /* Update parent cfs_rq runnable */
4309 add_positive(&cfs_rq->avg.runnable_avg, delta_avg);
4310 add_positive(&cfs_rq->avg.runnable_sum, delta_sum);
4311 /* See update_cfs_rq_load_avg() */
4312 cfs_rq->avg.runnable_sum = max_t(u32, cfs_rq->avg.runnable_sum,
4313 cfs_rq->avg.runnable_avg * PELT_MIN_DIVIDER);
4314 }
4315
4316 static inline void
update_tg_cfs_load(struct cfs_rq * cfs_rq,struct sched_entity * se,struct cfs_rq * gcfs_rq)4317 update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
4318 {
4319 long delta_avg, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum;
4320 unsigned long load_avg;
4321 u64 load_sum = 0;
4322 s64 delta_sum;
4323 u32 divider;
4324
4325 if (!runnable_sum)
4326 return;
4327
4328 gcfs_rq->prop_runnable_sum = 0;
4329
4330 /*
4331 * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
4332 * See ___update_load_avg() for details.
4333 */
4334 divider = get_pelt_divider(&cfs_rq->avg);
4335
4336 if (runnable_sum >= 0) {
4337 /*
4338 * Add runnable; clip at LOAD_AVG_MAX. Reflects that until
4339 * the CPU is saturated running == runnable.
4340 */
4341 runnable_sum += se->avg.load_sum;
4342 runnable_sum = min_t(long, runnable_sum, divider);
4343 } else {
4344 /*
4345 * Estimate the new unweighted runnable_sum of the gcfs_rq by
4346 * assuming all tasks are equally runnable.
4347 */
4348 if (scale_load_down(gcfs_rq->load.weight)) {
4349 load_sum = div_u64(gcfs_rq->avg.load_sum,
4350 scale_load_down(gcfs_rq->load.weight));
4351 }
4352
4353 /* But make sure to not inflate se's runnable */
4354 runnable_sum = min(se->avg.load_sum, load_sum);
4355 }
4356
4357 /*
4358 * runnable_sum can't be lower than running_sum
4359 * Rescale running sum to be in the same range as runnable sum
4360 * running_sum is in [0 : LOAD_AVG_MAX << SCHED_CAPACITY_SHIFT]
4361 * runnable_sum is in [0 : LOAD_AVG_MAX]
4362 */
4363 running_sum = se->avg.util_sum >> SCHED_CAPACITY_SHIFT;
4364 runnable_sum = max(runnable_sum, running_sum);
4365
4366 load_sum = se_weight(se) * runnable_sum;
4367 load_avg = div_u64(load_sum, divider);
4368
4369 delta_avg = load_avg - se->avg.load_avg;
4370 if (!delta_avg)
4371 return;
4372
4373 delta_sum = load_sum - (s64)se_weight(se) * se->avg.load_sum;
4374
4375 se->avg.load_sum = runnable_sum;
4376 se->avg.load_avg = load_avg;
4377 add_positive(&cfs_rq->avg.load_avg, delta_avg);
4378 add_positive(&cfs_rq->avg.load_sum, delta_sum);
4379 /* See update_cfs_rq_load_avg() */
4380 cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum,
4381 cfs_rq->avg.load_avg * PELT_MIN_DIVIDER);
4382 }
4383
add_tg_cfs_propagate(struct cfs_rq * cfs_rq,long runnable_sum)4384 static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum)
4385 {
4386 cfs_rq->propagate = 1;
4387 cfs_rq->prop_runnable_sum += runnable_sum;
4388 }
4389
4390 /* Update task and its cfs_rq load average */
propagate_entity_load_avg(struct sched_entity * se)4391 static inline int propagate_entity_load_avg(struct sched_entity *se)
4392 {
4393 struct cfs_rq *cfs_rq, *gcfs_rq;
4394
4395 if (entity_is_task(se))
4396 return 0;
4397
4398 gcfs_rq = group_cfs_rq(se);
4399 if (!gcfs_rq->propagate)
4400 return 0;
4401
4402 gcfs_rq->propagate = 0;
4403
4404 cfs_rq = cfs_rq_of(se);
4405
4406 add_tg_cfs_propagate(cfs_rq, gcfs_rq->prop_runnable_sum);
4407
4408 update_tg_cfs_util(cfs_rq, se, gcfs_rq);
4409 update_tg_cfs_runnable(cfs_rq, se, gcfs_rq);
4410 update_tg_cfs_load(cfs_rq, se, gcfs_rq);
4411
4412 trace_pelt_cfs_tp(cfs_rq);
4413 trace_pelt_se_tp(se);
4414
4415 return 1;
4416 }
4417
4418 /*
4419 * Check if we need to update the load and the utilization of a blocked
4420 * group_entity:
4421 */
skip_blocked_update(struct sched_entity * se)4422 static inline bool skip_blocked_update(struct sched_entity *se)
4423 {
4424 struct cfs_rq *gcfs_rq = group_cfs_rq(se);
4425
4426 /*
4427 * If sched_entity still have not zero load or utilization, we have to
4428 * decay it:
4429 */
4430 if (se->avg.load_avg || se->avg.util_avg)
4431 return false;
4432
4433 /*
4434 * If there is a pending propagation, we have to update the load and
4435 * the utilization of the sched_entity:
4436 */
4437 if (gcfs_rq->propagate)
4438 return false;
4439
4440 /*
4441 * Otherwise, the load and the utilization of the sched_entity is
4442 * already zero and there is no pending propagation, so it will be a
4443 * waste of time to try to decay it:
4444 */
4445 return true;
4446 }
4447
4448 #else /* !CONFIG_FAIR_GROUP_SCHED: */
4449
update_tg_load_avg(struct cfs_rq * cfs_rq)4450 static inline void update_tg_load_avg(struct cfs_rq *cfs_rq) {}
4451
clear_tg_offline_cfs_rqs(struct rq * rq)4452 static inline void clear_tg_offline_cfs_rqs(struct rq *rq) {}
4453
propagate_entity_load_avg(struct sched_entity * se)4454 static inline int propagate_entity_load_avg(struct sched_entity *se)
4455 {
4456 return 0;
4457 }
4458
add_tg_cfs_propagate(struct cfs_rq * cfs_rq,long runnable_sum)4459 static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) {}
4460
4461 #endif /* !CONFIG_FAIR_GROUP_SCHED */
4462
4463 #ifdef CONFIG_NO_HZ_COMMON
migrate_se_pelt_lag(struct sched_entity * se)4464 static inline void migrate_se_pelt_lag(struct sched_entity *se)
4465 {
4466 u64 throttled = 0, now, lut;
4467 struct cfs_rq *cfs_rq;
4468 struct rq *rq;
4469 bool is_idle;
4470
4471 if (load_avg_is_decayed(&se->avg))
4472 return;
4473
4474 cfs_rq = cfs_rq_of(se);
4475 rq = rq_of(cfs_rq);
4476
4477 rcu_read_lock();
4478 is_idle = is_idle_task(rcu_dereference(rq->curr));
4479 rcu_read_unlock();
4480
4481 /*
4482 * The lag estimation comes with a cost we don't want to pay all the
4483 * time. Hence, limiting to the case where the source CPU is idle and
4484 * we know we are at the greatest risk to have an outdated clock.
4485 */
4486 if (!is_idle)
4487 return;
4488
4489 /*
4490 * Estimated "now" is: last_update_time + cfs_idle_lag + rq_idle_lag, where:
4491 *
4492 * last_update_time (the cfs_rq's last_update_time)
4493 * = cfs_rq_clock_pelt()@cfs_rq_idle
4494 * = rq_clock_pelt()@cfs_rq_idle
4495 * - cfs->throttled_clock_pelt_time@cfs_rq_idle
4496 *
4497 * cfs_idle_lag (delta between rq's update and cfs_rq's update)
4498 * = rq_clock_pelt()@rq_idle - rq_clock_pelt()@cfs_rq_idle
4499 *
4500 * rq_idle_lag (delta between now and rq's update)
4501 * = sched_clock_cpu() - rq_clock()@rq_idle
4502 *
4503 * We can then write:
4504 *
4505 * now = rq_clock_pelt()@rq_idle - cfs->throttled_clock_pelt_time +
4506 * sched_clock_cpu() - rq_clock()@rq_idle
4507 * Where:
4508 * rq_clock_pelt()@rq_idle is rq->clock_pelt_idle
4509 * rq_clock()@rq_idle is rq->clock_idle
4510 * cfs->throttled_clock_pelt_time@cfs_rq_idle
4511 * is cfs_rq->throttled_pelt_idle
4512 */
4513
4514 #ifdef CONFIG_CFS_BANDWIDTH
4515 throttled = u64_u32_load(cfs_rq->throttled_pelt_idle);
4516 /* The clock has been stopped for throttling */
4517 if (throttled == U64_MAX)
4518 return;
4519 #endif
4520 now = u64_u32_load(rq->clock_pelt_idle);
4521 /*
4522 * Paired with _update_idle_rq_clock_pelt(). It ensures at the worst case
4523 * is observed the old clock_pelt_idle value and the new clock_idle,
4524 * which lead to an underestimation. The opposite would lead to an
4525 * overestimation.
4526 */
4527 smp_rmb();
4528 lut = cfs_rq_last_update_time(cfs_rq);
4529
4530 now -= throttled;
4531 if (now < lut)
4532 /*
4533 * cfs_rq->avg.last_update_time is more recent than our
4534 * estimation, let's use it.
4535 */
4536 now = lut;
4537 else
4538 now += sched_clock_cpu(cpu_of(rq)) - u64_u32_load(rq->clock_idle);
4539
4540 __update_load_avg_blocked_se(now, se);
4541 }
4542 #else /* !CONFIG_NO_HZ_COMMON: */
migrate_se_pelt_lag(struct sched_entity * se)4543 static void migrate_se_pelt_lag(struct sched_entity *se) {}
4544 #endif /* !CONFIG_NO_HZ_COMMON */
4545
4546 /**
4547 * update_cfs_rq_load_avg - update the cfs_rq's load/util averages
4548 * @now: current time, as per cfs_rq_clock_pelt()
4549 * @cfs_rq: cfs_rq to update
4550 *
4551 * The cfs_rq avg is the direct sum of all its entities (blocked and runnable)
4552 * avg. The immediate corollary is that all (fair) tasks must be attached.
4553 *
4554 * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example.
4555 *
4556 * Return: true if the load decayed or we removed load.
4557 *
4558 * Since both these conditions indicate a changed cfs_rq->avg.load we should
4559 * call update_tg_load_avg() when this function returns true.
4560 */
4561 static inline int
update_cfs_rq_load_avg(u64 now,struct cfs_rq * cfs_rq)4562 update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
4563 {
4564 unsigned long removed_load = 0, removed_util = 0, removed_runnable = 0;
4565 struct sched_avg *sa = &cfs_rq->avg;
4566 int decayed = 0;
4567
4568 if (cfs_rq->removed.nr) {
4569 unsigned long r;
4570 u32 divider = get_pelt_divider(&cfs_rq->avg);
4571
4572 raw_spin_lock(&cfs_rq->removed.lock);
4573 swap(cfs_rq->removed.util_avg, removed_util);
4574 swap(cfs_rq->removed.load_avg, removed_load);
4575 swap(cfs_rq->removed.runnable_avg, removed_runnable);
4576 cfs_rq->removed.nr = 0;
4577 raw_spin_unlock(&cfs_rq->removed.lock);
4578
4579 r = removed_load;
4580 sub_positive(&sa->load_avg, r);
4581 sub_positive(&sa->load_sum, r * divider);
4582 /* See sa->util_sum below */
4583 sa->load_sum = max_t(u32, sa->load_sum, sa->load_avg * PELT_MIN_DIVIDER);
4584
4585 r = removed_util;
4586 sub_positive(&sa->util_avg, r);
4587 sub_positive(&sa->util_sum, r * divider);
4588 /*
4589 * Because of rounding, se->util_sum might ends up being +1 more than
4590 * cfs->util_sum. Although this is not a problem by itself, detaching
4591 * a lot of tasks with the rounding problem between 2 updates of
4592 * util_avg (~1ms) can make cfs->util_sum becoming null whereas
4593 * cfs_util_avg is not.
4594 * Check that util_sum is still above its lower bound for the new
4595 * util_avg. Given that period_contrib might have moved since the last
4596 * sync, we are only sure that util_sum must be above or equal to
4597 * util_avg * minimum possible divider
4598 */
4599 sa->util_sum = max_t(u32, sa->util_sum, sa->util_avg * PELT_MIN_DIVIDER);
4600
4601 r = removed_runnable;
4602 sub_positive(&sa->runnable_avg, r);
4603 sub_positive(&sa->runnable_sum, r * divider);
4604 /* See sa->util_sum above */
4605 sa->runnable_sum = max_t(u32, sa->runnable_sum,
4606 sa->runnable_avg * PELT_MIN_DIVIDER);
4607
4608 /*
4609 * removed_runnable is the unweighted version of removed_load so we
4610 * can use it to estimate removed_load_sum.
4611 */
4612 add_tg_cfs_propagate(cfs_rq,
4613 -(long)(removed_runnable * divider) >> SCHED_CAPACITY_SHIFT);
4614
4615 decayed = 1;
4616 }
4617
4618 decayed |= __update_load_avg_cfs_rq(now, cfs_rq);
4619 u64_u32_store_copy(sa->last_update_time,
4620 cfs_rq->last_update_time_copy,
4621 sa->last_update_time);
4622 return decayed;
4623 }
4624
4625 /**
4626 * attach_entity_load_avg - attach this entity to its cfs_rq load avg
4627 * @cfs_rq: cfs_rq to attach to
4628 * @se: sched_entity to attach
4629 *
4630 * Must call update_cfs_rq_load_avg() before this, since we rely on
4631 * cfs_rq->avg.last_update_time being current.
4632 */
attach_entity_load_avg(struct cfs_rq * cfs_rq,struct sched_entity * se)4633 static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
4634 {
4635 /*
4636 * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
4637 * See ___update_load_avg() for details.
4638 */
4639 u32 divider = get_pelt_divider(&cfs_rq->avg);
4640
4641 /*
4642 * When we attach the @se to the @cfs_rq, we must align the decay
4643 * window because without that, really weird and wonderful things can
4644 * happen.
4645 *
4646 * XXX illustrate
4647 */
4648 se->avg.last_update_time = cfs_rq->avg.last_update_time;
4649 se->avg.period_contrib = cfs_rq->avg.period_contrib;
4650
4651 /*
4652 * Hell(o) Nasty stuff.. we need to recompute _sum based on the new
4653 * period_contrib. This isn't strictly correct, but since we're
4654 * entirely outside of the PELT hierarchy, nobody cares if we truncate
4655 * _sum a little.
4656 */
4657 se->avg.util_sum = se->avg.util_avg * divider;
4658
4659 se->avg.runnable_sum = se->avg.runnable_avg * divider;
4660
4661 se->avg.load_sum = se->avg.load_avg * divider;
4662 if (se_weight(se) < se->avg.load_sum)
4663 se->avg.load_sum = div_u64(se->avg.load_sum, se_weight(se));
4664 else
4665 se->avg.load_sum = 1;
4666
4667 enqueue_load_avg(cfs_rq, se);
4668 cfs_rq->avg.util_avg += se->avg.util_avg;
4669 cfs_rq->avg.util_sum += se->avg.util_sum;
4670 cfs_rq->avg.runnable_avg += se->avg.runnable_avg;
4671 cfs_rq->avg.runnable_sum += se->avg.runnable_sum;
4672
4673 add_tg_cfs_propagate(cfs_rq, se->avg.load_sum);
4674
4675 cfs_rq_util_change(cfs_rq, 0);
4676
4677 trace_pelt_cfs_tp(cfs_rq);
4678 }
4679
4680 /**
4681 * detach_entity_load_avg - detach this entity from its cfs_rq load avg
4682 * @cfs_rq: cfs_rq to detach from
4683 * @se: sched_entity to detach
4684 *
4685 * Must call update_cfs_rq_load_avg() before this, since we rely on
4686 * cfs_rq->avg.last_update_time being current.
4687 */
detach_entity_load_avg(struct cfs_rq * cfs_rq,struct sched_entity * se)4688 static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
4689 {
4690 dequeue_load_avg(cfs_rq, se);
4691 sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
4692 sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
4693 /* See update_cfs_rq_load_avg() */
4694 cfs_rq->avg.util_sum = max_t(u32, cfs_rq->avg.util_sum,
4695 cfs_rq->avg.util_avg * PELT_MIN_DIVIDER);
4696
4697 sub_positive(&cfs_rq->avg.runnable_avg, se->avg.runnable_avg);
4698 sub_positive(&cfs_rq->avg.runnable_sum, se->avg.runnable_sum);
4699 /* See update_cfs_rq_load_avg() */
4700 cfs_rq->avg.runnable_sum = max_t(u32, cfs_rq->avg.runnable_sum,
4701 cfs_rq->avg.runnable_avg * PELT_MIN_DIVIDER);
4702
4703 add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum);
4704
4705 cfs_rq_util_change(cfs_rq, 0);
4706
4707 trace_pelt_cfs_tp(cfs_rq);
4708 }
4709
4710 /*
4711 * Optional action to be done while updating the load average
4712 */
4713 #define UPDATE_TG 0x1
4714 #define SKIP_AGE_LOAD 0x2
4715 #define DO_ATTACH 0x4
4716 #define DO_DETACH 0x8
4717
4718 /* Update task and its cfs_rq load average */
update_load_avg(struct cfs_rq * cfs_rq,struct sched_entity * se,int flags)4719 static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
4720 {
4721 u64 now = cfs_rq_clock_pelt(cfs_rq);
4722 int decayed;
4723
4724 /*
4725 * Track task load average for carrying it to new CPU after migrated, and
4726 * track group sched_entity load average for task_h_load calculation in migration
4727 */
4728 if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
4729 __update_load_avg_se(now, cfs_rq, se);
4730
4731 decayed = update_cfs_rq_load_avg(now, cfs_rq);
4732 decayed |= propagate_entity_load_avg(se);
4733
4734 if (!se->avg.last_update_time && (flags & DO_ATTACH)) {
4735
4736 /*
4737 * DO_ATTACH means we're here from enqueue_entity().
4738 * !last_update_time means we've passed through
4739 * migrate_task_rq_fair() indicating we migrated.
4740 *
4741 * IOW we're enqueueing a task on a new CPU.
4742 */
4743 attach_entity_load_avg(cfs_rq, se);
4744 update_tg_load_avg(cfs_rq);
4745
4746 } else if (flags & DO_DETACH) {
4747 /*
4748 * DO_DETACH means we're here from dequeue_entity()
4749 * and we are migrating task out of the CPU.
4750 */
4751 detach_entity_load_avg(cfs_rq, se);
4752 update_tg_load_avg(cfs_rq);
4753 } else if (decayed) {
4754 cfs_rq_util_change(cfs_rq, 0);
4755
4756 if (flags & UPDATE_TG)
4757 update_tg_load_avg(cfs_rq);
4758 }
4759 }
4760
4761 /*
4762 * Synchronize entity load avg of dequeued entity without locking
4763 * the previous rq.
4764 */
sync_entity_load_avg(struct sched_entity * se)4765 static void sync_entity_load_avg(struct sched_entity *se)
4766 {
4767 struct cfs_rq *cfs_rq = cfs_rq_of(se);
4768 u64 last_update_time;
4769
4770 last_update_time = cfs_rq_last_update_time(cfs_rq);
4771 __update_load_avg_blocked_se(last_update_time, se);
4772 }
4773
4774 /*
4775 * Task first catches up with cfs_rq, and then subtract
4776 * itself from the cfs_rq (task must be off the queue now).
4777 */
remove_entity_load_avg(struct sched_entity * se)4778 static void remove_entity_load_avg(struct sched_entity *se)
4779 {
4780 struct cfs_rq *cfs_rq = cfs_rq_of(se);
4781 unsigned long flags;
4782
4783 /*
4784 * tasks cannot exit without having gone through wake_up_new_task() ->
4785 * enqueue_task_fair() which will have added things to the cfs_rq,
4786 * so we can remove unconditionally.
4787 */
4788
4789 sync_entity_load_avg(se);
4790
4791 raw_spin_lock_irqsave(&cfs_rq->removed.lock, flags);
4792 ++cfs_rq->removed.nr;
4793 cfs_rq->removed.util_avg += se->avg.util_avg;
4794 cfs_rq->removed.load_avg += se->avg.load_avg;
4795 cfs_rq->removed.runnable_avg += se->avg.runnable_avg;
4796 raw_spin_unlock_irqrestore(&cfs_rq->removed.lock, flags);
4797 }
4798
cfs_rq_runnable_avg(struct cfs_rq * cfs_rq)4799 static inline unsigned long cfs_rq_runnable_avg(struct cfs_rq *cfs_rq)
4800 {
4801 return cfs_rq->avg.runnable_avg;
4802 }
4803
cfs_rq_load_avg(struct cfs_rq * cfs_rq)4804 static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
4805 {
4806 return cfs_rq->avg.load_avg;
4807 }
4808
4809 static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf);
4810
task_util(struct task_struct * p)4811 static inline unsigned long task_util(struct task_struct *p)
4812 {
4813 return READ_ONCE(p->se.avg.util_avg);
4814 }
4815
task_runnable(struct task_struct * p)4816 static inline unsigned long task_runnable(struct task_struct *p)
4817 {
4818 return READ_ONCE(p->se.avg.runnable_avg);
4819 }
4820
_task_util_est(struct task_struct * p)4821 static inline unsigned long _task_util_est(struct task_struct *p)
4822 {
4823 return READ_ONCE(p->se.avg.util_est) & ~UTIL_AVG_UNCHANGED;
4824 }
4825
task_util_est(struct task_struct * p)4826 static inline unsigned long task_util_est(struct task_struct *p)
4827 {
4828 return max(task_util(p), _task_util_est(p));
4829 }
4830
util_est_enqueue(struct cfs_rq * cfs_rq,struct task_struct * p)4831 static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
4832 struct task_struct *p)
4833 {
4834 unsigned int enqueued;
4835
4836 if (!sched_feat(UTIL_EST))
4837 return;
4838
4839 /* Update root cfs_rq's estimated utilization */
4840 enqueued = cfs_rq->avg.util_est;
4841 enqueued += _task_util_est(p);
4842 WRITE_ONCE(cfs_rq->avg.util_est, enqueued);
4843
4844 trace_sched_util_est_cfs_tp(cfs_rq);
4845 }
4846
util_est_dequeue(struct cfs_rq * cfs_rq,struct task_struct * p)4847 static inline void util_est_dequeue(struct cfs_rq *cfs_rq,
4848 struct task_struct *p)
4849 {
4850 unsigned int enqueued;
4851
4852 if (!sched_feat(UTIL_EST))
4853 return;
4854
4855 /* Update root cfs_rq's estimated utilization */
4856 enqueued = cfs_rq->avg.util_est;
4857 enqueued -= min_t(unsigned int, enqueued, _task_util_est(p));
4858 WRITE_ONCE(cfs_rq->avg.util_est, enqueued);
4859
4860 trace_sched_util_est_cfs_tp(cfs_rq);
4861 }
4862
4863 #define UTIL_EST_MARGIN (SCHED_CAPACITY_SCALE / 100)
4864
util_est_update(struct cfs_rq * cfs_rq,struct task_struct * p,bool task_sleep)4865 static inline void util_est_update(struct cfs_rq *cfs_rq,
4866 struct task_struct *p,
4867 bool task_sleep)
4868 {
4869 unsigned int ewma, dequeued, last_ewma_diff;
4870
4871 if (!sched_feat(UTIL_EST))
4872 return;
4873
4874 /*
4875 * Skip update of task's estimated utilization when the task has not
4876 * yet completed an activation, e.g. being migrated.
4877 */
4878 if (!task_sleep)
4879 return;
4880
4881 /* Get current estimate of utilization */
4882 ewma = READ_ONCE(p->se.avg.util_est);
4883
4884 /*
4885 * If the PELT values haven't changed since enqueue time,
4886 * skip the util_est update.
4887 */
4888 if (ewma & UTIL_AVG_UNCHANGED)
4889 return;
4890
4891 /* Get utilization at dequeue */
4892 dequeued = task_util(p);
4893
4894 /*
4895 * Reset EWMA on utilization increases, the moving average is used only
4896 * to smooth utilization decreases.
4897 */
4898 if (ewma <= dequeued) {
4899 ewma = dequeued;
4900 goto done;
4901 }
4902
4903 /*
4904 * Skip update of task's estimated utilization when its members are
4905 * already ~1% close to its last activation value.
4906 */
4907 last_ewma_diff = ewma - dequeued;
4908 if (last_ewma_diff < UTIL_EST_MARGIN)
4909 goto done;
4910
4911 /*
4912 * To avoid underestimate of task utilization, skip updates of EWMA if
4913 * we cannot grant that thread got all CPU time it wanted.
4914 */
4915 if ((dequeued + UTIL_EST_MARGIN) < task_runnable(p))
4916 goto done;
4917
4918
4919 /*
4920 * Update Task's estimated utilization
4921 *
4922 * When *p completes an activation we can consolidate another sample
4923 * of the task size. This is done by using this value to update the
4924 * Exponential Weighted Moving Average (EWMA):
4925 *
4926 * ewma(t) = w * task_util(p) + (1-w) * ewma(t-1)
4927 * = w * task_util(p) + ewma(t-1) - w * ewma(t-1)
4928 * = w * (task_util(p) - ewma(t-1)) + ewma(t-1)
4929 * = w * ( -last_ewma_diff ) + ewma(t-1)
4930 * = w * (-last_ewma_diff + ewma(t-1) / w)
4931 *
4932 * Where 'w' is the weight of new samples, which is configured to be
4933 * 0.25, thus making w=1/4 ( >>= UTIL_EST_WEIGHT_SHIFT)
4934 */
4935 ewma <<= UTIL_EST_WEIGHT_SHIFT;
4936 ewma -= last_ewma_diff;
4937 ewma >>= UTIL_EST_WEIGHT_SHIFT;
4938 done:
4939 ewma |= UTIL_AVG_UNCHANGED;
4940 WRITE_ONCE(p->se.avg.util_est, ewma);
4941
4942 trace_sched_util_est_se_tp(&p->se);
4943 }
4944
get_actual_cpu_capacity(int cpu)4945 static inline unsigned long get_actual_cpu_capacity(int cpu)
4946 {
4947 unsigned long capacity = arch_scale_cpu_capacity(cpu);
4948
4949 capacity -= max(hw_load_avg(cpu_rq(cpu)), cpufreq_get_pressure(cpu));
4950
4951 return capacity;
4952 }
4953
util_fits_cpu(unsigned long util,unsigned long uclamp_min,unsigned long uclamp_max,int cpu)4954 static inline int util_fits_cpu(unsigned long util,
4955 unsigned long uclamp_min,
4956 unsigned long uclamp_max,
4957 int cpu)
4958 {
4959 unsigned long capacity = capacity_of(cpu);
4960 unsigned long capacity_orig;
4961 bool fits, uclamp_max_fits;
4962
4963 /*
4964 * Check if the real util fits without any uclamp boost/cap applied.
4965 */
4966 fits = fits_capacity(util, capacity);
4967
4968 if (!uclamp_is_used())
4969 return fits;
4970
4971 /*
4972 * We must use arch_scale_cpu_capacity() for comparing against uclamp_min and
4973 * uclamp_max. We only care about capacity pressure (by using
4974 * capacity_of()) for comparing against the real util.
4975 *
4976 * If a task is boosted to 1024 for example, we don't want a tiny
4977 * pressure to skew the check whether it fits a CPU or not.
4978 *
4979 * Similarly if a task is capped to arch_scale_cpu_capacity(little_cpu), it
4980 * should fit a little cpu even if there's some pressure.
4981 *
4982 * Only exception is for HW or cpufreq pressure since it has a direct impact
4983 * on available OPP of the system.
4984 *
4985 * We honour it for uclamp_min only as a drop in performance level
4986 * could result in not getting the requested minimum performance level.
4987 *
4988 * For uclamp_max, we can tolerate a drop in performance level as the
4989 * goal is to cap the task. So it's okay if it's getting less.
4990 */
4991 capacity_orig = arch_scale_cpu_capacity(cpu);
4992
4993 /*
4994 * We want to force a task to fit a cpu as implied by uclamp_max.
4995 * But we do have some corner cases to cater for..
4996 *
4997 *
4998 * C=z
4999 * | ___
5000 * | C=y | |
5001 * |_ _ _ _ _ _ _ _ _ ___ _ _ _ | _ | _ _ _ _ _ uclamp_max
5002 * | C=x | | | |
5003 * | ___ | | | |
5004 * | | | | | | | (util somewhere in this region)
5005 * | | | | | | |
5006 * | | | | | | |
5007 * +----------------------------------------
5008 * CPU0 CPU1 CPU2
5009 *
5010 * In the above example if a task is capped to a specific performance
5011 * point, y, then when:
5012 *
5013 * * util = 80% of x then it does not fit on CPU0 and should migrate
5014 * to CPU1
5015 * * util = 80% of y then it is forced to fit on CPU1 to honour
5016 * uclamp_max request.
5017 *
5018 * which is what we're enforcing here. A task always fits if
5019 * uclamp_max <= capacity_orig. But when uclamp_max > capacity_orig,
5020 * the normal upmigration rules should withhold still.
5021 *
5022 * Only exception is when we are on max capacity, then we need to be
5023 * careful not to block overutilized state. This is so because:
5024 *
5025 * 1. There's no concept of capping at max_capacity! We can't go
5026 * beyond this performance level anyway.
5027 * 2. The system is being saturated when we're operating near
5028 * max capacity, it doesn't make sense to block overutilized.
5029 */
5030 uclamp_max_fits = (capacity_orig == SCHED_CAPACITY_SCALE) && (uclamp_max == SCHED_CAPACITY_SCALE);
5031 uclamp_max_fits = !uclamp_max_fits && (uclamp_max <= capacity_orig);
5032 fits = fits || uclamp_max_fits;
5033
5034 /*
5035 *
5036 * C=z
5037 * | ___ (region a, capped, util >= uclamp_max)
5038 * | C=y | |
5039 * |_ _ _ _ _ _ _ _ _ ___ _ _ _ | _ | _ _ _ _ _ uclamp_max
5040 * | C=x | | | |
5041 * | ___ | | | | (region b, uclamp_min <= util <= uclamp_max)
5042 * |_ _ _|_ _|_ _ _ _| _ | _ _ _| _ | _ _ _ _ _ uclamp_min
5043 * | | | | | | |
5044 * | | | | | | | (region c, boosted, util < uclamp_min)
5045 * +----------------------------------------
5046 * CPU0 CPU1 CPU2
5047 *
5048 * a) If util > uclamp_max, then we're capped, we don't care about
5049 * actual fitness value here. We only care if uclamp_max fits
5050 * capacity without taking margin/pressure into account.
5051 * See comment above.
5052 *
5053 * b) If uclamp_min <= util <= uclamp_max, then the normal
5054 * fits_capacity() rules apply. Except we need to ensure that we
5055 * enforce we remain within uclamp_max, see comment above.
5056 *
5057 * c) If util < uclamp_min, then we are boosted. Same as (b) but we
5058 * need to take into account the boosted value fits the CPU without
5059 * taking margin/pressure into account.
5060 *
5061 * Cases (a) and (b) are handled in the 'fits' variable already. We
5062 * just need to consider an extra check for case (c) after ensuring we
5063 * handle the case uclamp_min > uclamp_max.
5064 */
5065 uclamp_min = min(uclamp_min, uclamp_max);
5066 if (fits && (util < uclamp_min) &&
5067 (uclamp_min > get_actual_cpu_capacity(cpu)))
5068 return -1;
5069
5070 return fits;
5071 }
5072
task_fits_cpu(struct task_struct * p,int cpu)5073 static inline int task_fits_cpu(struct task_struct *p, int cpu)
5074 {
5075 unsigned long uclamp_min = uclamp_eff_value(p, UCLAMP_MIN);
5076 unsigned long uclamp_max = uclamp_eff_value(p, UCLAMP_MAX);
5077 unsigned long util = task_util_est(p);
5078 /*
5079 * Return true only if the cpu fully fits the task requirements, which
5080 * include the utilization but also the performance hints.
5081 */
5082 return (util_fits_cpu(util, uclamp_min, uclamp_max, cpu) > 0);
5083 }
5084
update_misfit_status(struct task_struct * p,struct rq * rq)5085 static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
5086 {
5087 int cpu = cpu_of(rq);
5088
5089 if (!sched_asym_cpucap_active())
5090 return;
5091
5092 /*
5093 * Affinity allows us to go somewhere higher? Or are we on biggest
5094 * available CPU already? Or do we fit into this CPU ?
5095 */
5096 if (!p || (p->nr_cpus_allowed == 1) ||
5097 (arch_scale_cpu_capacity(cpu) == p->max_allowed_capacity) ||
5098 task_fits_cpu(p, cpu)) {
5099
5100 rq->misfit_task_load = 0;
5101 return;
5102 }
5103
5104 /*
5105 * Make sure that misfit_task_load will not be null even if
5106 * task_h_load() returns 0.
5107 */
5108 rq->misfit_task_load = max_t(unsigned long, task_h_load(p), 1);
5109 }
5110
__setparam_fair(struct task_struct * p,const struct sched_attr * attr)5111 void __setparam_fair(struct task_struct *p, const struct sched_attr *attr)
5112 {
5113 struct sched_entity *se = &p->se;
5114
5115 p->static_prio = NICE_TO_PRIO(attr->sched_nice);
5116 if (attr->sched_runtime) {
5117 se->custom_slice = 1;
5118 se->slice = clamp_t(u64, attr->sched_runtime,
5119 NSEC_PER_MSEC/10, /* HZ=1000 * 10 */
5120 NSEC_PER_MSEC*100); /* HZ=100 / 10 */
5121 } else {
5122 se->custom_slice = 0;
5123 se->slice = sysctl_sched_base_slice;
5124 }
5125 }
5126
5127 static void
place_entity(struct cfs_rq * cfs_rq,struct sched_entity * se,int flags)5128 place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
5129 {
5130 u64 vslice, vruntime = avg_vruntime(cfs_rq);
5131 s64 lag = 0;
5132
5133 if (!se->custom_slice)
5134 se->slice = sysctl_sched_base_slice;
5135 vslice = calc_delta_fair(se->slice, se);
5136
5137 /*
5138 * Due to how V is constructed as the weighted average of entities,
5139 * adding tasks with positive lag, or removing tasks with negative lag
5140 * will move 'time' backwards, this can screw around with the lag of
5141 * other tasks.
5142 *
5143 * EEVDF: placement strategy #1 / #2
5144 */
5145 if (sched_feat(PLACE_LAG) && cfs_rq->nr_queued && se->vlag) {
5146 struct sched_entity *curr = cfs_rq->curr;
5147 unsigned long load;
5148
5149 lag = se->vlag;
5150
5151 /*
5152 * If we want to place a task and preserve lag, we have to
5153 * consider the effect of the new entity on the weighted
5154 * average and compensate for this, otherwise lag can quickly
5155 * evaporate.
5156 *
5157 * Lag is defined as:
5158 *
5159 * lag_i = S - s_i = w_i * (V - v_i)
5160 *
5161 * To avoid the 'w_i' term all over the place, we only track
5162 * the virtual lag:
5163 *
5164 * vl_i = V - v_i <=> v_i = V - vl_i
5165 *
5166 * And we take V to be the weighted average of all v:
5167 *
5168 * V = (\Sum w_j*v_j) / W
5169 *
5170 * Where W is: \Sum w_j
5171 *
5172 * Then, the weighted average after adding an entity with lag
5173 * vl_i is given by:
5174 *
5175 * V' = (\Sum w_j*v_j + w_i*v_i) / (W + w_i)
5176 * = (W*V + w_i*(V - vl_i)) / (W + w_i)
5177 * = (W*V + w_i*V - w_i*vl_i) / (W + w_i)
5178 * = (V*(W + w_i) - w_i*vl_i) / (W + w_i)
5179 * = V - w_i*vl_i / (W + w_i)
5180 *
5181 * And the actual lag after adding an entity with vl_i is:
5182 *
5183 * vl'_i = V' - v_i
5184 * = V - w_i*vl_i / (W + w_i) - (V - vl_i)
5185 * = vl_i - w_i*vl_i / (W + w_i)
5186 *
5187 * Which is strictly less than vl_i. So in order to preserve lag
5188 * we should inflate the lag before placement such that the
5189 * effective lag after placement comes out right.
5190 *
5191 * As such, invert the above relation for vl'_i to get the vl_i
5192 * we need to use such that the lag after placement is the lag
5193 * we computed before dequeue.
5194 *
5195 * vl'_i = vl_i - w_i*vl_i / (W + w_i)
5196 * = ((W + w_i)*vl_i - w_i*vl_i) / (W + w_i)
5197 *
5198 * (W + w_i)*vl'_i = (W + w_i)*vl_i - w_i*vl_i
5199 * = W*vl_i
5200 *
5201 * vl_i = (W + w_i)*vl'_i / W
5202 */
5203 load = cfs_rq->avg_load;
5204 if (curr && curr->on_rq)
5205 load += scale_load_down(curr->load.weight);
5206
5207 lag *= load + scale_load_down(se->load.weight);
5208 if (WARN_ON_ONCE(!load))
5209 load = 1;
5210 lag = div_s64(lag, load);
5211 }
5212
5213 se->vruntime = vruntime - lag;
5214
5215 if (se->rel_deadline) {
5216 se->deadline += se->vruntime;
5217 se->rel_deadline = 0;
5218 return;
5219 }
5220
5221 /*
5222 * When joining the competition; the existing tasks will be,
5223 * on average, halfway through their slice, as such start tasks
5224 * off with half a slice to ease into the competition.
5225 */
5226 if (sched_feat(PLACE_DEADLINE_INITIAL) && (flags & ENQUEUE_INITIAL))
5227 vslice /= 2;
5228
5229 /*
5230 * EEVDF: vd_i = ve_i + r_i/w_i
5231 */
5232 se->deadline = se->vruntime + vslice;
5233 }
5234
5235 static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
5236 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq);
5237
5238 static void
5239 requeue_delayed_entity(struct sched_entity *se);
5240
5241 static void
enqueue_entity(struct cfs_rq * cfs_rq,struct sched_entity * se,int flags)5242 enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
5243 {
5244 bool curr = cfs_rq->curr == se;
5245
5246 /*
5247 * If we're the current task, we must renormalise before calling
5248 * update_curr().
5249 */
5250 if (curr)
5251 place_entity(cfs_rq, se, flags);
5252
5253 update_curr(cfs_rq);
5254
5255 /*
5256 * When enqueuing a sched_entity, we must:
5257 * - Update loads to have both entity and cfs_rq synced with now.
5258 * - For group_entity, update its runnable_weight to reflect the new
5259 * h_nr_runnable of its group cfs_rq.
5260 * - For group_entity, update its weight to reflect the new share of
5261 * its group cfs_rq
5262 * - Add its new weight to cfs_rq->load.weight
5263 */
5264 update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH);
5265 se_update_runnable(se);
5266 /*
5267 * XXX update_load_avg() above will have attached us to the pelt sum;
5268 * but update_cfs_group() here will re-adjust the weight and have to
5269 * undo/redo all that. Seems wasteful.
5270 */
5271 update_cfs_group(se);
5272
5273 /*
5274 * XXX now that the entity has been re-weighted, and it's lag adjusted,
5275 * we can place the entity.
5276 */
5277 if (!curr)
5278 place_entity(cfs_rq, se, flags);
5279
5280 account_entity_enqueue(cfs_rq, se);
5281
5282 /* Entity has migrated, no longer consider this task hot */
5283 if (flags & ENQUEUE_MIGRATED)
5284 se->exec_start = 0;
5285
5286 check_schedstat_required();
5287 update_stats_enqueue_fair(cfs_rq, se, flags);
5288 if (!curr)
5289 __enqueue_entity(cfs_rq, se);
5290 se->on_rq = 1;
5291
5292 if (cfs_rq->nr_queued == 1) {
5293 check_enqueue_throttle(cfs_rq);
5294 if (!throttled_hierarchy(cfs_rq)) {
5295 list_add_leaf_cfs_rq(cfs_rq);
5296 } else {
5297 #ifdef CONFIG_CFS_BANDWIDTH
5298 struct rq *rq = rq_of(cfs_rq);
5299
5300 if (cfs_rq_throttled(cfs_rq) && !cfs_rq->throttled_clock)
5301 cfs_rq->throttled_clock = rq_clock(rq);
5302 if (!cfs_rq->throttled_clock_self)
5303 cfs_rq->throttled_clock_self = rq_clock(rq);
5304 #endif
5305 }
5306 }
5307 }
5308
__clear_buddies_next(struct sched_entity * se)5309 static void __clear_buddies_next(struct sched_entity *se)
5310 {
5311 for_each_sched_entity(se) {
5312 struct cfs_rq *cfs_rq = cfs_rq_of(se);
5313 if (cfs_rq->next != se)
5314 break;
5315
5316 cfs_rq->next = NULL;
5317 }
5318 }
5319
clear_buddies(struct cfs_rq * cfs_rq,struct sched_entity * se)5320 static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
5321 {
5322 if (cfs_rq->next == se)
5323 __clear_buddies_next(se);
5324 }
5325
5326 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
5327
set_delayed(struct sched_entity * se)5328 static void set_delayed(struct sched_entity *se)
5329 {
5330 se->sched_delayed = 1;
5331
5332 /*
5333 * Delayed se of cfs_rq have no tasks queued on them.
5334 * Do not adjust h_nr_runnable since dequeue_entities()
5335 * will account it for blocked tasks.
5336 */
5337 if (!entity_is_task(se))
5338 return;
5339
5340 for_each_sched_entity(se) {
5341 struct cfs_rq *cfs_rq = cfs_rq_of(se);
5342
5343 cfs_rq->h_nr_runnable--;
5344 if (cfs_rq_throttled(cfs_rq))
5345 break;
5346 }
5347 }
5348
clear_delayed(struct sched_entity * se)5349 static void clear_delayed(struct sched_entity *se)
5350 {
5351 se->sched_delayed = 0;
5352
5353 /*
5354 * Delayed se of cfs_rq have no tasks queued on them.
5355 * Do not adjust h_nr_runnable since a dequeue has
5356 * already accounted for it or an enqueue of a task
5357 * below it will account for it in enqueue_task_fair().
5358 */
5359 if (!entity_is_task(se))
5360 return;
5361
5362 for_each_sched_entity(se) {
5363 struct cfs_rq *cfs_rq = cfs_rq_of(se);
5364
5365 cfs_rq->h_nr_runnable++;
5366 if (cfs_rq_throttled(cfs_rq))
5367 break;
5368 }
5369 }
5370
finish_delayed_dequeue_entity(struct sched_entity * se)5371 static inline void finish_delayed_dequeue_entity(struct sched_entity *se)
5372 {
5373 clear_delayed(se);
5374 if (sched_feat(DELAY_ZERO) && se->vlag > 0)
5375 se->vlag = 0;
5376 }
5377
5378 static bool
dequeue_entity(struct cfs_rq * cfs_rq,struct sched_entity * se,int flags)5379 dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
5380 {
5381 bool sleep = flags & DEQUEUE_SLEEP;
5382 int action = UPDATE_TG;
5383
5384 update_curr(cfs_rq);
5385 clear_buddies(cfs_rq, se);
5386
5387 if (flags & DEQUEUE_DELAYED) {
5388 WARN_ON_ONCE(!se->sched_delayed);
5389 } else {
5390 bool delay = sleep;
5391 /*
5392 * DELAY_DEQUEUE relies on spurious wakeups, special task
5393 * states must not suffer spurious wakeups, excempt them.
5394 */
5395 if (flags & DEQUEUE_SPECIAL)
5396 delay = false;
5397
5398 WARN_ON_ONCE(delay && se->sched_delayed);
5399
5400 if (sched_feat(DELAY_DEQUEUE) && delay &&
5401 !entity_eligible(cfs_rq, se)) {
5402 update_load_avg(cfs_rq, se, 0);
5403 set_delayed(se);
5404 return false;
5405 }
5406 }
5407
5408 if (entity_is_task(se) && task_on_rq_migrating(task_of(se)))
5409 action |= DO_DETACH;
5410
5411 /*
5412 * When dequeuing a sched_entity, we must:
5413 * - Update loads to have both entity and cfs_rq synced with now.
5414 * - For group_entity, update its runnable_weight to reflect the new
5415 * h_nr_runnable of its group cfs_rq.
5416 * - Subtract its previous weight from cfs_rq->load.weight.
5417 * - For group entity, update its weight to reflect the new share
5418 * of its group cfs_rq.
5419 */
5420 update_load_avg(cfs_rq, se, action);
5421 se_update_runnable(se);
5422
5423 update_stats_dequeue_fair(cfs_rq, se, flags);
5424
5425 update_entity_lag(cfs_rq, se);
5426 if (sched_feat(PLACE_REL_DEADLINE) && !sleep) {
5427 se->deadline -= se->vruntime;
5428 se->rel_deadline = 1;
5429 }
5430
5431 if (se != cfs_rq->curr)
5432 __dequeue_entity(cfs_rq, se);
5433 se->on_rq = 0;
5434 account_entity_dequeue(cfs_rq, se);
5435
5436 /* return excess runtime on last dequeue */
5437 return_cfs_rq_runtime(cfs_rq);
5438
5439 update_cfs_group(se);
5440
5441 /*
5442 * Now advance min_vruntime if @se was the entity holding it back,
5443 * except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be
5444 * put back on, and if we advance min_vruntime, we'll be placed back
5445 * further than we started -- i.e. we'll be penalized.
5446 */
5447 if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE)
5448 update_min_vruntime(cfs_rq);
5449
5450 if (flags & DEQUEUE_DELAYED)
5451 finish_delayed_dequeue_entity(se);
5452
5453 if (cfs_rq->nr_queued == 0)
5454 update_idle_cfs_rq_clock_pelt(cfs_rq);
5455
5456 return true;
5457 }
5458
5459 static void
set_next_entity(struct cfs_rq * cfs_rq,struct sched_entity * se)5460 set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
5461 {
5462 clear_buddies(cfs_rq, se);
5463
5464 /* 'current' is not kept within the tree. */
5465 if (se->on_rq) {
5466 /*
5467 * Any task has to be enqueued before it get to execute on
5468 * a CPU. So account for the time it spent waiting on the
5469 * runqueue.
5470 */
5471 update_stats_wait_end_fair(cfs_rq, se);
5472 __dequeue_entity(cfs_rq, se);
5473 update_load_avg(cfs_rq, se, UPDATE_TG);
5474
5475 set_protect_slice(cfs_rq, se);
5476 }
5477
5478 update_stats_curr_start(cfs_rq, se);
5479 WARN_ON_ONCE(cfs_rq->curr);
5480 cfs_rq->curr = se;
5481
5482 /*
5483 * Track our maximum slice length, if the CPU's load is at
5484 * least twice that of our own weight (i.e. don't track it
5485 * when there are only lesser-weight tasks around):
5486 */
5487 if (schedstat_enabled() &&
5488 rq_of(cfs_rq)->cfs.load.weight >= 2*se->load.weight) {
5489 struct sched_statistics *stats;
5490
5491 stats = __schedstats_from_se(se);
5492 __schedstat_set(stats->slice_max,
5493 max((u64)stats->slice_max,
5494 se->sum_exec_runtime - se->prev_sum_exec_runtime));
5495 }
5496
5497 se->prev_sum_exec_runtime = se->sum_exec_runtime;
5498 }
5499
5500 static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags);
5501
5502 /*
5503 * Pick the next process, keeping these things in mind, in this order:
5504 * 1) keep things fair between processes/task groups
5505 * 2) pick the "next" process, since someone really wants that to run
5506 * 3) pick the "last" process, for cache locality
5507 * 4) do not run the "skip" process, if something else is available
5508 */
5509 static struct sched_entity *
pick_next_entity(struct rq * rq,struct cfs_rq * cfs_rq)5510 pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq)
5511 {
5512 struct sched_entity *se;
5513
5514 /*
5515 * Picking the ->next buddy will affect latency but not fairness.
5516 */
5517 if (sched_feat(PICK_BUDDY) &&
5518 cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next)) {
5519 /* ->next will never be delayed */
5520 WARN_ON_ONCE(cfs_rq->next->sched_delayed);
5521 return cfs_rq->next;
5522 }
5523
5524 se = pick_eevdf(cfs_rq);
5525 if (se->sched_delayed) {
5526 dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
5527 /*
5528 * Must not reference @se again, see __block_task().
5529 */
5530 return NULL;
5531 }
5532 return se;
5533 }
5534
5535 static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
5536
put_prev_entity(struct cfs_rq * cfs_rq,struct sched_entity * prev)5537 static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
5538 {
5539 /*
5540 * If still on the runqueue then deactivate_task()
5541 * was not called and update_curr() has to be done:
5542 */
5543 if (prev->on_rq)
5544 update_curr(cfs_rq);
5545
5546 /* throttle cfs_rqs exceeding runtime */
5547 check_cfs_rq_runtime(cfs_rq);
5548
5549 if (prev->on_rq) {
5550 update_stats_wait_start_fair(cfs_rq, prev);
5551 /* Put 'current' back into the tree. */
5552 __enqueue_entity(cfs_rq, prev);
5553 /* in !on_rq case, update occurred at dequeue */
5554 update_load_avg(cfs_rq, prev, 0);
5555 }
5556 WARN_ON_ONCE(cfs_rq->curr != prev);
5557 cfs_rq->curr = NULL;
5558 }
5559
5560 static void
entity_tick(struct cfs_rq * cfs_rq,struct sched_entity * curr,int queued)5561 entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
5562 {
5563 /*
5564 * Update run-time statistics of the 'current'.
5565 */
5566 update_curr(cfs_rq);
5567
5568 /*
5569 * Ensure that runnable average is periodically updated.
5570 */
5571 update_load_avg(cfs_rq, curr, UPDATE_TG);
5572 update_cfs_group(curr);
5573
5574 #ifdef CONFIG_SCHED_HRTICK
5575 /*
5576 * queued ticks are scheduled to match the slice, so don't bother
5577 * validating it and just reschedule.
5578 */
5579 if (queued) {
5580 resched_curr_lazy(rq_of(cfs_rq));
5581 return;
5582 }
5583 #endif
5584 }
5585
5586
5587 /**************************************************
5588 * CFS bandwidth control machinery
5589 */
5590
5591 #ifdef CONFIG_CFS_BANDWIDTH
5592
5593 #ifdef CONFIG_JUMP_LABEL
5594 static struct static_key __cfs_bandwidth_used;
5595
cfs_bandwidth_used(void)5596 static inline bool cfs_bandwidth_used(void)
5597 {
5598 return static_key_false(&__cfs_bandwidth_used);
5599 }
5600
cfs_bandwidth_usage_inc(void)5601 void cfs_bandwidth_usage_inc(void)
5602 {
5603 static_key_slow_inc_cpuslocked(&__cfs_bandwidth_used);
5604 }
5605
cfs_bandwidth_usage_dec(void)5606 void cfs_bandwidth_usage_dec(void)
5607 {
5608 static_key_slow_dec_cpuslocked(&__cfs_bandwidth_used);
5609 }
5610 #else /* !CONFIG_JUMP_LABEL: */
cfs_bandwidth_used(void)5611 static bool cfs_bandwidth_used(void)
5612 {
5613 return true;
5614 }
5615
cfs_bandwidth_usage_inc(void)5616 void cfs_bandwidth_usage_inc(void) {}
cfs_bandwidth_usage_dec(void)5617 void cfs_bandwidth_usage_dec(void) {}
5618 #endif /* !CONFIG_JUMP_LABEL */
5619
sched_cfs_bandwidth_slice(void)5620 static inline u64 sched_cfs_bandwidth_slice(void)
5621 {
5622 return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
5623 }
5624
5625 /*
5626 * Replenish runtime according to assigned quota. We use sched_clock_cpu
5627 * directly instead of rq->clock to avoid adding additional synchronization
5628 * around rq->lock.
5629 *
5630 * requires cfs_b->lock
5631 */
__refill_cfs_bandwidth_runtime(struct cfs_bandwidth * cfs_b)5632 void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
5633 {
5634 s64 runtime;
5635
5636 if (unlikely(cfs_b->quota == RUNTIME_INF))
5637 return;
5638
5639 cfs_b->runtime += cfs_b->quota;
5640 runtime = cfs_b->runtime_snap - cfs_b->runtime;
5641 if (runtime > 0) {
5642 cfs_b->burst_time += runtime;
5643 cfs_b->nr_burst++;
5644 }
5645
5646 cfs_b->runtime = min(cfs_b->runtime, cfs_b->quota + cfs_b->burst);
5647 cfs_b->runtime_snap = cfs_b->runtime;
5648 }
5649
tg_cfs_bandwidth(struct task_group * tg)5650 static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
5651 {
5652 return &tg->cfs_bandwidth;
5653 }
5654
5655 /* returns 0 on failure to allocate runtime */
__assign_cfs_rq_runtime(struct cfs_bandwidth * cfs_b,struct cfs_rq * cfs_rq,u64 target_runtime)5656 static int __assign_cfs_rq_runtime(struct cfs_bandwidth *cfs_b,
5657 struct cfs_rq *cfs_rq, u64 target_runtime)
5658 {
5659 u64 min_amount, amount = 0;
5660
5661 lockdep_assert_held(&cfs_b->lock);
5662
5663 /* note: this is a positive sum as runtime_remaining <= 0 */
5664 min_amount = target_runtime - cfs_rq->runtime_remaining;
5665
5666 if (cfs_b->quota == RUNTIME_INF)
5667 amount = min_amount;
5668 else {
5669 start_cfs_bandwidth(cfs_b);
5670
5671 if (cfs_b->runtime > 0) {
5672 amount = min(cfs_b->runtime, min_amount);
5673 cfs_b->runtime -= amount;
5674 cfs_b->idle = 0;
5675 }
5676 }
5677
5678 cfs_rq->runtime_remaining += amount;
5679
5680 return cfs_rq->runtime_remaining > 0;
5681 }
5682
5683 /* returns 0 on failure to allocate runtime */
assign_cfs_rq_runtime(struct cfs_rq * cfs_rq)5684 static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
5685 {
5686 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
5687 int ret;
5688
5689 raw_spin_lock(&cfs_b->lock);
5690 ret = __assign_cfs_rq_runtime(cfs_b, cfs_rq, sched_cfs_bandwidth_slice());
5691 raw_spin_unlock(&cfs_b->lock);
5692
5693 return ret;
5694 }
5695
__account_cfs_rq_runtime(struct cfs_rq * cfs_rq,u64 delta_exec)5696 static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
5697 {
5698 /* dock delta_exec before expiring quota (as it could span periods) */
5699 cfs_rq->runtime_remaining -= delta_exec;
5700
5701 if (likely(cfs_rq->runtime_remaining > 0))
5702 return;
5703
5704 if (cfs_rq->throttled)
5705 return;
5706 /*
5707 * if we're unable to extend our runtime we resched so that the active
5708 * hierarchy can be throttled
5709 */
5710 if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
5711 resched_curr(rq_of(cfs_rq));
5712 }
5713
5714 static __always_inline
account_cfs_rq_runtime(struct cfs_rq * cfs_rq,u64 delta_exec)5715 void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
5716 {
5717 if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
5718 return;
5719
5720 __account_cfs_rq_runtime(cfs_rq, delta_exec);
5721 }
5722
cfs_rq_throttled(struct cfs_rq * cfs_rq)5723 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
5724 {
5725 return cfs_bandwidth_used() && cfs_rq->throttled;
5726 }
5727
5728 /* check whether cfs_rq, or any parent, is throttled */
throttled_hierarchy(struct cfs_rq * cfs_rq)5729 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
5730 {
5731 return cfs_bandwidth_used() && cfs_rq->throttle_count;
5732 }
5733
5734 /*
5735 * Ensure that neither of the group entities corresponding to src_cpu or
5736 * dest_cpu are members of a throttled hierarchy when performing group
5737 * load-balance operations.
5738 */
throttled_lb_pair(struct task_group * tg,int src_cpu,int dest_cpu)5739 static inline int throttled_lb_pair(struct task_group *tg,
5740 int src_cpu, int dest_cpu)
5741 {
5742 struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
5743
5744 src_cfs_rq = tg->cfs_rq[src_cpu];
5745 dest_cfs_rq = tg->cfs_rq[dest_cpu];
5746
5747 return throttled_hierarchy(src_cfs_rq) ||
5748 throttled_hierarchy(dest_cfs_rq);
5749 }
5750
tg_unthrottle_up(struct task_group * tg,void * data)5751 static int tg_unthrottle_up(struct task_group *tg, void *data)
5752 {
5753 struct rq *rq = data;
5754 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
5755
5756 cfs_rq->throttle_count--;
5757 if (!cfs_rq->throttle_count) {
5758 cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) -
5759 cfs_rq->throttled_clock_pelt;
5760
5761 /* Add cfs_rq with load or one or more already running entities to the list */
5762 if (!cfs_rq_is_decayed(cfs_rq))
5763 list_add_leaf_cfs_rq(cfs_rq);
5764
5765 if (cfs_rq->throttled_clock_self) {
5766 u64 delta = rq_clock(rq) - cfs_rq->throttled_clock_self;
5767
5768 cfs_rq->throttled_clock_self = 0;
5769
5770 if (WARN_ON_ONCE((s64)delta < 0))
5771 delta = 0;
5772
5773 cfs_rq->throttled_clock_self_time += delta;
5774 }
5775 }
5776
5777 return 0;
5778 }
5779
tg_throttle_down(struct task_group * tg,void * data)5780 static int tg_throttle_down(struct task_group *tg, void *data)
5781 {
5782 struct rq *rq = data;
5783 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
5784
5785 /* group is entering throttled state, stop time */
5786 if (!cfs_rq->throttle_count) {
5787 cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
5788 list_del_leaf_cfs_rq(cfs_rq);
5789
5790 WARN_ON_ONCE(cfs_rq->throttled_clock_self);
5791 if (cfs_rq->nr_queued)
5792 cfs_rq->throttled_clock_self = rq_clock(rq);
5793 }
5794 cfs_rq->throttle_count++;
5795
5796 return 0;
5797 }
5798
throttle_cfs_rq(struct cfs_rq * cfs_rq)5799 static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
5800 {
5801 struct rq *rq = rq_of(cfs_rq);
5802 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
5803 struct sched_entity *se;
5804 long queued_delta, runnable_delta, idle_delta, dequeue = 1;
5805
5806 raw_spin_lock(&cfs_b->lock);
5807 /* This will start the period timer if necessary */
5808 if (__assign_cfs_rq_runtime(cfs_b, cfs_rq, 1)) {
5809 /*
5810 * We have raced with bandwidth becoming available, and if we
5811 * actually throttled the timer might not unthrottle us for an
5812 * entire period. We additionally needed to make sure that any
5813 * subsequent check_cfs_rq_runtime calls agree not to throttle
5814 * us, as we may commit to do cfs put_prev+pick_next, so we ask
5815 * for 1ns of runtime rather than just check cfs_b.
5816 */
5817 dequeue = 0;
5818 } else {
5819 list_add_tail_rcu(&cfs_rq->throttled_list,
5820 &cfs_b->throttled_cfs_rq);
5821 }
5822 raw_spin_unlock(&cfs_b->lock);
5823
5824 if (!dequeue)
5825 return false; /* Throttle no longer required. */
5826
5827 se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
5828
5829 /* freeze hierarchy runnable averages while throttled */
5830 rcu_read_lock();
5831 walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
5832 rcu_read_unlock();
5833
5834 queued_delta = cfs_rq->h_nr_queued;
5835 runnable_delta = cfs_rq->h_nr_runnable;
5836 idle_delta = cfs_rq->h_nr_idle;
5837 for_each_sched_entity(se) {
5838 struct cfs_rq *qcfs_rq = cfs_rq_of(se);
5839 int flags;
5840
5841 /* throttled entity or throttle-on-deactivate */
5842 if (!se->on_rq)
5843 goto done;
5844
5845 /*
5846 * Abuse SPECIAL to avoid delayed dequeue in this instance.
5847 * This avoids teaching dequeue_entities() about throttled
5848 * entities and keeps things relatively simple.
5849 */
5850 flags = DEQUEUE_SLEEP | DEQUEUE_SPECIAL;
5851 if (se->sched_delayed)
5852 flags |= DEQUEUE_DELAYED;
5853 dequeue_entity(qcfs_rq, se, flags);
5854
5855 if (cfs_rq_is_idle(group_cfs_rq(se)))
5856 idle_delta = cfs_rq->h_nr_queued;
5857
5858 qcfs_rq->h_nr_queued -= queued_delta;
5859 qcfs_rq->h_nr_runnable -= runnable_delta;
5860 qcfs_rq->h_nr_idle -= idle_delta;
5861
5862 if (qcfs_rq->load.weight) {
5863 /* Avoid re-evaluating load for this entity: */
5864 se = parent_entity(se);
5865 break;
5866 }
5867 }
5868
5869 for_each_sched_entity(se) {
5870 struct cfs_rq *qcfs_rq = cfs_rq_of(se);
5871 /* throttled entity or throttle-on-deactivate */
5872 if (!se->on_rq)
5873 goto done;
5874
5875 update_load_avg(qcfs_rq, se, 0);
5876 se_update_runnable(se);
5877
5878 if (cfs_rq_is_idle(group_cfs_rq(se)))
5879 idle_delta = cfs_rq->h_nr_queued;
5880
5881 qcfs_rq->h_nr_queued -= queued_delta;
5882 qcfs_rq->h_nr_runnable -= runnable_delta;
5883 qcfs_rq->h_nr_idle -= idle_delta;
5884 }
5885
5886 /* At this point se is NULL and we are at root level*/
5887 sub_nr_running(rq, queued_delta);
5888 done:
5889 /*
5890 * Note: distribution will already see us throttled via the
5891 * throttled-list. rq->lock protects completion.
5892 */
5893 cfs_rq->throttled = 1;
5894 WARN_ON_ONCE(cfs_rq->throttled_clock);
5895 if (cfs_rq->nr_queued)
5896 cfs_rq->throttled_clock = rq_clock(rq);
5897 return true;
5898 }
5899
unthrottle_cfs_rq(struct cfs_rq * cfs_rq)5900 void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
5901 {
5902 struct rq *rq = rq_of(cfs_rq);
5903 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
5904 struct sched_entity *se;
5905 long queued_delta, runnable_delta, idle_delta;
5906 long rq_h_nr_queued = rq->cfs.h_nr_queued;
5907
5908 se = cfs_rq->tg->se[cpu_of(rq)];
5909
5910 cfs_rq->throttled = 0;
5911
5912 update_rq_clock(rq);
5913
5914 raw_spin_lock(&cfs_b->lock);
5915 if (cfs_rq->throttled_clock) {
5916 cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
5917 cfs_rq->throttled_clock = 0;
5918 }
5919 list_del_rcu(&cfs_rq->throttled_list);
5920 raw_spin_unlock(&cfs_b->lock);
5921
5922 /* update hierarchical throttle state */
5923 walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
5924
5925 if (!cfs_rq->load.weight) {
5926 if (!cfs_rq->on_list)
5927 return;
5928 /*
5929 * Nothing to run but something to decay (on_list)?
5930 * Complete the branch.
5931 */
5932 for_each_sched_entity(se) {
5933 if (list_add_leaf_cfs_rq(cfs_rq_of(se)))
5934 break;
5935 }
5936 goto unthrottle_throttle;
5937 }
5938
5939 queued_delta = cfs_rq->h_nr_queued;
5940 runnable_delta = cfs_rq->h_nr_runnable;
5941 idle_delta = cfs_rq->h_nr_idle;
5942 for_each_sched_entity(se) {
5943 struct cfs_rq *qcfs_rq = cfs_rq_of(se);
5944
5945 /* Handle any unfinished DELAY_DEQUEUE business first. */
5946 if (se->sched_delayed) {
5947 int flags = DEQUEUE_SLEEP | DEQUEUE_DELAYED;
5948
5949 dequeue_entity(qcfs_rq, se, flags);
5950 } else if (se->on_rq)
5951 break;
5952 enqueue_entity(qcfs_rq, se, ENQUEUE_WAKEUP);
5953
5954 if (cfs_rq_is_idle(group_cfs_rq(se)))
5955 idle_delta = cfs_rq->h_nr_queued;
5956
5957 qcfs_rq->h_nr_queued += queued_delta;
5958 qcfs_rq->h_nr_runnable += runnable_delta;
5959 qcfs_rq->h_nr_idle += idle_delta;
5960
5961 /* end evaluation on encountering a throttled cfs_rq */
5962 if (cfs_rq_throttled(qcfs_rq))
5963 goto unthrottle_throttle;
5964 }
5965
5966 for_each_sched_entity(se) {
5967 struct cfs_rq *qcfs_rq = cfs_rq_of(se);
5968
5969 update_load_avg(qcfs_rq, se, UPDATE_TG);
5970 se_update_runnable(se);
5971
5972 if (cfs_rq_is_idle(group_cfs_rq(se)))
5973 idle_delta = cfs_rq->h_nr_queued;
5974
5975 qcfs_rq->h_nr_queued += queued_delta;
5976 qcfs_rq->h_nr_runnable += runnable_delta;
5977 qcfs_rq->h_nr_idle += idle_delta;
5978
5979 /* end evaluation on encountering a throttled cfs_rq */
5980 if (cfs_rq_throttled(qcfs_rq))
5981 goto unthrottle_throttle;
5982 }
5983
5984 /* Start the fair server if un-throttling resulted in new runnable tasks */
5985 if (!rq_h_nr_queued && rq->cfs.h_nr_queued)
5986 dl_server_start(&rq->fair_server);
5987
5988 /* At this point se is NULL and we are at root level*/
5989 add_nr_running(rq, queued_delta);
5990
5991 unthrottle_throttle:
5992 assert_list_leaf_cfs_rq(rq);
5993
5994 /* Determine whether we need to wake up potentially idle CPU: */
5995 if (rq->curr == rq->idle && rq->cfs.nr_queued)
5996 resched_curr(rq);
5997 }
5998
__cfsb_csd_unthrottle(void * arg)5999 static void __cfsb_csd_unthrottle(void *arg)
6000 {
6001 struct cfs_rq *cursor, *tmp;
6002 struct rq *rq = arg;
6003 struct rq_flags rf;
6004
6005 rq_lock(rq, &rf);
6006
6007 /*
6008 * Iterating over the list can trigger several call to
6009 * update_rq_clock() in unthrottle_cfs_rq().
6010 * Do it once and skip the potential next ones.
6011 */
6012 update_rq_clock(rq);
6013 rq_clock_start_loop_update(rq);
6014
6015 /*
6016 * Since we hold rq lock we're safe from concurrent manipulation of
6017 * the CSD list. However, this RCU critical section annotates the
6018 * fact that we pair with sched_free_group_rcu(), so that we cannot
6019 * race with group being freed in the window between removing it
6020 * from the list and advancing to the next entry in the list.
6021 */
6022 rcu_read_lock();
6023
6024 list_for_each_entry_safe(cursor, tmp, &rq->cfsb_csd_list,
6025 throttled_csd_list) {
6026 list_del_init(&cursor->throttled_csd_list);
6027
6028 if (cfs_rq_throttled(cursor))
6029 unthrottle_cfs_rq(cursor);
6030 }
6031
6032 rcu_read_unlock();
6033
6034 rq_clock_stop_loop_update(rq);
6035 rq_unlock(rq, &rf);
6036 }
6037
__unthrottle_cfs_rq_async(struct cfs_rq * cfs_rq)6038 static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
6039 {
6040 struct rq *rq = rq_of(cfs_rq);
6041 bool first;
6042
6043 if (rq == this_rq()) {
6044 unthrottle_cfs_rq(cfs_rq);
6045 return;
6046 }
6047
6048 /* Already enqueued */
6049 if (WARN_ON_ONCE(!list_empty(&cfs_rq->throttled_csd_list)))
6050 return;
6051
6052 first = list_empty(&rq->cfsb_csd_list);
6053 list_add_tail(&cfs_rq->throttled_csd_list, &rq->cfsb_csd_list);
6054 if (first)
6055 smp_call_function_single_async(cpu_of(rq), &rq->cfsb_csd);
6056 }
6057
unthrottle_cfs_rq_async(struct cfs_rq * cfs_rq)6058 static void unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
6059 {
6060 lockdep_assert_rq_held(rq_of(cfs_rq));
6061
6062 if (WARN_ON_ONCE(!cfs_rq_throttled(cfs_rq) ||
6063 cfs_rq->runtime_remaining <= 0))
6064 return;
6065
6066 __unthrottle_cfs_rq_async(cfs_rq);
6067 }
6068
distribute_cfs_runtime(struct cfs_bandwidth * cfs_b)6069 static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
6070 {
6071 int this_cpu = smp_processor_id();
6072 u64 runtime, remaining = 1;
6073 bool throttled = false;
6074 struct cfs_rq *cfs_rq, *tmp;
6075 struct rq_flags rf;
6076 struct rq *rq;
6077 LIST_HEAD(local_unthrottle);
6078
6079 rcu_read_lock();
6080 list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
6081 throttled_list) {
6082 rq = rq_of(cfs_rq);
6083
6084 if (!remaining) {
6085 throttled = true;
6086 break;
6087 }
6088
6089 rq_lock_irqsave(rq, &rf);
6090 if (!cfs_rq_throttled(cfs_rq))
6091 goto next;
6092
6093 /* Already queued for async unthrottle */
6094 if (!list_empty(&cfs_rq->throttled_csd_list))
6095 goto next;
6096
6097 /* By the above checks, this should never be true */
6098 WARN_ON_ONCE(cfs_rq->runtime_remaining > 0);
6099
6100 raw_spin_lock(&cfs_b->lock);
6101 runtime = -cfs_rq->runtime_remaining + 1;
6102 if (runtime > cfs_b->runtime)
6103 runtime = cfs_b->runtime;
6104 cfs_b->runtime -= runtime;
6105 remaining = cfs_b->runtime;
6106 raw_spin_unlock(&cfs_b->lock);
6107
6108 cfs_rq->runtime_remaining += runtime;
6109
6110 /* we check whether we're throttled above */
6111 if (cfs_rq->runtime_remaining > 0) {
6112 if (cpu_of(rq) != this_cpu) {
6113 unthrottle_cfs_rq_async(cfs_rq);
6114 } else {
6115 /*
6116 * We currently only expect to be unthrottling
6117 * a single cfs_rq locally.
6118 */
6119 WARN_ON_ONCE(!list_empty(&local_unthrottle));
6120 list_add_tail(&cfs_rq->throttled_csd_list,
6121 &local_unthrottle);
6122 }
6123 } else {
6124 throttled = true;
6125 }
6126
6127 next:
6128 rq_unlock_irqrestore(rq, &rf);
6129 }
6130
6131 list_for_each_entry_safe(cfs_rq, tmp, &local_unthrottle,
6132 throttled_csd_list) {
6133 struct rq *rq = rq_of(cfs_rq);
6134
6135 rq_lock_irqsave(rq, &rf);
6136
6137 list_del_init(&cfs_rq->throttled_csd_list);
6138
6139 if (cfs_rq_throttled(cfs_rq))
6140 unthrottle_cfs_rq(cfs_rq);
6141
6142 rq_unlock_irqrestore(rq, &rf);
6143 }
6144 WARN_ON_ONCE(!list_empty(&local_unthrottle));
6145
6146 rcu_read_unlock();
6147
6148 return throttled;
6149 }
6150
6151 /*
6152 * Responsible for refilling a task_group's bandwidth and unthrottling its
6153 * cfs_rqs as appropriate. If there has been no activity within the last
6154 * period the timer is deactivated until scheduling resumes; cfs_b->idle is
6155 * used to track this state.
6156 */
do_sched_cfs_period_timer(struct cfs_bandwidth * cfs_b,int overrun,unsigned long flags)6157 static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, unsigned long flags)
6158 {
6159 int throttled;
6160
6161 /* no need to continue the timer with no bandwidth constraint */
6162 if (cfs_b->quota == RUNTIME_INF)
6163 goto out_deactivate;
6164
6165 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
6166 cfs_b->nr_periods += overrun;
6167
6168 /* Refill extra burst quota even if cfs_b->idle */
6169 __refill_cfs_bandwidth_runtime(cfs_b);
6170
6171 /*
6172 * idle depends on !throttled (for the case of a large deficit), and if
6173 * we're going inactive then everything else can be deferred
6174 */
6175 if (cfs_b->idle && !throttled)
6176 goto out_deactivate;
6177
6178 if (!throttled) {
6179 /* mark as potentially idle for the upcoming period */
6180 cfs_b->idle = 1;
6181 return 0;
6182 }
6183
6184 /* account preceding periods in which throttling occurred */
6185 cfs_b->nr_throttled += overrun;
6186
6187 /*
6188 * This check is repeated as we release cfs_b->lock while we unthrottle.
6189 */
6190 while (throttled && cfs_b->runtime > 0) {
6191 raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
6192 /* we can't nest cfs_b->lock while distributing bandwidth */
6193 throttled = distribute_cfs_runtime(cfs_b);
6194 raw_spin_lock_irqsave(&cfs_b->lock, flags);
6195 }
6196
6197 /*
6198 * While we are ensured activity in the period following an
6199 * unthrottle, this also covers the case in which the new bandwidth is
6200 * insufficient to cover the existing bandwidth deficit. (Forcing the
6201 * timer to remain active while there are any throttled entities.)
6202 */
6203 cfs_b->idle = 0;
6204
6205 return 0;
6206
6207 out_deactivate:
6208 return 1;
6209 }
6210
6211 /* a cfs_rq won't donate quota below this amount */
6212 static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
6213 /* minimum remaining period time to redistribute slack quota */
6214 static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
6215 /* how long we wait to gather additional slack before distributing */
6216 static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
6217
6218 /*
6219 * Are we near the end of the current quota period?
6220 *
6221 * Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the
6222 * hrtimer base being cleared by hrtimer_start. In the case of
6223 * migrate_hrtimers, base is never cleared, so we are fine.
6224 */
runtime_refresh_within(struct cfs_bandwidth * cfs_b,u64 min_expire)6225 static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
6226 {
6227 struct hrtimer *refresh_timer = &cfs_b->period_timer;
6228 s64 remaining;
6229
6230 /* if the call-back is running a quota refresh is already occurring */
6231 if (hrtimer_callback_running(refresh_timer))
6232 return 1;
6233
6234 /* is a quota refresh about to occur? */
6235 remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
6236 if (remaining < (s64)min_expire)
6237 return 1;
6238
6239 return 0;
6240 }
6241
start_cfs_slack_bandwidth(struct cfs_bandwidth * cfs_b)6242 static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
6243 {
6244 u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
6245
6246 /* if there's a quota refresh soon don't bother with slack */
6247 if (runtime_refresh_within(cfs_b, min_left))
6248 return;
6249
6250 /* don't push forwards an existing deferred unthrottle */
6251 if (cfs_b->slack_started)
6252 return;
6253 cfs_b->slack_started = true;
6254
6255 hrtimer_start(&cfs_b->slack_timer,
6256 ns_to_ktime(cfs_bandwidth_slack_period),
6257 HRTIMER_MODE_REL);
6258 }
6259
6260 /* we know any runtime found here is valid as update_curr() precedes return */
__return_cfs_rq_runtime(struct cfs_rq * cfs_rq)6261 static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
6262 {
6263 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
6264 s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
6265
6266 if (slack_runtime <= 0)
6267 return;
6268
6269 raw_spin_lock(&cfs_b->lock);
6270 if (cfs_b->quota != RUNTIME_INF) {
6271 cfs_b->runtime += slack_runtime;
6272
6273 /* we are under rq->lock, defer unthrottling using a timer */
6274 if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
6275 !list_empty(&cfs_b->throttled_cfs_rq))
6276 start_cfs_slack_bandwidth(cfs_b);
6277 }
6278 raw_spin_unlock(&cfs_b->lock);
6279
6280 /* even if it's not valid for return we don't want to try again */
6281 cfs_rq->runtime_remaining -= slack_runtime;
6282 }
6283
return_cfs_rq_runtime(struct cfs_rq * cfs_rq)6284 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
6285 {
6286 if (!cfs_bandwidth_used())
6287 return;
6288
6289 if (!cfs_rq->runtime_enabled || cfs_rq->nr_queued)
6290 return;
6291
6292 __return_cfs_rq_runtime(cfs_rq);
6293 }
6294
6295 /*
6296 * This is done with a timer (instead of inline with bandwidth return) since
6297 * it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.
6298 */
do_sched_cfs_slack_timer(struct cfs_bandwidth * cfs_b)6299 static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
6300 {
6301 u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
6302 unsigned long flags;
6303
6304 /* confirm we're still not at a refresh boundary */
6305 raw_spin_lock_irqsave(&cfs_b->lock, flags);
6306 cfs_b->slack_started = false;
6307
6308 if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
6309 raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
6310 return;
6311 }
6312
6313 if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
6314 runtime = cfs_b->runtime;
6315
6316 raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
6317
6318 if (!runtime)
6319 return;
6320
6321 distribute_cfs_runtime(cfs_b);
6322 }
6323
6324 /*
6325 * When a group wakes up we want to make sure that its quota is not already
6326 * expired/exceeded, otherwise it may be allowed to steal additional ticks of
6327 * runtime as update_curr() throttling can not trigger until it's on-rq.
6328 */
check_enqueue_throttle(struct cfs_rq * cfs_rq)6329 static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
6330 {
6331 if (!cfs_bandwidth_used())
6332 return;
6333
6334 /* an active group must be handled by the update_curr()->put() path */
6335 if (!cfs_rq->runtime_enabled || cfs_rq->curr)
6336 return;
6337
6338 /* ensure the group is not already throttled */
6339 if (cfs_rq_throttled(cfs_rq))
6340 return;
6341
6342 /* update runtime allocation */
6343 account_cfs_rq_runtime(cfs_rq, 0);
6344 if (cfs_rq->runtime_remaining <= 0)
6345 throttle_cfs_rq(cfs_rq);
6346 }
6347
sync_throttle(struct task_group * tg,int cpu)6348 static void sync_throttle(struct task_group *tg, int cpu)
6349 {
6350 struct cfs_rq *pcfs_rq, *cfs_rq;
6351
6352 if (!cfs_bandwidth_used())
6353 return;
6354
6355 if (!tg->parent)
6356 return;
6357
6358 cfs_rq = tg->cfs_rq[cpu];
6359 pcfs_rq = tg->parent->cfs_rq[cpu];
6360
6361 cfs_rq->throttle_count = pcfs_rq->throttle_count;
6362 cfs_rq->throttled_clock_pelt = rq_clock_pelt(cpu_rq(cpu));
6363 }
6364
6365 /* conditionally throttle active cfs_rq's from put_prev_entity() */
check_cfs_rq_runtime(struct cfs_rq * cfs_rq)6366 static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
6367 {
6368 if (!cfs_bandwidth_used())
6369 return false;
6370
6371 if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
6372 return false;
6373
6374 /*
6375 * it's possible for a throttled entity to be forced into a running
6376 * state (e.g. set_curr_task), in this case we're finished.
6377 */
6378 if (cfs_rq_throttled(cfs_rq))
6379 return true;
6380
6381 return throttle_cfs_rq(cfs_rq);
6382 }
6383
sched_cfs_slack_timer(struct hrtimer * timer)6384 static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
6385 {
6386 struct cfs_bandwidth *cfs_b =
6387 container_of(timer, struct cfs_bandwidth, slack_timer);
6388
6389 do_sched_cfs_slack_timer(cfs_b);
6390
6391 return HRTIMER_NORESTART;
6392 }
6393
sched_cfs_period_timer(struct hrtimer * timer)6394 static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
6395 {
6396 struct cfs_bandwidth *cfs_b =
6397 container_of(timer, struct cfs_bandwidth, period_timer);
6398 unsigned long flags;
6399 int overrun;
6400 int idle = 0;
6401 int count = 0;
6402
6403 raw_spin_lock_irqsave(&cfs_b->lock, flags);
6404 for (;;) {
6405 overrun = hrtimer_forward_now(timer, cfs_b->period);
6406 if (!overrun)
6407 break;
6408
6409 idle = do_sched_cfs_period_timer(cfs_b, overrun, flags);
6410
6411 if (++count > 3) {
6412 u64 new, old = ktime_to_ns(cfs_b->period);
6413
6414 /*
6415 * Grow period by a factor of 2 to avoid losing precision.
6416 * Precision loss in the quota/period ratio can cause __cfs_schedulable
6417 * to fail.
6418 */
6419 new = old * 2;
6420 if (new < max_bw_quota_period_us * NSEC_PER_USEC) {
6421 cfs_b->period = ns_to_ktime(new);
6422 cfs_b->quota *= 2;
6423 cfs_b->burst *= 2;
6424
6425 pr_warn_ratelimited(
6426 "cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us = %lld, cfs_quota_us = %lld)\n",
6427 smp_processor_id(),
6428 div_u64(new, NSEC_PER_USEC),
6429 div_u64(cfs_b->quota, NSEC_PER_USEC));
6430 } else {
6431 pr_warn_ratelimited(
6432 "cfs_period_timer[cpu%d]: period too short, but cannot scale up without losing precision (cfs_period_us = %lld, cfs_quota_us = %lld)\n",
6433 smp_processor_id(),
6434 div_u64(old, NSEC_PER_USEC),
6435 div_u64(cfs_b->quota, NSEC_PER_USEC));
6436 }
6437
6438 /* reset count so we don't come right back in here */
6439 count = 0;
6440 }
6441 }
6442 if (idle)
6443 cfs_b->period_active = 0;
6444 raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
6445
6446 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
6447 }
6448
init_cfs_bandwidth(struct cfs_bandwidth * cfs_b,struct cfs_bandwidth * parent)6449 void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b, struct cfs_bandwidth *parent)
6450 {
6451 raw_spin_lock_init(&cfs_b->lock);
6452 cfs_b->runtime = 0;
6453 cfs_b->quota = RUNTIME_INF;
6454 cfs_b->period = us_to_ktime(default_bw_period_us());
6455 cfs_b->burst = 0;
6456 cfs_b->hierarchical_quota = parent ? parent->hierarchical_quota : RUNTIME_INF;
6457
6458 INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
6459 hrtimer_setup(&cfs_b->period_timer, sched_cfs_period_timer, CLOCK_MONOTONIC,
6460 HRTIMER_MODE_ABS_PINNED);
6461
6462 /* Add a random offset so that timers interleave */
6463 hrtimer_set_expires(&cfs_b->period_timer,
6464 get_random_u32_below(cfs_b->period));
6465 hrtimer_setup(&cfs_b->slack_timer, sched_cfs_slack_timer, CLOCK_MONOTONIC,
6466 HRTIMER_MODE_REL);
6467 cfs_b->slack_started = false;
6468 }
6469
init_cfs_rq_runtime(struct cfs_rq * cfs_rq)6470 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
6471 {
6472 cfs_rq->runtime_enabled = 0;
6473 INIT_LIST_HEAD(&cfs_rq->throttled_list);
6474 INIT_LIST_HEAD(&cfs_rq->throttled_csd_list);
6475 }
6476
start_cfs_bandwidth(struct cfs_bandwidth * cfs_b)6477 void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
6478 {
6479 lockdep_assert_held(&cfs_b->lock);
6480
6481 if (cfs_b->period_active)
6482 return;
6483
6484 cfs_b->period_active = 1;
6485 hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period);
6486 hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED);
6487 }
6488
destroy_cfs_bandwidth(struct cfs_bandwidth * cfs_b)6489 static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
6490 {
6491 int __maybe_unused i;
6492
6493 /* init_cfs_bandwidth() was not called */
6494 if (!cfs_b->throttled_cfs_rq.next)
6495 return;
6496
6497 hrtimer_cancel(&cfs_b->period_timer);
6498 hrtimer_cancel(&cfs_b->slack_timer);
6499
6500 /*
6501 * It is possible that we still have some cfs_rq's pending on a CSD
6502 * list, though this race is very rare. In order for this to occur, we
6503 * must have raced with the last task leaving the group while there
6504 * exist throttled cfs_rq(s), and the period_timer must have queued the
6505 * CSD item but the remote cpu has not yet processed it. To handle this,
6506 * we can simply flush all pending CSD work inline here. We're
6507 * guaranteed at this point that no additional cfs_rq of this group can
6508 * join a CSD list.
6509 */
6510 for_each_possible_cpu(i) {
6511 struct rq *rq = cpu_rq(i);
6512 unsigned long flags;
6513
6514 if (list_empty(&rq->cfsb_csd_list))
6515 continue;
6516
6517 local_irq_save(flags);
6518 __cfsb_csd_unthrottle(rq);
6519 local_irq_restore(flags);
6520 }
6521 }
6522
6523 /*
6524 * Both these CPU hotplug callbacks race against unregister_fair_sched_group()
6525 *
6526 * The race is harmless, since modifying bandwidth settings of unhooked group
6527 * bits doesn't do much.
6528 */
6529
6530 /* cpu online callback */
update_runtime_enabled(struct rq * rq)6531 static void __maybe_unused update_runtime_enabled(struct rq *rq)
6532 {
6533 struct task_group *tg;
6534
6535 lockdep_assert_rq_held(rq);
6536
6537 rcu_read_lock();
6538 list_for_each_entry_rcu(tg, &task_groups, list) {
6539 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
6540 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
6541
6542 raw_spin_lock(&cfs_b->lock);
6543 cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF;
6544 raw_spin_unlock(&cfs_b->lock);
6545 }
6546 rcu_read_unlock();
6547 }
6548
6549 /* cpu offline callback */
unthrottle_offline_cfs_rqs(struct rq * rq)6550 static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
6551 {
6552 struct task_group *tg;
6553
6554 lockdep_assert_rq_held(rq);
6555
6556 // Do not unthrottle for an active CPU
6557 if (cpumask_test_cpu(cpu_of(rq), cpu_active_mask))
6558 return;
6559
6560 /*
6561 * The rq clock has already been updated in the
6562 * set_rq_offline(), so we should skip updating
6563 * the rq clock again in unthrottle_cfs_rq().
6564 */
6565 rq_clock_start_loop_update(rq);
6566
6567 rcu_read_lock();
6568 list_for_each_entry_rcu(tg, &task_groups, list) {
6569 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
6570
6571 if (!cfs_rq->runtime_enabled)
6572 continue;
6573
6574 /*
6575 * Offline rq is schedulable till CPU is completely disabled
6576 * in take_cpu_down(), so we prevent new cfs throttling here.
6577 */
6578 cfs_rq->runtime_enabled = 0;
6579
6580 if (!cfs_rq_throttled(cfs_rq))
6581 continue;
6582
6583 /*
6584 * clock_task is not advancing so we just need to make sure
6585 * there's some valid quota amount
6586 */
6587 cfs_rq->runtime_remaining = 1;
6588 unthrottle_cfs_rq(cfs_rq);
6589 }
6590 rcu_read_unlock();
6591
6592 rq_clock_stop_loop_update(rq);
6593 }
6594
cfs_task_bw_constrained(struct task_struct * p)6595 bool cfs_task_bw_constrained(struct task_struct *p)
6596 {
6597 struct cfs_rq *cfs_rq = task_cfs_rq(p);
6598
6599 if (!cfs_bandwidth_used())
6600 return false;
6601
6602 if (cfs_rq->runtime_enabled ||
6603 tg_cfs_bandwidth(cfs_rq->tg)->hierarchical_quota != RUNTIME_INF)
6604 return true;
6605
6606 return false;
6607 }
6608
6609 #ifdef CONFIG_NO_HZ_FULL
6610 /* called from pick_next_task_fair() */
sched_fair_update_stop_tick(struct rq * rq,struct task_struct * p)6611 static void sched_fair_update_stop_tick(struct rq *rq, struct task_struct *p)
6612 {
6613 int cpu = cpu_of(rq);
6614
6615 if (!cfs_bandwidth_used())
6616 return;
6617
6618 if (!tick_nohz_full_cpu(cpu))
6619 return;
6620
6621 if (rq->nr_running != 1)
6622 return;
6623
6624 /*
6625 * We know there is only one task runnable and we've just picked it. The
6626 * normal enqueue path will have cleared TICK_DEP_BIT_SCHED if we will
6627 * be otherwise able to stop the tick. Just need to check if we are using
6628 * bandwidth control.
6629 */
6630 if (cfs_task_bw_constrained(p))
6631 tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED);
6632 }
6633 #endif /* CONFIG_NO_HZ_FULL */
6634
6635 #else /* !CONFIG_CFS_BANDWIDTH: */
6636
account_cfs_rq_runtime(struct cfs_rq * cfs_rq,u64 delta_exec)6637 static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
check_cfs_rq_runtime(struct cfs_rq * cfs_rq)6638 static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
check_enqueue_throttle(struct cfs_rq * cfs_rq)6639 static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
sync_throttle(struct task_group * tg,int cpu)6640 static inline void sync_throttle(struct task_group *tg, int cpu) {}
return_cfs_rq_runtime(struct cfs_rq * cfs_rq)6641 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
6642
cfs_rq_throttled(struct cfs_rq * cfs_rq)6643 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
6644 {
6645 return 0;
6646 }
6647
throttled_hierarchy(struct cfs_rq * cfs_rq)6648 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
6649 {
6650 return 0;
6651 }
6652
throttled_lb_pair(struct task_group * tg,int src_cpu,int dest_cpu)6653 static inline int throttled_lb_pair(struct task_group *tg,
6654 int src_cpu, int dest_cpu)
6655 {
6656 return 0;
6657 }
6658
6659 #ifdef CONFIG_FAIR_GROUP_SCHED
init_cfs_bandwidth(struct cfs_bandwidth * cfs_b,struct cfs_bandwidth * parent)6660 void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b, struct cfs_bandwidth *parent) {}
init_cfs_rq_runtime(struct cfs_rq * cfs_rq)6661 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
6662 #endif
6663
tg_cfs_bandwidth(struct task_group * tg)6664 static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
6665 {
6666 return NULL;
6667 }
destroy_cfs_bandwidth(struct cfs_bandwidth * cfs_b)6668 static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
update_runtime_enabled(struct rq * rq)6669 static inline void update_runtime_enabled(struct rq *rq) {}
unthrottle_offline_cfs_rqs(struct rq * rq)6670 static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
6671 #ifdef CONFIG_CGROUP_SCHED
cfs_task_bw_constrained(struct task_struct * p)6672 bool cfs_task_bw_constrained(struct task_struct *p)
6673 {
6674 return false;
6675 }
6676 #endif
6677 #endif /* !CONFIG_CFS_BANDWIDTH */
6678
6679 #if !defined(CONFIG_CFS_BANDWIDTH) || !defined(CONFIG_NO_HZ_FULL)
sched_fair_update_stop_tick(struct rq * rq,struct task_struct * p)6680 static inline void sched_fair_update_stop_tick(struct rq *rq, struct task_struct *p) {}
6681 #endif
6682
6683 /**************************************************
6684 * CFS operations on tasks:
6685 */
6686
6687 #ifdef CONFIG_SCHED_HRTICK
hrtick_start_fair(struct rq * rq,struct task_struct * p)6688 static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
6689 {
6690 struct sched_entity *se = &p->se;
6691
6692 WARN_ON_ONCE(task_rq(p) != rq);
6693
6694 if (rq->cfs.h_nr_queued > 1) {
6695 u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
6696 u64 slice = se->slice;
6697 s64 delta = slice - ran;
6698
6699 if (delta < 0) {
6700 if (task_current_donor(rq, p))
6701 resched_curr(rq);
6702 return;
6703 }
6704 hrtick_start(rq, delta);
6705 }
6706 }
6707
6708 /*
6709 * called from enqueue/dequeue and updates the hrtick when the
6710 * current task is from our class and nr_running is low enough
6711 * to matter.
6712 */
hrtick_update(struct rq * rq)6713 static void hrtick_update(struct rq *rq)
6714 {
6715 struct task_struct *donor = rq->donor;
6716
6717 if (!hrtick_enabled_fair(rq) || donor->sched_class != &fair_sched_class)
6718 return;
6719
6720 hrtick_start_fair(rq, donor);
6721 }
6722 #else /* !CONFIG_SCHED_HRTICK: */
6723 static inline void
hrtick_start_fair(struct rq * rq,struct task_struct * p)6724 hrtick_start_fair(struct rq *rq, struct task_struct *p)
6725 {
6726 }
6727
hrtick_update(struct rq * rq)6728 static inline void hrtick_update(struct rq *rq)
6729 {
6730 }
6731 #endif /* !CONFIG_SCHED_HRTICK */
6732
cpu_overutilized(int cpu)6733 static inline bool cpu_overutilized(int cpu)
6734 {
6735 unsigned long rq_util_min, rq_util_max;
6736
6737 if (!sched_energy_enabled())
6738 return false;
6739
6740 rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN);
6741 rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX);
6742
6743 /* Return true only if the utilization doesn't fit CPU's capacity */
6744 return !util_fits_cpu(cpu_util_cfs(cpu), rq_util_min, rq_util_max, cpu);
6745 }
6746
6747 /*
6748 * overutilized value make sense only if EAS is enabled
6749 */
is_rd_overutilized(struct root_domain * rd)6750 static inline bool is_rd_overutilized(struct root_domain *rd)
6751 {
6752 return !sched_energy_enabled() || READ_ONCE(rd->overutilized);
6753 }
6754
set_rd_overutilized(struct root_domain * rd,bool flag)6755 static inline void set_rd_overutilized(struct root_domain *rd, bool flag)
6756 {
6757 if (!sched_energy_enabled())
6758 return;
6759
6760 WRITE_ONCE(rd->overutilized, flag);
6761 trace_sched_overutilized_tp(rd, flag);
6762 }
6763
check_update_overutilized_status(struct rq * rq)6764 static inline void check_update_overutilized_status(struct rq *rq)
6765 {
6766 /*
6767 * overutilized field is used for load balancing decisions only
6768 * if energy aware scheduler is being used
6769 */
6770
6771 if (!is_rd_overutilized(rq->rd) && cpu_overutilized(rq->cpu))
6772 set_rd_overutilized(rq->rd, 1);
6773 }
6774
6775 /* Runqueue only has SCHED_IDLE tasks enqueued */
sched_idle_rq(struct rq * rq)6776 static int sched_idle_rq(struct rq *rq)
6777 {
6778 return unlikely(rq->nr_running == rq->cfs.h_nr_idle &&
6779 rq->nr_running);
6780 }
6781
sched_idle_cpu(int cpu)6782 static int sched_idle_cpu(int cpu)
6783 {
6784 return sched_idle_rq(cpu_rq(cpu));
6785 }
6786
6787 static void
requeue_delayed_entity(struct sched_entity * se)6788 requeue_delayed_entity(struct sched_entity *se)
6789 {
6790 struct cfs_rq *cfs_rq = cfs_rq_of(se);
6791
6792 /*
6793 * se->sched_delayed should imply: se->on_rq == 1.
6794 * Because a delayed entity is one that is still on
6795 * the runqueue competing until elegibility.
6796 */
6797 WARN_ON_ONCE(!se->sched_delayed);
6798 WARN_ON_ONCE(!se->on_rq);
6799
6800 if (sched_feat(DELAY_ZERO)) {
6801 update_entity_lag(cfs_rq, se);
6802 if (se->vlag > 0) {
6803 cfs_rq->nr_queued--;
6804 if (se != cfs_rq->curr)
6805 __dequeue_entity(cfs_rq, se);
6806 se->vlag = 0;
6807 place_entity(cfs_rq, se, 0);
6808 if (se != cfs_rq->curr)
6809 __enqueue_entity(cfs_rq, se);
6810 cfs_rq->nr_queued++;
6811 }
6812 }
6813
6814 update_load_avg(cfs_rq, se, 0);
6815 clear_delayed(se);
6816 }
6817
6818 /*
6819 * The enqueue_task method is called before nr_running is
6820 * increased. Here we update the fair scheduling stats and
6821 * then put the task into the rbtree:
6822 */
6823 static void
enqueue_task_fair(struct rq * rq,struct task_struct * p,int flags)6824 enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
6825 {
6826 struct cfs_rq *cfs_rq;
6827 struct sched_entity *se = &p->se;
6828 int h_nr_idle = task_has_idle_policy(p);
6829 int h_nr_runnable = 1;
6830 int task_new = !(flags & ENQUEUE_WAKEUP);
6831 int rq_h_nr_queued = rq->cfs.h_nr_queued;
6832 u64 slice = 0;
6833
6834 /*
6835 * The code below (indirectly) updates schedutil which looks at
6836 * the cfs_rq utilization to select a frequency.
6837 * Let's add the task's estimated utilization to the cfs_rq's
6838 * estimated utilization, before we update schedutil.
6839 */
6840 if (!p->se.sched_delayed || (flags & ENQUEUE_DELAYED))
6841 util_est_enqueue(&rq->cfs, p);
6842
6843 if (flags & ENQUEUE_DELAYED) {
6844 requeue_delayed_entity(se);
6845 return;
6846 }
6847
6848 /*
6849 * If in_iowait is set, the code below may not trigger any cpufreq
6850 * utilization updates, so do it here explicitly with the IOWAIT flag
6851 * passed.
6852 */
6853 if (p->in_iowait)
6854 cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);
6855
6856 if (task_new && se->sched_delayed)
6857 h_nr_runnable = 0;
6858
6859 for_each_sched_entity(se) {
6860 if (se->on_rq) {
6861 if (se->sched_delayed)
6862 requeue_delayed_entity(se);
6863 break;
6864 }
6865 cfs_rq = cfs_rq_of(se);
6866
6867 /*
6868 * Basically set the slice of group entries to the min_slice of
6869 * their respective cfs_rq. This ensures the group can service
6870 * its entities in the desired time-frame.
6871 */
6872 if (slice) {
6873 se->slice = slice;
6874 se->custom_slice = 1;
6875 }
6876 enqueue_entity(cfs_rq, se, flags);
6877 slice = cfs_rq_min_slice(cfs_rq);
6878
6879 cfs_rq->h_nr_runnable += h_nr_runnable;
6880 cfs_rq->h_nr_queued++;
6881 cfs_rq->h_nr_idle += h_nr_idle;
6882
6883 if (cfs_rq_is_idle(cfs_rq))
6884 h_nr_idle = 1;
6885
6886 /* end evaluation on encountering a throttled cfs_rq */
6887 if (cfs_rq_throttled(cfs_rq))
6888 goto enqueue_throttle;
6889
6890 flags = ENQUEUE_WAKEUP;
6891 }
6892
6893 for_each_sched_entity(se) {
6894 cfs_rq = cfs_rq_of(se);
6895
6896 update_load_avg(cfs_rq, se, UPDATE_TG);
6897 se_update_runnable(se);
6898 update_cfs_group(se);
6899
6900 se->slice = slice;
6901 if (se != cfs_rq->curr)
6902 min_vruntime_cb_propagate(&se->run_node, NULL);
6903 slice = cfs_rq_min_slice(cfs_rq);
6904
6905 cfs_rq->h_nr_runnable += h_nr_runnable;
6906 cfs_rq->h_nr_queued++;
6907 cfs_rq->h_nr_idle += h_nr_idle;
6908
6909 if (cfs_rq_is_idle(cfs_rq))
6910 h_nr_idle = 1;
6911
6912 /* end evaluation on encountering a throttled cfs_rq */
6913 if (cfs_rq_throttled(cfs_rq))
6914 goto enqueue_throttle;
6915 }
6916
6917 if (!rq_h_nr_queued && rq->cfs.h_nr_queued) {
6918 /* Account for idle runtime */
6919 if (!rq->nr_running)
6920 dl_server_update_idle_time(rq, rq->curr);
6921 dl_server_start(&rq->fair_server);
6922 }
6923
6924 /* At this point se is NULL and we are at root level*/
6925 add_nr_running(rq, 1);
6926
6927 /*
6928 * Since new tasks are assigned an initial util_avg equal to
6929 * half of the spare capacity of their CPU, tiny tasks have the
6930 * ability to cross the overutilized threshold, which will
6931 * result in the load balancer ruining all the task placement
6932 * done by EAS. As a way to mitigate that effect, do not account
6933 * for the first enqueue operation of new tasks during the
6934 * overutilized flag detection.
6935 *
6936 * A better way of solving this problem would be to wait for
6937 * the PELT signals of tasks to converge before taking them
6938 * into account, but that is not straightforward to implement,
6939 * and the following generally works well enough in practice.
6940 */
6941 if (!task_new)
6942 check_update_overutilized_status(rq);
6943
6944 enqueue_throttle:
6945 assert_list_leaf_cfs_rq(rq);
6946
6947 hrtick_update(rq);
6948 }
6949
6950 static void set_next_buddy(struct sched_entity *se);
6951
6952 /*
6953 * Basically dequeue_task_fair(), except it can deal with dequeue_entity()
6954 * failing half-way through and resume the dequeue later.
6955 *
6956 * Returns:
6957 * -1 - dequeue delayed
6958 * 0 - dequeue throttled
6959 * 1 - dequeue complete
6960 */
dequeue_entities(struct rq * rq,struct sched_entity * se,int flags)6961 static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
6962 {
6963 bool was_sched_idle = sched_idle_rq(rq);
6964 bool task_sleep = flags & DEQUEUE_SLEEP;
6965 bool task_delayed = flags & DEQUEUE_DELAYED;
6966 struct task_struct *p = NULL;
6967 int h_nr_idle = 0;
6968 int h_nr_queued = 0;
6969 int h_nr_runnable = 0;
6970 struct cfs_rq *cfs_rq;
6971 u64 slice = 0;
6972
6973 if (entity_is_task(se)) {
6974 p = task_of(se);
6975 h_nr_queued = 1;
6976 h_nr_idle = task_has_idle_policy(p);
6977 if (task_sleep || task_delayed || !se->sched_delayed)
6978 h_nr_runnable = 1;
6979 }
6980
6981 for_each_sched_entity(se) {
6982 cfs_rq = cfs_rq_of(se);
6983
6984 if (!dequeue_entity(cfs_rq, se, flags)) {
6985 if (p && &p->se == se)
6986 return -1;
6987
6988 slice = cfs_rq_min_slice(cfs_rq);
6989 break;
6990 }
6991
6992 cfs_rq->h_nr_runnable -= h_nr_runnable;
6993 cfs_rq->h_nr_queued -= h_nr_queued;
6994 cfs_rq->h_nr_idle -= h_nr_idle;
6995
6996 if (cfs_rq_is_idle(cfs_rq))
6997 h_nr_idle = h_nr_queued;
6998
6999 /* end evaluation on encountering a throttled cfs_rq */
7000 if (cfs_rq_throttled(cfs_rq))
7001 return 0;
7002
7003 /* Don't dequeue parent if it has other entities besides us */
7004 if (cfs_rq->load.weight) {
7005 slice = cfs_rq_min_slice(cfs_rq);
7006
7007 /* Avoid re-evaluating load for this entity: */
7008 se = parent_entity(se);
7009 /*
7010 * Bias pick_next to pick a task from this cfs_rq, as
7011 * p is sleeping when it is within its sched_slice.
7012 */
7013 if (task_sleep && se && !throttled_hierarchy(cfs_rq))
7014 set_next_buddy(se);
7015 break;
7016 }
7017 flags |= DEQUEUE_SLEEP;
7018 flags &= ~(DEQUEUE_DELAYED | DEQUEUE_SPECIAL);
7019 }
7020
7021 for_each_sched_entity(se) {
7022 cfs_rq = cfs_rq_of(se);
7023
7024 update_load_avg(cfs_rq, se, UPDATE_TG);
7025 se_update_runnable(se);
7026 update_cfs_group(se);
7027
7028 se->slice = slice;
7029 if (se != cfs_rq->curr)
7030 min_vruntime_cb_propagate(&se->run_node, NULL);
7031 slice = cfs_rq_min_slice(cfs_rq);
7032
7033 cfs_rq->h_nr_runnable -= h_nr_runnable;
7034 cfs_rq->h_nr_queued -= h_nr_queued;
7035 cfs_rq->h_nr_idle -= h_nr_idle;
7036
7037 if (cfs_rq_is_idle(cfs_rq))
7038 h_nr_idle = h_nr_queued;
7039
7040 /* end evaluation on encountering a throttled cfs_rq */
7041 if (cfs_rq_throttled(cfs_rq))
7042 return 0;
7043 }
7044
7045 sub_nr_running(rq, h_nr_queued);
7046
7047 /* balance early to pull high priority tasks */
7048 if (unlikely(!was_sched_idle && sched_idle_rq(rq)))
7049 rq->next_balance = jiffies;
7050
7051 if (p && task_delayed) {
7052 WARN_ON_ONCE(!task_sleep);
7053 WARN_ON_ONCE(p->on_rq != 1);
7054
7055 /* Fix-up what dequeue_task_fair() skipped */
7056 hrtick_update(rq);
7057
7058 /*
7059 * Fix-up what block_task() skipped.
7060 *
7061 * Must be last, @p might not be valid after this.
7062 */
7063 __block_task(rq, p);
7064 }
7065
7066 return 1;
7067 }
7068
7069 /*
7070 * The dequeue_task method is called before nr_running is
7071 * decreased. We remove the task from the rbtree and
7072 * update the fair scheduling stats:
7073 */
dequeue_task_fair(struct rq * rq,struct task_struct * p,int flags)7074 static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
7075 {
7076 if (!p->se.sched_delayed)
7077 util_est_dequeue(&rq->cfs, p);
7078
7079 util_est_update(&rq->cfs, p, flags & DEQUEUE_SLEEP);
7080 if (dequeue_entities(rq, &p->se, flags) < 0)
7081 return false;
7082
7083 /*
7084 * Must not reference @p after dequeue_entities(DEQUEUE_DELAYED).
7085 */
7086
7087 hrtick_update(rq);
7088 return true;
7089 }
7090
cfs_h_nr_delayed(struct rq * rq)7091 static inline unsigned int cfs_h_nr_delayed(struct rq *rq)
7092 {
7093 return (rq->cfs.h_nr_queued - rq->cfs.h_nr_runnable);
7094 }
7095
7096 /* Working cpumask for: sched_balance_rq(), sched_balance_newidle(). */
7097 static DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
7098 static DEFINE_PER_CPU(cpumask_var_t, select_rq_mask);
7099 static DEFINE_PER_CPU(cpumask_var_t, should_we_balance_tmpmask);
7100
7101 #ifdef CONFIG_NO_HZ_COMMON
7102
7103 static struct {
7104 cpumask_var_t idle_cpus_mask;
7105 atomic_t nr_cpus;
7106 int has_blocked; /* Idle CPUS has blocked load */
7107 int needs_update; /* Newly idle CPUs need their next_balance collated */
7108 unsigned long next_balance; /* in jiffy units */
7109 unsigned long next_blocked; /* Next update of blocked load in jiffies */
7110 } nohz ____cacheline_aligned;
7111
7112 #endif /* CONFIG_NO_HZ_COMMON */
7113
cpu_load(struct rq * rq)7114 static unsigned long cpu_load(struct rq *rq)
7115 {
7116 return cfs_rq_load_avg(&rq->cfs);
7117 }
7118
7119 /*
7120 * cpu_load_without - compute CPU load without any contributions from *p
7121 * @cpu: the CPU which load is requested
7122 * @p: the task which load should be discounted
7123 *
7124 * The load of a CPU is defined by the load of tasks currently enqueued on that
7125 * CPU as well as tasks which are currently sleeping after an execution on that
7126 * CPU.
7127 *
7128 * This method returns the load of the specified CPU by discounting the load of
7129 * the specified task, whenever the task is currently contributing to the CPU
7130 * load.
7131 */
cpu_load_without(struct rq * rq,struct task_struct * p)7132 static unsigned long cpu_load_without(struct rq *rq, struct task_struct *p)
7133 {
7134 struct cfs_rq *cfs_rq;
7135 unsigned int load;
7136
7137 /* Task has no contribution or is new */
7138 if (cpu_of(rq) != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
7139 return cpu_load(rq);
7140
7141 cfs_rq = &rq->cfs;
7142 load = READ_ONCE(cfs_rq->avg.load_avg);
7143
7144 /* Discount task's util from CPU's util */
7145 lsub_positive(&load, task_h_load(p));
7146
7147 return load;
7148 }
7149
cpu_runnable(struct rq * rq)7150 static unsigned long cpu_runnable(struct rq *rq)
7151 {
7152 return cfs_rq_runnable_avg(&rq->cfs);
7153 }
7154
cpu_runnable_without(struct rq * rq,struct task_struct * p)7155 static unsigned long cpu_runnable_without(struct rq *rq, struct task_struct *p)
7156 {
7157 struct cfs_rq *cfs_rq;
7158 unsigned int runnable;
7159
7160 /* Task has no contribution or is new */
7161 if (cpu_of(rq) != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
7162 return cpu_runnable(rq);
7163
7164 cfs_rq = &rq->cfs;
7165 runnable = READ_ONCE(cfs_rq->avg.runnable_avg);
7166
7167 /* Discount task's runnable from CPU's runnable */
7168 lsub_positive(&runnable, p->se.avg.runnable_avg);
7169
7170 return runnable;
7171 }
7172
capacity_of(int cpu)7173 static unsigned long capacity_of(int cpu)
7174 {
7175 return cpu_rq(cpu)->cpu_capacity;
7176 }
7177
record_wakee(struct task_struct * p)7178 static void record_wakee(struct task_struct *p)
7179 {
7180 /*
7181 * Only decay a single time; tasks that have less then 1 wakeup per
7182 * jiffy will not have built up many flips.
7183 */
7184 if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) {
7185 current->wakee_flips >>= 1;
7186 current->wakee_flip_decay_ts = jiffies;
7187 }
7188
7189 if (current->last_wakee != p) {
7190 current->last_wakee = p;
7191 current->wakee_flips++;
7192 }
7193 }
7194
7195 /*
7196 * Detect M:N waker/wakee relationships via a switching-frequency heuristic.
7197 *
7198 * A waker of many should wake a different task than the one last awakened
7199 * at a frequency roughly N times higher than one of its wakees.
7200 *
7201 * In order to determine whether we should let the load spread vs consolidating
7202 * to shared cache, we look for a minimum 'flip' frequency of llc_size in one
7203 * partner, and a factor of lls_size higher frequency in the other.
7204 *
7205 * With both conditions met, we can be relatively sure that the relationship is
7206 * non-monogamous, with partner count exceeding socket size.
7207 *
7208 * Waker/wakee being client/server, worker/dispatcher, interrupt source or
7209 * whatever is irrelevant, spread criteria is apparent partner count exceeds
7210 * socket size.
7211 */
wake_wide(struct task_struct * p)7212 static int wake_wide(struct task_struct *p)
7213 {
7214 unsigned int master = current->wakee_flips;
7215 unsigned int slave = p->wakee_flips;
7216 int factor = __this_cpu_read(sd_llc_size);
7217
7218 if (master < slave)
7219 swap(master, slave);
7220 if (slave < factor || master < slave * factor)
7221 return 0;
7222 return 1;
7223 }
7224
7225 /*
7226 * The purpose of wake_affine() is to quickly determine on which CPU we can run
7227 * soonest. For the purpose of speed we only consider the waking and previous
7228 * CPU.
7229 *
7230 * wake_affine_idle() - only considers 'now', it check if the waking CPU is
7231 * cache-affine and is (or will be) idle.
7232 *
7233 * wake_affine_weight() - considers the weight to reflect the average
7234 * scheduling latency of the CPUs. This seems to work
7235 * for the overloaded case.
7236 */
7237 static int
wake_affine_idle(int this_cpu,int prev_cpu,int sync)7238 wake_affine_idle(int this_cpu, int prev_cpu, int sync)
7239 {
7240 /*
7241 * If this_cpu is idle, it implies the wakeup is from interrupt
7242 * context. Only allow the move if cache is shared. Otherwise an
7243 * interrupt intensive workload could force all tasks onto one
7244 * node depending on the IO topology or IRQ affinity settings.
7245 *
7246 * If the prev_cpu is idle and cache affine then avoid a migration.
7247 * There is no guarantee that the cache hot data from an interrupt
7248 * is more important than cache hot data on the prev_cpu and from
7249 * a cpufreq perspective, it's better to have higher utilisation
7250 * on one CPU.
7251 */
7252 if (available_idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu))
7253 return available_idle_cpu(prev_cpu) ? prev_cpu : this_cpu;
7254
7255 if (sync) {
7256 struct rq *rq = cpu_rq(this_cpu);
7257
7258 if ((rq->nr_running - cfs_h_nr_delayed(rq)) == 1)
7259 return this_cpu;
7260 }
7261
7262 if (available_idle_cpu(prev_cpu))
7263 return prev_cpu;
7264
7265 return nr_cpumask_bits;
7266 }
7267
7268 static int
wake_affine_weight(struct sched_domain * sd,struct task_struct * p,int this_cpu,int prev_cpu,int sync)7269 wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
7270 int this_cpu, int prev_cpu, int sync)
7271 {
7272 s64 this_eff_load, prev_eff_load;
7273 unsigned long task_load;
7274
7275 this_eff_load = cpu_load(cpu_rq(this_cpu));
7276
7277 if (sync) {
7278 unsigned long current_load = task_h_load(current);
7279
7280 if (current_load > this_eff_load)
7281 return this_cpu;
7282
7283 this_eff_load -= current_load;
7284 }
7285
7286 task_load = task_h_load(p);
7287
7288 this_eff_load += task_load;
7289 if (sched_feat(WA_BIAS))
7290 this_eff_load *= 100;
7291 this_eff_load *= capacity_of(prev_cpu);
7292
7293 prev_eff_load = cpu_load(cpu_rq(prev_cpu));
7294 prev_eff_load -= task_load;
7295 if (sched_feat(WA_BIAS))
7296 prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2;
7297 prev_eff_load *= capacity_of(this_cpu);
7298
7299 /*
7300 * If sync, adjust the weight of prev_eff_load such that if
7301 * prev_eff == this_eff that select_idle_sibling() will consider
7302 * stacking the wakee on top of the waker if no other CPU is
7303 * idle.
7304 */
7305 if (sync)
7306 prev_eff_load += 1;
7307
7308 return this_eff_load < prev_eff_load ? this_cpu : nr_cpumask_bits;
7309 }
7310
wake_affine(struct sched_domain * sd,struct task_struct * p,int this_cpu,int prev_cpu,int sync)7311 static int wake_affine(struct sched_domain *sd, struct task_struct *p,
7312 int this_cpu, int prev_cpu, int sync)
7313 {
7314 int target = nr_cpumask_bits;
7315
7316 if (sched_feat(WA_IDLE))
7317 target = wake_affine_idle(this_cpu, prev_cpu, sync);
7318
7319 if (sched_feat(WA_WEIGHT) && target == nr_cpumask_bits)
7320 target = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync);
7321
7322 schedstat_inc(p->stats.nr_wakeups_affine_attempts);
7323 if (target != this_cpu)
7324 return prev_cpu;
7325
7326 schedstat_inc(sd->ttwu_move_affine);
7327 schedstat_inc(p->stats.nr_wakeups_affine);
7328 return target;
7329 }
7330
7331 static struct sched_group *
7332 sched_balance_find_dst_group(struct sched_domain *sd, struct task_struct *p, int this_cpu);
7333
7334 /*
7335 * sched_balance_find_dst_group_cpu - find the idlest CPU among the CPUs in the group.
7336 */
7337 static int
sched_balance_find_dst_group_cpu(struct sched_group * group,struct task_struct * p,int this_cpu)7338 sched_balance_find_dst_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
7339 {
7340 unsigned long load, min_load = ULONG_MAX;
7341 unsigned int min_exit_latency = UINT_MAX;
7342 u64 latest_idle_timestamp = 0;
7343 int least_loaded_cpu = this_cpu;
7344 int shallowest_idle_cpu = -1;
7345 int i;
7346
7347 /* Check if we have any choice: */
7348 if (group->group_weight == 1)
7349 return cpumask_first(sched_group_span(group));
7350
7351 /* Traverse only the allowed CPUs */
7352 for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) {
7353 struct rq *rq = cpu_rq(i);
7354
7355 if (!sched_core_cookie_match(rq, p))
7356 continue;
7357
7358 if (sched_idle_cpu(i))
7359 return i;
7360
7361 if (available_idle_cpu(i)) {
7362 struct cpuidle_state *idle = idle_get_state(rq);
7363 if (idle && idle->exit_latency < min_exit_latency) {
7364 /*
7365 * We give priority to a CPU whose idle state
7366 * has the smallest exit latency irrespective
7367 * of any idle timestamp.
7368 */
7369 min_exit_latency = idle->exit_latency;
7370 latest_idle_timestamp = rq->idle_stamp;
7371 shallowest_idle_cpu = i;
7372 } else if ((!idle || idle->exit_latency == min_exit_latency) &&
7373 rq->idle_stamp > latest_idle_timestamp) {
7374 /*
7375 * If equal or no active idle state, then
7376 * the most recently idled CPU might have
7377 * a warmer cache.
7378 */
7379 latest_idle_timestamp = rq->idle_stamp;
7380 shallowest_idle_cpu = i;
7381 }
7382 } else if (shallowest_idle_cpu == -1) {
7383 load = cpu_load(cpu_rq(i));
7384 if (load < min_load) {
7385 min_load = load;
7386 least_loaded_cpu = i;
7387 }
7388 }
7389 }
7390
7391 return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
7392 }
7393
sched_balance_find_dst_cpu(struct sched_domain * sd,struct task_struct * p,int cpu,int prev_cpu,int sd_flag)7394 static inline int sched_balance_find_dst_cpu(struct sched_domain *sd, struct task_struct *p,
7395 int cpu, int prev_cpu, int sd_flag)
7396 {
7397 int new_cpu = cpu;
7398
7399 if (!cpumask_intersects(sched_domain_span(sd), p->cpus_ptr))
7400 return prev_cpu;
7401
7402 /*
7403 * We need task's util for cpu_util_without, sync it up to
7404 * prev_cpu's last_update_time.
7405 */
7406 if (!(sd_flag & SD_BALANCE_FORK))
7407 sync_entity_load_avg(&p->se);
7408
7409 while (sd) {
7410 struct sched_group *group;
7411 struct sched_domain *tmp;
7412 int weight;
7413
7414 if (!(sd->flags & sd_flag)) {
7415 sd = sd->child;
7416 continue;
7417 }
7418
7419 group = sched_balance_find_dst_group(sd, p, cpu);
7420 if (!group) {
7421 sd = sd->child;
7422 continue;
7423 }
7424
7425 new_cpu = sched_balance_find_dst_group_cpu(group, p, cpu);
7426 if (new_cpu == cpu) {
7427 /* Now try balancing at a lower domain level of 'cpu': */
7428 sd = sd->child;
7429 continue;
7430 }
7431
7432 /* Now try balancing at a lower domain level of 'new_cpu': */
7433 cpu = new_cpu;
7434 weight = sd->span_weight;
7435 sd = NULL;
7436 for_each_domain(cpu, tmp) {
7437 if (weight <= tmp->span_weight)
7438 break;
7439 if (tmp->flags & sd_flag)
7440 sd = tmp;
7441 }
7442 }
7443
7444 return new_cpu;
7445 }
7446
__select_idle_cpu(int cpu,struct task_struct * p)7447 static inline int __select_idle_cpu(int cpu, struct task_struct *p)
7448 {
7449 if ((available_idle_cpu(cpu) || sched_idle_cpu(cpu)) &&
7450 sched_cpu_cookie_match(cpu_rq(cpu), p))
7451 return cpu;
7452
7453 return -1;
7454 }
7455
7456 #ifdef CONFIG_SCHED_SMT
7457 DEFINE_STATIC_KEY_FALSE(sched_smt_present);
7458 EXPORT_SYMBOL_GPL(sched_smt_present);
7459
set_idle_cores(int cpu,int val)7460 static inline void set_idle_cores(int cpu, int val)
7461 {
7462 struct sched_domain_shared *sds;
7463
7464 sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
7465 if (sds)
7466 WRITE_ONCE(sds->has_idle_cores, val);
7467 }
7468
test_idle_cores(int cpu)7469 static inline bool test_idle_cores(int cpu)
7470 {
7471 struct sched_domain_shared *sds;
7472
7473 sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
7474 if (sds)
7475 return READ_ONCE(sds->has_idle_cores);
7476
7477 return false;
7478 }
7479
7480 /*
7481 * Scans the local SMT mask to see if the entire core is idle, and records this
7482 * information in sd_llc_shared->has_idle_cores.
7483 *
7484 * Since SMT siblings share all cache levels, inspecting this limited remote
7485 * state should be fairly cheap.
7486 */
__update_idle_core(struct rq * rq)7487 void __update_idle_core(struct rq *rq)
7488 {
7489 int core = cpu_of(rq);
7490 int cpu;
7491
7492 rcu_read_lock();
7493 if (test_idle_cores(core))
7494 goto unlock;
7495
7496 for_each_cpu(cpu, cpu_smt_mask(core)) {
7497 if (cpu == core)
7498 continue;
7499
7500 if (!available_idle_cpu(cpu))
7501 goto unlock;
7502 }
7503
7504 set_idle_cores(core, 1);
7505 unlock:
7506 rcu_read_unlock();
7507 }
7508
7509 /*
7510 * Scan the entire LLC domain for idle cores; this dynamically switches off if
7511 * there are no idle cores left in the system; tracked through
7512 * sd_llc->shared->has_idle_cores and enabled through update_idle_core() above.
7513 */
select_idle_core(struct task_struct * p,int core,struct cpumask * cpus,int * idle_cpu)7514 static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpus, int *idle_cpu)
7515 {
7516 bool idle = true;
7517 int cpu;
7518
7519 for_each_cpu(cpu, cpu_smt_mask(core)) {
7520 if (!available_idle_cpu(cpu)) {
7521 idle = false;
7522 if (*idle_cpu == -1) {
7523 if (sched_idle_cpu(cpu) && cpumask_test_cpu(cpu, cpus)) {
7524 *idle_cpu = cpu;
7525 break;
7526 }
7527 continue;
7528 }
7529 break;
7530 }
7531 if (*idle_cpu == -1 && cpumask_test_cpu(cpu, cpus))
7532 *idle_cpu = cpu;
7533 }
7534
7535 if (idle)
7536 return core;
7537
7538 cpumask_andnot(cpus, cpus, cpu_smt_mask(core));
7539 return -1;
7540 }
7541
7542 /*
7543 * Scan the local SMT mask for idle CPUs.
7544 */
select_idle_smt(struct task_struct * p,struct sched_domain * sd,int target)7545 static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
7546 {
7547 int cpu;
7548
7549 for_each_cpu_and(cpu, cpu_smt_mask(target), p->cpus_ptr) {
7550 if (cpu == target)
7551 continue;
7552 /*
7553 * Check if the CPU is in the LLC scheduling domain of @target.
7554 * Due to isolcpus, there is no guarantee that all the siblings are in the domain.
7555 */
7556 if (!cpumask_test_cpu(cpu, sched_domain_span(sd)))
7557 continue;
7558 if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
7559 return cpu;
7560 }
7561
7562 return -1;
7563 }
7564
7565 #else /* !CONFIG_SCHED_SMT: */
7566
set_idle_cores(int cpu,int val)7567 static inline void set_idle_cores(int cpu, int val)
7568 {
7569 }
7570
test_idle_cores(int cpu)7571 static inline bool test_idle_cores(int cpu)
7572 {
7573 return false;
7574 }
7575
select_idle_core(struct task_struct * p,int core,struct cpumask * cpus,int * idle_cpu)7576 static inline int select_idle_core(struct task_struct *p, int core, struct cpumask *cpus, int *idle_cpu)
7577 {
7578 return __select_idle_cpu(core, p);
7579 }
7580
select_idle_smt(struct task_struct * p,struct sched_domain * sd,int target)7581 static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
7582 {
7583 return -1;
7584 }
7585
7586 #endif /* !CONFIG_SCHED_SMT */
7587
7588 /*
7589 * Scan the LLC domain for idle CPUs; this is dynamically regulated by
7590 * comparing the average scan cost (tracked in sd->avg_scan_cost) against the
7591 * average idle time for this rq (as found in rq->avg_idle).
7592 */
select_idle_cpu(struct task_struct * p,struct sched_domain * sd,bool has_idle_core,int target)7593 static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool has_idle_core, int target)
7594 {
7595 struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask);
7596 int i, cpu, idle_cpu = -1, nr = INT_MAX;
7597 struct sched_domain_shared *sd_share;
7598
7599 cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
7600
7601 if (sched_feat(SIS_UTIL)) {
7602 sd_share = rcu_dereference(per_cpu(sd_llc_shared, target));
7603 if (sd_share) {
7604 /* because !--nr is the condition to stop scan */
7605 nr = READ_ONCE(sd_share->nr_idle_scan) + 1;
7606 /* overloaded LLC is unlikely to have idle cpu/core */
7607 if (nr == 1)
7608 return -1;
7609 }
7610 }
7611
7612 if (static_branch_unlikely(&sched_cluster_active)) {
7613 struct sched_group *sg = sd->groups;
7614
7615 if (sg->flags & SD_CLUSTER) {
7616 for_each_cpu_wrap(cpu, sched_group_span(sg), target + 1) {
7617 if (!cpumask_test_cpu(cpu, cpus))
7618 continue;
7619
7620 if (has_idle_core) {
7621 i = select_idle_core(p, cpu, cpus, &idle_cpu);
7622 if ((unsigned int)i < nr_cpumask_bits)
7623 return i;
7624 } else {
7625 if (--nr <= 0)
7626 return -1;
7627 idle_cpu = __select_idle_cpu(cpu, p);
7628 if ((unsigned int)idle_cpu < nr_cpumask_bits)
7629 return idle_cpu;
7630 }
7631 }
7632 cpumask_andnot(cpus, cpus, sched_group_span(sg));
7633 }
7634 }
7635
7636 for_each_cpu_wrap(cpu, cpus, target + 1) {
7637 if (has_idle_core) {
7638 i = select_idle_core(p, cpu, cpus, &idle_cpu);
7639 if ((unsigned int)i < nr_cpumask_bits)
7640 return i;
7641
7642 } else {
7643 if (--nr <= 0)
7644 return -1;
7645 idle_cpu = __select_idle_cpu(cpu, p);
7646 if ((unsigned int)idle_cpu < nr_cpumask_bits)
7647 break;
7648 }
7649 }
7650
7651 if (has_idle_core)
7652 set_idle_cores(target, false);
7653
7654 return idle_cpu;
7655 }
7656
7657 /*
7658 * Scan the asym_capacity domain for idle CPUs; pick the first idle one on which
7659 * the task fits. If no CPU is big enough, but there are idle ones, try to
7660 * maximize capacity.
7661 */
7662 static int
select_idle_capacity(struct task_struct * p,struct sched_domain * sd,int target)7663 select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
7664 {
7665 unsigned long task_util, util_min, util_max, best_cap = 0;
7666 int fits, best_fits = 0;
7667 int cpu, best_cpu = -1;
7668 struct cpumask *cpus;
7669
7670 cpus = this_cpu_cpumask_var_ptr(select_rq_mask);
7671 cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
7672
7673 task_util = task_util_est(p);
7674 util_min = uclamp_eff_value(p, UCLAMP_MIN);
7675 util_max = uclamp_eff_value(p, UCLAMP_MAX);
7676
7677 for_each_cpu_wrap(cpu, cpus, target) {
7678 unsigned long cpu_cap = capacity_of(cpu);
7679
7680 if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu))
7681 continue;
7682
7683 fits = util_fits_cpu(task_util, util_min, util_max, cpu);
7684
7685 /* This CPU fits with all requirements */
7686 if (fits > 0)
7687 return cpu;
7688 /*
7689 * Only the min performance hint (i.e. uclamp_min) doesn't fit.
7690 * Look for the CPU with best capacity.
7691 */
7692 else if (fits < 0)
7693 cpu_cap = get_actual_cpu_capacity(cpu);
7694
7695 /*
7696 * First, select CPU which fits better (-1 being better than 0).
7697 * Then, select the one with best capacity at same level.
7698 */
7699 if ((fits < best_fits) ||
7700 ((fits == best_fits) && (cpu_cap > best_cap))) {
7701 best_cap = cpu_cap;
7702 best_cpu = cpu;
7703 best_fits = fits;
7704 }
7705 }
7706
7707 return best_cpu;
7708 }
7709
asym_fits_cpu(unsigned long util,unsigned long util_min,unsigned long util_max,int cpu)7710 static inline bool asym_fits_cpu(unsigned long util,
7711 unsigned long util_min,
7712 unsigned long util_max,
7713 int cpu)
7714 {
7715 if (sched_asym_cpucap_active())
7716 /*
7717 * Return true only if the cpu fully fits the task requirements
7718 * which include the utilization and the performance hints.
7719 */
7720 return (util_fits_cpu(util, util_min, util_max, cpu) > 0);
7721
7722 return true;
7723 }
7724
7725 /*
7726 * Try and locate an idle core/thread in the LLC cache domain.
7727 */
select_idle_sibling(struct task_struct * p,int prev,int target)7728 static int select_idle_sibling(struct task_struct *p, int prev, int target)
7729 {
7730 bool has_idle_core = false;
7731 struct sched_domain *sd;
7732 unsigned long task_util, util_min, util_max;
7733 int i, recent_used_cpu, prev_aff = -1;
7734
7735 /*
7736 * On asymmetric system, update task utilization because we will check
7737 * that the task fits with CPU's capacity.
7738 */
7739 if (sched_asym_cpucap_active()) {
7740 sync_entity_load_avg(&p->se);
7741 task_util = task_util_est(p);
7742 util_min = uclamp_eff_value(p, UCLAMP_MIN);
7743 util_max = uclamp_eff_value(p, UCLAMP_MAX);
7744 }
7745
7746 /*
7747 * per-cpu select_rq_mask usage
7748 */
7749 lockdep_assert_irqs_disabled();
7750
7751 if ((available_idle_cpu(target) || sched_idle_cpu(target)) &&
7752 asym_fits_cpu(task_util, util_min, util_max, target))
7753 return target;
7754
7755 /*
7756 * If the previous CPU is cache affine and idle, don't be stupid:
7757 */
7758 if (prev != target && cpus_share_cache(prev, target) &&
7759 (available_idle_cpu(prev) || sched_idle_cpu(prev)) &&
7760 asym_fits_cpu(task_util, util_min, util_max, prev)) {
7761
7762 if (!static_branch_unlikely(&sched_cluster_active) ||
7763 cpus_share_resources(prev, target))
7764 return prev;
7765
7766 prev_aff = prev;
7767 }
7768
7769 /*
7770 * Allow a per-cpu kthread to stack with the wakee if the
7771 * kworker thread and the tasks previous CPUs are the same.
7772 * The assumption is that the wakee queued work for the
7773 * per-cpu kthread that is now complete and the wakeup is
7774 * essentially a sync wakeup. An obvious example of this
7775 * pattern is IO completions.
7776 */
7777 if (is_per_cpu_kthread(current) &&
7778 in_task() &&
7779 prev == smp_processor_id() &&
7780 this_rq()->nr_running <= 1 &&
7781 asym_fits_cpu(task_util, util_min, util_max, prev)) {
7782 return prev;
7783 }
7784
7785 /* Check a recently used CPU as a potential idle candidate: */
7786 recent_used_cpu = p->recent_used_cpu;
7787 p->recent_used_cpu = prev;
7788 if (recent_used_cpu != prev &&
7789 recent_used_cpu != target &&
7790 cpus_share_cache(recent_used_cpu, target) &&
7791 (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) &&
7792 cpumask_test_cpu(recent_used_cpu, p->cpus_ptr) &&
7793 asym_fits_cpu(task_util, util_min, util_max, recent_used_cpu)) {
7794
7795 if (!static_branch_unlikely(&sched_cluster_active) ||
7796 cpus_share_resources(recent_used_cpu, target))
7797 return recent_used_cpu;
7798
7799 } else {
7800 recent_used_cpu = -1;
7801 }
7802
7803 /*
7804 * For asymmetric CPU capacity systems, our domain of interest is
7805 * sd_asym_cpucapacity rather than sd_llc.
7806 */
7807 if (sched_asym_cpucap_active()) {
7808 sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, target));
7809 /*
7810 * On an asymmetric CPU capacity system where an exclusive
7811 * cpuset defines a symmetric island (i.e. one unique
7812 * capacity_orig value through the cpuset), the key will be set
7813 * but the CPUs within that cpuset will not have a domain with
7814 * SD_ASYM_CPUCAPACITY. These should follow the usual symmetric
7815 * capacity path.
7816 */
7817 if (sd) {
7818 i = select_idle_capacity(p, sd, target);
7819 return ((unsigned)i < nr_cpumask_bits) ? i : target;
7820 }
7821 }
7822
7823 sd = rcu_dereference(per_cpu(sd_llc, target));
7824 if (!sd)
7825 return target;
7826
7827 if (sched_smt_active()) {
7828 has_idle_core = test_idle_cores(target);
7829
7830 if (!has_idle_core && cpus_share_cache(prev, target)) {
7831 i = select_idle_smt(p, sd, prev);
7832 if ((unsigned int)i < nr_cpumask_bits)
7833 return i;
7834 }
7835 }
7836
7837 i = select_idle_cpu(p, sd, has_idle_core, target);
7838 if ((unsigned)i < nr_cpumask_bits)
7839 return i;
7840
7841 /*
7842 * For cluster machines which have lower sharing cache like L2 or
7843 * LLC Tag, we tend to find an idle CPU in the target's cluster
7844 * first. But prev_cpu or recent_used_cpu may also be a good candidate,
7845 * use them if possible when no idle CPU found in select_idle_cpu().
7846 */
7847 if ((unsigned int)prev_aff < nr_cpumask_bits)
7848 return prev_aff;
7849 if ((unsigned int)recent_used_cpu < nr_cpumask_bits)
7850 return recent_used_cpu;
7851
7852 return target;
7853 }
7854
7855 /**
7856 * cpu_util() - Estimates the amount of CPU capacity used by CFS tasks.
7857 * @cpu: the CPU to get the utilization for
7858 * @p: task for which the CPU utilization should be predicted or NULL
7859 * @dst_cpu: CPU @p migrates to, -1 if @p moves from @cpu or @p == NULL
7860 * @boost: 1 to enable boosting, otherwise 0
7861 *
7862 * The unit of the return value must be the same as the one of CPU capacity
7863 * so that CPU utilization can be compared with CPU capacity.
7864 *
7865 * CPU utilization is the sum of running time of runnable tasks plus the
7866 * recent utilization of currently non-runnable tasks on that CPU.
7867 * It represents the amount of CPU capacity currently used by CFS tasks in
7868 * the range [0..max CPU capacity] with max CPU capacity being the CPU
7869 * capacity at f_max.
7870 *
7871 * The estimated CPU utilization is defined as the maximum between CPU
7872 * utilization and sum of the estimated utilization of the currently
7873 * runnable tasks on that CPU. It preserves a utilization "snapshot" of
7874 * previously-executed tasks, which helps better deduce how busy a CPU will
7875 * be when a long-sleeping task wakes up. The contribution to CPU utilization
7876 * of such a task would be significantly decayed at this point of time.
7877 *
7878 * Boosted CPU utilization is defined as max(CPU runnable, CPU utilization).
7879 * CPU contention for CFS tasks can be detected by CPU runnable > CPU
7880 * utilization. Boosting is implemented in cpu_util() so that internal
7881 * users (e.g. EAS) can use it next to external users (e.g. schedutil),
7882 * latter via cpu_util_cfs_boost().
7883 *
7884 * CPU utilization can be higher than the current CPU capacity
7885 * (f_curr/f_max * max CPU capacity) or even the max CPU capacity because
7886 * of rounding errors as well as task migrations or wakeups of new tasks.
7887 * CPU utilization has to be capped to fit into the [0..max CPU capacity]
7888 * range. Otherwise a group of CPUs (CPU0 util = 121% + CPU1 util = 80%)
7889 * could be seen as over-utilized even though CPU1 has 20% of spare CPU
7890 * capacity. CPU utilization is allowed to overshoot current CPU capacity
7891 * though since this is useful for predicting the CPU capacity required
7892 * after task migrations (scheduler-driven DVFS).
7893 *
7894 * Return: (Boosted) (estimated) utilization for the specified CPU.
7895 */
7896 static unsigned long
cpu_util(int cpu,struct task_struct * p,int dst_cpu,int boost)7897 cpu_util(int cpu, struct task_struct *p, int dst_cpu, int boost)
7898 {
7899 struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
7900 unsigned long util = READ_ONCE(cfs_rq->avg.util_avg);
7901 unsigned long runnable;
7902
7903 if (boost) {
7904 runnable = READ_ONCE(cfs_rq->avg.runnable_avg);
7905 util = max(util, runnable);
7906 }
7907
7908 /*
7909 * If @dst_cpu is -1 or @p migrates from @cpu to @dst_cpu remove its
7910 * contribution. If @p migrates from another CPU to @cpu add its
7911 * contribution. In all the other cases @cpu is not impacted by the
7912 * migration so its util_avg is already correct.
7913 */
7914 if (p && task_cpu(p) == cpu && dst_cpu != cpu)
7915 lsub_positive(&util, task_util(p));
7916 else if (p && task_cpu(p) != cpu && dst_cpu == cpu)
7917 util += task_util(p);
7918
7919 if (sched_feat(UTIL_EST)) {
7920 unsigned long util_est;
7921
7922 util_est = READ_ONCE(cfs_rq->avg.util_est);
7923
7924 /*
7925 * During wake-up @p isn't enqueued yet and doesn't contribute
7926 * to any cpu_rq(cpu)->cfs.avg.util_est.
7927 * If @dst_cpu == @cpu add it to "simulate" cpu_util after @p
7928 * has been enqueued.
7929 *
7930 * During exec (@dst_cpu = -1) @p is enqueued and does
7931 * contribute to cpu_rq(cpu)->cfs.util_est.
7932 * Remove it to "simulate" cpu_util without @p's contribution.
7933 *
7934 * Despite the task_on_rq_queued(@p) check there is still a
7935 * small window for a possible race when an exec
7936 * select_task_rq_fair() races with LB's detach_task().
7937 *
7938 * detach_task()
7939 * deactivate_task()
7940 * p->on_rq = TASK_ON_RQ_MIGRATING;
7941 * -------------------------------- A
7942 * dequeue_task() \
7943 * dequeue_task_fair() + Race Time
7944 * util_est_dequeue() /
7945 * -------------------------------- B
7946 *
7947 * The additional check "current == p" is required to further
7948 * reduce the race window.
7949 */
7950 if (dst_cpu == cpu)
7951 util_est += _task_util_est(p);
7952 else if (p && unlikely(task_on_rq_queued(p) || current == p))
7953 lsub_positive(&util_est, _task_util_est(p));
7954
7955 util = max(util, util_est);
7956 }
7957
7958 return min(util, arch_scale_cpu_capacity(cpu));
7959 }
7960
cpu_util_cfs(int cpu)7961 unsigned long cpu_util_cfs(int cpu)
7962 {
7963 return cpu_util(cpu, NULL, -1, 0);
7964 }
7965
cpu_util_cfs_boost(int cpu)7966 unsigned long cpu_util_cfs_boost(int cpu)
7967 {
7968 return cpu_util(cpu, NULL, -1, 1);
7969 }
7970
7971 /*
7972 * cpu_util_without: compute cpu utilization without any contributions from *p
7973 * @cpu: the CPU which utilization is requested
7974 * @p: the task which utilization should be discounted
7975 *
7976 * The utilization of a CPU is defined by the utilization of tasks currently
7977 * enqueued on that CPU as well as tasks which are currently sleeping after an
7978 * execution on that CPU.
7979 *
7980 * This method returns the utilization of the specified CPU by discounting the
7981 * utilization of the specified task, whenever the task is currently
7982 * contributing to the CPU utilization.
7983 */
cpu_util_without(int cpu,struct task_struct * p)7984 static unsigned long cpu_util_without(int cpu, struct task_struct *p)
7985 {
7986 /* Task has no contribution or is new */
7987 if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
7988 p = NULL;
7989
7990 return cpu_util(cpu, p, -1, 0);
7991 }
7992
7993 /*
7994 * This function computes an effective utilization for the given CPU, to be
7995 * used for frequency selection given the linear relation: f = u * f_max.
7996 *
7997 * The scheduler tracks the following metrics:
7998 *
7999 * cpu_util_{cfs,rt,dl,irq}()
8000 * cpu_bw_dl()
8001 *
8002 * Where the cfs,rt and dl util numbers are tracked with the same metric and
8003 * synchronized windows and are thus directly comparable.
8004 *
8005 * The cfs,rt,dl utilization are the running times measured with rq->clock_task
8006 * which excludes things like IRQ and steal-time. These latter are then accrued
8007 * in the IRQ utilization.
8008 *
8009 * The DL bandwidth number OTOH is not a measured metric but a value computed
8010 * based on the task model parameters and gives the minimal utilization
8011 * required to meet deadlines.
8012 */
effective_cpu_util(int cpu,unsigned long util_cfs,unsigned long * min,unsigned long * max)8013 unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
8014 unsigned long *min,
8015 unsigned long *max)
8016 {
8017 unsigned long util, irq, scale;
8018 struct rq *rq = cpu_rq(cpu);
8019
8020 scale = arch_scale_cpu_capacity(cpu);
8021
8022 /*
8023 * Early check to see if IRQ/steal time saturates the CPU, can be
8024 * because of inaccuracies in how we track these -- see
8025 * update_irq_load_avg().
8026 */
8027 irq = cpu_util_irq(rq);
8028 if (unlikely(irq >= scale)) {
8029 if (min)
8030 *min = scale;
8031 if (max)
8032 *max = scale;
8033 return scale;
8034 }
8035
8036 if (min) {
8037 /*
8038 * The minimum utilization returns the highest level between:
8039 * - the computed DL bandwidth needed with the IRQ pressure which
8040 * steals time to the deadline task.
8041 * - The minimum performance requirement for CFS and/or RT.
8042 */
8043 *min = max(irq + cpu_bw_dl(rq), uclamp_rq_get(rq, UCLAMP_MIN));
8044
8045 /*
8046 * When an RT task is runnable and uclamp is not used, we must
8047 * ensure that the task will run at maximum compute capacity.
8048 */
8049 if (!uclamp_is_used() && rt_rq_is_runnable(&rq->rt))
8050 *min = max(*min, scale);
8051 }
8052
8053 /*
8054 * Because the time spend on RT/DL tasks is visible as 'lost' time to
8055 * CFS tasks and we use the same metric to track the effective
8056 * utilization (PELT windows are synchronized) we can directly add them
8057 * to obtain the CPU's actual utilization.
8058 */
8059 util = util_cfs + cpu_util_rt(rq);
8060 util += cpu_util_dl(rq);
8061
8062 /*
8063 * The maximum hint is a soft bandwidth requirement, which can be lower
8064 * than the actual utilization because of uclamp_max requirements.
8065 */
8066 if (max)
8067 *max = min(scale, uclamp_rq_get(rq, UCLAMP_MAX));
8068
8069 if (util >= scale)
8070 return scale;
8071
8072 /*
8073 * There is still idle time; further improve the number by using the
8074 * IRQ metric. Because IRQ/steal time is hidden from the task clock we
8075 * need to scale the task numbers:
8076 *
8077 * max - irq
8078 * U' = irq + --------- * U
8079 * max
8080 */
8081 util = scale_irq_capacity(util, irq, scale);
8082 util += irq;
8083
8084 return min(scale, util);
8085 }
8086
sched_cpu_util(int cpu)8087 unsigned long sched_cpu_util(int cpu)
8088 {
8089 return effective_cpu_util(cpu, cpu_util_cfs(cpu), NULL, NULL);
8090 }
8091
8092 /*
8093 * energy_env - Utilization landscape for energy estimation.
8094 * @task_busy_time: Utilization contribution by the task for which we test the
8095 * placement. Given by eenv_task_busy_time().
8096 * @pd_busy_time: Utilization of the whole perf domain without the task
8097 * contribution. Given by eenv_pd_busy_time().
8098 * @cpu_cap: Maximum CPU capacity for the perf domain.
8099 * @pd_cap: Entire perf domain capacity. (pd->nr_cpus * cpu_cap).
8100 */
8101 struct energy_env {
8102 unsigned long task_busy_time;
8103 unsigned long pd_busy_time;
8104 unsigned long cpu_cap;
8105 unsigned long pd_cap;
8106 };
8107
8108 /*
8109 * Compute the task busy time for compute_energy(). This time cannot be
8110 * injected directly into effective_cpu_util() because of the IRQ scaling.
8111 * The latter only makes sense with the most recent CPUs where the task has
8112 * run.
8113 */
eenv_task_busy_time(struct energy_env * eenv,struct task_struct * p,int prev_cpu)8114 static inline void eenv_task_busy_time(struct energy_env *eenv,
8115 struct task_struct *p, int prev_cpu)
8116 {
8117 unsigned long busy_time, max_cap = arch_scale_cpu_capacity(prev_cpu);
8118 unsigned long irq = cpu_util_irq(cpu_rq(prev_cpu));
8119
8120 if (unlikely(irq >= max_cap))
8121 busy_time = max_cap;
8122 else
8123 busy_time = scale_irq_capacity(task_util_est(p), irq, max_cap);
8124
8125 eenv->task_busy_time = busy_time;
8126 }
8127
8128 /*
8129 * Compute the perf_domain (PD) busy time for compute_energy(). Based on the
8130 * utilization for each @pd_cpus, it however doesn't take into account
8131 * clamping since the ratio (utilization / cpu_capacity) is already enough to
8132 * scale the EM reported power consumption at the (eventually clamped)
8133 * cpu_capacity.
8134 *
8135 * The contribution of the task @p for which we want to estimate the
8136 * energy cost is removed (by cpu_util()) and must be calculated
8137 * separately (see eenv_task_busy_time). This ensures:
8138 *
8139 * - A stable PD utilization, no matter which CPU of that PD we want to place
8140 * the task on.
8141 *
8142 * - A fair comparison between CPUs as the task contribution (task_util())
8143 * will always be the same no matter which CPU utilization we rely on
8144 * (util_avg or util_est).
8145 *
8146 * Set @eenv busy time for the PD that spans @pd_cpus. This busy time can't
8147 * exceed @eenv->pd_cap.
8148 */
eenv_pd_busy_time(struct energy_env * eenv,struct cpumask * pd_cpus,struct task_struct * p)8149 static inline void eenv_pd_busy_time(struct energy_env *eenv,
8150 struct cpumask *pd_cpus,
8151 struct task_struct *p)
8152 {
8153 unsigned long busy_time = 0;
8154 int cpu;
8155
8156 for_each_cpu(cpu, pd_cpus) {
8157 unsigned long util = cpu_util(cpu, p, -1, 0);
8158
8159 busy_time += effective_cpu_util(cpu, util, NULL, NULL);
8160 }
8161
8162 eenv->pd_busy_time = min(eenv->pd_cap, busy_time);
8163 }
8164
8165 /*
8166 * Compute the maximum utilization for compute_energy() when the task @p
8167 * is placed on the cpu @dst_cpu.
8168 *
8169 * Returns the maximum utilization among @eenv->cpus. This utilization can't
8170 * exceed @eenv->cpu_cap.
8171 */
8172 static inline unsigned long
eenv_pd_max_util(struct energy_env * eenv,struct cpumask * pd_cpus,struct task_struct * p,int dst_cpu)8173 eenv_pd_max_util(struct energy_env *eenv, struct cpumask *pd_cpus,
8174 struct task_struct *p, int dst_cpu)
8175 {
8176 unsigned long max_util = 0;
8177 int cpu;
8178
8179 for_each_cpu(cpu, pd_cpus) {
8180 struct task_struct *tsk = (cpu == dst_cpu) ? p : NULL;
8181 unsigned long util = cpu_util(cpu, p, dst_cpu, 1);
8182 unsigned long eff_util, min, max;
8183
8184 /*
8185 * Performance domain frequency: utilization clamping
8186 * must be considered since it affects the selection
8187 * of the performance domain frequency.
8188 * NOTE: in case RT tasks are running, by default the min
8189 * utilization can be max OPP.
8190 */
8191 eff_util = effective_cpu_util(cpu, util, &min, &max);
8192
8193 /* Task's uclamp can modify min and max value */
8194 if (tsk && uclamp_is_used()) {
8195 min = max(min, uclamp_eff_value(p, UCLAMP_MIN));
8196
8197 /*
8198 * If there is no active max uclamp constraint,
8199 * directly use task's one, otherwise keep max.
8200 */
8201 if (uclamp_rq_is_idle(cpu_rq(cpu)))
8202 max = uclamp_eff_value(p, UCLAMP_MAX);
8203 else
8204 max = max(max, uclamp_eff_value(p, UCLAMP_MAX));
8205 }
8206
8207 eff_util = sugov_effective_cpu_perf(cpu, eff_util, min, max);
8208 max_util = max(max_util, eff_util);
8209 }
8210
8211 return min(max_util, eenv->cpu_cap);
8212 }
8213
8214 /*
8215 * compute_energy(): Use the Energy Model to estimate the energy that @pd would
8216 * consume for a given utilization landscape @eenv. When @dst_cpu < 0, the task
8217 * contribution is ignored.
8218 */
8219 static inline unsigned long
compute_energy(struct energy_env * eenv,struct perf_domain * pd,struct cpumask * pd_cpus,struct task_struct * p,int dst_cpu)8220 compute_energy(struct energy_env *eenv, struct perf_domain *pd,
8221 struct cpumask *pd_cpus, struct task_struct *p, int dst_cpu)
8222 {
8223 unsigned long max_util = eenv_pd_max_util(eenv, pd_cpus, p, dst_cpu);
8224 unsigned long busy_time = eenv->pd_busy_time;
8225 unsigned long energy;
8226
8227 if (dst_cpu >= 0)
8228 busy_time = min(eenv->pd_cap, busy_time + eenv->task_busy_time);
8229
8230 energy = em_cpu_energy(pd->em_pd, max_util, busy_time, eenv->cpu_cap);
8231
8232 trace_sched_compute_energy_tp(p, dst_cpu, energy, max_util, busy_time);
8233
8234 return energy;
8235 }
8236
8237 /*
8238 * find_energy_efficient_cpu(): Find most energy-efficient target CPU for the
8239 * waking task. find_energy_efficient_cpu() looks for the CPU with maximum
8240 * spare capacity in each performance domain and uses it as a potential
8241 * candidate to execute the task. Then, it uses the Energy Model to figure
8242 * out which of the CPU candidates is the most energy-efficient.
8243 *
8244 * The rationale for this heuristic is as follows. In a performance domain,
8245 * all the most energy efficient CPU candidates (according to the Energy
8246 * Model) are those for which we'll request a low frequency. When there are
8247 * several CPUs for which the frequency request will be the same, we don't
8248 * have enough data to break the tie between them, because the Energy Model
8249 * only includes active power costs. With this model, if we assume that
8250 * frequency requests follow utilization (e.g. using schedutil), the CPU with
8251 * the maximum spare capacity in a performance domain is guaranteed to be among
8252 * the best candidates of the performance domain.
8253 *
8254 * In practice, it could be preferable from an energy standpoint to pack
8255 * small tasks on a CPU in order to let other CPUs go in deeper idle states,
8256 * but that could also hurt our chances to go cluster idle, and we have no
8257 * ways to tell with the current Energy Model if this is actually a good
8258 * idea or not. So, find_energy_efficient_cpu() basically favors
8259 * cluster-packing, and spreading inside a cluster. That should at least be
8260 * a good thing for latency, and this is consistent with the idea that most
8261 * of the energy savings of EAS come from the asymmetry of the system, and
8262 * not so much from breaking the tie between identical CPUs. That's also the
8263 * reason why EAS is enabled in the topology code only for systems where
8264 * SD_ASYM_CPUCAPACITY is set.
8265 *
8266 * NOTE: Forkees are not accepted in the energy-aware wake-up path because
8267 * they don't have any useful utilization data yet and it's not possible to
8268 * forecast their impact on energy consumption. Consequently, they will be
8269 * placed by sched_balance_find_dst_cpu() on the least loaded CPU, which might turn out
8270 * to be energy-inefficient in some use-cases. The alternative would be to
8271 * bias new tasks towards specific types of CPUs first, or to try to infer
8272 * their util_avg from the parent task, but those heuristics could hurt
8273 * other use-cases too. So, until someone finds a better way to solve this,
8274 * let's keep things simple by re-using the existing slow path.
8275 */
find_energy_efficient_cpu(struct task_struct * p,int prev_cpu)8276 static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
8277 {
8278 struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask);
8279 unsigned long prev_delta = ULONG_MAX, best_delta = ULONG_MAX;
8280 unsigned long p_util_min = uclamp_is_used() ? uclamp_eff_value(p, UCLAMP_MIN) : 0;
8281 unsigned long p_util_max = uclamp_is_used() ? uclamp_eff_value(p, UCLAMP_MAX) : 1024;
8282 struct root_domain *rd = this_rq()->rd;
8283 int cpu, best_energy_cpu, target = -1;
8284 int prev_fits = -1, best_fits = -1;
8285 unsigned long best_actual_cap = 0;
8286 unsigned long prev_actual_cap = 0;
8287 struct sched_domain *sd;
8288 struct perf_domain *pd;
8289 struct energy_env eenv;
8290
8291 rcu_read_lock();
8292 pd = rcu_dereference(rd->pd);
8293 if (!pd)
8294 goto unlock;
8295
8296 /*
8297 * Energy-aware wake-up happens on the lowest sched_domain starting
8298 * from sd_asym_cpucapacity spanning over this_cpu and prev_cpu.
8299 */
8300 sd = rcu_dereference(*this_cpu_ptr(&sd_asym_cpucapacity));
8301 while (sd && !cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
8302 sd = sd->parent;
8303 if (!sd)
8304 goto unlock;
8305
8306 target = prev_cpu;
8307
8308 sync_entity_load_avg(&p->se);
8309 if (!task_util_est(p) && p_util_min == 0)
8310 goto unlock;
8311
8312 eenv_task_busy_time(&eenv, p, prev_cpu);
8313
8314 for (; pd; pd = pd->next) {
8315 unsigned long util_min = p_util_min, util_max = p_util_max;
8316 unsigned long cpu_cap, cpu_actual_cap, util;
8317 long prev_spare_cap = -1, max_spare_cap = -1;
8318 unsigned long rq_util_min, rq_util_max;
8319 unsigned long cur_delta, base_energy;
8320 int max_spare_cap_cpu = -1;
8321 int fits, max_fits = -1;
8322
8323 cpumask_and(cpus, perf_domain_span(pd), cpu_online_mask);
8324
8325 if (cpumask_empty(cpus))
8326 continue;
8327
8328 /* Account external pressure for the energy estimation */
8329 cpu = cpumask_first(cpus);
8330 cpu_actual_cap = get_actual_cpu_capacity(cpu);
8331
8332 eenv.cpu_cap = cpu_actual_cap;
8333 eenv.pd_cap = 0;
8334
8335 for_each_cpu(cpu, cpus) {
8336 struct rq *rq = cpu_rq(cpu);
8337
8338 eenv.pd_cap += cpu_actual_cap;
8339
8340 if (!cpumask_test_cpu(cpu, sched_domain_span(sd)))
8341 continue;
8342
8343 if (!cpumask_test_cpu(cpu, p->cpus_ptr))
8344 continue;
8345
8346 util = cpu_util(cpu, p, cpu, 0);
8347 cpu_cap = capacity_of(cpu);
8348
8349 /*
8350 * Skip CPUs that cannot satisfy the capacity request.
8351 * IOW, placing the task there would make the CPU
8352 * overutilized. Take uclamp into account to see how
8353 * much capacity we can get out of the CPU; this is
8354 * aligned with sched_cpu_util().
8355 */
8356 if (uclamp_is_used() && !uclamp_rq_is_idle(rq)) {
8357 /*
8358 * Open code uclamp_rq_util_with() except for
8359 * the clamp() part. I.e.: apply max aggregation
8360 * only. util_fits_cpu() logic requires to
8361 * operate on non clamped util but must use the
8362 * max-aggregated uclamp_{min, max}.
8363 */
8364 rq_util_min = uclamp_rq_get(rq, UCLAMP_MIN);
8365 rq_util_max = uclamp_rq_get(rq, UCLAMP_MAX);
8366
8367 util_min = max(rq_util_min, p_util_min);
8368 util_max = max(rq_util_max, p_util_max);
8369 }
8370
8371 fits = util_fits_cpu(util, util_min, util_max, cpu);
8372 if (!fits)
8373 continue;
8374
8375 lsub_positive(&cpu_cap, util);
8376
8377 if (cpu == prev_cpu) {
8378 /* Always use prev_cpu as a candidate. */
8379 prev_spare_cap = cpu_cap;
8380 prev_fits = fits;
8381 } else if ((fits > max_fits) ||
8382 ((fits == max_fits) && ((long)cpu_cap > max_spare_cap))) {
8383 /*
8384 * Find the CPU with the maximum spare capacity
8385 * among the remaining CPUs in the performance
8386 * domain.
8387 */
8388 max_spare_cap = cpu_cap;
8389 max_spare_cap_cpu = cpu;
8390 max_fits = fits;
8391 }
8392 }
8393
8394 if (max_spare_cap_cpu < 0 && prev_spare_cap < 0)
8395 continue;
8396
8397 eenv_pd_busy_time(&eenv, cpus, p);
8398 /* Compute the 'base' energy of the pd, without @p */
8399 base_energy = compute_energy(&eenv, pd, cpus, p, -1);
8400
8401 /* Evaluate the energy impact of using prev_cpu. */
8402 if (prev_spare_cap > -1) {
8403 prev_delta = compute_energy(&eenv, pd, cpus, p,
8404 prev_cpu);
8405 /* CPU utilization has changed */
8406 if (prev_delta < base_energy)
8407 goto unlock;
8408 prev_delta -= base_energy;
8409 prev_actual_cap = cpu_actual_cap;
8410 best_delta = min(best_delta, prev_delta);
8411 }
8412
8413 /* Evaluate the energy impact of using max_spare_cap_cpu. */
8414 if (max_spare_cap_cpu >= 0 && max_spare_cap > prev_spare_cap) {
8415 /* Current best energy cpu fits better */
8416 if (max_fits < best_fits)
8417 continue;
8418
8419 /*
8420 * Both don't fit performance hint (i.e. uclamp_min)
8421 * but best energy cpu has better capacity.
8422 */
8423 if ((max_fits < 0) &&
8424 (cpu_actual_cap <= best_actual_cap))
8425 continue;
8426
8427 cur_delta = compute_energy(&eenv, pd, cpus, p,
8428 max_spare_cap_cpu);
8429 /* CPU utilization has changed */
8430 if (cur_delta < base_energy)
8431 goto unlock;
8432 cur_delta -= base_energy;
8433
8434 /*
8435 * Both fit for the task but best energy cpu has lower
8436 * energy impact.
8437 */
8438 if ((max_fits > 0) && (best_fits > 0) &&
8439 (cur_delta >= best_delta))
8440 continue;
8441
8442 best_delta = cur_delta;
8443 best_energy_cpu = max_spare_cap_cpu;
8444 best_fits = max_fits;
8445 best_actual_cap = cpu_actual_cap;
8446 }
8447 }
8448 rcu_read_unlock();
8449
8450 if ((best_fits > prev_fits) ||
8451 ((best_fits > 0) && (best_delta < prev_delta)) ||
8452 ((best_fits < 0) && (best_actual_cap > prev_actual_cap)))
8453 target = best_energy_cpu;
8454
8455 return target;
8456
8457 unlock:
8458 rcu_read_unlock();
8459
8460 return target;
8461 }
8462
8463 /*
8464 * select_task_rq_fair: Select target runqueue for the waking task in domains
8465 * that have the relevant SD flag set. In practice, this is SD_BALANCE_WAKE,
8466 * SD_BALANCE_FORK, or SD_BALANCE_EXEC.
8467 *
8468 * Balances load by selecting the idlest CPU in the idlest group, or under
8469 * certain conditions an idle sibling CPU if the domain has SD_WAKE_AFFINE set.
8470 *
8471 * Returns the target CPU number.
8472 */
8473 static int
select_task_rq_fair(struct task_struct * p,int prev_cpu,int wake_flags)8474 select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
8475 {
8476 int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING);
8477 struct sched_domain *tmp, *sd = NULL;
8478 int cpu = smp_processor_id();
8479 int new_cpu = prev_cpu;
8480 int want_affine = 0;
8481 /* SD_flags and WF_flags share the first nibble */
8482 int sd_flag = wake_flags & 0xF;
8483
8484 /*
8485 * required for stable ->cpus_allowed
8486 */
8487 lockdep_assert_held(&p->pi_lock);
8488 if (wake_flags & WF_TTWU) {
8489 record_wakee(p);
8490
8491 if ((wake_flags & WF_CURRENT_CPU) &&
8492 cpumask_test_cpu(cpu, p->cpus_ptr))
8493 return cpu;
8494
8495 if (!is_rd_overutilized(this_rq()->rd)) {
8496 new_cpu = find_energy_efficient_cpu(p, prev_cpu);
8497 if (new_cpu >= 0)
8498 return new_cpu;
8499 new_cpu = prev_cpu;
8500 }
8501
8502 want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, p->cpus_ptr);
8503 }
8504
8505 rcu_read_lock();
8506 for_each_domain(cpu, tmp) {
8507 /*
8508 * If both 'cpu' and 'prev_cpu' are part of this domain,
8509 * cpu is a valid SD_WAKE_AFFINE target.
8510 */
8511 if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
8512 cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
8513 if (cpu != prev_cpu)
8514 new_cpu = wake_affine(tmp, p, cpu, prev_cpu, sync);
8515
8516 sd = NULL; /* Prefer wake_affine over balance flags */
8517 break;
8518 }
8519
8520 /*
8521 * Usually only true for WF_EXEC and WF_FORK, as sched_domains
8522 * usually do not have SD_BALANCE_WAKE set. That means wakeup
8523 * will usually go to the fast path.
8524 */
8525 if (tmp->flags & sd_flag)
8526 sd = tmp;
8527 else if (!want_affine)
8528 break;
8529 }
8530
8531 if (unlikely(sd)) {
8532 /* Slow path */
8533 new_cpu = sched_balance_find_dst_cpu(sd, p, cpu, prev_cpu, sd_flag);
8534 } else if (wake_flags & WF_TTWU) { /* XXX always ? */
8535 /* Fast path */
8536 new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
8537 }
8538 rcu_read_unlock();
8539
8540 return new_cpu;
8541 }
8542
8543 /*
8544 * Called immediately before a task is migrated to a new CPU; task_cpu(p) and
8545 * cfs_rq_of(p) references at time of call are still valid and identify the
8546 * previous CPU. The caller guarantees p->pi_lock or task_rq(p)->lock is held.
8547 */
migrate_task_rq_fair(struct task_struct * p,int new_cpu)8548 static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
8549 {
8550 struct sched_entity *se = &p->se;
8551
8552 if (!task_on_rq_migrating(p)) {
8553 remove_entity_load_avg(se);
8554
8555 /*
8556 * Here, the task's PELT values have been updated according to
8557 * the current rq's clock. But if that clock hasn't been
8558 * updated in a while, a substantial idle time will be missed,
8559 * leading to an inflation after wake-up on the new rq.
8560 *
8561 * Estimate the missing time from the cfs_rq last_update_time
8562 * and update sched_avg to improve the PELT continuity after
8563 * migration.
8564 */
8565 migrate_se_pelt_lag(se);
8566 }
8567
8568 /* Tell new CPU we are migrated */
8569 se->avg.last_update_time = 0;
8570
8571 update_scan_period(p, new_cpu);
8572 }
8573
task_dead_fair(struct task_struct * p)8574 static void task_dead_fair(struct task_struct *p)
8575 {
8576 struct sched_entity *se = &p->se;
8577
8578 if (se->sched_delayed) {
8579 struct rq_flags rf;
8580 struct rq *rq;
8581
8582 rq = task_rq_lock(p, &rf);
8583 if (se->sched_delayed) {
8584 update_rq_clock(rq);
8585 dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
8586 }
8587 task_rq_unlock(rq, p, &rf);
8588 }
8589
8590 remove_entity_load_avg(se);
8591 }
8592
8593 /*
8594 * Set the max capacity the task is allowed to run at for misfit detection.
8595 */
set_task_max_allowed_capacity(struct task_struct * p)8596 static void set_task_max_allowed_capacity(struct task_struct *p)
8597 {
8598 struct asym_cap_data *entry;
8599
8600 if (!sched_asym_cpucap_active())
8601 return;
8602
8603 rcu_read_lock();
8604 list_for_each_entry_rcu(entry, &asym_cap_list, link) {
8605 cpumask_t *cpumask;
8606
8607 cpumask = cpu_capacity_span(entry);
8608 if (!cpumask_intersects(p->cpus_ptr, cpumask))
8609 continue;
8610
8611 p->max_allowed_capacity = entry->capacity;
8612 break;
8613 }
8614 rcu_read_unlock();
8615 }
8616
set_cpus_allowed_fair(struct task_struct * p,struct affinity_context * ctx)8617 static void set_cpus_allowed_fair(struct task_struct *p, struct affinity_context *ctx)
8618 {
8619 set_cpus_allowed_common(p, ctx);
8620 set_task_max_allowed_capacity(p);
8621 }
8622
8623 static int
balance_fair(struct rq * rq,struct task_struct * prev,struct rq_flags * rf)8624 balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
8625 {
8626 if (sched_fair_runnable(rq))
8627 return 1;
8628
8629 return sched_balance_newidle(rq, rf) != 0;
8630 }
8631
set_next_buddy(struct sched_entity * se)8632 static void set_next_buddy(struct sched_entity *se)
8633 {
8634 for_each_sched_entity(se) {
8635 if (WARN_ON_ONCE(!se->on_rq))
8636 return;
8637 if (se_is_idle(se))
8638 return;
8639 cfs_rq_of(se)->next = se;
8640 }
8641 }
8642
8643 /*
8644 * Preempt the current task with a newly woken task if needed:
8645 */
check_preempt_wakeup_fair(struct rq * rq,struct task_struct * p,int wake_flags)8646 static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int wake_flags)
8647 {
8648 struct task_struct *donor = rq->donor;
8649 struct sched_entity *se = &donor->se, *pse = &p->se;
8650 struct cfs_rq *cfs_rq = task_cfs_rq(donor);
8651 int cse_is_idle, pse_is_idle;
8652 bool do_preempt_short = false;
8653
8654 if (unlikely(se == pse))
8655 return;
8656
8657 /*
8658 * This is possible from callers such as attach_tasks(), in which we
8659 * unconditionally wakeup_preempt() after an enqueue (which may have
8660 * lead to a throttle). This both saves work and prevents false
8661 * next-buddy nomination below.
8662 */
8663 if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
8664 return;
8665
8666 if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK) && !pse->sched_delayed) {
8667 set_next_buddy(pse);
8668 }
8669
8670 /*
8671 * We can come here with TIF_NEED_RESCHED already set from new task
8672 * wake up path.
8673 *
8674 * Note: this also catches the edge-case of curr being in a throttled
8675 * group (e.g. via set_curr_task), since update_curr() (in the
8676 * enqueue of curr) will have resulted in resched being set. This
8677 * prevents us from potentially nominating it as a false LAST_BUDDY
8678 * below.
8679 */
8680 if (test_tsk_need_resched(rq->curr))
8681 return;
8682
8683 if (!sched_feat(WAKEUP_PREEMPTION))
8684 return;
8685
8686 find_matching_se(&se, &pse);
8687 WARN_ON_ONCE(!pse);
8688
8689 cse_is_idle = se_is_idle(se);
8690 pse_is_idle = se_is_idle(pse);
8691
8692 /*
8693 * Preempt an idle entity in favor of a non-idle entity (and don't preempt
8694 * in the inverse case).
8695 */
8696 if (cse_is_idle && !pse_is_idle) {
8697 /*
8698 * When non-idle entity preempt an idle entity,
8699 * don't give idle entity slice protection.
8700 */
8701 do_preempt_short = true;
8702 goto preempt;
8703 }
8704
8705 if (cse_is_idle != pse_is_idle)
8706 return;
8707
8708 /*
8709 * BATCH and IDLE tasks do not preempt others.
8710 */
8711 if (unlikely(!normal_policy(p->policy)))
8712 return;
8713
8714 cfs_rq = cfs_rq_of(se);
8715 update_curr(cfs_rq);
8716 /*
8717 * If @p has a shorter slice than current and @p is eligible, override
8718 * current's slice protection in order to allow preemption.
8719 */
8720 do_preempt_short = sched_feat(PREEMPT_SHORT) && (pse->slice < se->slice);
8721
8722 /*
8723 * If @p has become the most eligible task, force preemption.
8724 */
8725 if (__pick_eevdf(cfs_rq, !do_preempt_short) == pse)
8726 goto preempt;
8727
8728 if (sched_feat(RUN_TO_PARITY) && do_preempt_short)
8729 update_protect_slice(cfs_rq, se);
8730
8731 return;
8732
8733 preempt:
8734 if (do_preempt_short)
8735 cancel_protect_slice(se);
8736
8737 resched_curr_lazy(rq);
8738 }
8739
pick_task_fair(struct rq * rq)8740 static struct task_struct *pick_task_fair(struct rq *rq)
8741 {
8742 struct sched_entity *se;
8743 struct cfs_rq *cfs_rq;
8744
8745 again:
8746 cfs_rq = &rq->cfs;
8747 if (!cfs_rq->nr_queued)
8748 return NULL;
8749
8750 do {
8751 /* Might not have done put_prev_entity() */
8752 if (cfs_rq->curr && cfs_rq->curr->on_rq)
8753 update_curr(cfs_rq);
8754
8755 if (unlikely(check_cfs_rq_runtime(cfs_rq)))
8756 goto again;
8757
8758 se = pick_next_entity(rq, cfs_rq);
8759 if (!se)
8760 goto again;
8761 cfs_rq = group_cfs_rq(se);
8762 } while (cfs_rq);
8763
8764 return task_of(se);
8765 }
8766
8767 static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool first);
8768 static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first);
8769
8770 struct task_struct *
pick_next_task_fair(struct rq * rq,struct task_struct * prev,struct rq_flags * rf)8771 pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
8772 {
8773 struct sched_entity *se;
8774 struct task_struct *p;
8775 int new_tasks;
8776
8777 again:
8778 p = pick_task_fair(rq);
8779 if (!p)
8780 goto idle;
8781 se = &p->se;
8782
8783 #ifdef CONFIG_FAIR_GROUP_SCHED
8784 if (prev->sched_class != &fair_sched_class)
8785 goto simple;
8786
8787 __put_prev_set_next_dl_server(rq, prev, p);
8788
8789 /*
8790 * Because of the set_next_buddy() in dequeue_task_fair() it is rather
8791 * likely that a next task is from the same cgroup as the current.
8792 *
8793 * Therefore attempt to avoid putting and setting the entire cgroup
8794 * hierarchy, only change the part that actually changes.
8795 *
8796 * Since we haven't yet done put_prev_entity and if the selected task
8797 * is a different task than we started out with, try and touch the
8798 * least amount of cfs_rqs.
8799 */
8800 if (prev != p) {
8801 struct sched_entity *pse = &prev->se;
8802 struct cfs_rq *cfs_rq;
8803
8804 while (!(cfs_rq = is_same_group(se, pse))) {
8805 int se_depth = se->depth;
8806 int pse_depth = pse->depth;
8807
8808 if (se_depth <= pse_depth) {
8809 put_prev_entity(cfs_rq_of(pse), pse);
8810 pse = parent_entity(pse);
8811 }
8812 if (se_depth >= pse_depth) {
8813 set_next_entity(cfs_rq_of(se), se);
8814 se = parent_entity(se);
8815 }
8816 }
8817
8818 put_prev_entity(cfs_rq, pse);
8819 set_next_entity(cfs_rq, se);
8820
8821 __set_next_task_fair(rq, p, true);
8822 }
8823
8824 return p;
8825
8826 simple:
8827 #endif /* CONFIG_FAIR_GROUP_SCHED */
8828 put_prev_set_next_task(rq, prev, p);
8829 return p;
8830
8831 idle:
8832 if (!rf)
8833 return NULL;
8834
8835 new_tasks = sched_balance_newidle(rq, rf);
8836
8837 /*
8838 * Because sched_balance_newidle() releases (and re-acquires) rq->lock, it is
8839 * possible for any higher priority task to appear. In that case we
8840 * must re-start the pick_next_entity() loop.
8841 */
8842 if (new_tasks < 0)
8843 return RETRY_TASK;
8844
8845 if (new_tasks > 0)
8846 goto again;
8847
8848 /*
8849 * rq is about to be idle, check if we need to update the
8850 * lost_idle_time of clock_pelt
8851 */
8852 update_idle_rq_clock_pelt(rq);
8853
8854 return NULL;
8855 }
8856
__pick_next_task_fair(struct rq * rq,struct task_struct * prev)8857 static struct task_struct *__pick_next_task_fair(struct rq *rq, struct task_struct *prev)
8858 {
8859 return pick_next_task_fair(rq, prev, NULL);
8860 }
8861
fair_server_has_tasks(struct sched_dl_entity * dl_se)8862 static bool fair_server_has_tasks(struct sched_dl_entity *dl_se)
8863 {
8864 return !!dl_se->rq->cfs.nr_queued;
8865 }
8866
fair_server_pick_task(struct sched_dl_entity * dl_se)8867 static struct task_struct *fair_server_pick_task(struct sched_dl_entity *dl_se)
8868 {
8869 return pick_task_fair(dl_se->rq);
8870 }
8871
fair_server_init(struct rq * rq)8872 void fair_server_init(struct rq *rq)
8873 {
8874 struct sched_dl_entity *dl_se = &rq->fair_server;
8875
8876 init_dl_entity(dl_se);
8877
8878 dl_server_init(dl_se, rq, fair_server_has_tasks, fair_server_pick_task);
8879 }
8880
8881 /*
8882 * Account for a descheduled task:
8883 */
put_prev_task_fair(struct rq * rq,struct task_struct * prev,struct task_struct * next)8884 static void put_prev_task_fair(struct rq *rq, struct task_struct *prev, struct task_struct *next)
8885 {
8886 struct sched_entity *se = &prev->se;
8887 struct cfs_rq *cfs_rq;
8888
8889 for_each_sched_entity(se) {
8890 cfs_rq = cfs_rq_of(se);
8891 put_prev_entity(cfs_rq, se);
8892 }
8893 }
8894
8895 /*
8896 * sched_yield() is very simple
8897 */
yield_task_fair(struct rq * rq)8898 static void yield_task_fair(struct rq *rq)
8899 {
8900 struct task_struct *curr = rq->curr;
8901 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
8902 struct sched_entity *se = &curr->se;
8903
8904 /*
8905 * Are we the only task in the tree?
8906 */
8907 if (unlikely(rq->nr_running == 1))
8908 return;
8909
8910 clear_buddies(cfs_rq, se);
8911
8912 update_rq_clock(rq);
8913 /*
8914 * Update run-time statistics of the 'current'.
8915 */
8916 update_curr(cfs_rq);
8917 /*
8918 * Tell update_rq_clock() that we've just updated,
8919 * so we don't do microscopic update in schedule()
8920 * and double the fastpath cost.
8921 */
8922 rq_clock_skip_update(rq);
8923
8924 se->deadline += calc_delta_fair(se->slice, se);
8925 }
8926
yield_to_task_fair(struct rq * rq,struct task_struct * p)8927 static bool yield_to_task_fair(struct rq *rq, struct task_struct *p)
8928 {
8929 struct sched_entity *se = &p->se;
8930
8931 /* throttled hierarchies are not runnable */
8932 if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
8933 return false;
8934
8935 /* Tell the scheduler that we'd really like se to run next. */
8936 set_next_buddy(se);
8937
8938 yield_task_fair(rq);
8939
8940 return true;
8941 }
8942
8943 /**************************************************
8944 * Fair scheduling class load-balancing methods.
8945 *
8946 * BASICS
8947 *
8948 * The purpose of load-balancing is to achieve the same basic fairness the
8949 * per-CPU scheduler provides, namely provide a proportional amount of compute
8950 * time to each task. This is expressed in the following equation:
8951 *
8952 * W_i,n/P_i == W_j,n/P_j for all i,j (1)
8953 *
8954 * Where W_i,n is the n-th weight average for CPU i. The instantaneous weight
8955 * W_i,0 is defined as:
8956 *
8957 * W_i,0 = \Sum_j w_i,j (2)
8958 *
8959 * Where w_i,j is the weight of the j-th runnable task on CPU i. This weight
8960 * is derived from the nice value as per sched_prio_to_weight[].
8961 *
8962 * The weight average is an exponential decay average of the instantaneous
8963 * weight:
8964 *
8965 * W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3)
8966 *
8967 * C_i is the compute capacity of CPU i, typically it is the
8968 * fraction of 'recent' time available for SCHED_OTHER task execution. But it
8969 * can also include other factors [XXX].
8970 *
8971 * To achieve this balance we define a measure of imbalance which follows
8972 * directly from (1):
8973 *
8974 * imb_i,j = max{ avg(W/C), W_i/C_i } - min{ avg(W/C), W_j/C_j } (4)
8975 *
8976 * We them move tasks around to minimize the imbalance. In the continuous
8977 * function space it is obvious this converges, in the discrete case we get
8978 * a few fun cases generally called infeasible weight scenarios.
8979 *
8980 * [XXX expand on:
8981 * - infeasible weights;
8982 * - local vs global optima in the discrete case. ]
8983 *
8984 *
8985 * SCHED DOMAINS
8986 *
8987 * In order to solve the imbalance equation (4), and avoid the obvious O(n^2)
8988 * for all i,j solution, we create a tree of CPUs that follows the hardware
8989 * topology where each level pairs two lower groups (or better). This results
8990 * in O(log n) layers. Furthermore we reduce the number of CPUs going up the
8991 * tree to only the first of the previous level and we decrease the frequency
8992 * of load-balance at each level inversely proportional to the number of CPUs in
8993 * the groups.
8994 *
8995 * This yields:
8996 *
8997 * log_2 n 1 n
8998 * \Sum { --- * --- * 2^i } = O(n) (5)
8999 * i = 0 2^i 2^i
9000 * `- size of each group
9001 * | | `- number of CPUs doing load-balance
9002 * | `- freq
9003 * `- sum over all levels
9004 *
9005 * Coupled with a limit on how many tasks we can migrate every balance pass,
9006 * this makes (5) the runtime complexity of the balancer.
9007 *
9008 * An important property here is that each CPU is still (indirectly) connected
9009 * to every other CPU in at most O(log n) steps:
9010 *
9011 * The adjacency matrix of the resulting graph is given by:
9012 *
9013 * log_2 n
9014 * A_i,j = \Union (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1) (6)
9015 * k = 0
9016 *
9017 * And you'll find that:
9018 *
9019 * A^(log_2 n)_i,j != 0 for all i,j (7)
9020 *
9021 * Showing there's indeed a path between every CPU in at most O(log n) steps.
9022 * The task movement gives a factor of O(m), giving a convergence complexity
9023 * of:
9024 *
9025 * O(nm log n), n := nr_cpus, m := nr_tasks (8)
9026 *
9027 *
9028 * WORK CONSERVING
9029 *
9030 * In order to avoid CPUs going idle while there's still work to do, new idle
9031 * balancing is more aggressive and has the newly idle CPU iterate up the domain
9032 * tree itself instead of relying on other CPUs to bring it work.
9033 *
9034 * This adds some complexity to both (5) and (8) but it reduces the total idle
9035 * time.
9036 *
9037 * [XXX more?]
9038 *
9039 *
9040 * CGROUPS
9041 *
9042 * Cgroups make a horror show out of (2), instead of a simple sum we get:
9043 *
9044 * s_k,i
9045 * W_i,0 = \Sum_j \Prod_k w_k * ----- (9)
9046 * S_k
9047 *
9048 * Where
9049 *
9050 * s_k,i = \Sum_j w_i,j,k and S_k = \Sum_i s_k,i (10)
9051 *
9052 * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on CPU i.
9053 *
9054 * The big problem is S_k, its a global sum needed to compute a local (W_i)
9055 * property.
9056 *
9057 * [XXX write more on how we solve this.. _after_ merging pjt's patches that
9058 * rewrite all of this once again.]
9059 */
9060
9061 static unsigned long __read_mostly max_load_balance_interval = HZ/10;
9062
9063 enum fbq_type { regular, remote, all };
9064
9065 /*
9066 * 'group_type' describes the group of CPUs at the moment of load balancing.
9067 *
9068 * The enum is ordered by pulling priority, with the group with lowest priority
9069 * first so the group_type can simply be compared when selecting the busiest
9070 * group. See update_sd_pick_busiest().
9071 */
9072 enum group_type {
9073 /* The group has spare capacity that can be used to run more tasks. */
9074 group_has_spare = 0,
9075 /*
9076 * The group is fully used and the tasks don't compete for more CPU
9077 * cycles. Nevertheless, some tasks might wait before running.
9078 */
9079 group_fully_busy,
9080 /*
9081 * One task doesn't fit with CPU's capacity and must be migrated to a
9082 * more powerful CPU.
9083 */
9084 group_misfit_task,
9085 /*
9086 * Balance SMT group that's fully busy. Can benefit from migration
9087 * a task on SMT with busy sibling to another CPU on idle core.
9088 */
9089 group_smt_balance,
9090 /*
9091 * SD_ASYM_PACKING only: One local CPU with higher capacity is available,
9092 * and the task should be migrated to it instead of running on the
9093 * current CPU.
9094 */
9095 group_asym_packing,
9096 /*
9097 * The tasks' affinity constraints previously prevented the scheduler
9098 * from balancing the load across the system.
9099 */
9100 group_imbalanced,
9101 /*
9102 * The CPU is overloaded and can't provide expected CPU cycles to all
9103 * tasks.
9104 */
9105 group_overloaded
9106 };
9107
9108 enum migration_type {
9109 migrate_load = 0,
9110 migrate_util,
9111 migrate_task,
9112 migrate_misfit
9113 };
9114
9115 #define LBF_ALL_PINNED 0x01
9116 #define LBF_NEED_BREAK 0x02
9117 #define LBF_DST_PINNED 0x04
9118 #define LBF_SOME_PINNED 0x08
9119 #define LBF_ACTIVE_LB 0x10
9120
9121 struct lb_env {
9122 struct sched_domain *sd;
9123
9124 struct rq *src_rq;
9125 int src_cpu;
9126
9127 int dst_cpu;
9128 struct rq *dst_rq;
9129
9130 struct cpumask *dst_grpmask;
9131 int new_dst_cpu;
9132 enum cpu_idle_type idle;
9133 long imbalance;
9134 /* The set of CPUs under consideration for load-balancing */
9135 struct cpumask *cpus;
9136
9137 unsigned int flags;
9138
9139 unsigned int loop;
9140 unsigned int loop_break;
9141 unsigned int loop_max;
9142
9143 enum fbq_type fbq_type;
9144 enum migration_type migration_type;
9145 struct list_head tasks;
9146 };
9147
9148 /*
9149 * Is this task likely cache-hot:
9150 */
task_hot(struct task_struct * p,struct lb_env * env)9151 static int task_hot(struct task_struct *p, struct lb_env *env)
9152 {
9153 s64 delta;
9154
9155 lockdep_assert_rq_held(env->src_rq);
9156
9157 if (p->sched_class != &fair_sched_class)
9158 return 0;
9159
9160 if (unlikely(task_has_idle_policy(p)))
9161 return 0;
9162
9163 /* SMT siblings share cache */
9164 if (env->sd->flags & SD_SHARE_CPUCAPACITY)
9165 return 0;
9166
9167 /*
9168 * Buddy candidates are cache hot:
9169 */
9170 if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running &&
9171 (&p->se == cfs_rq_of(&p->se)->next))
9172 return 1;
9173
9174 if (sysctl_sched_migration_cost == -1)
9175 return 1;
9176
9177 /*
9178 * Don't migrate task if the task's cookie does not match
9179 * with the destination CPU's core cookie.
9180 */
9181 if (!sched_core_cookie_match(cpu_rq(env->dst_cpu), p))
9182 return 1;
9183
9184 if (sysctl_sched_migration_cost == 0)
9185 return 0;
9186
9187 delta = rq_clock_task(env->src_rq) - p->se.exec_start;
9188
9189 return delta < (s64)sysctl_sched_migration_cost;
9190 }
9191
9192 #ifdef CONFIG_NUMA_BALANCING
9193 /*
9194 * Returns a positive value, if task migration degrades locality.
9195 * Returns 0, if task migration is not affected by locality.
9196 * Returns a negative value, if task migration improves locality i.e migration preferred.
9197 */
migrate_degrades_locality(struct task_struct * p,struct lb_env * env)9198 static long migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
9199 {
9200 struct numa_group *numa_group = rcu_dereference(p->numa_group);
9201 unsigned long src_weight, dst_weight;
9202 int src_nid, dst_nid, dist;
9203
9204 if (!static_branch_likely(&sched_numa_balancing))
9205 return 0;
9206
9207 if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
9208 return 0;
9209
9210 src_nid = cpu_to_node(env->src_cpu);
9211 dst_nid = cpu_to_node(env->dst_cpu);
9212
9213 if (src_nid == dst_nid)
9214 return 0;
9215
9216 /* Migrating away from the preferred node is always bad. */
9217 if (src_nid == p->numa_preferred_nid) {
9218 if (env->src_rq->nr_running > env->src_rq->nr_preferred_running)
9219 return 1;
9220 else
9221 return 0;
9222 }
9223
9224 /* Encourage migration to the preferred node. */
9225 if (dst_nid == p->numa_preferred_nid)
9226 return -1;
9227
9228 /* Leaving a core idle is often worse than degrading locality. */
9229 if (env->idle == CPU_IDLE)
9230 return 0;
9231
9232 dist = node_distance(src_nid, dst_nid);
9233 if (numa_group) {
9234 src_weight = group_weight(p, src_nid, dist);
9235 dst_weight = group_weight(p, dst_nid, dist);
9236 } else {
9237 src_weight = task_weight(p, src_nid, dist);
9238 dst_weight = task_weight(p, dst_nid, dist);
9239 }
9240
9241 return src_weight - dst_weight;
9242 }
9243
9244 #else /* !CONFIG_NUMA_BALANCING: */
migrate_degrades_locality(struct task_struct * p,struct lb_env * env)9245 static inline long migrate_degrades_locality(struct task_struct *p,
9246 struct lb_env *env)
9247 {
9248 return 0;
9249 }
9250 #endif /* !CONFIG_NUMA_BALANCING */
9251
9252 /*
9253 * Check whether the task is ineligible on the destination cpu
9254 *
9255 * When the PLACE_LAG scheduling feature is enabled and
9256 * dst_cfs_rq->nr_queued is greater than 1, if the task
9257 * is ineligible, it will also be ineligible when
9258 * it is migrated to the destination cpu.
9259 */
task_is_ineligible_on_dst_cpu(struct task_struct * p,int dest_cpu)9260 static inline int task_is_ineligible_on_dst_cpu(struct task_struct *p, int dest_cpu)
9261 {
9262 struct cfs_rq *dst_cfs_rq;
9263
9264 #ifdef CONFIG_FAIR_GROUP_SCHED
9265 dst_cfs_rq = task_group(p)->cfs_rq[dest_cpu];
9266 #else
9267 dst_cfs_rq = &cpu_rq(dest_cpu)->cfs;
9268 #endif
9269 if (sched_feat(PLACE_LAG) && dst_cfs_rq->nr_queued &&
9270 !entity_eligible(task_cfs_rq(p), &p->se))
9271 return 1;
9272
9273 return 0;
9274 }
9275
9276 /*
9277 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
9278 */
9279 static
can_migrate_task(struct task_struct * p,struct lb_env * env)9280 int can_migrate_task(struct task_struct *p, struct lb_env *env)
9281 {
9282 long degrades, hot;
9283
9284 lockdep_assert_rq_held(env->src_rq);
9285 if (p->sched_task_hot)
9286 p->sched_task_hot = 0;
9287
9288 /*
9289 * We do not migrate tasks that are:
9290 * 1) delayed dequeued unless we migrate load, or
9291 * 2) throttled_lb_pair, or
9292 * 3) cannot be migrated to this CPU due to cpus_ptr, or
9293 * 4) running (obviously), or
9294 * 5) are cache-hot on their current CPU, or
9295 * 6) are blocked on mutexes (if SCHED_PROXY_EXEC is enabled)
9296 */
9297 if ((p->se.sched_delayed) && (env->migration_type != migrate_load))
9298 return 0;
9299
9300 if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
9301 return 0;
9302
9303 /*
9304 * We want to prioritize the migration of eligible tasks.
9305 * For ineligible tasks we soft-limit them and only allow
9306 * them to migrate when nr_balance_failed is non-zero to
9307 * avoid load-balancing trying very hard to balance the load.
9308 */
9309 if (!env->sd->nr_balance_failed &&
9310 task_is_ineligible_on_dst_cpu(p, env->dst_cpu))
9311 return 0;
9312
9313 /* Disregard percpu kthreads; they are where they need to be. */
9314 if (kthread_is_per_cpu(p))
9315 return 0;
9316
9317 if (task_is_blocked(p))
9318 return 0;
9319
9320 if (!cpumask_test_cpu(env->dst_cpu, p->cpus_ptr)) {
9321 int cpu;
9322
9323 schedstat_inc(p->stats.nr_failed_migrations_affine);
9324
9325 env->flags |= LBF_SOME_PINNED;
9326
9327 /*
9328 * Remember if this task can be migrated to any other CPU in
9329 * our sched_group. We may want to revisit it if we couldn't
9330 * meet load balance goals by pulling other tasks on src_cpu.
9331 *
9332 * Avoid computing new_dst_cpu
9333 * - for NEWLY_IDLE
9334 * - if we have already computed one in current iteration
9335 * - if it's an active balance
9336 */
9337 if (env->idle == CPU_NEWLY_IDLE ||
9338 env->flags & (LBF_DST_PINNED | LBF_ACTIVE_LB))
9339 return 0;
9340
9341 /* Prevent to re-select dst_cpu via env's CPUs: */
9342 cpu = cpumask_first_and_and(env->dst_grpmask, env->cpus, p->cpus_ptr);
9343
9344 if (cpu < nr_cpu_ids) {
9345 env->flags |= LBF_DST_PINNED;
9346 env->new_dst_cpu = cpu;
9347 }
9348
9349 return 0;
9350 }
9351
9352 /* Record that we found at least one task that could run on dst_cpu */
9353 env->flags &= ~LBF_ALL_PINNED;
9354
9355 if (task_on_cpu(env->src_rq, p) ||
9356 task_current_donor(env->src_rq, p)) {
9357 schedstat_inc(p->stats.nr_failed_migrations_running);
9358 return 0;
9359 }
9360
9361 /*
9362 * Aggressive migration if:
9363 * 1) active balance
9364 * 2) destination numa is preferred
9365 * 3) task is cache cold, or
9366 * 4) too many balance attempts have failed.
9367 */
9368 if (env->flags & LBF_ACTIVE_LB)
9369 return 1;
9370
9371 degrades = migrate_degrades_locality(p, env);
9372 if (!degrades)
9373 hot = task_hot(p, env);
9374 else
9375 hot = degrades > 0;
9376
9377 if (!hot || env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
9378 if (hot)
9379 p->sched_task_hot = 1;
9380 return 1;
9381 }
9382
9383 schedstat_inc(p->stats.nr_failed_migrations_hot);
9384 return 0;
9385 }
9386
9387 /*
9388 * detach_task() -- detach the task for the migration specified in env
9389 */
detach_task(struct task_struct * p,struct lb_env * env)9390 static void detach_task(struct task_struct *p, struct lb_env *env)
9391 {
9392 lockdep_assert_rq_held(env->src_rq);
9393
9394 if (p->sched_task_hot) {
9395 p->sched_task_hot = 0;
9396 schedstat_inc(env->sd->lb_hot_gained[env->idle]);
9397 schedstat_inc(p->stats.nr_forced_migrations);
9398 }
9399
9400 WARN_ON(task_current(env->src_rq, p));
9401 WARN_ON(task_current_donor(env->src_rq, p));
9402
9403 deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK);
9404 set_task_cpu(p, env->dst_cpu);
9405 }
9406
9407 /*
9408 * detach_one_task() -- tries to dequeue exactly one task from env->src_rq, as
9409 * part of active balancing operations within "domain".
9410 *
9411 * Returns a task if successful and NULL otherwise.
9412 */
detach_one_task(struct lb_env * env)9413 static struct task_struct *detach_one_task(struct lb_env *env)
9414 {
9415 struct task_struct *p;
9416
9417 lockdep_assert_rq_held(env->src_rq);
9418
9419 list_for_each_entry_reverse(p,
9420 &env->src_rq->cfs_tasks, se.group_node) {
9421 if (!can_migrate_task(p, env))
9422 continue;
9423
9424 detach_task(p, env);
9425
9426 /*
9427 * Right now, this is only the second place where
9428 * lb_gained[env->idle] is updated (other is detach_tasks)
9429 * so we can safely collect stats here rather than
9430 * inside detach_tasks().
9431 */
9432 schedstat_inc(env->sd->lb_gained[env->idle]);
9433 return p;
9434 }
9435 return NULL;
9436 }
9437
9438 /*
9439 * detach_tasks() -- tries to detach up to imbalance load/util/tasks from
9440 * busiest_rq, as part of a balancing operation within domain "sd".
9441 *
9442 * Returns number of detached tasks if successful and 0 otherwise.
9443 */
detach_tasks(struct lb_env * env)9444 static int detach_tasks(struct lb_env *env)
9445 {
9446 struct list_head *tasks = &env->src_rq->cfs_tasks;
9447 unsigned long util, load;
9448 struct task_struct *p;
9449 int detached = 0;
9450
9451 lockdep_assert_rq_held(env->src_rq);
9452
9453 /*
9454 * Source run queue has been emptied by another CPU, clear
9455 * LBF_ALL_PINNED flag as we will not test any task.
9456 */
9457 if (env->src_rq->nr_running <= 1) {
9458 env->flags &= ~LBF_ALL_PINNED;
9459 return 0;
9460 }
9461
9462 if (env->imbalance <= 0)
9463 return 0;
9464
9465 while (!list_empty(tasks)) {
9466 /*
9467 * We don't want to steal all, otherwise we may be treated likewise,
9468 * which could at worst lead to a livelock crash.
9469 */
9470 if (env->idle && env->src_rq->nr_running <= 1)
9471 break;
9472
9473 env->loop++;
9474 /* We've more or less seen every task there is, call it quits */
9475 if (env->loop > env->loop_max)
9476 break;
9477
9478 /* take a breather every nr_migrate tasks */
9479 if (env->loop > env->loop_break) {
9480 env->loop_break += SCHED_NR_MIGRATE_BREAK;
9481 env->flags |= LBF_NEED_BREAK;
9482 break;
9483 }
9484
9485 p = list_last_entry(tasks, struct task_struct, se.group_node);
9486
9487 if (!can_migrate_task(p, env))
9488 goto next;
9489
9490 switch (env->migration_type) {
9491 case migrate_load:
9492 /*
9493 * Depending of the number of CPUs and tasks and the
9494 * cgroup hierarchy, task_h_load() can return a null
9495 * value. Make sure that env->imbalance decreases
9496 * otherwise detach_tasks() will stop only after
9497 * detaching up to loop_max tasks.
9498 */
9499 load = max_t(unsigned long, task_h_load(p), 1);
9500
9501 if (sched_feat(LB_MIN) &&
9502 load < 16 && !env->sd->nr_balance_failed)
9503 goto next;
9504
9505 /*
9506 * Make sure that we don't migrate too much load.
9507 * Nevertheless, let relax the constraint if
9508 * scheduler fails to find a good waiting task to
9509 * migrate.
9510 */
9511 if (shr_bound(load, env->sd->nr_balance_failed) > env->imbalance)
9512 goto next;
9513
9514 env->imbalance -= load;
9515 break;
9516
9517 case migrate_util:
9518 util = task_util_est(p);
9519
9520 if (shr_bound(util, env->sd->nr_balance_failed) > env->imbalance)
9521 goto next;
9522
9523 env->imbalance -= util;
9524 break;
9525
9526 case migrate_task:
9527 env->imbalance--;
9528 break;
9529
9530 case migrate_misfit:
9531 /* This is not a misfit task */
9532 if (task_fits_cpu(p, env->src_cpu))
9533 goto next;
9534
9535 env->imbalance = 0;
9536 break;
9537 }
9538
9539 detach_task(p, env);
9540 list_add(&p->se.group_node, &env->tasks);
9541
9542 detached++;
9543
9544 #ifdef CONFIG_PREEMPTION
9545 /*
9546 * NEWIDLE balancing is a source of latency, so preemptible
9547 * kernels will stop after the first task is detached to minimize
9548 * the critical section.
9549 */
9550 if (env->idle == CPU_NEWLY_IDLE)
9551 break;
9552 #endif
9553
9554 /*
9555 * We only want to steal up to the prescribed amount of
9556 * load/util/tasks.
9557 */
9558 if (env->imbalance <= 0)
9559 break;
9560
9561 continue;
9562 next:
9563 if (p->sched_task_hot)
9564 schedstat_inc(p->stats.nr_failed_migrations_hot);
9565
9566 list_move(&p->se.group_node, tasks);
9567 }
9568
9569 /*
9570 * Right now, this is one of only two places we collect this stat
9571 * so we can safely collect detach_one_task() stats here rather
9572 * than inside detach_one_task().
9573 */
9574 schedstat_add(env->sd->lb_gained[env->idle], detached);
9575
9576 return detached;
9577 }
9578
9579 /*
9580 * attach_task() -- attach the task detached by detach_task() to its new rq.
9581 */
attach_task(struct rq * rq,struct task_struct * p)9582 static void attach_task(struct rq *rq, struct task_struct *p)
9583 {
9584 lockdep_assert_rq_held(rq);
9585
9586 WARN_ON_ONCE(task_rq(p) != rq);
9587 activate_task(rq, p, ENQUEUE_NOCLOCK);
9588 wakeup_preempt(rq, p, 0);
9589 }
9590
9591 /*
9592 * attach_one_task() -- attaches the task returned from detach_one_task() to
9593 * its new rq.
9594 */
attach_one_task(struct rq * rq,struct task_struct * p)9595 static void attach_one_task(struct rq *rq, struct task_struct *p)
9596 {
9597 struct rq_flags rf;
9598
9599 rq_lock(rq, &rf);
9600 update_rq_clock(rq);
9601 attach_task(rq, p);
9602 rq_unlock(rq, &rf);
9603 }
9604
9605 /*
9606 * attach_tasks() -- attaches all tasks detached by detach_tasks() to their
9607 * new rq.
9608 */
attach_tasks(struct lb_env * env)9609 static void attach_tasks(struct lb_env *env)
9610 {
9611 struct list_head *tasks = &env->tasks;
9612 struct task_struct *p;
9613 struct rq_flags rf;
9614
9615 rq_lock(env->dst_rq, &rf);
9616 update_rq_clock(env->dst_rq);
9617
9618 while (!list_empty(tasks)) {
9619 p = list_first_entry(tasks, struct task_struct, se.group_node);
9620 list_del_init(&p->se.group_node);
9621
9622 attach_task(env->dst_rq, p);
9623 }
9624
9625 rq_unlock(env->dst_rq, &rf);
9626 }
9627
9628 #ifdef CONFIG_NO_HZ_COMMON
cfs_rq_has_blocked(struct cfs_rq * cfs_rq)9629 static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq)
9630 {
9631 if (cfs_rq->avg.load_avg)
9632 return true;
9633
9634 if (cfs_rq->avg.util_avg)
9635 return true;
9636
9637 return false;
9638 }
9639
others_have_blocked(struct rq * rq)9640 static inline bool others_have_blocked(struct rq *rq)
9641 {
9642 if (cpu_util_rt(rq))
9643 return true;
9644
9645 if (cpu_util_dl(rq))
9646 return true;
9647
9648 if (hw_load_avg(rq))
9649 return true;
9650
9651 if (cpu_util_irq(rq))
9652 return true;
9653
9654 return false;
9655 }
9656
update_blocked_load_tick(struct rq * rq)9657 static inline void update_blocked_load_tick(struct rq *rq)
9658 {
9659 WRITE_ONCE(rq->last_blocked_load_update_tick, jiffies);
9660 }
9661
update_blocked_load_status(struct rq * rq,bool has_blocked)9662 static inline void update_blocked_load_status(struct rq *rq, bool has_blocked)
9663 {
9664 if (!has_blocked)
9665 rq->has_blocked_load = 0;
9666 }
9667 #else /* !CONFIG_NO_HZ_COMMON: */
cfs_rq_has_blocked(struct cfs_rq * cfs_rq)9668 static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) { return false; }
others_have_blocked(struct rq * rq)9669 static inline bool others_have_blocked(struct rq *rq) { return false; }
update_blocked_load_tick(struct rq * rq)9670 static inline void update_blocked_load_tick(struct rq *rq) {}
update_blocked_load_status(struct rq * rq,bool has_blocked)9671 static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) {}
9672 #endif /* !CONFIG_NO_HZ_COMMON */
9673
__update_blocked_others(struct rq * rq,bool * done)9674 static bool __update_blocked_others(struct rq *rq, bool *done)
9675 {
9676 bool updated;
9677
9678 /*
9679 * update_load_avg() can call cpufreq_update_util(). Make sure that RT,
9680 * DL and IRQ signals have been updated before updating CFS.
9681 */
9682 updated = update_other_load_avgs(rq);
9683
9684 if (others_have_blocked(rq))
9685 *done = false;
9686
9687 return updated;
9688 }
9689
9690 #ifdef CONFIG_FAIR_GROUP_SCHED
9691
__update_blocked_fair(struct rq * rq,bool * done)9692 static bool __update_blocked_fair(struct rq *rq, bool *done)
9693 {
9694 struct cfs_rq *cfs_rq, *pos;
9695 bool decayed = false;
9696 int cpu = cpu_of(rq);
9697
9698 /*
9699 * Iterates the task_group tree in a bottom up fashion, see
9700 * list_add_leaf_cfs_rq() for details.
9701 */
9702 for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) {
9703 struct sched_entity *se;
9704
9705 if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq)) {
9706 update_tg_load_avg(cfs_rq);
9707
9708 if (cfs_rq->nr_queued == 0)
9709 update_idle_cfs_rq_clock_pelt(cfs_rq);
9710
9711 if (cfs_rq == &rq->cfs)
9712 decayed = true;
9713 }
9714
9715 /* Propagate pending load changes to the parent, if any: */
9716 se = cfs_rq->tg->se[cpu];
9717 if (se && !skip_blocked_update(se))
9718 update_load_avg(cfs_rq_of(se), se, UPDATE_TG);
9719
9720 /*
9721 * There can be a lot of idle CPU cgroups. Don't let fully
9722 * decayed cfs_rqs linger on the list.
9723 */
9724 if (cfs_rq_is_decayed(cfs_rq))
9725 list_del_leaf_cfs_rq(cfs_rq);
9726
9727 /* Don't need periodic decay once load/util_avg are null */
9728 if (cfs_rq_has_blocked(cfs_rq))
9729 *done = false;
9730 }
9731
9732 return decayed;
9733 }
9734
9735 /*
9736 * Compute the hierarchical load factor for cfs_rq and all its ascendants.
9737 * This needs to be done in a top-down fashion because the load of a child
9738 * group is a fraction of its parents load.
9739 */
update_cfs_rq_h_load(struct cfs_rq * cfs_rq)9740 static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
9741 {
9742 struct rq *rq = rq_of(cfs_rq);
9743 struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
9744 unsigned long now = jiffies;
9745 unsigned long load;
9746
9747 if (cfs_rq->last_h_load_update == now)
9748 return;
9749
9750 WRITE_ONCE(cfs_rq->h_load_next, NULL);
9751 for_each_sched_entity(se) {
9752 cfs_rq = cfs_rq_of(se);
9753 WRITE_ONCE(cfs_rq->h_load_next, se);
9754 if (cfs_rq->last_h_load_update == now)
9755 break;
9756 }
9757
9758 if (!se) {
9759 cfs_rq->h_load = cfs_rq_load_avg(cfs_rq);
9760 cfs_rq->last_h_load_update = now;
9761 }
9762
9763 while ((se = READ_ONCE(cfs_rq->h_load_next)) != NULL) {
9764 load = cfs_rq->h_load;
9765 load = div64_ul(load * se->avg.load_avg,
9766 cfs_rq_load_avg(cfs_rq) + 1);
9767 cfs_rq = group_cfs_rq(se);
9768 cfs_rq->h_load = load;
9769 cfs_rq->last_h_load_update = now;
9770 }
9771 }
9772
task_h_load(struct task_struct * p)9773 static unsigned long task_h_load(struct task_struct *p)
9774 {
9775 struct cfs_rq *cfs_rq = task_cfs_rq(p);
9776
9777 update_cfs_rq_h_load(cfs_rq);
9778 return div64_ul(p->se.avg.load_avg * cfs_rq->h_load,
9779 cfs_rq_load_avg(cfs_rq) + 1);
9780 }
9781 #else /* !CONFIG_FAIR_GROUP_SCHED: */
__update_blocked_fair(struct rq * rq,bool * done)9782 static bool __update_blocked_fair(struct rq *rq, bool *done)
9783 {
9784 struct cfs_rq *cfs_rq = &rq->cfs;
9785 bool decayed;
9786
9787 decayed = update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq);
9788 if (cfs_rq_has_blocked(cfs_rq))
9789 *done = false;
9790
9791 return decayed;
9792 }
9793
task_h_load(struct task_struct * p)9794 static unsigned long task_h_load(struct task_struct *p)
9795 {
9796 return p->se.avg.load_avg;
9797 }
9798 #endif /* !CONFIG_FAIR_GROUP_SCHED */
9799
sched_balance_update_blocked_averages(int cpu)9800 static void sched_balance_update_blocked_averages(int cpu)
9801 {
9802 bool decayed = false, done = true;
9803 struct rq *rq = cpu_rq(cpu);
9804 struct rq_flags rf;
9805
9806 rq_lock_irqsave(rq, &rf);
9807 update_blocked_load_tick(rq);
9808 update_rq_clock(rq);
9809
9810 decayed |= __update_blocked_others(rq, &done);
9811 decayed |= __update_blocked_fair(rq, &done);
9812
9813 update_blocked_load_status(rq, !done);
9814 if (decayed)
9815 cpufreq_update_util(rq, 0);
9816 rq_unlock_irqrestore(rq, &rf);
9817 }
9818
9819 /********** Helpers for sched_balance_find_src_group ************************/
9820
9821 /*
9822 * sg_lb_stats - stats of a sched_group required for load-balancing:
9823 */
9824 struct sg_lb_stats {
9825 unsigned long avg_load; /* Avg load over the CPUs of the group */
9826 unsigned long group_load; /* Total load over the CPUs of the group */
9827 unsigned long group_capacity; /* Capacity over the CPUs of the group */
9828 unsigned long group_util; /* Total utilization over the CPUs of the group */
9829 unsigned long group_runnable; /* Total runnable time over the CPUs of the group */
9830 unsigned int sum_nr_running; /* Nr of all tasks running in the group */
9831 unsigned int sum_h_nr_running; /* Nr of CFS tasks running in the group */
9832 unsigned int idle_cpus; /* Nr of idle CPUs in the group */
9833 unsigned int group_weight;
9834 enum group_type group_type;
9835 unsigned int group_asym_packing; /* Tasks should be moved to preferred CPU */
9836 unsigned int group_smt_balance; /* Task on busy SMT be moved */
9837 unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */
9838 #ifdef CONFIG_NUMA_BALANCING
9839 unsigned int nr_numa_running;
9840 unsigned int nr_preferred_running;
9841 #endif
9842 };
9843
9844 /*
9845 * sd_lb_stats - stats of a sched_domain required for load-balancing:
9846 */
9847 struct sd_lb_stats {
9848 struct sched_group *busiest; /* Busiest group in this sd */
9849 struct sched_group *local; /* Local group in this sd */
9850 unsigned long total_load; /* Total load of all groups in sd */
9851 unsigned long total_capacity; /* Total capacity of all groups in sd */
9852 unsigned long avg_load; /* Average load across all groups in sd */
9853 unsigned int prefer_sibling; /* Tasks should go to sibling first */
9854
9855 struct sg_lb_stats busiest_stat; /* Statistics of the busiest group */
9856 struct sg_lb_stats local_stat; /* Statistics of the local group */
9857 };
9858
init_sd_lb_stats(struct sd_lb_stats * sds)9859 static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
9860 {
9861 /*
9862 * Skimp on the clearing to avoid duplicate work. We can avoid clearing
9863 * local_stat because update_sg_lb_stats() does a full clear/assignment.
9864 * We must however set busiest_stat::group_type and
9865 * busiest_stat::idle_cpus to the worst busiest group because
9866 * update_sd_pick_busiest() reads these before assignment.
9867 */
9868 *sds = (struct sd_lb_stats){
9869 .busiest = NULL,
9870 .local = NULL,
9871 .total_load = 0UL,
9872 .total_capacity = 0UL,
9873 .busiest_stat = {
9874 .idle_cpus = UINT_MAX,
9875 .group_type = group_has_spare,
9876 },
9877 };
9878 }
9879
scale_rt_capacity(int cpu)9880 static unsigned long scale_rt_capacity(int cpu)
9881 {
9882 unsigned long max = get_actual_cpu_capacity(cpu);
9883 struct rq *rq = cpu_rq(cpu);
9884 unsigned long used, free;
9885 unsigned long irq;
9886
9887 irq = cpu_util_irq(rq);
9888
9889 if (unlikely(irq >= max))
9890 return 1;
9891
9892 /*
9893 * avg_rt.util_avg and avg_dl.util_avg track binary signals
9894 * (running and not running) with weights 0 and 1024 respectively.
9895 */
9896 used = cpu_util_rt(rq);
9897 used += cpu_util_dl(rq);
9898
9899 if (unlikely(used >= max))
9900 return 1;
9901
9902 free = max - used;
9903
9904 return scale_irq_capacity(free, irq, max);
9905 }
9906
update_cpu_capacity(struct sched_domain * sd,int cpu)9907 static void update_cpu_capacity(struct sched_domain *sd, int cpu)
9908 {
9909 unsigned long capacity = scale_rt_capacity(cpu);
9910 struct sched_group *sdg = sd->groups;
9911
9912 if (!capacity)
9913 capacity = 1;
9914
9915 cpu_rq(cpu)->cpu_capacity = capacity;
9916 trace_sched_cpu_capacity_tp(cpu_rq(cpu));
9917
9918 sdg->sgc->capacity = capacity;
9919 sdg->sgc->min_capacity = capacity;
9920 sdg->sgc->max_capacity = capacity;
9921 }
9922
update_group_capacity(struct sched_domain * sd,int cpu)9923 void update_group_capacity(struct sched_domain *sd, int cpu)
9924 {
9925 struct sched_domain *child = sd->child;
9926 struct sched_group *group, *sdg = sd->groups;
9927 unsigned long capacity, min_capacity, max_capacity;
9928 unsigned long interval;
9929
9930 interval = msecs_to_jiffies(sd->balance_interval);
9931 interval = clamp(interval, 1UL, max_load_balance_interval);
9932 sdg->sgc->next_update = jiffies + interval;
9933
9934 if (!child) {
9935 update_cpu_capacity(sd, cpu);
9936 return;
9937 }
9938
9939 capacity = 0;
9940 min_capacity = ULONG_MAX;
9941 max_capacity = 0;
9942
9943 if (child->flags & SD_NUMA) {
9944 /*
9945 * SD_NUMA domains cannot assume that child groups
9946 * span the current group.
9947 */
9948
9949 for_each_cpu(cpu, sched_group_span(sdg)) {
9950 unsigned long cpu_cap = capacity_of(cpu);
9951
9952 capacity += cpu_cap;
9953 min_capacity = min(cpu_cap, min_capacity);
9954 max_capacity = max(cpu_cap, max_capacity);
9955 }
9956 } else {
9957 /*
9958 * !SD_NUMA domains can assume that child groups
9959 * span the current group.
9960 */
9961
9962 group = child->groups;
9963 do {
9964 struct sched_group_capacity *sgc = group->sgc;
9965
9966 capacity += sgc->capacity;
9967 min_capacity = min(sgc->min_capacity, min_capacity);
9968 max_capacity = max(sgc->max_capacity, max_capacity);
9969 group = group->next;
9970 } while (group != child->groups);
9971 }
9972
9973 sdg->sgc->capacity = capacity;
9974 sdg->sgc->min_capacity = min_capacity;
9975 sdg->sgc->max_capacity = max_capacity;
9976 }
9977
9978 /*
9979 * Check whether the capacity of the rq has been noticeably reduced by side
9980 * activity. The imbalance_pct is used for the threshold.
9981 * Return true is the capacity is reduced
9982 */
9983 static inline int
check_cpu_capacity(struct rq * rq,struct sched_domain * sd)9984 check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
9985 {
9986 return ((rq->cpu_capacity * sd->imbalance_pct) <
9987 (arch_scale_cpu_capacity(cpu_of(rq)) * 100));
9988 }
9989
9990 /* Check if the rq has a misfit task */
check_misfit_status(struct rq * rq)9991 static inline bool check_misfit_status(struct rq *rq)
9992 {
9993 return rq->misfit_task_load;
9994 }
9995
9996 /*
9997 * Group imbalance indicates (and tries to solve) the problem where balancing
9998 * groups is inadequate due to ->cpus_ptr constraints.
9999 *
10000 * Imagine a situation of two groups of 4 CPUs each and 4 tasks each with a
10001 * cpumask covering 1 CPU of the first group and 3 CPUs of the second group.
10002 * Something like:
10003 *
10004 * { 0 1 2 3 } { 4 5 6 7 }
10005 * * * * *
10006 *
10007 * If we were to balance group-wise we'd place two tasks in the first group and
10008 * two tasks in the second group. Clearly this is undesired as it will overload
10009 * cpu 3 and leave one of the CPUs in the second group unused.
10010 *
10011 * The current solution to this issue is detecting the skew in the first group
10012 * by noticing the lower domain failed to reach balance and had difficulty
10013 * moving tasks due to affinity constraints.
10014 *
10015 * When this is so detected; this group becomes a candidate for busiest; see
10016 * update_sd_pick_busiest(). And calculate_imbalance() and
10017 * sched_balance_find_src_group() avoid some of the usual balance conditions to allow it
10018 * to create an effective group imbalance.
10019 *
10020 * This is a somewhat tricky proposition since the next run might not find the
10021 * group imbalance and decide the groups need to be balanced again. A most
10022 * subtle and fragile situation.
10023 */
10024
sg_imbalanced(struct sched_group * group)10025 static inline int sg_imbalanced(struct sched_group *group)
10026 {
10027 return group->sgc->imbalance;
10028 }
10029
10030 /*
10031 * group_has_capacity returns true if the group has spare capacity that could
10032 * be used by some tasks.
10033 * We consider that a group has spare capacity if the number of task is
10034 * smaller than the number of CPUs or if the utilization is lower than the
10035 * available capacity for CFS tasks.
10036 * For the latter, we use a threshold to stabilize the state, to take into
10037 * account the variance of the tasks' load and to return true if the available
10038 * capacity in meaningful for the load balancer.
10039 * As an example, an available capacity of 1% can appear but it doesn't make
10040 * any benefit for the load balance.
10041 */
10042 static inline bool
group_has_capacity(unsigned int imbalance_pct,struct sg_lb_stats * sgs)10043 group_has_capacity(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
10044 {
10045 if (sgs->sum_nr_running < sgs->group_weight)
10046 return true;
10047
10048 if ((sgs->group_capacity * imbalance_pct) <
10049 (sgs->group_runnable * 100))
10050 return false;
10051
10052 if ((sgs->group_capacity * 100) >
10053 (sgs->group_util * imbalance_pct))
10054 return true;
10055
10056 return false;
10057 }
10058
10059 /*
10060 * group_is_overloaded returns true if the group has more tasks than it can
10061 * handle.
10062 * group_is_overloaded is not equals to !group_has_capacity because a group
10063 * with the exact right number of tasks, has no more spare capacity but is not
10064 * overloaded so both group_has_capacity and group_is_overloaded return
10065 * false.
10066 */
10067 static inline bool
group_is_overloaded(unsigned int imbalance_pct,struct sg_lb_stats * sgs)10068 group_is_overloaded(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
10069 {
10070 if (sgs->sum_nr_running <= sgs->group_weight)
10071 return false;
10072
10073 if ((sgs->group_capacity * 100) <
10074 (sgs->group_util * imbalance_pct))
10075 return true;
10076
10077 if ((sgs->group_capacity * imbalance_pct) <
10078 (sgs->group_runnable * 100))
10079 return true;
10080
10081 return false;
10082 }
10083
10084 static inline enum
group_classify(unsigned int imbalance_pct,struct sched_group * group,struct sg_lb_stats * sgs)10085 group_type group_classify(unsigned int imbalance_pct,
10086 struct sched_group *group,
10087 struct sg_lb_stats *sgs)
10088 {
10089 if (group_is_overloaded(imbalance_pct, sgs))
10090 return group_overloaded;
10091
10092 if (sg_imbalanced(group))
10093 return group_imbalanced;
10094
10095 if (sgs->group_asym_packing)
10096 return group_asym_packing;
10097
10098 if (sgs->group_smt_balance)
10099 return group_smt_balance;
10100
10101 if (sgs->group_misfit_task_load)
10102 return group_misfit_task;
10103
10104 if (!group_has_capacity(imbalance_pct, sgs))
10105 return group_fully_busy;
10106
10107 return group_has_spare;
10108 }
10109
10110 /**
10111 * sched_use_asym_prio - Check whether asym_packing priority must be used
10112 * @sd: The scheduling domain of the load balancing
10113 * @cpu: A CPU
10114 *
10115 * Always use CPU priority when balancing load between SMT siblings. When
10116 * balancing load between cores, it is not sufficient that @cpu is idle. Only
10117 * use CPU priority if the whole core is idle.
10118 *
10119 * Returns: True if the priority of @cpu must be followed. False otherwise.
10120 */
sched_use_asym_prio(struct sched_domain * sd,int cpu)10121 static bool sched_use_asym_prio(struct sched_domain *sd, int cpu)
10122 {
10123 if (!(sd->flags & SD_ASYM_PACKING))
10124 return false;
10125
10126 if (!sched_smt_active())
10127 return true;
10128
10129 return sd->flags & SD_SHARE_CPUCAPACITY || is_core_idle(cpu);
10130 }
10131
sched_asym(struct sched_domain * sd,int dst_cpu,int src_cpu)10132 static inline bool sched_asym(struct sched_domain *sd, int dst_cpu, int src_cpu)
10133 {
10134 /*
10135 * First check if @dst_cpu can do asym_packing load balance. Only do it
10136 * if it has higher priority than @src_cpu.
10137 */
10138 return sched_use_asym_prio(sd, dst_cpu) &&
10139 sched_asym_prefer(dst_cpu, src_cpu);
10140 }
10141
10142 /**
10143 * sched_group_asym - Check if the destination CPU can do asym_packing balance
10144 * @env: The load balancing environment
10145 * @sgs: Load-balancing statistics of the candidate busiest group
10146 * @group: The candidate busiest group
10147 *
10148 * @env::dst_cpu can do asym_packing if it has higher priority than the
10149 * preferred CPU of @group.
10150 *
10151 * Return: true if @env::dst_cpu can do with asym_packing load balance. False
10152 * otherwise.
10153 */
10154 static inline bool
sched_group_asym(struct lb_env * env,struct sg_lb_stats * sgs,struct sched_group * group)10155 sched_group_asym(struct lb_env *env, struct sg_lb_stats *sgs, struct sched_group *group)
10156 {
10157 /*
10158 * CPU priorities do not make sense for SMT cores with more than one
10159 * busy sibling.
10160 */
10161 if ((group->flags & SD_SHARE_CPUCAPACITY) &&
10162 (sgs->group_weight - sgs->idle_cpus != 1))
10163 return false;
10164
10165 return sched_asym(env->sd, env->dst_cpu, READ_ONCE(group->asym_prefer_cpu));
10166 }
10167
10168 /* One group has more than one SMT CPU while the other group does not */
smt_vs_nonsmt_groups(struct sched_group * sg1,struct sched_group * sg2)10169 static inline bool smt_vs_nonsmt_groups(struct sched_group *sg1,
10170 struct sched_group *sg2)
10171 {
10172 if (!sg1 || !sg2)
10173 return false;
10174
10175 return (sg1->flags & SD_SHARE_CPUCAPACITY) !=
10176 (sg2->flags & SD_SHARE_CPUCAPACITY);
10177 }
10178
smt_balance(struct lb_env * env,struct sg_lb_stats * sgs,struct sched_group * group)10179 static inline bool smt_balance(struct lb_env *env, struct sg_lb_stats *sgs,
10180 struct sched_group *group)
10181 {
10182 if (!env->idle)
10183 return false;
10184
10185 /*
10186 * For SMT source group, it is better to move a task
10187 * to a CPU that doesn't have multiple tasks sharing its CPU capacity.
10188 * Note that if a group has a single SMT, SD_SHARE_CPUCAPACITY
10189 * will not be on.
10190 */
10191 if (group->flags & SD_SHARE_CPUCAPACITY &&
10192 sgs->sum_h_nr_running > 1)
10193 return true;
10194
10195 return false;
10196 }
10197
sibling_imbalance(struct lb_env * env,struct sd_lb_stats * sds,struct sg_lb_stats * busiest,struct sg_lb_stats * local)10198 static inline long sibling_imbalance(struct lb_env *env,
10199 struct sd_lb_stats *sds,
10200 struct sg_lb_stats *busiest,
10201 struct sg_lb_stats *local)
10202 {
10203 int ncores_busiest, ncores_local;
10204 long imbalance;
10205
10206 if (!env->idle || !busiest->sum_nr_running)
10207 return 0;
10208
10209 ncores_busiest = sds->busiest->cores;
10210 ncores_local = sds->local->cores;
10211
10212 if (ncores_busiest == ncores_local) {
10213 imbalance = busiest->sum_nr_running;
10214 lsub_positive(&imbalance, local->sum_nr_running);
10215 return imbalance;
10216 }
10217
10218 /* Balance such that nr_running/ncores ratio are same on both groups */
10219 imbalance = ncores_local * busiest->sum_nr_running;
10220 lsub_positive(&imbalance, ncores_busiest * local->sum_nr_running);
10221 /* Normalize imbalance and do rounding on normalization */
10222 imbalance = 2 * imbalance + ncores_local + ncores_busiest;
10223 imbalance /= ncores_local + ncores_busiest;
10224
10225 /* Take advantage of resource in an empty sched group */
10226 if (imbalance <= 1 && local->sum_nr_running == 0 &&
10227 busiest->sum_nr_running > 1)
10228 imbalance = 2;
10229
10230 return imbalance;
10231 }
10232
10233 static inline bool
sched_reduced_capacity(struct rq * rq,struct sched_domain * sd)10234 sched_reduced_capacity(struct rq *rq, struct sched_domain *sd)
10235 {
10236 /*
10237 * When there is more than 1 task, the group_overloaded case already
10238 * takes care of cpu with reduced capacity
10239 */
10240 if (rq->cfs.h_nr_runnable != 1)
10241 return false;
10242
10243 return check_cpu_capacity(rq, sd);
10244 }
10245
10246 /**
10247 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
10248 * @env: The load balancing environment.
10249 * @sds: Load-balancing data with statistics of the local group.
10250 * @group: sched_group whose statistics are to be updated.
10251 * @sgs: variable to hold the statistics for this group.
10252 * @sg_overloaded: sched_group is overloaded
10253 * @sg_overutilized: sched_group is overutilized
10254 */
update_sg_lb_stats(struct lb_env * env,struct sd_lb_stats * sds,struct sched_group * group,struct sg_lb_stats * sgs,bool * sg_overloaded,bool * sg_overutilized)10255 static inline void update_sg_lb_stats(struct lb_env *env,
10256 struct sd_lb_stats *sds,
10257 struct sched_group *group,
10258 struct sg_lb_stats *sgs,
10259 bool *sg_overloaded,
10260 bool *sg_overutilized)
10261 {
10262 int i, nr_running, local_group, sd_flags = env->sd->flags;
10263 bool balancing_at_rd = !env->sd->parent;
10264
10265 memset(sgs, 0, sizeof(*sgs));
10266
10267 local_group = group == sds->local;
10268
10269 for_each_cpu_and(i, sched_group_span(group), env->cpus) {
10270 struct rq *rq = cpu_rq(i);
10271 unsigned long load = cpu_load(rq);
10272
10273 sgs->group_load += load;
10274 sgs->group_util += cpu_util_cfs(i);
10275 sgs->group_runnable += cpu_runnable(rq);
10276 sgs->sum_h_nr_running += rq->cfs.h_nr_runnable;
10277
10278 nr_running = rq->nr_running;
10279 sgs->sum_nr_running += nr_running;
10280
10281 if (cpu_overutilized(i))
10282 *sg_overutilized = 1;
10283
10284 /*
10285 * No need to call idle_cpu() if nr_running is not 0
10286 */
10287 if (!nr_running && idle_cpu(i)) {
10288 sgs->idle_cpus++;
10289 /* Idle cpu can't have misfit task */
10290 continue;
10291 }
10292
10293 /* Overload indicator is only updated at root domain */
10294 if (balancing_at_rd && nr_running > 1)
10295 *sg_overloaded = 1;
10296
10297 #ifdef CONFIG_NUMA_BALANCING
10298 /* Only fbq_classify_group() uses this to classify NUMA groups */
10299 if (sd_flags & SD_NUMA) {
10300 sgs->nr_numa_running += rq->nr_numa_running;
10301 sgs->nr_preferred_running += rq->nr_preferred_running;
10302 }
10303 #endif
10304 if (local_group)
10305 continue;
10306
10307 if (sd_flags & SD_ASYM_CPUCAPACITY) {
10308 /* Check for a misfit task on the cpu */
10309 if (sgs->group_misfit_task_load < rq->misfit_task_load) {
10310 sgs->group_misfit_task_load = rq->misfit_task_load;
10311 *sg_overloaded = 1;
10312 }
10313 } else if (env->idle && sched_reduced_capacity(rq, env->sd)) {
10314 /* Check for a task running on a CPU with reduced capacity */
10315 if (sgs->group_misfit_task_load < load)
10316 sgs->group_misfit_task_load = load;
10317 }
10318 }
10319
10320 sgs->group_capacity = group->sgc->capacity;
10321
10322 sgs->group_weight = group->group_weight;
10323
10324 /* Check if dst CPU is idle and preferred to this group */
10325 if (!local_group && env->idle && sgs->sum_h_nr_running &&
10326 sched_group_asym(env, sgs, group))
10327 sgs->group_asym_packing = 1;
10328
10329 /* Check for loaded SMT group to be balanced to dst CPU */
10330 if (!local_group && smt_balance(env, sgs, group))
10331 sgs->group_smt_balance = 1;
10332
10333 sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs);
10334
10335 /* Computing avg_load makes sense only when group is overloaded */
10336 if (sgs->group_type == group_overloaded)
10337 sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) /
10338 sgs->group_capacity;
10339 }
10340
10341 /**
10342 * update_sd_pick_busiest - return 1 on busiest group
10343 * @env: The load balancing environment.
10344 * @sds: sched_domain statistics
10345 * @sg: sched_group candidate to be checked for being the busiest
10346 * @sgs: sched_group statistics
10347 *
10348 * Determine if @sg is a busier group than the previously selected
10349 * busiest group.
10350 *
10351 * Return: %true if @sg is a busier group than the previously selected
10352 * busiest group. %false otherwise.
10353 */
update_sd_pick_busiest(struct lb_env * env,struct sd_lb_stats * sds,struct sched_group * sg,struct sg_lb_stats * sgs)10354 static bool update_sd_pick_busiest(struct lb_env *env,
10355 struct sd_lb_stats *sds,
10356 struct sched_group *sg,
10357 struct sg_lb_stats *sgs)
10358 {
10359 struct sg_lb_stats *busiest = &sds->busiest_stat;
10360
10361 /* Make sure that there is at least one task to pull */
10362 if (!sgs->sum_h_nr_running)
10363 return false;
10364
10365 /*
10366 * Don't try to pull misfit tasks we can't help.
10367 * We can use max_capacity here as reduction in capacity on some
10368 * CPUs in the group should either be possible to resolve
10369 * internally or be covered by avg_load imbalance (eventually).
10370 */
10371 if ((env->sd->flags & SD_ASYM_CPUCAPACITY) &&
10372 (sgs->group_type == group_misfit_task) &&
10373 (!capacity_greater(capacity_of(env->dst_cpu), sg->sgc->max_capacity) ||
10374 sds->local_stat.group_type != group_has_spare))
10375 return false;
10376
10377 if (sgs->group_type > busiest->group_type)
10378 return true;
10379
10380 if (sgs->group_type < busiest->group_type)
10381 return false;
10382
10383 /*
10384 * The candidate and the current busiest group are the same type of
10385 * group. Let check which one is the busiest according to the type.
10386 */
10387
10388 switch (sgs->group_type) {
10389 case group_overloaded:
10390 /* Select the overloaded group with highest avg_load. */
10391 return sgs->avg_load > busiest->avg_load;
10392
10393 case group_imbalanced:
10394 /*
10395 * Select the 1st imbalanced group as we don't have any way to
10396 * choose one more than another.
10397 */
10398 return false;
10399
10400 case group_asym_packing:
10401 /* Prefer to move from lowest priority CPU's work */
10402 return sched_asym_prefer(READ_ONCE(sds->busiest->asym_prefer_cpu),
10403 READ_ONCE(sg->asym_prefer_cpu));
10404
10405 case group_misfit_task:
10406 /*
10407 * If we have more than one misfit sg go with the biggest
10408 * misfit.
10409 */
10410 return sgs->group_misfit_task_load > busiest->group_misfit_task_load;
10411
10412 case group_smt_balance:
10413 /*
10414 * Check if we have spare CPUs on either SMT group to
10415 * choose has spare or fully busy handling.
10416 */
10417 if (sgs->idle_cpus != 0 || busiest->idle_cpus != 0)
10418 goto has_spare;
10419
10420 fallthrough;
10421
10422 case group_fully_busy:
10423 /*
10424 * Select the fully busy group with highest avg_load. In
10425 * theory, there is no need to pull task from such kind of
10426 * group because tasks have all compute capacity that they need
10427 * but we can still improve the overall throughput by reducing
10428 * contention when accessing shared HW resources.
10429 *
10430 * XXX for now avg_load is not computed and always 0 so we
10431 * select the 1st one, except if @sg is composed of SMT
10432 * siblings.
10433 */
10434
10435 if (sgs->avg_load < busiest->avg_load)
10436 return false;
10437
10438 if (sgs->avg_load == busiest->avg_load) {
10439 /*
10440 * SMT sched groups need more help than non-SMT groups.
10441 * If @sg happens to also be SMT, either choice is good.
10442 */
10443 if (sds->busiest->flags & SD_SHARE_CPUCAPACITY)
10444 return false;
10445 }
10446
10447 break;
10448
10449 case group_has_spare:
10450 /*
10451 * Do not pick sg with SMT CPUs over sg with pure CPUs,
10452 * as we do not want to pull task off SMT core with one task
10453 * and make the core idle.
10454 */
10455 if (smt_vs_nonsmt_groups(sds->busiest, sg)) {
10456 if (sg->flags & SD_SHARE_CPUCAPACITY && sgs->sum_h_nr_running <= 1)
10457 return false;
10458 else
10459 return true;
10460 }
10461 has_spare:
10462
10463 /*
10464 * Select not overloaded group with lowest number of idle CPUs
10465 * and highest number of running tasks. We could also compare
10466 * the spare capacity which is more stable but it can end up
10467 * that the group has less spare capacity but finally more idle
10468 * CPUs which means less opportunity to pull tasks.
10469 */
10470 if (sgs->idle_cpus > busiest->idle_cpus)
10471 return false;
10472 else if ((sgs->idle_cpus == busiest->idle_cpus) &&
10473 (sgs->sum_nr_running <= busiest->sum_nr_running))
10474 return false;
10475
10476 break;
10477 }
10478
10479 /*
10480 * Candidate sg has no more than one task per CPU and has higher
10481 * per-CPU capacity. Migrating tasks to less capable CPUs may harm
10482 * throughput. Maximize throughput, power/energy consequences are not
10483 * considered.
10484 */
10485 if ((env->sd->flags & SD_ASYM_CPUCAPACITY) &&
10486 (sgs->group_type <= group_fully_busy) &&
10487 (capacity_greater(sg->sgc->min_capacity, capacity_of(env->dst_cpu))))
10488 return false;
10489
10490 return true;
10491 }
10492
10493 #ifdef CONFIG_NUMA_BALANCING
fbq_classify_group(struct sg_lb_stats * sgs)10494 static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
10495 {
10496 if (sgs->sum_h_nr_running > sgs->nr_numa_running)
10497 return regular;
10498 if (sgs->sum_h_nr_running > sgs->nr_preferred_running)
10499 return remote;
10500 return all;
10501 }
10502
fbq_classify_rq(struct rq * rq)10503 static inline enum fbq_type fbq_classify_rq(struct rq *rq)
10504 {
10505 if (rq->nr_running > rq->nr_numa_running)
10506 return regular;
10507 if (rq->nr_running > rq->nr_preferred_running)
10508 return remote;
10509 return all;
10510 }
10511 #else /* !CONFIG_NUMA_BALANCING: */
fbq_classify_group(struct sg_lb_stats * sgs)10512 static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
10513 {
10514 return all;
10515 }
10516
fbq_classify_rq(struct rq * rq)10517 static inline enum fbq_type fbq_classify_rq(struct rq *rq)
10518 {
10519 return regular;
10520 }
10521 #endif /* !CONFIG_NUMA_BALANCING */
10522
10523
10524 struct sg_lb_stats;
10525
10526 /*
10527 * task_running_on_cpu - return 1 if @p is running on @cpu.
10528 */
10529
task_running_on_cpu(int cpu,struct task_struct * p)10530 static unsigned int task_running_on_cpu(int cpu, struct task_struct *p)
10531 {
10532 /* Task has no contribution or is new */
10533 if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
10534 return 0;
10535
10536 if (task_on_rq_queued(p))
10537 return 1;
10538
10539 return 0;
10540 }
10541
10542 /**
10543 * idle_cpu_without - would a given CPU be idle without p ?
10544 * @cpu: the processor on which idleness is tested.
10545 * @p: task which should be ignored.
10546 *
10547 * Return: 1 if the CPU would be idle. 0 otherwise.
10548 */
idle_cpu_without(int cpu,struct task_struct * p)10549 static int idle_cpu_without(int cpu, struct task_struct *p)
10550 {
10551 struct rq *rq = cpu_rq(cpu);
10552
10553 if (rq->curr != rq->idle && rq->curr != p)
10554 return 0;
10555
10556 /*
10557 * rq->nr_running can't be used but an updated version without the
10558 * impact of p on cpu must be used instead. The updated nr_running
10559 * be computed and tested before calling idle_cpu_without().
10560 */
10561
10562 if (rq->ttwu_pending)
10563 return 0;
10564
10565 return 1;
10566 }
10567
10568 /*
10569 * update_sg_wakeup_stats - Update sched_group's statistics for wakeup.
10570 * @sd: The sched_domain level to look for idlest group.
10571 * @group: sched_group whose statistics are to be updated.
10572 * @sgs: variable to hold the statistics for this group.
10573 * @p: The task for which we look for the idlest group/CPU.
10574 */
update_sg_wakeup_stats(struct sched_domain * sd,struct sched_group * group,struct sg_lb_stats * sgs,struct task_struct * p)10575 static inline void update_sg_wakeup_stats(struct sched_domain *sd,
10576 struct sched_group *group,
10577 struct sg_lb_stats *sgs,
10578 struct task_struct *p)
10579 {
10580 int i, nr_running;
10581
10582 memset(sgs, 0, sizeof(*sgs));
10583
10584 /* Assume that task can't fit any CPU of the group */
10585 if (sd->flags & SD_ASYM_CPUCAPACITY)
10586 sgs->group_misfit_task_load = 1;
10587
10588 for_each_cpu(i, sched_group_span(group)) {
10589 struct rq *rq = cpu_rq(i);
10590 unsigned int local;
10591
10592 sgs->group_load += cpu_load_without(rq, p);
10593 sgs->group_util += cpu_util_without(i, p);
10594 sgs->group_runnable += cpu_runnable_without(rq, p);
10595 local = task_running_on_cpu(i, p);
10596 sgs->sum_h_nr_running += rq->cfs.h_nr_runnable - local;
10597
10598 nr_running = rq->nr_running - local;
10599 sgs->sum_nr_running += nr_running;
10600
10601 /*
10602 * No need to call idle_cpu_without() if nr_running is not 0
10603 */
10604 if (!nr_running && idle_cpu_without(i, p))
10605 sgs->idle_cpus++;
10606
10607 /* Check if task fits in the CPU */
10608 if (sd->flags & SD_ASYM_CPUCAPACITY &&
10609 sgs->group_misfit_task_load &&
10610 task_fits_cpu(p, i))
10611 sgs->group_misfit_task_load = 0;
10612
10613 }
10614
10615 sgs->group_capacity = group->sgc->capacity;
10616
10617 sgs->group_weight = group->group_weight;
10618
10619 sgs->group_type = group_classify(sd->imbalance_pct, group, sgs);
10620
10621 /*
10622 * Computing avg_load makes sense only when group is fully busy or
10623 * overloaded
10624 */
10625 if (sgs->group_type == group_fully_busy ||
10626 sgs->group_type == group_overloaded)
10627 sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) /
10628 sgs->group_capacity;
10629 }
10630
update_pick_idlest(struct sched_group * idlest,struct sg_lb_stats * idlest_sgs,struct sched_group * group,struct sg_lb_stats * sgs)10631 static bool update_pick_idlest(struct sched_group *idlest,
10632 struct sg_lb_stats *idlest_sgs,
10633 struct sched_group *group,
10634 struct sg_lb_stats *sgs)
10635 {
10636 if (sgs->group_type < idlest_sgs->group_type)
10637 return true;
10638
10639 if (sgs->group_type > idlest_sgs->group_type)
10640 return false;
10641
10642 /*
10643 * The candidate and the current idlest group are the same type of
10644 * group. Let check which one is the idlest according to the type.
10645 */
10646
10647 switch (sgs->group_type) {
10648 case group_overloaded:
10649 case group_fully_busy:
10650 /* Select the group with lowest avg_load. */
10651 if (idlest_sgs->avg_load <= sgs->avg_load)
10652 return false;
10653 break;
10654
10655 case group_imbalanced:
10656 case group_asym_packing:
10657 case group_smt_balance:
10658 /* Those types are not used in the slow wakeup path */
10659 return false;
10660
10661 case group_misfit_task:
10662 /* Select group with the highest max capacity */
10663 if (idlest->sgc->max_capacity >= group->sgc->max_capacity)
10664 return false;
10665 break;
10666
10667 case group_has_spare:
10668 /* Select group with most idle CPUs */
10669 if (idlest_sgs->idle_cpus > sgs->idle_cpus)
10670 return false;
10671
10672 /* Select group with lowest group_util */
10673 if (idlest_sgs->idle_cpus == sgs->idle_cpus &&
10674 idlest_sgs->group_util <= sgs->group_util)
10675 return false;
10676
10677 break;
10678 }
10679
10680 return true;
10681 }
10682
10683 /*
10684 * sched_balance_find_dst_group() finds and returns the least busy CPU group within the
10685 * domain.
10686 *
10687 * Assumes p is allowed on at least one CPU in sd.
10688 */
10689 static struct sched_group *
sched_balance_find_dst_group(struct sched_domain * sd,struct task_struct * p,int this_cpu)10690 sched_balance_find_dst_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
10691 {
10692 struct sched_group *idlest = NULL, *local = NULL, *group = sd->groups;
10693 struct sg_lb_stats local_sgs, tmp_sgs;
10694 struct sg_lb_stats *sgs;
10695 unsigned long imbalance;
10696 struct sg_lb_stats idlest_sgs = {
10697 .avg_load = UINT_MAX,
10698 .group_type = group_overloaded,
10699 };
10700
10701 do {
10702 int local_group;
10703
10704 /* Skip over this group if it has no CPUs allowed */
10705 if (!cpumask_intersects(sched_group_span(group),
10706 p->cpus_ptr))
10707 continue;
10708
10709 /* Skip over this group if no cookie matched */
10710 if (!sched_group_cookie_match(cpu_rq(this_cpu), p, group))
10711 continue;
10712
10713 local_group = cpumask_test_cpu(this_cpu,
10714 sched_group_span(group));
10715
10716 if (local_group) {
10717 sgs = &local_sgs;
10718 local = group;
10719 } else {
10720 sgs = &tmp_sgs;
10721 }
10722
10723 update_sg_wakeup_stats(sd, group, sgs, p);
10724
10725 if (!local_group && update_pick_idlest(idlest, &idlest_sgs, group, sgs)) {
10726 idlest = group;
10727 idlest_sgs = *sgs;
10728 }
10729
10730 } while (group = group->next, group != sd->groups);
10731
10732
10733 /* There is no idlest group to push tasks to */
10734 if (!idlest)
10735 return NULL;
10736
10737 /* The local group has been skipped because of CPU affinity */
10738 if (!local)
10739 return idlest;
10740
10741 /*
10742 * If the local group is idler than the selected idlest group
10743 * don't try and push the task.
10744 */
10745 if (local_sgs.group_type < idlest_sgs.group_type)
10746 return NULL;
10747
10748 /*
10749 * If the local group is busier than the selected idlest group
10750 * try and push the task.
10751 */
10752 if (local_sgs.group_type > idlest_sgs.group_type)
10753 return idlest;
10754
10755 switch (local_sgs.group_type) {
10756 case group_overloaded:
10757 case group_fully_busy:
10758
10759 /* Calculate allowed imbalance based on load */
10760 imbalance = scale_load_down(NICE_0_LOAD) *
10761 (sd->imbalance_pct-100) / 100;
10762
10763 /*
10764 * When comparing groups across NUMA domains, it's possible for
10765 * the local domain to be very lightly loaded relative to the
10766 * remote domains but "imbalance" skews the comparison making
10767 * remote CPUs look much more favourable. When considering
10768 * cross-domain, add imbalance to the load on the remote node
10769 * and consider staying local.
10770 */
10771
10772 if ((sd->flags & SD_NUMA) &&
10773 ((idlest_sgs.avg_load + imbalance) >= local_sgs.avg_load))
10774 return NULL;
10775
10776 /*
10777 * If the local group is less loaded than the selected
10778 * idlest group don't try and push any tasks.
10779 */
10780 if (idlest_sgs.avg_load >= (local_sgs.avg_load + imbalance))
10781 return NULL;
10782
10783 if (100 * local_sgs.avg_load <= sd->imbalance_pct * idlest_sgs.avg_load)
10784 return NULL;
10785 break;
10786
10787 case group_imbalanced:
10788 case group_asym_packing:
10789 case group_smt_balance:
10790 /* Those type are not used in the slow wakeup path */
10791 return NULL;
10792
10793 case group_misfit_task:
10794 /* Select group with the highest max capacity */
10795 if (local->sgc->max_capacity >= idlest->sgc->max_capacity)
10796 return NULL;
10797 break;
10798
10799 case group_has_spare:
10800 #ifdef CONFIG_NUMA
10801 if (sd->flags & SD_NUMA) {
10802 int imb_numa_nr = sd->imb_numa_nr;
10803 #ifdef CONFIG_NUMA_BALANCING
10804 int idlest_cpu;
10805 /*
10806 * If there is spare capacity at NUMA, try to select
10807 * the preferred node
10808 */
10809 if (cpu_to_node(this_cpu) == p->numa_preferred_nid)
10810 return NULL;
10811
10812 idlest_cpu = cpumask_first(sched_group_span(idlest));
10813 if (cpu_to_node(idlest_cpu) == p->numa_preferred_nid)
10814 return idlest;
10815 #endif /* CONFIG_NUMA_BALANCING */
10816 /*
10817 * Otherwise, keep the task close to the wakeup source
10818 * and improve locality if the number of running tasks
10819 * would remain below threshold where an imbalance is
10820 * allowed while accounting for the possibility the
10821 * task is pinned to a subset of CPUs. If there is a
10822 * real need of migration, periodic load balance will
10823 * take care of it.
10824 */
10825 if (p->nr_cpus_allowed != NR_CPUS) {
10826 struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask);
10827
10828 cpumask_and(cpus, sched_group_span(local), p->cpus_ptr);
10829 imb_numa_nr = min(cpumask_weight(cpus), sd->imb_numa_nr);
10830 }
10831
10832 imbalance = abs(local_sgs.idle_cpus - idlest_sgs.idle_cpus);
10833 if (!adjust_numa_imbalance(imbalance,
10834 local_sgs.sum_nr_running + 1,
10835 imb_numa_nr)) {
10836 return NULL;
10837 }
10838 }
10839 #endif /* CONFIG_NUMA */
10840
10841 /*
10842 * Select group with highest number of idle CPUs. We could also
10843 * compare the utilization which is more stable but it can end
10844 * up that the group has less spare capacity but finally more
10845 * idle CPUs which means more opportunity to run task.
10846 */
10847 if (local_sgs.idle_cpus >= idlest_sgs.idle_cpus)
10848 return NULL;
10849 break;
10850 }
10851
10852 return idlest;
10853 }
10854
update_idle_cpu_scan(struct lb_env * env,unsigned long sum_util)10855 static void update_idle_cpu_scan(struct lb_env *env,
10856 unsigned long sum_util)
10857 {
10858 struct sched_domain_shared *sd_share;
10859 int llc_weight, pct;
10860 u64 x, y, tmp;
10861 /*
10862 * Update the number of CPUs to scan in LLC domain, which could
10863 * be used as a hint in select_idle_cpu(). The update of sd_share
10864 * could be expensive because it is within a shared cache line.
10865 * So the write of this hint only occurs during periodic load
10866 * balancing, rather than CPU_NEWLY_IDLE, because the latter
10867 * can fire way more frequently than the former.
10868 */
10869 if (!sched_feat(SIS_UTIL) || env->idle == CPU_NEWLY_IDLE)
10870 return;
10871
10872 llc_weight = per_cpu(sd_llc_size, env->dst_cpu);
10873 if (env->sd->span_weight != llc_weight)
10874 return;
10875
10876 sd_share = rcu_dereference(per_cpu(sd_llc_shared, env->dst_cpu));
10877 if (!sd_share)
10878 return;
10879
10880 /*
10881 * The number of CPUs to search drops as sum_util increases, when
10882 * sum_util hits 85% or above, the scan stops.
10883 * The reason to choose 85% as the threshold is because this is the
10884 * imbalance_pct(117) when a LLC sched group is overloaded.
10885 *
10886 * let y = SCHED_CAPACITY_SCALE - p * x^2 [1]
10887 * and y'= y / SCHED_CAPACITY_SCALE
10888 *
10889 * x is the ratio of sum_util compared to the CPU capacity:
10890 * x = sum_util / (llc_weight * SCHED_CAPACITY_SCALE)
10891 * y' is the ratio of CPUs to be scanned in the LLC domain,
10892 * and the number of CPUs to scan is calculated by:
10893 *
10894 * nr_scan = llc_weight * y' [2]
10895 *
10896 * When x hits the threshold of overloaded, AKA, when
10897 * x = 100 / pct, y drops to 0. According to [1],
10898 * p should be SCHED_CAPACITY_SCALE * pct^2 / 10000
10899 *
10900 * Scale x by SCHED_CAPACITY_SCALE:
10901 * x' = sum_util / llc_weight; [3]
10902 *
10903 * and finally [1] becomes:
10904 * y = SCHED_CAPACITY_SCALE -
10905 * x'^2 * pct^2 / (10000 * SCHED_CAPACITY_SCALE) [4]
10906 *
10907 */
10908 /* equation [3] */
10909 x = sum_util;
10910 do_div(x, llc_weight);
10911
10912 /* equation [4] */
10913 pct = env->sd->imbalance_pct;
10914 tmp = x * x * pct * pct;
10915 do_div(tmp, 10000 * SCHED_CAPACITY_SCALE);
10916 tmp = min_t(long, tmp, SCHED_CAPACITY_SCALE);
10917 y = SCHED_CAPACITY_SCALE - tmp;
10918
10919 /* equation [2] */
10920 y *= llc_weight;
10921 do_div(y, SCHED_CAPACITY_SCALE);
10922 if ((int)y != sd_share->nr_idle_scan)
10923 WRITE_ONCE(sd_share->nr_idle_scan, (int)y);
10924 }
10925
10926 /**
10927 * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
10928 * @env: The load balancing environment.
10929 * @sds: variable to hold the statistics for this sched_domain.
10930 */
10931
update_sd_lb_stats(struct lb_env * env,struct sd_lb_stats * sds)10932 static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
10933 {
10934 struct sched_group *sg = env->sd->groups;
10935 struct sg_lb_stats *local = &sds->local_stat;
10936 struct sg_lb_stats tmp_sgs;
10937 unsigned long sum_util = 0;
10938 bool sg_overloaded = 0, sg_overutilized = 0;
10939
10940 do {
10941 struct sg_lb_stats *sgs = &tmp_sgs;
10942 int local_group;
10943
10944 local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(sg));
10945 if (local_group) {
10946 sds->local = sg;
10947 sgs = local;
10948
10949 if (env->idle != CPU_NEWLY_IDLE ||
10950 time_after_eq(jiffies, sg->sgc->next_update))
10951 update_group_capacity(env->sd, env->dst_cpu);
10952 }
10953
10954 update_sg_lb_stats(env, sds, sg, sgs, &sg_overloaded, &sg_overutilized);
10955
10956 if (!local_group && update_sd_pick_busiest(env, sds, sg, sgs)) {
10957 sds->busiest = sg;
10958 sds->busiest_stat = *sgs;
10959 }
10960
10961 /* Now, start updating sd_lb_stats */
10962 sds->total_load += sgs->group_load;
10963 sds->total_capacity += sgs->group_capacity;
10964
10965 sum_util += sgs->group_util;
10966 sg = sg->next;
10967 } while (sg != env->sd->groups);
10968
10969 /*
10970 * Indicate that the child domain of the busiest group prefers tasks
10971 * go to a child's sibling domains first. NB the flags of a sched group
10972 * are those of the child domain.
10973 */
10974 if (sds->busiest)
10975 sds->prefer_sibling = !!(sds->busiest->flags & SD_PREFER_SIBLING);
10976
10977
10978 if (env->sd->flags & SD_NUMA)
10979 env->fbq_type = fbq_classify_group(&sds->busiest_stat);
10980
10981 if (!env->sd->parent) {
10982 /* update overload indicator if we are at root domain */
10983 set_rd_overloaded(env->dst_rq->rd, sg_overloaded);
10984
10985 /* Update over-utilization (tipping point, U >= 0) indicator */
10986 set_rd_overutilized(env->dst_rq->rd, sg_overutilized);
10987 } else if (sg_overutilized) {
10988 set_rd_overutilized(env->dst_rq->rd, sg_overutilized);
10989 }
10990
10991 update_idle_cpu_scan(env, sum_util);
10992 }
10993
10994 /**
10995 * calculate_imbalance - Calculate the amount of imbalance present within the
10996 * groups of a given sched_domain during load balance.
10997 * @env: load balance environment
10998 * @sds: statistics of the sched_domain whose imbalance is to be calculated.
10999 */
calculate_imbalance(struct lb_env * env,struct sd_lb_stats * sds)11000 static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
11001 {
11002 struct sg_lb_stats *local, *busiest;
11003
11004 local = &sds->local_stat;
11005 busiest = &sds->busiest_stat;
11006
11007 if (busiest->group_type == group_misfit_task) {
11008 if (env->sd->flags & SD_ASYM_CPUCAPACITY) {
11009 /* Set imbalance to allow misfit tasks to be balanced. */
11010 env->migration_type = migrate_misfit;
11011 env->imbalance = 1;
11012 } else {
11013 /*
11014 * Set load imbalance to allow moving task from cpu
11015 * with reduced capacity.
11016 */
11017 env->migration_type = migrate_load;
11018 env->imbalance = busiest->group_misfit_task_load;
11019 }
11020 return;
11021 }
11022
11023 if (busiest->group_type == group_asym_packing) {
11024 /*
11025 * In case of asym capacity, we will try to migrate all load to
11026 * the preferred CPU.
11027 */
11028 env->migration_type = migrate_task;
11029 env->imbalance = busiest->sum_h_nr_running;
11030 return;
11031 }
11032
11033 if (busiest->group_type == group_smt_balance) {
11034 /* Reduce number of tasks sharing CPU capacity */
11035 env->migration_type = migrate_task;
11036 env->imbalance = 1;
11037 return;
11038 }
11039
11040 if (busiest->group_type == group_imbalanced) {
11041 /*
11042 * In the group_imb case we cannot rely on group-wide averages
11043 * to ensure CPU-load equilibrium, try to move any task to fix
11044 * the imbalance. The next load balance will take care of
11045 * balancing back the system.
11046 */
11047 env->migration_type = migrate_task;
11048 env->imbalance = 1;
11049 return;
11050 }
11051
11052 /*
11053 * Try to use spare capacity of local group without overloading it or
11054 * emptying busiest.
11055 */
11056 if (local->group_type == group_has_spare) {
11057 if ((busiest->group_type > group_fully_busy) &&
11058 !(env->sd->flags & SD_SHARE_LLC)) {
11059 /*
11060 * If busiest is overloaded, try to fill spare
11061 * capacity. This might end up creating spare capacity
11062 * in busiest or busiest still being overloaded but
11063 * there is no simple way to directly compute the
11064 * amount of load to migrate in order to balance the
11065 * system.
11066 */
11067 env->migration_type = migrate_util;
11068 env->imbalance = max(local->group_capacity, local->group_util) -
11069 local->group_util;
11070
11071 /*
11072 * In some cases, the group's utilization is max or even
11073 * higher than capacity because of migrations but the
11074 * local CPU is (newly) idle. There is at least one
11075 * waiting task in this overloaded busiest group. Let's
11076 * try to pull it.
11077 */
11078 if (env->idle && env->imbalance == 0) {
11079 env->migration_type = migrate_task;
11080 env->imbalance = 1;
11081 }
11082
11083 return;
11084 }
11085
11086 if (busiest->group_weight == 1 || sds->prefer_sibling) {
11087 /*
11088 * When prefer sibling, evenly spread running tasks on
11089 * groups.
11090 */
11091 env->migration_type = migrate_task;
11092 env->imbalance = sibling_imbalance(env, sds, busiest, local);
11093 } else {
11094
11095 /*
11096 * If there is no overload, we just want to even the number of
11097 * idle CPUs.
11098 */
11099 env->migration_type = migrate_task;
11100 env->imbalance = max_t(long, 0,
11101 (local->idle_cpus - busiest->idle_cpus));
11102 }
11103
11104 #ifdef CONFIG_NUMA
11105 /* Consider allowing a small imbalance between NUMA groups */
11106 if (env->sd->flags & SD_NUMA) {
11107 env->imbalance = adjust_numa_imbalance(env->imbalance,
11108 local->sum_nr_running + 1,
11109 env->sd->imb_numa_nr);
11110 }
11111 #endif
11112
11113 /* Number of tasks to move to restore balance */
11114 env->imbalance >>= 1;
11115
11116 return;
11117 }
11118
11119 /*
11120 * Local is fully busy but has to take more load to relieve the
11121 * busiest group
11122 */
11123 if (local->group_type < group_overloaded) {
11124 /*
11125 * Local will become overloaded so the avg_load metrics are
11126 * finally needed.
11127 */
11128
11129 local->avg_load = (local->group_load * SCHED_CAPACITY_SCALE) /
11130 local->group_capacity;
11131
11132 /*
11133 * If the local group is more loaded than the selected
11134 * busiest group don't try to pull any tasks.
11135 */
11136 if (local->avg_load >= busiest->avg_load) {
11137 env->imbalance = 0;
11138 return;
11139 }
11140
11141 sds->avg_load = (sds->total_load * SCHED_CAPACITY_SCALE) /
11142 sds->total_capacity;
11143
11144 /*
11145 * If the local group is more loaded than the average system
11146 * load, don't try to pull any tasks.
11147 */
11148 if (local->avg_load >= sds->avg_load) {
11149 env->imbalance = 0;
11150 return;
11151 }
11152
11153 }
11154
11155 /*
11156 * Both group are or will become overloaded and we're trying to get all
11157 * the CPUs to the average_load, so we don't want to push ourselves
11158 * above the average load, nor do we wish to reduce the max loaded CPU
11159 * below the average load. At the same time, we also don't want to
11160 * reduce the group load below the group capacity. Thus we look for
11161 * the minimum possible imbalance.
11162 */
11163 env->migration_type = migrate_load;
11164 env->imbalance = min(
11165 (busiest->avg_load - sds->avg_load) * busiest->group_capacity,
11166 (sds->avg_load - local->avg_load) * local->group_capacity
11167 ) / SCHED_CAPACITY_SCALE;
11168 }
11169
11170 /******* sched_balance_find_src_group() helpers end here *********************/
11171
11172 /*
11173 * Decision matrix according to the local and busiest group type:
11174 *
11175 * busiest \ local has_spare fully_busy misfit asym imbalanced overloaded
11176 * has_spare nr_idle balanced N/A N/A balanced balanced
11177 * fully_busy nr_idle nr_idle N/A N/A balanced balanced
11178 * misfit_task force N/A N/A N/A N/A N/A
11179 * asym_packing force force N/A N/A force force
11180 * imbalanced force force N/A N/A force force
11181 * overloaded force force N/A N/A force avg_load
11182 *
11183 * N/A : Not Applicable because already filtered while updating
11184 * statistics.
11185 * balanced : The system is balanced for these 2 groups.
11186 * force : Calculate the imbalance as load migration is probably needed.
11187 * avg_load : Only if imbalance is significant enough.
11188 * nr_idle : dst_cpu is not busy and the number of idle CPUs is quite
11189 * different in groups.
11190 */
11191
11192 /**
11193 * sched_balance_find_src_group - Returns the busiest group within the sched_domain
11194 * if there is an imbalance.
11195 * @env: The load balancing environment.
11196 *
11197 * Also calculates the amount of runnable load which should be moved
11198 * to restore balance.
11199 *
11200 * Return: - The busiest group if imbalance exists.
11201 */
sched_balance_find_src_group(struct lb_env * env)11202 static struct sched_group *sched_balance_find_src_group(struct lb_env *env)
11203 {
11204 struct sg_lb_stats *local, *busiest;
11205 struct sd_lb_stats sds;
11206
11207 init_sd_lb_stats(&sds);
11208
11209 /*
11210 * Compute the various statistics relevant for load balancing at
11211 * this level.
11212 */
11213 update_sd_lb_stats(env, &sds);
11214
11215 /* There is no busy sibling group to pull tasks from */
11216 if (!sds.busiest)
11217 goto out_balanced;
11218
11219 busiest = &sds.busiest_stat;
11220
11221 /* Misfit tasks should be dealt with regardless of the avg load */
11222 if (busiest->group_type == group_misfit_task)
11223 goto force_balance;
11224
11225 if (!is_rd_overutilized(env->dst_rq->rd) &&
11226 rcu_dereference(env->dst_rq->rd->pd))
11227 goto out_balanced;
11228
11229 /* ASYM feature bypasses nice load balance check */
11230 if (busiest->group_type == group_asym_packing)
11231 goto force_balance;
11232
11233 /*
11234 * If the busiest group is imbalanced the below checks don't
11235 * work because they assume all things are equal, which typically
11236 * isn't true due to cpus_ptr constraints and the like.
11237 */
11238 if (busiest->group_type == group_imbalanced)
11239 goto force_balance;
11240
11241 local = &sds.local_stat;
11242 /*
11243 * If the local group is busier than the selected busiest group
11244 * don't try and pull any tasks.
11245 */
11246 if (local->group_type > busiest->group_type)
11247 goto out_balanced;
11248
11249 /*
11250 * When groups are overloaded, use the avg_load to ensure fairness
11251 * between tasks.
11252 */
11253 if (local->group_type == group_overloaded) {
11254 /*
11255 * If the local group is more loaded than the selected
11256 * busiest group don't try to pull any tasks.
11257 */
11258 if (local->avg_load >= busiest->avg_load)
11259 goto out_balanced;
11260
11261 /* XXX broken for overlapping NUMA groups */
11262 sds.avg_load = (sds.total_load * SCHED_CAPACITY_SCALE) /
11263 sds.total_capacity;
11264
11265 /*
11266 * Don't pull any tasks if this group is already above the
11267 * domain average load.
11268 */
11269 if (local->avg_load >= sds.avg_load)
11270 goto out_balanced;
11271
11272 /*
11273 * If the busiest group is more loaded, use imbalance_pct to be
11274 * conservative.
11275 */
11276 if (100 * busiest->avg_load <=
11277 env->sd->imbalance_pct * local->avg_load)
11278 goto out_balanced;
11279 }
11280
11281 /*
11282 * Try to move all excess tasks to a sibling domain of the busiest
11283 * group's child domain.
11284 */
11285 if (sds.prefer_sibling && local->group_type == group_has_spare &&
11286 sibling_imbalance(env, &sds, busiest, local) > 1)
11287 goto force_balance;
11288
11289 if (busiest->group_type != group_overloaded) {
11290 if (!env->idle) {
11291 /*
11292 * If the busiest group is not overloaded (and as a
11293 * result the local one too) but this CPU is already
11294 * busy, let another idle CPU try to pull task.
11295 */
11296 goto out_balanced;
11297 }
11298
11299 if (busiest->group_type == group_smt_balance &&
11300 smt_vs_nonsmt_groups(sds.local, sds.busiest)) {
11301 /* Let non SMT CPU pull from SMT CPU sharing with sibling */
11302 goto force_balance;
11303 }
11304
11305 if (busiest->group_weight > 1 &&
11306 local->idle_cpus <= (busiest->idle_cpus + 1)) {
11307 /*
11308 * If the busiest group is not overloaded
11309 * and there is no imbalance between this and busiest
11310 * group wrt idle CPUs, it is balanced. The imbalance
11311 * becomes significant if the diff is greater than 1
11312 * otherwise we might end up to just move the imbalance
11313 * on another group. Of course this applies only if
11314 * there is more than 1 CPU per group.
11315 */
11316 goto out_balanced;
11317 }
11318
11319 if (busiest->sum_h_nr_running == 1) {
11320 /*
11321 * busiest doesn't have any tasks waiting to run
11322 */
11323 goto out_balanced;
11324 }
11325 }
11326
11327 force_balance:
11328 /* Looks like there is an imbalance. Compute it */
11329 calculate_imbalance(env, &sds);
11330 return env->imbalance ? sds.busiest : NULL;
11331
11332 out_balanced:
11333 env->imbalance = 0;
11334 return NULL;
11335 }
11336
11337 /*
11338 * sched_balance_find_src_rq - find the busiest runqueue among the CPUs in the group.
11339 */
sched_balance_find_src_rq(struct lb_env * env,struct sched_group * group)11340 static struct rq *sched_balance_find_src_rq(struct lb_env *env,
11341 struct sched_group *group)
11342 {
11343 struct rq *busiest = NULL, *rq;
11344 unsigned long busiest_util = 0, busiest_load = 0, busiest_capacity = 1;
11345 unsigned int busiest_nr = 0;
11346 int i;
11347
11348 for_each_cpu_and(i, sched_group_span(group), env->cpus) {
11349 unsigned long capacity, load, util;
11350 unsigned int nr_running;
11351 enum fbq_type rt;
11352
11353 rq = cpu_rq(i);
11354 rt = fbq_classify_rq(rq);
11355
11356 /*
11357 * We classify groups/runqueues into three groups:
11358 * - regular: there are !numa tasks
11359 * - remote: there are numa tasks that run on the 'wrong' node
11360 * - all: there is no distinction
11361 *
11362 * In order to avoid migrating ideally placed numa tasks,
11363 * ignore those when there's better options.
11364 *
11365 * If we ignore the actual busiest queue to migrate another
11366 * task, the next balance pass can still reduce the busiest
11367 * queue by moving tasks around inside the node.
11368 *
11369 * If we cannot move enough load due to this classification
11370 * the next pass will adjust the group classification and
11371 * allow migration of more tasks.
11372 *
11373 * Both cases only affect the total convergence complexity.
11374 */
11375 if (rt > env->fbq_type)
11376 continue;
11377
11378 nr_running = rq->cfs.h_nr_runnable;
11379 if (!nr_running)
11380 continue;
11381
11382 capacity = capacity_of(i);
11383
11384 /*
11385 * For ASYM_CPUCAPACITY domains, don't pick a CPU that could
11386 * eventually lead to active_balancing high->low capacity.
11387 * Higher per-CPU capacity is considered better than balancing
11388 * average load.
11389 */
11390 if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
11391 !capacity_greater(capacity_of(env->dst_cpu), capacity) &&
11392 nr_running == 1)
11393 continue;
11394
11395 /*
11396 * Make sure we only pull tasks from a CPU of lower priority
11397 * when balancing between SMT siblings.
11398 *
11399 * If balancing between cores, let lower priority CPUs help
11400 * SMT cores with more than one busy sibling.
11401 */
11402 if (sched_asym(env->sd, i, env->dst_cpu) && nr_running == 1)
11403 continue;
11404
11405 switch (env->migration_type) {
11406 case migrate_load:
11407 /*
11408 * When comparing with load imbalance, use cpu_load()
11409 * which is not scaled with the CPU capacity.
11410 */
11411 load = cpu_load(rq);
11412
11413 if (nr_running == 1 && load > env->imbalance &&
11414 !check_cpu_capacity(rq, env->sd))
11415 break;
11416
11417 /*
11418 * For the load comparisons with the other CPUs,
11419 * consider the cpu_load() scaled with the CPU
11420 * capacity, so that the load can be moved away
11421 * from the CPU that is potentially running at a
11422 * lower capacity.
11423 *
11424 * Thus we're looking for max(load_i / capacity_i),
11425 * crosswise multiplication to rid ourselves of the
11426 * division works out to:
11427 * load_i * capacity_j > load_j * capacity_i;
11428 * where j is our previous maximum.
11429 */
11430 if (load * busiest_capacity > busiest_load * capacity) {
11431 busiest_load = load;
11432 busiest_capacity = capacity;
11433 busiest = rq;
11434 }
11435 break;
11436
11437 case migrate_util:
11438 util = cpu_util_cfs_boost(i);
11439
11440 /*
11441 * Don't try to pull utilization from a CPU with one
11442 * running task. Whatever its utilization, we will fail
11443 * detach the task.
11444 */
11445 if (nr_running <= 1)
11446 continue;
11447
11448 if (busiest_util < util) {
11449 busiest_util = util;
11450 busiest = rq;
11451 }
11452 break;
11453
11454 case migrate_task:
11455 if (busiest_nr < nr_running) {
11456 busiest_nr = nr_running;
11457 busiest = rq;
11458 }
11459 break;
11460
11461 case migrate_misfit:
11462 /*
11463 * For ASYM_CPUCAPACITY domains with misfit tasks we
11464 * simply seek the "biggest" misfit task.
11465 */
11466 if (rq->misfit_task_load > busiest_load) {
11467 busiest_load = rq->misfit_task_load;
11468 busiest = rq;
11469 }
11470
11471 break;
11472
11473 }
11474 }
11475
11476 return busiest;
11477 }
11478
11479 /*
11480 * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
11481 * so long as it is large enough.
11482 */
11483 #define MAX_PINNED_INTERVAL 512
11484
11485 static inline bool
asym_active_balance(struct lb_env * env)11486 asym_active_balance(struct lb_env *env)
11487 {
11488 /*
11489 * ASYM_PACKING needs to force migrate tasks from busy but lower
11490 * priority CPUs in order to pack all tasks in the highest priority
11491 * CPUs. When done between cores, do it only if the whole core if the
11492 * whole core is idle.
11493 *
11494 * If @env::src_cpu is an SMT core with busy siblings, let
11495 * the lower priority @env::dst_cpu help it. Do not follow
11496 * CPU priority.
11497 */
11498 return env->idle && sched_use_asym_prio(env->sd, env->dst_cpu) &&
11499 (sched_asym_prefer(env->dst_cpu, env->src_cpu) ||
11500 !sched_use_asym_prio(env->sd, env->src_cpu));
11501 }
11502
11503 static inline bool
imbalanced_active_balance(struct lb_env * env)11504 imbalanced_active_balance(struct lb_env *env)
11505 {
11506 struct sched_domain *sd = env->sd;
11507
11508 /*
11509 * The imbalanced case includes the case of pinned tasks preventing a fair
11510 * distribution of the load on the system but also the even distribution of the
11511 * threads on a system with spare capacity
11512 */
11513 if ((env->migration_type == migrate_task) &&
11514 (sd->nr_balance_failed > sd->cache_nice_tries+2))
11515 return 1;
11516
11517 return 0;
11518 }
11519
need_active_balance(struct lb_env * env)11520 static int need_active_balance(struct lb_env *env)
11521 {
11522 struct sched_domain *sd = env->sd;
11523
11524 if (asym_active_balance(env))
11525 return 1;
11526
11527 if (imbalanced_active_balance(env))
11528 return 1;
11529
11530 /*
11531 * The dst_cpu is idle and the src_cpu CPU has only 1 CFS task.
11532 * It's worth migrating the task if the src_cpu's capacity is reduced
11533 * because of other sched_class or IRQs if more capacity stays
11534 * available on dst_cpu.
11535 */
11536 if (env->idle &&
11537 (env->src_rq->cfs.h_nr_runnable == 1)) {
11538 if ((check_cpu_capacity(env->src_rq, sd)) &&
11539 (capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100))
11540 return 1;
11541 }
11542
11543 if (env->migration_type == migrate_misfit)
11544 return 1;
11545
11546 return 0;
11547 }
11548
11549 static int active_load_balance_cpu_stop(void *data);
11550
should_we_balance(struct lb_env * env)11551 static int should_we_balance(struct lb_env *env)
11552 {
11553 struct cpumask *swb_cpus = this_cpu_cpumask_var_ptr(should_we_balance_tmpmask);
11554 struct sched_group *sg = env->sd->groups;
11555 int cpu, idle_smt = -1;
11556
11557 /*
11558 * Ensure the balancing environment is consistent; can happen
11559 * when the softirq triggers 'during' hotplug.
11560 */
11561 if (!cpumask_test_cpu(env->dst_cpu, env->cpus))
11562 return 0;
11563
11564 /*
11565 * In the newly idle case, we will allow all the CPUs
11566 * to do the newly idle load balance.
11567 *
11568 * However, we bail out if we already have tasks or a wakeup pending,
11569 * to optimize wakeup latency.
11570 */
11571 if (env->idle == CPU_NEWLY_IDLE) {
11572 if (env->dst_rq->nr_running > 0 || env->dst_rq->ttwu_pending)
11573 return 0;
11574 return 1;
11575 }
11576
11577 cpumask_copy(swb_cpus, group_balance_mask(sg));
11578 /* Try to find first idle CPU */
11579 for_each_cpu_and(cpu, swb_cpus, env->cpus) {
11580 if (!idle_cpu(cpu))
11581 continue;
11582
11583 /*
11584 * Don't balance to idle SMT in busy core right away when
11585 * balancing cores, but remember the first idle SMT CPU for
11586 * later consideration. Find CPU on an idle core first.
11587 */
11588 if (!(env->sd->flags & SD_SHARE_CPUCAPACITY) && !is_core_idle(cpu)) {
11589 if (idle_smt == -1)
11590 idle_smt = cpu;
11591 /*
11592 * If the core is not idle, and first SMT sibling which is
11593 * idle has been found, then its not needed to check other
11594 * SMT siblings for idleness:
11595 */
11596 #ifdef CONFIG_SCHED_SMT
11597 cpumask_andnot(swb_cpus, swb_cpus, cpu_smt_mask(cpu));
11598 #endif
11599 continue;
11600 }
11601
11602 /*
11603 * Are we the first idle core in a non-SMT domain or higher,
11604 * or the first idle CPU in a SMT domain?
11605 */
11606 return cpu == env->dst_cpu;
11607 }
11608
11609 /* Are we the first idle CPU with busy siblings? */
11610 if (idle_smt != -1)
11611 return idle_smt == env->dst_cpu;
11612
11613 /* Are we the first CPU of this group ? */
11614 return group_balance_cpu(sg) == env->dst_cpu;
11615 }
11616
update_lb_imbalance_stat(struct lb_env * env,struct sched_domain * sd,enum cpu_idle_type idle)11617 static void update_lb_imbalance_stat(struct lb_env *env, struct sched_domain *sd,
11618 enum cpu_idle_type idle)
11619 {
11620 if (!schedstat_enabled())
11621 return;
11622
11623 switch (env->migration_type) {
11624 case migrate_load:
11625 __schedstat_add(sd->lb_imbalance_load[idle], env->imbalance);
11626 break;
11627 case migrate_util:
11628 __schedstat_add(sd->lb_imbalance_util[idle], env->imbalance);
11629 break;
11630 case migrate_task:
11631 __schedstat_add(sd->lb_imbalance_task[idle], env->imbalance);
11632 break;
11633 case migrate_misfit:
11634 __schedstat_add(sd->lb_imbalance_misfit[idle], env->imbalance);
11635 break;
11636 }
11637 }
11638
11639 /*
11640 * Check this_cpu to ensure it is balanced within domain. Attempt to move
11641 * tasks if there is an imbalance.
11642 */
sched_balance_rq(int this_cpu,struct rq * this_rq,struct sched_domain * sd,enum cpu_idle_type idle,int * continue_balancing)11643 static int sched_balance_rq(int this_cpu, struct rq *this_rq,
11644 struct sched_domain *sd, enum cpu_idle_type idle,
11645 int *continue_balancing)
11646 {
11647 int ld_moved, cur_ld_moved, active_balance = 0;
11648 struct sched_domain *sd_parent = sd->parent;
11649 struct sched_group *group;
11650 struct rq *busiest;
11651 struct rq_flags rf;
11652 struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);
11653 struct lb_env env = {
11654 .sd = sd,
11655 .dst_cpu = this_cpu,
11656 .dst_rq = this_rq,
11657 .dst_grpmask = group_balance_mask(sd->groups),
11658 .idle = idle,
11659 .loop_break = SCHED_NR_MIGRATE_BREAK,
11660 .cpus = cpus,
11661 .fbq_type = all,
11662 .tasks = LIST_HEAD_INIT(env.tasks),
11663 };
11664
11665 cpumask_and(cpus, sched_domain_span(sd), cpu_active_mask);
11666
11667 schedstat_inc(sd->lb_count[idle]);
11668
11669 redo:
11670 if (!should_we_balance(&env)) {
11671 *continue_balancing = 0;
11672 goto out_balanced;
11673 }
11674
11675 group = sched_balance_find_src_group(&env);
11676 if (!group) {
11677 schedstat_inc(sd->lb_nobusyg[idle]);
11678 goto out_balanced;
11679 }
11680
11681 busiest = sched_balance_find_src_rq(&env, group);
11682 if (!busiest) {
11683 schedstat_inc(sd->lb_nobusyq[idle]);
11684 goto out_balanced;
11685 }
11686
11687 WARN_ON_ONCE(busiest == env.dst_rq);
11688
11689 update_lb_imbalance_stat(&env, sd, idle);
11690
11691 env.src_cpu = busiest->cpu;
11692 env.src_rq = busiest;
11693
11694 ld_moved = 0;
11695 /* Clear this flag as soon as we find a pullable task */
11696 env.flags |= LBF_ALL_PINNED;
11697 if (busiest->nr_running > 1) {
11698 /*
11699 * Attempt to move tasks. If sched_balance_find_src_group has found
11700 * an imbalance but busiest->nr_running <= 1, the group is
11701 * still unbalanced. ld_moved simply stays zero, so it is
11702 * correctly treated as an imbalance.
11703 */
11704 env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
11705
11706 more_balance:
11707 rq_lock_irqsave(busiest, &rf);
11708 update_rq_clock(busiest);
11709
11710 /*
11711 * cur_ld_moved - load moved in current iteration
11712 * ld_moved - cumulative load moved across iterations
11713 */
11714 cur_ld_moved = detach_tasks(&env);
11715
11716 /*
11717 * We've detached some tasks from busiest_rq. Every
11718 * task is masked "TASK_ON_RQ_MIGRATING", so we can safely
11719 * unlock busiest->lock, and we are able to be sure
11720 * that nobody can manipulate the tasks in parallel.
11721 * See task_rq_lock() family for the details.
11722 */
11723
11724 rq_unlock(busiest, &rf);
11725
11726 if (cur_ld_moved) {
11727 attach_tasks(&env);
11728 ld_moved += cur_ld_moved;
11729 }
11730
11731 local_irq_restore(rf.flags);
11732
11733 if (env.flags & LBF_NEED_BREAK) {
11734 env.flags &= ~LBF_NEED_BREAK;
11735 goto more_balance;
11736 }
11737
11738 /*
11739 * Revisit (affine) tasks on src_cpu that couldn't be moved to
11740 * us and move them to an alternate dst_cpu in our sched_group
11741 * where they can run. The upper limit on how many times we
11742 * iterate on same src_cpu is dependent on number of CPUs in our
11743 * sched_group.
11744 *
11745 * This changes load balance semantics a bit on who can move
11746 * load to a given_cpu. In addition to the given_cpu itself
11747 * (or a ilb_cpu acting on its behalf where given_cpu is
11748 * nohz-idle), we now have balance_cpu in a position to move
11749 * load to given_cpu. In rare situations, this may cause
11750 * conflicts (balance_cpu and given_cpu/ilb_cpu deciding
11751 * _independently_ and at _same_ time to move some load to
11752 * given_cpu) causing excess load to be moved to given_cpu.
11753 * This however should not happen so much in practice and
11754 * moreover subsequent load balance cycles should correct the
11755 * excess load moved.
11756 */
11757 if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
11758
11759 /* Prevent to re-select dst_cpu via env's CPUs */
11760 __cpumask_clear_cpu(env.dst_cpu, env.cpus);
11761
11762 env.dst_rq = cpu_rq(env.new_dst_cpu);
11763 env.dst_cpu = env.new_dst_cpu;
11764 env.flags &= ~LBF_DST_PINNED;
11765 env.loop = 0;
11766 env.loop_break = SCHED_NR_MIGRATE_BREAK;
11767
11768 /*
11769 * Go back to "more_balance" rather than "redo" since we
11770 * need to continue with same src_cpu.
11771 */
11772 goto more_balance;
11773 }
11774
11775 /*
11776 * We failed to reach balance because of affinity.
11777 */
11778 if (sd_parent) {
11779 int *group_imbalance = &sd_parent->groups->sgc->imbalance;
11780
11781 if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0)
11782 *group_imbalance = 1;
11783 }
11784
11785 /* All tasks on this runqueue were pinned by CPU affinity */
11786 if (unlikely(env.flags & LBF_ALL_PINNED)) {
11787 __cpumask_clear_cpu(cpu_of(busiest), cpus);
11788 /*
11789 * Attempting to continue load balancing at the current
11790 * sched_domain level only makes sense if there are
11791 * active CPUs remaining as possible busiest CPUs to
11792 * pull load from which are not contained within the
11793 * destination group that is receiving any migrated
11794 * load.
11795 */
11796 if (!cpumask_subset(cpus, env.dst_grpmask)) {
11797 env.loop = 0;
11798 env.loop_break = SCHED_NR_MIGRATE_BREAK;
11799 goto redo;
11800 }
11801 goto out_all_pinned;
11802 }
11803 }
11804
11805 if (!ld_moved) {
11806 schedstat_inc(sd->lb_failed[idle]);
11807 /*
11808 * Increment the failure counter only on periodic balance.
11809 * We do not want newidle balance, which can be very
11810 * frequent, pollute the failure counter causing
11811 * excessive cache_hot migrations and active balances.
11812 *
11813 * Similarly for migration_misfit which is not related to
11814 * load/util migration, don't pollute nr_balance_failed.
11815 */
11816 if (idle != CPU_NEWLY_IDLE &&
11817 env.migration_type != migrate_misfit)
11818 sd->nr_balance_failed++;
11819
11820 if (need_active_balance(&env)) {
11821 unsigned long flags;
11822
11823 raw_spin_rq_lock_irqsave(busiest, flags);
11824
11825 /*
11826 * Don't kick the active_load_balance_cpu_stop,
11827 * if the curr task on busiest CPU can't be
11828 * moved to this_cpu:
11829 */
11830 if (!cpumask_test_cpu(this_cpu, busiest->curr->cpus_ptr)) {
11831 raw_spin_rq_unlock_irqrestore(busiest, flags);
11832 goto out_one_pinned;
11833 }
11834
11835 /* Record that we found at least one task that could run on this_cpu */
11836 env.flags &= ~LBF_ALL_PINNED;
11837
11838 /*
11839 * ->active_balance synchronizes accesses to
11840 * ->active_balance_work. Once set, it's cleared
11841 * only after active load balance is finished.
11842 */
11843 if (!busiest->active_balance) {
11844 busiest->active_balance = 1;
11845 busiest->push_cpu = this_cpu;
11846 active_balance = 1;
11847 }
11848
11849 preempt_disable();
11850 raw_spin_rq_unlock_irqrestore(busiest, flags);
11851 if (active_balance) {
11852 stop_one_cpu_nowait(cpu_of(busiest),
11853 active_load_balance_cpu_stop, busiest,
11854 &busiest->active_balance_work);
11855 }
11856 preempt_enable();
11857 }
11858 } else {
11859 sd->nr_balance_failed = 0;
11860 }
11861
11862 if (likely(!active_balance) || need_active_balance(&env)) {
11863 /* We were unbalanced, so reset the balancing interval */
11864 sd->balance_interval = sd->min_interval;
11865 }
11866
11867 goto out;
11868
11869 out_balanced:
11870 /*
11871 * We reach balance although we may have faced some affinity
11872 * constraints. Clear the imbalance flag only if other tasks got
11873 * a chance to move and fix the imbalance.
11874 */
11875 if (sd_parent && !(env.flags & LBF_ALL_PINNED)) {
11876 int *group_imbalance = &sd_parent->groups->sgc->imbalance;
11877
11878 if (*group_imbalance)
11879 *group_imbalance = 0;
11880 }
11881
11882 out_all_pinned:
11883 /*
11884 * We reach balance because all tasks are pinned at this level so
11885 * we can't migrate them. Let the imbalance flag set so parent level
11886 * can try to migrate them.
11887 */
11888 schedstat_inc(sd->lb_balanced[idle]);
11889
11890 sd->nr_balance_failed = 0;
11891
11892 out_one_pinned:
11893 ld_moved = 0;
11894
11895 /*
11896 * sched_balance_newidle() disregards balance intervals, so we could
11897 * repeatedly reach this code, which would lead to balance_interval
11898 * skyrocketing in a short amount of time. Skip the balance_interval
11899 * increase logic to avoid that.
11900 *
11901 * Similarly misfit migration which is not necessarily an indication of
11902 * the system being busy and requires lb to backoff to let it settle
11903 * down.
11904 */
11905 if (env.idle == CPU_NEWLY_IDLE ||
11906 env.migration_type == migrate_misfit)
11907 goto out;
11908
11909 /* tune up the balancing interval */
11910 if ((env.flags & LBF_ALL_PINNED &&
11911 sd->balance_interval < MAX_PINNED_INTERVAL) ||
11912 sd->balance_interval < sd->max_interval)
11913 sd->balance_interval *= 2;
11914 out:
11915 return ld_moved;
11916 }
11917
11918 static inline unsigned long
get_sd_balance_interval(struct sched_domain * sd,int cpu_busy)11919 get_sd_balance_interval(struct sched_domain *sd, int cpu_busy)
11920 {
11921 unsigned long interval = sd->balance_interval;
11922
11923 if (cpu_busy)
11924 interval *= sd->busy_factor;
11925
11926 /* scale ms to jiffies */
11927 interval = msecs_to_jiffies(interval);
11928
11929 /*
11930 * Reduce likelihood of busy balancing at higher domains racing with
11931 * balancing at lower domains by preventing their balancing periods
11932 * from being multiples of each other.
11933 */
11934 if (cpu_busy)
11935 interval -= 1;
11936
11937 interval = clamp(interval, 1UL, max_load_balance_interval);
11938
11939 return interval;
11940 }
11941
11942 static inline void
update_next_balance(struct sched_domain * sd,unsigned long * next_balance)11943 update_next_balance(struct sched_domain *sd, unsigned long *next_balance)
11944 {
11945 unsigned long interval, next;
11946
11947 /* used by idle balance, so cpu_busy = 0 */
11948 interval = get_sd_balance_interval(sd, 0);
11949 next = sd->last_balance + interval;
11950
11951 if (time_after(*next_balance, next))
11952 *next_balance = next;
11953 }
11954
11955 /*
11956 * active_load_balance_cpu_stop is run by the CPU stopper. It pushes
11957 * running tasks off the busiest CPU onto idle CPUs. It requires at
11958 * least 1 task to be running on each physical CPU where possible, and
11959 * avoids physical / logical imbalances.
11960 */
active_load_balance_cpu_stop(void * data)11961 static int active_load_balance_cpu_stop(void *data)
11962 {
11963 struct rq *busiest_rq = data;
11964 int busiest_cpu = cpu_of(busiest_rq);
11965 int target_cpu = busiest_rq->push_cpu;
11966 struct rq *target_rq = cpu_rq(target_cpu);
11967 struct sched_domain *sd;
11968 struct task_struct *p = NULL;
11969 struct rq_flags rf;
11970
11971 rq_lock_irq(busiest_rq, &rf);
11972 /*
11973 * Between queueing the stop-work and running it is a hole in which
11974 * CPUs can become inactive. We should not move tasks from or to
11975 * inactive CPUs.
11976 */
11977 if (!cpu_active(busiest_cpu) || !cpu_active(target_cpu))
11978 goto out_unlock;
11979
11980 /* Make sure the requested CPU hasn't gone down in the meantime: */
11981 if (unlikely(busiest_cpu != smp_processor_id() ||
11982 !busiest_rq->active_balance))
11983 goto out_unlock;
11984
11985 /* Is there any task to move? */
11986 if (busiest_rq->nr_running <= 1)
11987 goto out_unlock;
11988
11989 /*
11990 * This condition is "impossible", if it occurs
11991 * we need to fix it. Originally reported by
11992 * Bjorn Helgaas on a 128-CPU setup.
11993 */
11994 WARN_ON_ONCE(busiest_rq == target_rq);
11995
11996 /* Search for an sd spanning us and the target CPU. */
11997 rcu_read_lock();
11998 for_each_domain(target_cpu, sd) {
11999 if (cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
12000 break;
12001 }
12002
12003 if (likely(sd)) {
12004 struct lb_env env = {
12005 .sd = sd,
12006 .dst_cpu = target_cpu,
12007 .dst_rq = target_rq,
12008 .src_cpu = busiest_rq->cpu,
12009 .src_rq = busiest_rq,
12010 .idle = CPU_IDLE,
12011 .flags = LBF_ACTIVE_LB,
12012 };
12013
12014 schedstat_inc(sd->alb_count);
12015 update_rq_clock(busiest_rq);
12016
12017 p = detach_one_task(&env);
12018 if (p) {
12019 schedstat_inc(sd->alb_pushed);
12020 /* Active balancing done, reset the failure counter. */
12021 sd->nr_balance_failed = 0;
12022 } else {
12023 schedstat_inc(sd->alb_failed);
12024 }
12025 }
12026 rcu_read_unlock();
12027 out_unlock:
12028 busiest_rq->active_balance = 0;
12029 rq_unlock(busiest_rq, &rf);
12030
12031 if (p)
12032 attach_one_task(target_rq, p);
12033
12034 local_irq_enable();
12035
12036 return 0;
12037 }
12038
12039 /*
12040 * This flag serializes load-balancing passes over large domains
12041 * (above the NODE topology level) - only one load-balancing instance
12042 * may run at a time, to reduce overhead on very large systems with
12043 * lots of CPUs and large NUMA distances.
12044 *
12045 * - Note that load-balancing passes triggered while another one
12046 * is executing are skipped and not re-tried.
12047 *
12048 * - Also note that this does not serialize rebalance_domains()
12049 * execution, as non-SD_SERIALIZE domains will still be
12050 * load-balanced in parallel.
12051 */
12052 static atomic_t sched_balance_running = ATOMIC_INIT(0);
12053
12054 /*
12055 * Scale the max sched_balance_rq interval with the number of CPUs in the system.
12056 * This trades load-balance latency on larger machines for less cross talk.
12057 */
update_max_interval(void)12058 void update_max_interval(void)
12059 {
12060 max_load_balance_interval = HZ*num_online_cpus()/10;
12061 }
12062
update_newidle_cost(struct sched_domain * sd,u64 cost)12063 static inline bool update_newidle_cost(struct sched_domain *sd, u64 cost)
12064 {
12065 if (cost > sd->max_newidle_lb_cost) {
12066 /*
12067 * Track max cost of a domain to make sure to not delay the
12068 * next wakeup on the CPU.
12069 *
12070 * sched_balance_newidle() bumps the cost whenever newidle
12071 * balance fails, and we don't want things to grow out of
12072 * control. Use the sysctl_sched_migration_cost as the upper
12073 * limit, plus a litle extra to avoid off by ones.
12074 */
12075 sd->max_newidle_lb_cost =
12076 min(cost, sysctl_sched_migration_cost + 200);
12077 sd->last_decay_max_lb_cost = jiffies;
12078 } else if (time_after(jiffies, sd->last_decay_max_lb_cost + HZ)) {
12079 /*
12080 * Decay the newidle max times by ~1% per second to ensure that
12081 * it is not outdated and the current max cost is actually
12082 * shorter.
12083 */
12084 sd->max_newidle_lb_cost = (sd->max_newidle_lb_cost * 253) / 256;
12085 sd->last_decay_max_lb_cost = jiffies;
12086
12087 return true;
12088 }
12089
12090 return false;
12091 }
12092
12093 /*
12094 * It checks each scheduling domain to see if it is due to be balanced,
12095 * and initiates a balancing operation if so.
12096 *
12097 * Balancing parameters are set up in init_sched_domains.
12098 */
sched_balance_domains(struct rq * rq,enum cpu_idle_type idle)12099 static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle)
12100 {
12101 int continue_balancing = 1;
12102 int cpu = rq->cpu;
12103 int busy = idle != CPU_IDLE && !sched_idle_cpu(cpu);
12104 unsigned long interval;
12105 struct sched_domain *sd;
12106 /* Earliest time when we have to do rebalance again */
12107 unsigned long next_balance = jiffies + 60*HZ;
12108 int update_next_balance = 0;
12109 int need_serialize, need_decay = 0;
12110 u64 max_cost = 0;
12111
12112 rcu_read_lock();
12113 for_each_domain(cpu, sd) {
12114 /*
12115 * Decay the newidle max times here because this is a regular
12116 * visit to all the domains.
12117 */
12118 need_decay = update_newidle_cost(sd, 0);
12119 max_cost += sd->max_newidle_lb_cost;
12120
12121 /*
12122 * Stop the load balance at this level. There is another
12123 * CPU in our sched group which is doing load balancing more
12124 * actively.
12125 */
12126 if (!continue_balancing) {
12127 if (need_decay)
12128 continue;
12129 break;
12130 }
12131
12132 interval = get_sd_balance_interval(sd, busy);
12133
12134 need_serialize = sd->flags & SD_SERIALIZE;
12135 if (need_serialize) {
12136 if (atomic_cmpxchg_acquire(&sched_balance_running, 0, 1))
12137 goto out;
12138 }
12139
12140 if (time_after_eq(jiffies, sd->last_balance + interval)) {
12141 if (sched_balance_rq(cpu, rq, sd, idle, &continue_balancing)) {
12142 /*
12143 * The LBF_DST_PINNED logic could have changed
12144 * env->dst_cpu, so we can't know our idle
12145 * state even if we migrated tasks. Update it.
12146 */
12147 idle = idle_cpu(cpu);
12148 busy = !idle && !sched_idle_cpu(cpu);
12149 }
12150 sd->last_balance = jiffies;
12151 interval = get_sd_balance_interval(sd, busy);
12152 }
12153 if (need_serialize)
12154 atomic_set_release(&sched_balance_running, 0);
12155 out:
12156 if (time_after(next_balance, sd->last_balance + interval)) {
12157 next_balance = sd->last_balance + interval;
12158 update_next_balance = 1;
12159 }
12160 }
12161 if (need_decay) {
12162 /*
12163 * Ensure the rq-wide value also decays but keep it at a
12164 * reasonable floor to avoid funnies with rq->avg_idle.
12165 */
12166 rq->max_idle_balance_cost =
12167 max((u64)sysctl_sched_migration_cost, max_cost);
12168 }
12169 rcu_read_unlock();
12170
12171 /*
12172 * next_balance will be updated only when there is a need.
12173 * When the cpu is attached to null domain for ex, it will not be
12174 * updated.
12175 */
12176 if (likely(update_next_balance))
12177 rq->next_balance = next_balance;
12178
12179 }
12180
on_null_domain(struct rq * rq)12181 static inline int on_null_domain(struct rq *rq)
12182 {
12183 return unlikely(!rcu_dereference_sched(rq->sd));
12184 }
12185
12186 #ifdef CONFIG_NO_HZ_COMMON
12187 /*
12188 * NOHZ idle load balancing (ILB) details:
12189 *
12190 * - When one of the busy CPUs notices that there may be an idle rebalancing
12191 * needed, they will kick the idle load balancer, which then does idle
12192 * load balancing for all the idle CPUs.
12193 */
find_new_ilb(void)12194 static inline int find_new_ilb(void)
12195 {
12196 const struct cpumask *hk_mask;
12197 int ilb_cpu;
12198
12199 hk_mask = housekeeping_cpumask(HK_TYPE_KERNEL_NOISE);
12200
12201 for_each_cpu_and(ilb_cpu, nohz.idle_cpus_mask, hk_mask) {
12202
12203 if (ilb_cpu == smp_processor_id())
12204 continue;
12205
12206 if (idle_cpu(ilb_cpu))
12207 return ilb_cpu;
12208 }
12209
12210 return -1;
12211 }
12212
12213 /*
12214 * Kick a CPU to do the NOHZ balancing, if it is time for it, via a cross-CPU
12215 * SMP function call (IPI).
12216 *
12217 * We pick the first idle CPU in the HK_TYPE_KERNEL_NOISE housekeeping set
12218 * (if there is one).
12219 */
kick_ilb(unsigned int flags)12220 static void kick_ilb(unsigned int flags)
12221 {
12222 int ilb_cpu;
12223
12224 /*
12225 * Increase nohz.next_balance only when if full ilb is triggered but
12226 * not if we only update stats.
12227 */
12228 if (flags & NOHZ_BALANCE_KICK)
12229 nohz.next_balance = jiffies+1;
12230
12231 ilb_cpu = find_new_ilb();
12232 if (ilb_cpu < 0)
12233 return;
12234
12235 /*
12236 * Don't bother if no new NOHZ balance work items for ilb_cpu,
12237 * i.e. all bits in flags are already set in ilb_cpu.
12238 */
12239 if ((atomic_read(nohz_flags(ilb_cpu)) & flags) == flags)
12240 return;
12241
12242 /*
12243 * Access to rq::nohz_csd is serialized by NOHZ_KICK_MASK; he who sets
12244 * the first flag owns it; cleared by nohz_csd_func().
12245 */
12246 flags = atomic_fetch_or(flags, nohz_flags(ilb_cpu));
12247 if (flags & NOHZ_KICK_MASK)
12248 return;
12249
12250 /*
12251 * This way we generate an IPI on the target CPU which
12252 * is idle, and the softirq performing NOHZ idle load balancing
12253 * will be run before returning from the IPI.
12254 */
12255 smp_call_function_single_async(ilb_cpu, &cpu_rq(ilb_cpu)->nohz_csd);
12256 }
12257
12258 /*
12259 * Current decision point for kicking the idle load balancer in the presence
12260 * of idle CPUs in the system.
12261 */
nohz_balancer_kick(struct rq * rq)12262 static void nohz_balancer_kick(struct rq *rq)
12263 {
12264 unsigned long now = jiffies;
12265 struct sched_domain_shared *sds;
12266 struct sched_domain *sd;
12267 int nr_busy, i, cpu = rq->cpu;
12268 unsigned int flags = 0;
12269
12270 if (unlikely(rq->idle_balance))
12271 return;
12272
12273 /*
12274 * We may be recently in ticked or tickless idle mode. At the first
12275 * busy tick after returning from idle, we will update the busy stats.
12276 */
12277 nohz_balance_exit_idle(rq);
12278
12279 /*
12280 * None are in tickless mode and hence no need for NOHZ idle load
12281 * balancing:
12282 */
12283 if (likely(!atomic_read(&nohz.nr_cpus)))
12284 return;
12285
12286 if (READ_ONCE(nohz.has_blocked) &&
12287 time_after(now, READ_ONCE(nohz.next_blocked)))
12288 flags = NOHZ_STATS_KICK;
12289
12290 if (time_before(now, nohz.next_balance))
12291 goto out;
12292
12293 if (rq->nr_running >= 2) {
12294 flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
12295 goto out;
12296 }
12297
12298 rcu_read_lock();
12299
12300 sd = rcu_dereference(rq->sd);
12301 if (sd) {
12302 /*
12303 * If there's a runnable CFS task and the current CPU has reduced
12304 * capacity, kick the ILB to see if there's a better CPU to run on:
12305 */
12306 if (rq->cfs.h_nr_runnable >= 1 && check_cpu_capacity(rq, sd)) {
12307 flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
12308 goto unlock;
12309 }
12310 }
12311
12312 sd = rcu_dereference(per_cpu(sd_asym_packing, cpu));
12313 if (sd) {
12314 /*
12315 * When ASYM_PACKING; see if there's a more preferred CPU
12316 * currently idle; in which case, kick the ILB to move tasks
12317 * around.
12318 *
12319 * When balancing between cores, all the SMT siblings of the
12320 * preferred CPU must be idle.
12321 */
12322 for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) {
12323 if (sched_asym(sd, i, cpu)) {
12324 flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
12325 goto unlock;
12326 }
12327 }
12328 }
12329
12330 sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, cpu));
12331 if (sd) {
12332 /*
12333 * When ASYM_CPUCAPACITY; see if there's a higher capacity CPU
12334 * to run the misfit task on.
12335 */
12336 if (check_misfit_status(rq)) {
12337 flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
12338 goto unlock;
12339 }
12340
12341 /*
12342 * For asymmetric systems, we do not want to nicely balance
12343 * cache use, instead we want to embrace asymmetry and only
12344 * ensure tasks have enough CPU capacity.
12345 *
12346 * Skip the LLC logic because it's not relevant in that case.
12347 */
12348 goto unlock;
12349 }
12350
12351 sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
12352 if (sds) {
12353 /*
12354 * If there is an imbalance between LLC domains (IOW we could
12355 * increase the overall cache utilization), we need a less-loaded LLC
12356 * domain to pull some load from. Likewise, we may need to spread
12357 * load within the current LLC domain (e.g. packed SMT cores but
12358 * other CPUs are idle). We can't really know from here how busy
12359 * the others are - so just get a NOHZ balance going if it looks
12360 * like this LLC domain has tasks we could move.
12361 */
12362 nr_busy = atomic_read(&sds->nr_busy_cpus);
12363 if (nr_busy > 1) {
12364 flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
12365 goto unlock;
12366 }
12367 }
12368 unlock:
12369 rcu_read_unlock();
12370 out:
12371 if (READ_ONCE(nohz.needs_update))
12372 flags |= NOHZ_NEXT_KICK;
12373
12374 if (flags)
12375 kick_ilb(flags);
12376 }
12377
set_cpu_sd_state_busy(int cpu)12378 static void set_cpu_sd_state_busy(int cpu)
12379 {
12380 struct sched_domain *sd;
12381
12382 rcu_read_lock();
12383 sd = rcu_dereference(per_cpu(sd_llc, cpu));
12384
12385 if (!sd || !sd->nohz_idle)
12386 goto unlock;
12387 sd->nohz_idle = 0;
12388
12389 atomic_inc(&sd->shared->nr_busy_cpus);
12390 unlock:
12391 rcu_read_unlock();
12392 }
12393
nohz_balance_exit_idle(struct rq * rq)12394 void nohz_balance_exit_idle(struct rq *rq)
12395 {
12396 WARN_ON_ONCE(rq != this_rq());
12397
12398 if (likely(!rq->nohz_tick_stopped))
12399 return;
12400
12401 rq->nohz_tick_stopped = 0;
12402 cpumask_clear_cpu(rq->cpu, nohz.idle_cpus_mask);
12403 atomic_dec(&nohz.nr_cpus);
12404
12405 set_cpu_sd_state_busy(rq->cpu);
12406 }
12407
set_cpu_sd_state_idle(int cpu)12408 static void set_cpu_sd_state_idle(int cpu)
12409 {
12410 struct sched_domain *sd;
12411
12412 rcu_read_lock();
12413 sd = rcu_dereference(per_cpu(sd_llc, cpu));
12414
12415 if (!sd || sd->nohz_idle)
12416 goto unlock;
12417 sd->nohz_idle = 1;
12418
12419 atomic_dec(&sd->shared->nr_busy_cpus);
12420 unlock:
12421 rcu_read_unlock();
12422 }
12423
12424 /*
12425 * This routine will record that the CPU is going idle with tick stopped.
12426 * This info will be used in performing idle load balancing in the future.
12427 */
nohz_balance_enter_idle(int cpu)12428 void nohz_balance_enter_idle(int cpu)
12429 {
12430 struct rq *rq = cpu_rq(cpu);
12431
12432 WARN_ON_ONCE(cpu != smp_processor_id());
12433
12434 /* If this CPU is going down, then nothing needs to be done: */
12435 if (!cpu_active(cpu))
12436 return;
12437
12438 /*
12439 * Can be set safely without rq->lock held
12440 * If a clear happens, it will have evaluated last additions because
12441 * rq->lock is held during the check and the clear
12442 */
12443 rq->has_blocked_load = 1;
12444
12445 /*
12446 * The tick is still stopped but load could have been added in the
12447 * meantime. We set the nohz.has_blocked flag to trig a check of the
12448 * *_avg. The CPU is already part of nohz.idle_cpus_mask so the clear
12449 * of nohz.has_blocked can only happen after checking the new load
12450 */
12451 if (rq->nohz_tick_stopped)
12452 goto out;
12453
12454 /* If we're a completely isolated CPU, we don't play: */
12455 if (on_null_domain(rq))
12456 return;
12457
12458 rq->nohz_tick_stopped = 1;
12459
12460 cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
12461 atomic_inc(&nohz.nr_cpus);
12462
12463 /*
12464 * Ensures that if nohz_idle_balance() fails to observe our
12465 * @idle_cpus_mask store, it must observe the @has_blocked
12466 * and @needs_update stores.
12467 */
12468 smp_mb__after_atomic();
12469
12470 set_cpu_sd_state_idle(cpu);
12471
12472 WRITE_ONCE(nohz.needs_update, 1);
12473 out:
12474 /*
12475 * Each time a cpu enter idle, we assume that it has blocked load and
12476 * enable the periodic update of the load of idle CPUs
12477 */
12478 WRITE_ONCE(nohz.has_blocked, 1);
12479 }
12480
update_nohz_stats(struct rq * rq)12481 static bool update_nohz_stats(struct rq *rq)
12482 {
12483 unsigned int cpu = rq->cpu;
12484
12485 if (!rq->has_blocked_load)
12486 return false;
12487
12488 if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
12489 return false;
12490
12491 if (!time_after(jiffies, READ_ONCE(rq->last_blocked_load_update_tick)))
12492 return true;
12493
12494 sched_balance_update_blocked_averages(cpu);
12495
12496 return rq->has_blocked_load;
12497 }
12498
12499 /*
12500 * Internal function that runs load balance for all idle CPUs. The load balance
12501 * can be a simple update of blocked load or a complete load balance with
12502 * tasks movement depending of flags.
12503 */
_nohz_idle_balance(struct rq * this_rq,unsigned int flags)12504 static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags)
12505 {
12506 /* Earliest time when we have to do rebalance again */
12507 unsigned long now = jiffies;
12508 unsigned long next_balance = now + 60*HZ;
12509 bool has_blocked_load = false;
12510 int update_next_balance = 0;
12511 int this_cpu = this_rq->cpu;
12512 int balance_cpu;
12513 struct rq *rq;
12514
12515 WARN_ON_ONCE((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
12516
12517 /*
12518 * We assume there will be no idle load after this update and clear
12519 * the has_blocked flag. If a cpu enters idle in the mean time, it will
12520 * set the has_blocked flag and trigger another update of idle load.
12521 * Because a cpu that becomes idle, is added to idle_cpus_mask before
12522 * setting the flag, we are sure to not clear the state and not
12523 * check the load of an idle cpu.
12524 *
12525 * Same applies to idle_cpus_mask vs needs_update.
12526 */
12527 if (flags & NOHZ_STATS_KICK)
12528 WRITE_ONCE(nohz.has_blocked, 0);
12529 if (flags & NOHZ_NEXT_KICK)
12530 WRITE_ONCE(nohz.needs_update, 0);
12531
12532 /*
12533 * Ensures that if we miss the CPU, we must see the has_blocked
12534 * store from nohz_balance_enter_idle().
12535 */
12536 smp_mb();
12537
12538 /*
12539 * Start with the next CPU after this_cpu so we will end with this_cpu and let a
12540 * chance for other idle cpu to pull load.
12541 */
12542 for_each_cpu_wrap(balance_cpu, nohz.idle_cpus_mask, this_cpu+1) {
12543 if (!idle_cpu(balance_cpu))
12544 continue;
12545
12546 /*
12547 * If this CPU gets work to do, stop the load balancing
12548 * work being done for other CPUs. Next load
12549 * balancing owner will pick it up.
12550 */
12551 if (!idle_cpu(this_cpu) && need_resched()) {
12552 if (flags & NOHZ_STATS_KICK)
12553 has_blocked_load = true;
12554 if (flags & NOHZ_NEXT_KICK)
12555 WRITE_ONCE(nohz.needs_update, 1);
12556 goto abort;
12557 }
12558
12559 rq = cpu_rq(balance_cpu);
12560
12561 if (flags & NOHZ_STATS_KICK)
12562 has_blocked_load |= update_nohz_stats(rq);
12563
12564 /*
12565 * If time for next balance is due,
12566 * do the balance.
12567 */
12568 if (time_after_eq(jiffies, rq->next_balance)) {
12569 struct rq_flags rf;
12570
12571 rq_lock_irqsave(rq, &rf);
12572 update_rq_clock(rq);
12573 rq_unlock_irqrestore(rq, &rf);
12574
12575 if (flags & NOHZ_BALANCE_KICK)
12576 sched_balance_domains(rq, CPU_IDLE);
12577 }
12578
12579 if (time_after(next_balance, rq->next_balance)) {
12580 next_balance = rq->next_balance;
12581 update_next_balance = 1;
12582 }
12583 }
12584
12585 /*
12586 * next_balance will be updated only when there is a need.
12587 * When the CPU is attached to null domain for ex, it will not be
12588 * updated.
12589 */
12590 if (likely(update_next_balance))
12591 nohz.next_balance = next_balance;
12592
12593 if (flags & NOHZ_STATS_KICK)
12594 WRITE_ONCE(nohz.next_blocked,
12595 now + msecs_to_jiffies(LOAD_AVG_PERIOD));
12596
12597 abort:
12598 /* There is still blocked load, enable periodic update */
12599 if (has_blocked_load)
12600 WRITE_ONCE(nohz.has_blocked, 1);
12601 }
12602
12603 /*
12604 * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
12605 * rebalancing for all the CPUs for whom scheduler ticks are stopped.
12606 */
nohz_idle_balance(struct rq * this_rq,enum cpu_idle_type idle)12607 static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
12608 {
12609 unsigned int flags = this_rq->nohz_idle_balance;
12610
12611 if (!flags)
12612 return false;
12613
12614 this_rq->nohz_idle_balance = 0;
12615
12616 if (idle != CPU_IDLE)
12617 return false;
12618
12619 _nohz_idle_balance(this_rq, flags);
12620
12621 return true;
12622 }
12623
12624 /*
12625 * Check if we need to directly run the ILB for updating blocked load before
12626 * entering idle state. Here we run ILB directly without issuing IPIs.
12627 *
12628 * Note that when this function is called, the tick may not yet be stopped on
12629 * this CPU yet. nohz.idle_cpus_mask is updated only when tick is stopped and
12630 * cleared on the next busy tick. In other words, nohz.idle_cpus_mask updates
12631 * don't align with CPUs enter/exit idle to avoid bottlenecks due to high idle
12632 * entry/exit rate (usec). So it is possible that _nohz_idle_balance() is
12633 * called from this function on (this) CPU that's not yet in the mask. That's
12634 * OK because the goal of nohz_run_idle_balance() is to run ILB only for
12635 * updating the blocked load of already idle CPUs without waking up one of
12636 * those idle CPUs and outside the preempt disable / IRQ off phase of the local
12637 * cpu about to enter idle, because it can take a long time.
12638 */
nohz_run_idle_balance(int cpu)12639 void nohz_run_idle_balance(int cpu)
12640 {
12641 unsigned int flags;
12642
12643 flags = atomic_fetch_andnot(NOHZ_NEWILB_KICK, nohz_flags(cpu));
12644
12645 /*
12646 * Update the blocked load only if no SCHED_SOFTIRQ is about to happen
12647 * (i.e. NOHZ_STATS_KICK set) and will do the same.
12648 */
12649 if ((flags == NOHZ_NEWILB_KICK) && !need_resched())
12650 _nohz_idle_balance(cpu_rq(cpu), NOHZ_STATS_KICK);
12651 }
12652
nohz_newidle_balance(struct rq * this_rq)12653 static void nohz_newidle_balance(struct rq *this_rq)
12654 {
12655 int this_cpu = this_rq->cpu;
12656
12657 /* Will wake up very soon. No time for doing anything else*/
12658 if (this_rq->avg_idle < sysctl_sched_migration_cost)
12659 return;
12660
12661 /* Don't need to update blocked load of idle CPUs*/
12662 if (!READ_ONCE(nohz.has_blocked) ||
12663 time_before(jiffies, READ_ONCE(nohz.next_blocked)))
12664 return;
12665
12666 /*
12667 * Set the need to trigger ILB in order to update blocked load
12668 * before entering idle state.
12669 */
12670 atomic_or(NOHZ_NEWILB_KICK, nohz_flags(this_cpu));
12671 }
12672
12673 #else /* !CONFIG_NO_HZ_COMMON: */
nohz_balancer_kick(struct rq * rq)12674 static inline void nohz_balancer_kick(struct rq *rq) { }
12675
nohz_idle_balance(struct rq * this_rq,enum cpu_idle_type idle)12676 static inline bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
12677 {
12678 return false;
12679 }
12680
nohz_newidle_balance(struct rq * this_rq)12681 static inline void nohz_newidle_balance(struct rq *this_rq) { }
12682 #endif /* !CONFIG_NO_HZ_COMMON */
12683
12684 /*
12685 * sched_balance_newidle is called by schedule() if this_cpu is about to become
12686 * idle. Attempts to pull tasks from other CPUs.
12687 *
12688 * Returns:
12689 * < 0 - we released the lock and there are !fair tasks present
12690 * 0 - failed, no new tasks
12691 * > 0 - success, new (fair) tasks present
12692 */
sched_balance_newidle(struct rq * this_rq,struct rq_flags * rf)12693 static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
12694 {
12695 unsigned long next_balance = jiffies + HZ;
12696 int this_cpu = this_rq->cpu;
12697 int continue_balancing = 1;
12698 u64 t0, t1, curr_cost = 0;
12699 struct sched_domain *sd;
12700 int pulled_task = 0;
12701
12702 update_misfit_status(NULL, this_rq);
12703
12704 /*
12705 * There is a task waiting to run. No need to search for one.
12706 * Return 0; the task will be enqueued when switching to idle.
12707 */
12708 if (this_rq->ttwu_pending)
12709 return 0;
12710
12711 /*
12712 * We must set idle_stamp _before_ calling sched_balance_rq()
12713 * for CPU_NEWLY_IDLE, such that we measure the this duration
12714 * as idle time.
12715 */
12716 this_rq->idle_stamp = rq_clock(this_rq);
12717
12718 /*
12719 * Do not pull tasks towards !active CPUs...
12720 */
12721 if (!cpu_active(this_cpu))
12722 return 0;
12723
12724 /*
12725 * This is OK, because current is on_cpu, which avoids it being picked
12726 * for load-balance and preemption/IRQs are still disabled avoiding
12727 * further scheduler activity on it and we're being very careful to
12728 * re-start the picking loop.
12729 */
12730 rq_unpin_lock(this_rq, rf);
12731
12732 rcu_read_lock();
12733 sd = rcu_dereference_check_sched_domain(this_rq->sd);
12734
12735 if (!get_rd_overloaded(this_rq->rd) ||
12736 (sd && this_rq->avg_idle < sd->max_newidle_lb_cost)) {
12737
12738 if (sd)
12739 update_next_balance(sd, &next_balance);
12740 rcu_read_unlock();
12741
12742 goto out;
12743 }
12744 rcu_read_unlock();
12745
12746 raw_spin_rq_unlock(this_rq);
12747
12748 t0 = sched_clock_cpu(this_cpu);
12749 sched_balance_update_blocked_averages(this_cpu);
12750
12751 rcu_read_lock();
12752 for_each_domain(this_cpu, sd) {
12753 u64 domain_cost;
12754
12755 update_next_balance(sd, &next_balance);
12756
12757 if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost)
12758 break;
12759
12760 if (sd->flags & SD_BALANCE_NEWIDLE) {
12761
12762 pulled_task = sched_balance_rq(this_cpu, this_rq,
12763 sd, CPU_NEWLY_IDLE,
12764 &continue_balancing);
12765
12766 t1 = sched_clock_cpu(this_cpu);
12767 domain_cost = t1 - t0;
12768 curr_cost += domain_cost;
12769 t0 = t1;
12770
12771 /*
12772 * Failing newidle means it is not effective;
12773 * bump the cost so we end up doing less of it.
12774 */
12775 if (!pulled_task)
12776 domain_cost = (3 * sd->max_newidle_lb_cost) / 2;
12777
12778 update_newidle_cost(sd, domain_cost);
12779 }
12780
12781 /*
12782 * Stop searching for tasks to pull if there are
12783 * now runnable tasks on this rq.
12784 */
12785 if (pulled_task || !continue_balancing)
12786 break;
12787 }
12788 rcu_read_unlock();
12789
12790 raw_spin_rq_lock(this_rq);
12791
12792 if (curr_cost > this_rq->max_idle_balance_cost)
12793 this_rq->max_idle_balance_cost = curr_cost;
12794
12795 /*
12796 * While browsing the domains, we released the rq lock, a task could
12797 * have been enqueued in the meantime. Since we're not going idle,
12798 * pretend we pulled a task.
12799 */
12800 if (this_rq->cfs.h_nr_queued && !pulled_task)
12801 pulled_task = 1;
12802
12803 /* Is there a task of a high priority class? */
12804 if (this_rq->nr_running != this_rq->cfs.h_nr_queued)
12805 pulled_task = -1;
12806
12807 out:
12808 /* Move the next balance forward */
12809 if (time_after(this_rq->next_balance, next_balance))
12810 this_rq->next_balance = next_balance;
12811
12812 if (pulled_task)
12813 this_rq->idle_stamp = 0;
12814 else
12815 nohz_newidle_balance(this_rq);
12816
12817 rq_repin_lock(this_rq, rf);
12818
12819 return pulled_task;
12820 }
12821
12822 /*
12823 * This softirq handler is triggered via SCHED_SOFTIRQ from two places:
12824 *
12825 * - directly from the local sched_tick() for periodic load balancing
12826 *
12827 * - indirectly from a remote sched_tick() for NOHZ idle balancing
12828 * through the SMP cross-call nohz_csd_func()
12829 */
sched_balance_softirq(void)12830 static __latent_entropy void sched_balance_softirq(void)
12831 {
12832 struct rq *this_rq = this_rq();
12833 enum cpu_idle_type idle = this_rq->idle_balance;
12834 /*
12835 * If this CPU has a pending NOHZ_BALANCE_KICK, then do the
12836 * balancing on behalf of the other idle CPUs whose ticks are
12837 * stopped. Do nohz_idle_balance *before* sched_balance_domains to
12838 * give the idle CPUs a chance to load balance. Else we may
12839 * load balance only within the local sched_domain hierarchy
12840 * and abort nohz_idle_balance altogether if we pull some load.
12841 */
12842 if (nohz_idle_balance(this_rq, idle))
12843 return;
12844
12845 /* normal load balance */
12846 sched_balance_update_blocked_averages(this_rq->cpu);
12847 sched_balance_domains(this_rq, idle);
12848 }
12849
12850 /*
12851 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
12852 */
sched_balance_trigger(struct rq * rq)12853 void sched_balance_trigger(struct rq *rq)
12854 {
12855 /*
12856 * Don't need to rebalance while attached to NULL domain or
12857 * runqueue CPU is not active
12858 */
12859 if (unlikely(on_null_domain(rq) || !cpu_active(cpu_of(rq))))
12860 return;
12861
12862 if (time_after_eq(jiffies, rq->next_balance))
12863 raise_softirq(SCHED_SOFTIRQ);
12864
12865 nohz_balancer_kick(rq);
12866 }
12867
rq_online_fair(struct rq * rq)12868 static void rq_online_fair(struct rq *rq)
12869 {
12870 update_sysctl();
12871
12872 update_runtime_enabled(rq);
12873 }
12874
rq_offline_fair(struct rq * rq)12875 static void rq_offline_fair(struct rq *rq)
12876 {
12877 update_sysctl();
12878
12879 /* Ensure any throttled groups are reachable by pick_next_task */
12880 unthrottle_offline_cfs_rqs(rq);
12881
12882 /* Ensure that we remove rq contribution to group share: */
12883 clear_tg_offline_cfs_rqs(rq);
12884 }
12885
12886 #ifdef CONFIG_SCHED_CORE
12887 static inline bool
__entity_slice_used(struct sched_entity * se,int min_nr_tasks)12888 __entity_slice_used(struct sched_entity *se, int min_nr_tasks)
12889 {
12890 u64 rtime = se->sum_exec_runtime - se->prev_sum_exec_runtime;
12891 u64 slice = se->slice;
12892
12893 return (rtime * min_nr_tasks > slice);
12894 }
12895
12896 #define MIN_NR_TASKS_DURING_FORCEIDLE 2
task_tick_core(struct rq * rq,struct task_struct * curr)12897 static inline void task_tick_core(struct rq *rq, struct task_struct *curr)
12898 {
12899 if (!sched_core_enabled(rq))
12900 return;
12901
12902 /*
12903 * If runqueue has only one task which used up its slice and
12904 * if the sibling is forced idle, then trigger schedule to
12905 * give forced idle task a chance.
12906 *
12907 * sched_slice() considers only this active rq and it gets the
12908 * whole slice. But during force idle, we have siblings acting
12909 * like a single runqueue and hence we need to consider runnable
12910 * tasks on this CPU and the forced idle CPU. Ideally, we should
12911 * go through the forced idle rq, but that would be a perf hit.
12912 * We can assume that the forced idle CPU has at least
12913 * MIN_NR_TASKS_DURING_FORCEIDLE - 1 tasks and use that to check
12914 * if we need to give up the CPU.
12915 */
12916 if (rq->core->core_forceidle_count && rq->cfs.nr_queued == 1 &&
12917 __entity_slice_used(&curr->se, MIN_NR_TASKS_DURING_FORCEIDLE))
12918 resched_curr(rq);
12919 }
12920
12921 /*
12922 * se_fi_update - Update the cfs_rq->min_vruntime_fi in a CFS hierarchy if needed.
12923 */
se_fi_update(const struct sched_entity * se,unsigned int fi_seq,bool forceidle)12924 static void se_fi_update(const struct sched_entity *se, unsigned int fi_seq,
12925 bool forceidle)
12926 {
12927 for_each_sched_entity(se) {
12928 struct cfs_rq *cfs_rq = cfs_rq_of(se);
12929
12930 if (forceidle) {
12931 if (cfs_rq->forceidle_seq == fi_seq)
12932 break;
12933 cfs_rq->forceidle_seq = fi_seq;
12934 }
12935
12936 cfs_rq->min_vruntime_fi = cfs_rq->min_vruntime;
12937 }
12938 }
12939
task_vruntime_update(struct rq * rq,struct task_struct * p,bool in_fi)12940 void task_vruntime_update(struct rq *rq, struct task_struct *p, bool in_fi)
12941 {
12942 struct sched_entity *se = &p->se;
12943
12944 if (p->sched_class != &fair_sched_class)
12945 return;
12946
12947 se_fi_update(se, rq->core->core_forceidle_seq, in_fi);
12948 }
12949
cfs_prio_less(const struct task_struct * a,const struct task_struct * b,bool in_fi)12950 bool cfs_prio_less(const struct task_struct *a, const struct task_struct *b,
12951 bool in_fi)
12952 {
12953 struct rq *rq = task_rq(a);
12954 const struct sched_entity *sea = &a->se;
12955 const struct sched_entity *seb = &b->se;
12956 struct cfs_rq *cfs_rqa;
12957 struct cfs_rq *cfs_rqb;
12958 s64 delta;
12959
12960 WARN_ON_ONCE(task_rq(b)->core != rq->core);
12961
12962 #ifdef CONFIG_FAIR_GROUP_SCHED
12963 /*
12964 * Find an se in the hierarchy for tasks a and b, such that the se's
12965 * are immediate siblings.
12966 */
12967 while (sea->cfs_rq->tg != seb->cfs_rq->tg) {
12968 int sea_depth = sea->depth;
12969 int seb_depth = seb->depth;
12970
12971 if (sea_depth >= seb_depth)
12972 sea = parent_entity(sea);
12973 if (sea_depth <= seb_depth)
12974 seb = parent_entity(seb);
12975 }
12976
12977 se_fi_update(sea, rq->core->core_forceidle_seq, in_fi);
12978 se_fi_update(seb, rq->core->core_forceidle_seq, in_fi);
12979
12980 cfs_rqa = sea->cfs_rq;
12981 cfs_rqb = seb->cfs_rq;
12982 #else /* !CONFIG_FAIR_GROUP_SCHED: */
12983 cfs_rqa = &task_rq(a)->cfs;
12984 cfs_rqb = &task_rq(b)->cfs;
12985 #endif /* !CONFIG_FAIR_GROUP_SCHED */
12986
12987 /*
12988 * Find delta after normalizing se's vruntime with its cfs_rq's
12989 * min_vruntime_fi, which would have been updated in prior calls
12990 * to se_fi_update().
12991 */
12992 delta = (s64)(sea->vruntime - seb->vruntime) +
12993 (s64)(cfs_rqb->min_vruntime_fi - cfs_rqa->min_vruntime_fi);
12994
12995 return delta > 0;
12996 }
12997
task_is_throttled_fair(struct task_struct * p,int cpu)12998 static int task_is_throttled_fair(struct task_struct *p, int cpu)
12999 {
13000 struct cfs_rq *cfs_rq;
13001
13002 #ifdef CONFIG_FAIR_GROUP_SCHED
13003 cfs_rq = task_group(p)->cfs_rq[cpu];
13004 #else
13005 cfs_rq = &cpu_rq(cpu)->cfs;
13006 #endif
13007 return throttled_hierarchy(cfs_rq);
13008 }
13009 #else /* !CONFIG_SCHED_CORE: */
task_tick_core(struct rq * rq,struct task_struct * curr)13010 static inline void task_tick_core(struct rq *rq, struct task_struct *curr) {}
13011 #endif /* !CONFIG_SCHED_CORE */
13012
13013 /*
13014 * scheduler tick hitting a task of our scheduling class.
13015 *
13016 * NOTE: This function can be called remotely by the tick offload that
13017 * goes along full dynticks. Therefore no local assumption can be made
13018 * and everything must be accessed through the @rq and @curr passed in
13019 * parameters.
13020 */
task_tick_fair(struct rq * rq,struct task_struct * curr,int queued)13021 static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
13022 {
13023 struct cfs_rq *cfs_rq;
13024 struct sched_entity *se = &curr->se;
13025
13026 for_each_sched_entity(se) {
13027 cfs_rq = cfs_rq_of(se);
13028 entity_tick(cfs_rq, se, queued);
13029 }
13030
13031 if (static_branch_unlikely(&sched_numa_balancing))
13032 task_tick_numa(rq, curr);
13033
13034 update_misfit_status(curr, rq);
13035 check_update_overutilized_status(task_rq(curr));
13036
13037 task_tick_core(rq, curr);
13038 }
13039
13040 /*
13041 * called on fork with the child task as argument from the parent's context
13042 * - child not yet on the tasklist
13043 * - preemption disabled
13044 */
task_fork_fair(struct task_struct * p)13045 static void task_fork_fair(struct task_struct *p)
13046 {
13047 set_task_max_allowed_capacity(p);
13048 }
13049
13050 /*
13051 * Priority of the task has changed. Check to see if we preempt
13052 * the current task.
13053 */
13054 static void
prio_changed_fair(struct rq * rq,struct task_struct * p,int oldprio)13055 prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
13056 {
13057 if (!task_on_rq_queued(p))
13058 return;
13059
13060 if (rq->cfs.nr_queued == 1)
13061 return;
13062
13063 /*
13064 * Reschedule if we are currently running on this runqueue and
13065 * our priority decreased, or if we are not currently running on
13066 * this runqueue and our priority is higher than the current's
13067 */
13068 if (task_current_donor(rq, p)) {
13069 if (p->prio > oldprio)
13070 resched_curr(rq);
13071 } else
13072 wakeup_preempt(rq, p, 0);
13073 }
13074
13075 #ifdef CONFIG_FAIR_GROUP_SCHED
13076 /*
13077 * Propagate the changes of the sched_entity across the tg tree to make it
13078 * visible to the root
13079 */
propagate_entity_cfs_rq(struct sched_entity * se)13080 static void propagate_entity_cfs_rq(struct sched_entity *se)
13081 {
13082 struct cfs_rq *cfs_rq = cfs_rq_of(se);
13083
13084 if (cfs_rq_throttled(cfs_rq))
13085 return;
13086
13087 if (!throttled_hierarchy(cfs_rq))
13088 list_add_leaf_cfs_rq(cfs_rq);
13089
13090 /* Start to propagate at parent */
13091 se = se->parent;
13092
13093 for_each_sched_entity(se) {
13094 cfs_rq = cfs_rq_of(se);
13095
13096 update_load_avg(cfs_rq, se, UPDATE_TG);
13097
13098 if (cfs_rq_throttled(cfs_rq))
13099 break;
13100
13101 if (!throttled_hierarchy(cfs_rq))
13102 list_add_leaf_cfs_rq(cfs_rq);
13103 }
13104 }
13105 #else /* !CONFIG_FAIR_GROUP_SCHED: */
propagate_entity_cfs_rq(struct sched_entity * se)13106 static void propagate_entity_cfs_rq(struct sched_entity *se) { }
13107 #endif /* !CONFIG_FAIR_GROUP_SCHED */
13108
detach_entity_cfs_rq(struct sched_entity * se)13109 static void detach_entity_cfs_rq(struct sched_entity *se)
13110 {
13111 struct cfs_rq *cfs_rq = cfs_rq_of(se);
13112
13113 /*
13114 * In case the task sched_avg hasn't been attached:
13115 * - A forked task which hasn't been woken up by wake_up_new_task().
13116 * - A task which has been woken up by try_to_wake_up() but is
13117 * waiting for actually being woken up by sched_ttwu_pending().
13118 */
13119 if (!se->avg.last_update_time)
13120 return;
13121
13122 /* Catch up with the cfs_rq and remove our load when we leave */
13123 update_load_avg(cfs_rq, se, 0);
13124 detach_entity_load_avg(cfs_rq, se);
13125 update_tg_load_avg(cfs_rq);
13126 propagate_entity_cfs_rq(se);
13127 }
13128
attach_entity_cfs_rq(struct sched_entity * se)13129 static void attach_entity_cfs_rq(struct sched_entity *se)
13130 {
13131 struct cfs_rq *cfs_rq = cfs_rq_of(se);
13132
13133 /* Synchronize entity with its cfs_rq */
13134 update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
13135 attach_entity_load_avg(cfs_rq, se);
13136 update_tg_load_avg(cfs_rq);
13137 propagate_entity_cfs_rq(se);
13138 }
13139
detach_task_cfs_rq(struct task_struct * p)13140 static void detach_task_cfs_rq(struct task_struct *p)
13141 {
13142 struct sched_entity *se = &p->se;
13143
13144 detach_entity_cfs_rq(se);
13145 }
13146
attach_task_cfs_rq(struct task_struct * p)13147 static void attach_task_cfs_rq(struct task_struct *p)
13148 {
13149 struct sched_entity *se = &p->se;
13150
13151 attach_entity_cfs_rq(se);
13152 }
13153
switched_from_fair(struct rq * rq,struct task_struct * p)13154 static void switched_from_fair(struct rq *rq, struct task_struct *p)
13155 {
13156 detach_task_cfs_rq(p);
13157 }
13158
switched_to_fair(struct rq * rq,struct task_struct * p)13159 static void switched_to_fair(struct rq *rq, struct task_struct *p)
13160 {
13161 WARN_ON_ONCE(p->se.sched_delayed);
13162
13163 attach_task_cfs_rq(p);
13164
13165 set_task_max_allowed_capacity(p);
13166
13167 if (task_on_rq_queued(p)) {
13168 /*
13169 * We were most likely switched from sched_rt, so
13170 * kick off the schedule if running, otherwise just see
13171 * if we can still preempt the current task.
13172 */
13173 if (task_current_donor(rq, p))
13174 resched_curr(rq);
13175 else
13176 wakeup_preempt(rq, p, 0);
13177 }
13178 }
13179
__set_next_task_fair(struct rq * rq,struct task_struct * p,bool first)13180 static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
13181 {
13182 struct sched_entity *se = &p->se;
13183
13184 if (task_on_rq_queued(p)) {
13185 /*
13186 * Move the next running task to the front of the list, so our
13187 * cfs_tasks list becomes MRU one.
13188 */
13189 list_move(&se->group_node, &rq->cfs_tasks);
13190 }
13191 if (!first)
13192 return;
13193
13194 WARN_ON_ONCE(se->sched_delayed);
13195
13196 if (hrtick_enabled_fair(rq))
13197 hrtick_start_fair(rq, p);
13198
13199 update_misfit_status(p, rq);
13200 sched_fair_update_stop_tick(rq, p);
13201 }
13202
13203 /*
13204 * Account for a task changing its policy or group.
13205 *
13206 * This routine is mostly called to set cfs_rq->curr field when a task
13207 * migrates between groups/classes.
13208 */
set_next_task_fair(struct rq * rq,struct task_struct * p,bool first)13209 static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
13210 {
13211 struct sched_entity *se = &p->se;
13212
13213 for_each_sched_entity(se) {
13214 struct cfs_rq *cfs_rq = cfs_rq_of(se);
13215
13216 set_next_entity(cfs_rq, se);
13217 /* ensure bandwidth has been allocated on our new cfs_rq */
13218 account_cfs_rq_runtime(cfs_rq, 0);
13219 }
13220
13221 __set_next_task_fair(rq, p, first);
13222 }
13223
init_cfs_rq(struct cfs_rq * cfs_rq)13224 void init_cfs_rq(struct cfs_rq *cfs_rq)
13225 {
13226 cfs_rq->tasks_timeline = RB_ROOT_CACHED;
13227 cfs_rq->min_vruntime = (u64)(-(1LL << 20));
13228 raw_spin_lock_init(&cfs_rq->removed.lock);
13229 }
13230
13231 #ifdef CONFIG_FAIR_GROUP_SCHED
task_change_group_fair(struct task_struct * p)13232 static void task_change_group_fair(struct task_struct *p)
13233 {
13234 /*
13235 * We couldn't detach or attach a forked task which
13236 * hasn't been woken up by wake_up_new_task().
13237 */
13238 if (READ_ONCE(p->__state) == TASK_NEW)
13239 return;
13240
13241 detach_task_cfs_rq(p);
13242
13243 /* Tell se's cfs_rq has been changed -- migrated */
13244 p->se.avg.last_update_time = 0;
13245 set_task_rq(p, task_cpu(p));
13246 attach_task_cfs_rq(p);
13247 }
13248
free_fair_sched_group(struct task_group * tg)13249 void free_fair_sched_group(struct task_group *tg)
13250 {
13251 int i;
13252
13253 for_each_possible_cpu(i) {
13254 if (tg->cfs_rq)
13255 kfree(tg->cfs_rq[i]);
13256 if (tg->se)
13257 kfree(tg->se[i]);
13258 }
13259
13260 kfree(tg->cfs_rq);
13261 kfree(tg->se);
13262 }
13263
alloc_fair_sched_group(struct task_group * tg,struct task_group * parent)13264 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
13265 {
13266 struct sched_entity *se;
13267 struct cfs_rq *cfs_rq;
13268 int i;
13269
13270 tg->cfs_rq = kcalloc(nr_cpu_ids, sizeof(cfs_rq), GFP_KERNEL);
13271 if (!tg->cfs_rq)
13272 goto err;
13273 tg->se = kcalloc(nr_cpu_ids, sizeof(se), GFP_KERNEL);
13274 if (!tg->se)
13275 goto err;
13276
13277 tg->shares = NICE_0_LOAD;
13278
13279 init_cfs_bandwidth(tg_cfs_bandwidth(tg), tg_cfs_bandwidth(parent));
13280
13281 for_each_possible_cpu(i) {
13282 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
13283 GFP_KERNEL, cpu_to_node(i));
13284 if (!cfs_rq)
13285 goto err;
13286
13287 se = kzalloc_node(sizeof(struct sched_entity_stats),
13288 GFP_KERNEL, cpu_to_node(i));
13289 if (!se)
13290 goto err_free_rq;
13291
13292 init_cfs_rq(cfs_rq);
13293 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
13294 init_entity_runnable_average(se);
13295 }
13296
13297 return 1;
13298
13299 err_free_rq:
13300 kfree(cfs_rq);
13301 err:
13302 return 0;
13303 }
13304
online_fair_sched_group(struct task_group * tg)13305 void online_fair_sched_group(struct task_group *tg)
13306 {
13307 struct sched_entity *se;
13308 struct rq_flags rf;
13309 struct rq *rq;
13310 int i;
13311
13312 for_each_possible_cpu(i) {
13313 rq = cpu_rq(i);
13314 se = tg->se[i];
13315 rq_lock_irq(rq, &rf);
13316 update_rq_clock(rq);
13317 attach_entity_cfs_rq(se);
13318 sync_throttle(tg, i);
13319 rq_unlock_irq(rq, &rf);
13320 }
13321 }
13322
unregister_fair_sched_group(struct task_group * tg)13323 void unregister_fair_sched_group(struct task_group *tg)
13324 {
13325 int cpu;
13326
13327 destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
13328
13329 for_each_possible_cpu(cpu) {
13330 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];
13331 struct sched_entity *se = tg->se[cpu];
13332 struct rq *rq = cpu_rq(cpu);
13333
13334 if (se) {
13335 if (se->sched_delayed) {
13336 guard(rq_lock_irqsave)(rq);
13337 if (se->sched_delayed) {
13338 update_rq_clock(rq);
13339 dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
13340 }
13341 list_del_leaf_cfs_rq(cfs_rq);
13342 }
13343 remove_entity_load_avg(se);
13344 }
13345
13346 /*
13347 * Only empty task groups can be destroyed; so we can speculatively
13348 * check on_list without danger of it being re-added.
13349 */
13350 if (cfs_rq->on_list) {
13351 guard(rq_lock_irqsave)(rq);
13352 list_del_leaf_cfs_rq(cfs_rq);
13353 }
13354 }
13355 }
13356
init_tg_cfs_entry(struct task_group * tg,struct cfs_rq * cfs_rq,struct sched_entity * se,int cpu,struct sched_entity * parent)13357 void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
13358 struct sched_entity *se, int cpu,
13359 struct sched_entity *parent)
13360 {
13361 struct rq *rq = cpu_rq(cpu);
13362
13363 cfs_rq->tg = tg;
13364 cfs_rq->rq = rq;
13365 init_cfs_rq_runtime(cfs_rq);
13366
13367 tg->cfs_rq[cpu] = cfs_rq;
13368 tg->se[cpu] = se;
13369
13370 /* se could be NULL for root_task_group */
13371 if (!se)
13372 return;
13373
13374 if (!parent) {
13375 se->cfs_rq = &rq->cfs;
13376 se->depth = 0;
13377 } else {
13378 se->cfs_rq = parent->my_q;
13379 se->depth = parent->depth + 1;
13380 }
13381
13382 se->my_q = cfs_rq;
13383 /* guarantee group entities always have weight */
13384 update_load_set(&se->load, NICE_0_LOAD);
13385 se->parent = parent;
13386 }
13387
13388 static DEFINE_MUTEX(shares_mutex);
13389
__sched_group_set_shares(struct task_group * tg,unsigned long shares)13390 static int __sched_group_set_shares(struct task_group *tg, unsigned long shares)
13391 {
13392 int i;
13393
13394 lockdep_assert_held(&shares_mutex);
13395
13396 /*
13397 * We can't change the weight of the root cgroup.
13398 */
13399 if (!tg->se[0])
13400 return -EINVAL;
13401
13402 shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
13403
13404 if (tg->shares == shares)
13405 return 0;
13406
13407 tg->shares = shares;
13408 for_each_possible_cpu(i) {
13409 struct rq *rq = cpu_rq(i);
13410 struct sched_entity *se = tg->se[i];
13411 struct rq_flags rf;
13412
13413 /* Propagate contribution to hierarchy */
13414 rq_lock_irqsave(rq, &rf);
13415 update_rq_clock(rq);
13416 for_each_sched_entity(se) {
13417 update_load_avg(cfs_rq_of(se), se, UPDATE_TG);
13418 update_cfs_group(se);
13419 }
13420 rq_unlock_irqrestore(rq, &rf);
13421 }
13422
13423 return 0;
13424 }
13425
sched_group_set_shares(struct task_group * tg,unsigned long shares)13426 int sched_group_set_shares(struct task_group *tg, unsigned long shares)
13427 {
13428 int ret;
13429
13430 mutex_lock(&shares_mutex);
13431 if (tg_is_idle(tg))
13432 ret = -EINVAL;
13433 else
13434 ret = __sched_group_set_shares(tg, shares);
13435 mutex_unlock(&shares_mutex);
13436
13437 return ret;
13438 }
13439
sched_group_set_idle(struct task_group * tg,long idle)13440 int sched_group_set_idle(struct task_group *tg, long idle)
13441 {
13442 int i;
13443
13444 if (tg == &root_task_group)
13445 return -EINVAL;
13446
13447 if (idle < 0 || idle > 1)
13448 return -EINVAL;
13449
13450 mutex_lock(&shares_mutex);
13451
13452 if (tg->idle == idle) {
13453 mutex_unlock(&shares_mutex);
13454 return 0;
13455 }
13456
13457 tg->idle = idle;
13458
13459 for_each_possible_cpu(i) {
13460 struct rq *rq = cpu_rq(i);
13461 struct sched_entity *se = tg->se[i];
13462 struct cfs_rq *grp_cfs_rq = tg->cfs_rq[i];
13463 bool was_idle = cfs_rq_is_idle(grp_cfs_rq);
13464 long idle_task_delta;
13465 struct rq_flags rf;
13466
13467 rq_lock_irqsave(rq, &rf);
13468
13469 grp_cfs_rq->idle = idle;
13470 if (WARN_ON_ONCE(was_idle == cfs_rq_is_idle(grp_cfs_rq)))
13471 goto next_cpu;
13472
13473 idle_task_delta = grp_cfs_rq->h_nr_queued -
13474 grp_cfs_rq->h_nr_idle;
13475 if (!cfs_rq_is_idle(grp_cfs_rq))
13476 idle_task_delta *= -1;
13477
13478 for_each_sched_entity(se) {
13479 struct cfs_rq *cfs_rq = cfs_rq_of(se);
13480
13481 if (!se->on_rq)
13482 break;
13483
13484 cfs_rq->h_nr_idle += idle_task_delta;
13485
13486 /* Already accounted at parent level and above. */
13487 if (cfs_rq_is_idle(cfs_rq))
13488 break;
13489 }
13490
13491 next_cpu:
13492 rq_unlock_irqrestore(rq, &rf);
13493 }
13494
13495 /* Idle groups have minimum weight. */
13496 if (tg_is_idle(tg))
13497 __sched_group_set_shares(tg, scale_load(WEIGHT_IDLEPRIO));
13498 else
13499 __sched_group_set_shares(tg, NICE_0_LOAD);
13500
13501 mutex_unlock(&shares_mutex);
13502 return 0;
13503 }
13504
13505 #endif /* CONFIG_FAIR_GROUP_SCHED */
13506
13507
get_rr_interval_fair(struct rq * rq,struct task_struct * task)13508 static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
13509 {
13510 struct sched_entity *se = &task->se;
13511 unsigned int rr_interval = 0;
13512
13513 /*
13514 * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
13515 * idle runqueue:
13516 */
13517 if (rq->cfs.load.weight)
13518 rr_interval = NS_TO_JIFFIES(se->slice);
13519
13520 return rr_interval;
13521 }
13522
13523 /*
13524 * All the scheduling class methods:
13525 */
13526 DEFINE_SCHED_CLASS(fair) = {
13527
13528 .enqueue_task = enqueue_task_fair,
13529 .dequeue_task = dequeue_task_fair,
13530 .yield_task = yield_task_fair,
13531 .yield_to_task = yield_to_task_fair,
13532
13533 .wakeup_preempt = check_preempt_wakeup_fair,
13534
13535 .pick_task = pick_task_fair,
13536 .pick_next_task = __pick_next_task_fair,
13537 .put_prev_task = put_prev_task_fair,
13538 .set_next_task = set_next_task_fair,
13539
13540 .balance = balance_fair,
13541 .select_task_rq = select_task_rq_fair,
13542 .migrate_task_rq = migrate_task_rq_fair,
13543
13544 .rq_online = rq_online_fair,
13545 .rq_offline = rq_offline_fair,
13546
13547 .task_dead = task_dead_fair,
13548 .set_cpus_allowed = set_cpus_allowed_fair,
13549
13550 .task_tick = task_tick_fair,
13551 .task_fork = task_fork_fair,
13552
13553 .reweight_task = reweight_task_fair,
13554 .prio_changed = prio_changed_fair,
13555 .switched_from = switched_from_fair,
13556 .switched_to = switched_to_fair,
13557
13558 .get_rr_interval = get_rr_interval_fair,
13559
13560 .update_curr = update_curr_fair,
13561
13562 #ifdef CONFIG_FAIR_GROUP_SCHED
13563 .task_change_group = task_change_group_fair,
13564 #endif
13565
13566 #ifdef CONFIG_SCHED_CORE
13567 .task_is_throttled = task_is_throttled_fair,
13568 #endif
13569
13570 #ifdef CONFIG_UCLAMP_TASK
13571 .uclamp_enabled = 1,
13572 #endif
13573 };
13574
print_cfs_stats(struct seq_file * m,int cpu)13575 void print_cfs_stats(struct seq_file *m, int cpu)
13576 {
13577 struct cfs_rq *cfs_rq, *pos;
13578
13579 rcu_read_lock();
13580 for_each_leaf_cfs_rq_safe(cpu_rq(cpu), cfs_rq, pos)
13581 print_cfs_rq(m, cpu, cfs_rq);
13582 rcu_read_unlock();
13583 }
13584
13585 #ifdef CONFIG_NUMA_BALANCING
show_numa_stats(struct task_struct * p,struct seq_file * m)13586 void show_numa_stats(struct task_struct *p, struct seq_file *m)
13587 {
13588 int node;
13589 unsigned long tsf = 0, tpf = 0, gsf = 0, gpf = 0;
13590 struct numa_group *ng;
13591
13592 rcu_read_lock();
13593 ng = rcu_dereference(p->numa_group);
13594 for_each_online_node(node) {
13595 if (p->numa_faults) {
13596 tsf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 0)];
13597 tpf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 1)];
13598 }
13599 if (ng) {
13600 gsf = ng->faults[task_faults_idx(NUMA_MEM, node, 0)],
13601 gpf = ng->faults[task_faults_idx(NUMA_MEM, node, 1)];
13602 }
13603 print_numa_stats(m, node, tsf, tpf, gsf, gpf);
13604 }
13605 rcu_read_unlock();
13606 }
13607 #endif /* CONFIG_NUMA_BALANCING */
13608
init_sched_fair_class(void)13609 __init void init_sched_fair_class(void)
13610 {
13611 int i;
13612
13613 for_each_possible_cpu(i) {
13614 zalloc_cpumask_var_node(&per_cpu(load_balance_mask, i), GFP_KERNEL, cpu_to_node(i));
13615 zalloc_cpumask_var_node(&per_cpu(select_rq_mask, i), GFP_KERNEL, cpu_to_node(i));
13616 zalloc_cpumask_var_node(&per_cpu(should_we_balance_tmpmask, i),
13617 GFP_KERNEL, cpu_to_node(i));
13618
13619 #ifdef CONFIG_CFS_BANDWIDTH
13620 INIT_CSD(&cpu_rq(i)->cfsb_csd, __cfsb_csd_unthrottle, cpu_rq(i));
13621 INIT_LIST_HEAD(&cpu_rq(i)->cfsb_csd_list);
13622 #endif
13623 }
13624
13625 open_softirq(SCHED_SOFTIRQ, sched_balance_softirq);
13626
13627 #ifdef CONFIG_NO_HZ_COMMON
13628 nohz.next_balance = jiffies;
13629 nohz.next_blocked = jiffies;
13630 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
13631 #endif
13632 }
13633