xref: /linux/kernel/sched/fair.c (revision dd7bd1093622621a910cbb6a77c7addeb20c9984)
1 /*
2  * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
3  *
4  *  Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
5  *
6  *  Interactivity improvements by Mike Galbraith
7  *  (C) 2007 Mike Galbraith <efault@gmx.de>
8  *
9  *  Various enhancements by Dmitry Adamushko.
10  *  (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
11  *
12  *  Group scheduling enhancements by Srivatsa Vaddagiri
13  *  Copyright IBM Corporation, 2007
14  *  Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
15  *
16  *  Scaled math optimizations by Thomas Gleixner
17  *  Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
18  *
19  *  Adaptive scheduling granularity, math enhancements by Peter Zijlstra
20  *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
21  */
22 
23 #include <linux/sched.h>
24 #include <linux/latencytop.h>
25 #include <linux/cpumask.h>
26 #include <linux/cpuidle.h>
27 #include <linux/slab.h>
28 #include <linux/profile.h>
29 #include <linux/interrupt.h>
30 #include <linux/mempolicy.h>
31 #include <linux/migrate.h>
32 #include <linux/task_work.h>
33 
34 #include <trace/events/sched.h>
35 
36 #include "sched.h"
37 
38 /*
39  * Targeted preemption latency for CPU-bound tasks:
40  * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
41  *
42  * NOTE: this latency value is not the same as the concept of
43  * 'timeslice length' - timeslices in CFS are of variable length
44  * and have no persistent notion like in traditional, time-slice
45  * based scheduling concepts.
46  *
47  * (to see the precise effective timeslice length of your workload,
48  *  run vmstat and monitor the context-switches (cs) field)
49  */
50 unsigned int sysctl_sched_latency = 6000000ULL;
51 unsigned int normalized_sysctl_sched_latency = 6000000ULL;
52 
53 /*
54  * The initial- and re-scaling of tunables is configurable
55  * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
56  *
57  * Options are:
58  * SCHED_TUNABLESCALING_NONE - unscaled, always *1
59  * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
60  * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
61  */
62 enum sched_tunable_scaling sysctl_sched_tunable_scaling
63 	= SCHED_TUNABLESCALING_LOG;
64 
65 /*
66  * Minimal preemption granularity for CPU-bound tasks:
67  * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
68  */
69 unsigned int sysctl_sched_min_granularity = 750000ULL;
70 unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
71 
72 /*
73  * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
74  */
75 static unsigned int sched_nr_latency = 8;
76 
77 /*
78  * After fork, child runs first. If set to 0 (default) then
79  * parent will (try to) run first.
80  */
81 unsigned int sysctl_sched_child_runs_first __read_mostly;
82 
83 /*
84  * SCHED_OTHER wake-up granularity.
85  * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
86  *
87  * This option delays the preemption effects of decoupled workloads
88  * and reduces their over-scheduling. Synchronous workloads will still
89  * have immediate wakeup/sleep latencies.
90  */
91 unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
92 unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
93 
94 const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
95 
96 /*
97  * The exponential sliding  window over which load is averaged for shares
98  * distribution.
99  * (default: 10msec)
100  */
101 unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
102 
103 #ifdef CONFIG_CFS_BANDWIDTH
104 /*
105  * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
106  * each time a cfs_rq requests quota.
107  *
108  * Note: in the case that the slice exceeds the runtime remaining (either due
109  * to consumption or the quota being specified to be smaller than the slice)
110  * we will always only issue the remaining available time.
111  *
112  * default: 5 msec, units: microseconds
113   */
114 unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
115 #endif
116 
117 static inline void update_load_add(struct load_weight *lw, unsigned long inc)
118 {
119 	lw->weight += inc;
120 	lw->inv_weight = 0;
121 }
122 
123 static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
124 {
125 	lw->weight -= dec;
126 	lw->inv_weight = 0;
127 }
128 
129 static inline void update_load_set(struct load_weight *lw, unsigned long w)
130 {
131 	lw->weight = w;
132 	lw->inv_weight = 0;
133 }
134 
135 /*
136  * Increase the granularity value when there are more CPUs,
137  * because with more CPUs the 'effective latency' as visible
138  * to users decreases. But the relationship is not linear,
139  * so pick a second-best guess by going with the log2 of the
140  * number of CPUs.
141  *
142  * This idea comes from the SD scheduler of Con Kolivas:
143  */
144 static unsigned int get_update_sysctl_factor(void)
145 {
146 	unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8);
147 	unsigned int factor;
148 
149 	switch (sysctl_sched_tunable_scaling) {
150 	case SCHED_TUNABLESCALING_NONE:
151 		factor = 1;
152 		break;
153 	case SCHED_TUNABLESCALING_LINEAR:
154 		factor = cpus;
155 		break;
156 	case SCHED_TUNABLESCALING_LOG:
157 	default:
158 		factor = 1 + ilog2(cpus);
159 		break;
160 	}
161 
162 	return factor;
163 }
164 
165 static void update_sysctl(void)
166 {
167 	unsigned int factor = get_update_sysctl_factor();
168 
169 #define SET_SYSCTL(name) \
170 	(sysctl_##name = (factor) * normalized_sysctl_##name)
171 	SET_SYSCTL(sched_min_granularity);
172 	SET_SYSCTL(sched_latency);
173 	SET_SYSCTL(sched_wakeup_granularity);
174 #undef SET_SYSCTL
175 }
176 
177 void sched_init_granularity(void)
178 {
179 	update_sysctl();
180 }
181 
182 #define WMULT_CONST	(~0U)
183 #define WMULT_SHIFT	32
184 
185 static void __update_inv_weight(struct load_weight *lw)
186 {
187 	unsigned long w;
188 
189 	if (likely(lw->inv_weight))
190 		return;
191 
192 	w = scale_load_down(lw->weight);
193 
194 	if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
195 		lw->inv_weight = 1;
196 	else if (unlikely(!w))
197 		lw->inv_weight = WMULT_CONST;
198 	else
199 		lw->inv_weight = WMULT_CONST / w;
200 }
201 
202 /*
203  * delta_exec * weight / lw.weight
204  *   OR
205  * (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT
206  *
207  * Either weight := NICE_0_LOAD and lw \e sched_prio_to_wmult[], in which case
208  * we're guaranteed shift stays positive because inv_weight is guaranteed to
209  * fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22.
210  *
211  * Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus
212  * weight/lw.weight <= 1, and therefore our shift will also be positive.
213  */
214 static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
215 {
216 	u64 fact = scale_load_down(weight);
217 	int shift = WMULT_SHIFT;
218 
219 	__update_inv_weight(lw);
220 
221 	if (unlikely(fact >> 32)) {
222 		while (fact >> 32) {
223 			fact >>= 1;
224 			shift--;
225 		}
226 	}
227 
228 	/* hint to use a 32x32->64 mul */
229 	fact = (u64)(u32)fact * lw->inv_weight;
230 
231 	while (fact >> 32) {
232 		fact >>= 1;
233 		shift--;
234 	}
235 
236 	return mul_u64_u32_shr(delta_exec, fact, shift);
237 }
238 
239 
240 const struct sched_class fair_sched_class;
241 
242 /**************************************************************
243  * CFS operations on generic schedulable entities:
244  */
245 
246 #ifdef CONFIG_FAIR_GROUP_SCHED
247 
248 /* cpu runqueue to which this cfs_rq is attached */
249 static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
250 {
251 	return cfs_rq->rq;
252 }
253 
254 /* An entity is a task if it doesn't "own" a runqueue */
255 #define entity_is_task(se)	(!se->my_q)
256 
257 static inline struct task_struct *task_of(struct sched_entity *se)
258 {
259 #ifdef CONFIG_SCHED_DEBUG
260 	WARN_ON_ONCE(!entity_is_task(se));
261 #endif
262 	return container_of(se, struct task_struct, se);
263 }
264 
265 /* Walk up scheduling entities hierarchy */
266 #define for_each_sched_entity(se) \
267 		for (; se; se = se->parent)
268 
269 static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
270 {
271 	return p->se.cfs_rq;
272 }
273 
274 /* runqueue on which this entity is (to be) queued */
275 static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
276 {
277 	return se->cfs_rq;
278 }
279 
280 /* runqueue "owned" by this group */
281 static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
282 {
283 	return grp->my_q;
284 }
285 
286 static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
287 {
288 	if (!cfs_rq->on_list) {
289 		/*
290 		 * Ensure we either appear before our parent (if already
291 		 * enqueued) or force our parent to appear after us when it is
292 		 * enqueued.  The fact that we always enqueue bottom-up
293 		 * reduces this to two cases.
294 		 */
295 		if (cfs_rq->tg->parent &&
296 		    cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
297 			list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
298 				&rq_of(cfs_rq)->leaf_cfs_rq_list);
299 		} else {
300 			list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
301 				&rq_of(cfs_rq)->leaf_cfs_rq_list);
302 		}
303 
304 		cfs_rq->on_list = 1;
305 	}
306 }
307 
308 static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
309 {
310 	if (cfs_rq->on_list) {
311 		list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
312 		cfs_rq->on_list = 0;
313 	}
314 }
315 
316 /* Iterate thr' all leaf cfs_rq's on a runqueue */
317 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
318 	list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
319 
320 /* Do the two (enqueued) entities belong to the same group ? */
321 static inline struct cfs_rq *
322 is_same_group(struct sched_entity *se, struct sched_entity *pse)
323 {
324 	if (se->cfs_rq == pse->cfs_rq)
325 		return se->cfs_rq;
326 
327 	return NULL;
328 }
329 
330 static inline struct sched_entity *parent_entity(struct sched_entity *se)
331 {
332 	return se->parent;
333 }
334 
335 static void
336 find_matching_se(struct sched_entity **se, struct sched_entity **pse)
337 {
338 	int se_depth, pse_depth;
339 
340 	/*
341 	 * preemption test can be made between sibling entities who are in the
342 	 * same cfs_rq i.e who have a common parent. Walk up the hierarchy of
343 	 * both tasks until we find their ancestors who are siblings of common
344 	 * parent.
345 	 */
346 
347 	/* First walk up until both entities are at same depth */
348 	se_depth = (*se)->depth;
349 	pse_depth = (*pse)->depth;
350 
351 	while (se_depth > pse_depth) {
352 		se_depth--;
353 		*se = parent_entity(*se);
354 	}
355 
356 	while (pse_depth > se_depth) {
357 		pse_depth--;
358 		*pse = parent_entity(*pse);
359 	}
360 
361 	while (!is_same_group(*se, *pse)) {
362 		*se = parent_entity(*se);
363 		*pse = parent_entity(*pse);
364 	}
365 }
366 
367 #else	/* !CONFIG_FAIR_GROUP_SCHED */
368 
369 static inline struct task_struct *task_of(struct sched_entity *se)
370 {
371 	return container_of(se, struct task_struct, se);
372 }
373 
374 static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
375 {
376 	return container_of(cfs_rq, struct rq, cfs);
377 }
378 
379 #define entity_is_task(se)	1
380 
381 #define for_each_sched_entity(se) \
382 		for (; se; se = NULL)
383 
384 static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
385 {
386 	return &task_rq(p)->cfs;
387 }
388 
389 static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
390 {
391 	struct task_struct *p = task_of(se);
392 	struct rq *rq = task_rq(p);
393 
394 	return &rq->cfs;
395 }
396 
397 /* runqueue "owned" by this group */
398 static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
399 {
400 	return NULL;
401 }
402 
403 static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
404 {
405 }
406 
407 static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
408 {
409 }
410 
411 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
412 		for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
413 
414 static inline struct sched_entity *parent_entity(struct sched_entity *se)
415 {
416 	return NULL;
417 }
418 
419 static inline void
420 find_matching_se(struct sched_entity **se, struct sched_entity **pse)
421 {
422 }
423 
424 #endif	/* CONFIG_FAIR_GROUP_SCHED */
425 
426 static __always_inline
427 void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
428 
429 /**************************************************************
430  * Scheduling class tree data structure manipulation methods:
431  */
432 
433 static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
434 {
435 	s64 delta = (s64)(vruntime - max_vruntime);
436 	if (delta > 0)
437 		max_vruntime = vruntime;
438 
439 	return max_vruntime;
440 }
441 
442 static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
443 {
444 	s64 delta = (s64)(vruntime - min_vruntime);
445 	if (delta < 0)
446 		min_vruntime = vruntime;
447 
448 	return min_vruntime;
449 }
450 
451 static inline int entity_before(struct sched_entity *a,
452 				struct sched_entity *b)
453 {
454 	return (s64)(a->vruntime - b->vruntime) < 0;
455 }
456 
457 static void update_min_vruntime(struct cfs_rq *cfs_rq)
458 {
459 	u64 vruntime = cfs_rq->min_vruntime;
460 
461 	if (cfs_rq->curr)
462 		vruntime = cfs_rq->curr->vruntime;
463 
464 	if (cfs_rq->rb_leftmost) {
465 		struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
466 						   struct sched_entity,
467 						   run_node);
468 
469 		if (!cfs_rq->curr)
470 			vruntime = se->vruntime;
471 		else
472 			vruntime = min_vruntime(vruntime, se->vruntime);
473 	}
474 
475 	/* ensure we never gain time by being placed backwards. */
476 	cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
477 #ifndef CONFIG_64BIT
478 	smp_wmb();
479 	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
480 #endif
481 }
482 
483 /*
484  * Enqueue an entity into the rb-tree:
485  */
486 static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
487 {
488 	struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
489 	struct rb_node *parent = NULL;
490 	struct sched_entity *entry;
491 	int leftmost = 1;
492 
493 	/*
494 	 * Find the right place in the rbtree:
495 	 */
496 	while (*link) {
497 		parent = *link;
498 		entry = rb_entry(parent, struct sched_entity, run_node);
499 		/*
500 		 * We dont care about collisions. Nodes with
501 		 * the same key stay together.
502 		 */
503 		if (entity_before(se, entry)) {
504 			link = &parent->rb_left;
505 		} else {
506 			link = &parent->rb_right;
507 			leftmost = 0;
508 		}
509 	}
510 
511 	/*
512 	 * Maintain a cache of leftmost tree entries (it is frequently
513 	 * used):
514 	 */
515 	if (leftmost)
516 		cfs_rq->rb_leftmost = &se->run_node;
517 
518 	rb_link_node(&se->run_node, parent, link);
519 	rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
520 }
521 
522 static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
523 {
524 	if (cfs_rq->rb_leftmost == &se->run_node) {
525 		struct rb_node *next_node;
526 
527 		next_node = rb_next(&se->run_node);
528 		cfs_rq->rb_leftmost = next_node;
529 	}
530 
531 	rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
532 }
533 
534 struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
535 {
536 	struct rb_node *left = cfs_rq->rb_leftmost;
537 
538 	if (!left)
539 		return NULL;
540 
541 	return rb_entry(left, struct sched_entity, run_node);
542 }
543 
544 static struct sched_entity *__pick_next_entity(struct sched_entity *se)
545 {
546 	struct rb_node *next = rb_next(&se->run_node);
547 
548 	if (!next)
549 		return NULL;
550 
551 	return rb_entry(next, struct sched_entity, run_node);
552 }
553 
554 #ifdef CONFIG_SCHED_DEBUG
555 struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
556 {
557 	struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
558 
559 	if (!last)
560 		return NULL;
561 
562 	return rb_entry(last, struct sched_entity, run_node);
563 }
564 
565 /**************************************************************
566  * Scheduling class statistics methods:
567  */
568 
569 int sched_proc_update_handler(struct ctl_table *table, int write,
570 		void __user *buffer, size_t *lenp,
571 		loff_t *ppos)
572 {
573 	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
574 	unsigned int factor = get_update_sysctl_factor();
575 
576 	if (ret || !write)
577 		return ret;
578 
579 	sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
580 					sysctl_sched_min_granularity);
581 
582 #define WRT_SYSCTL(name) \
583 	(normalized_sysctl_##name = sysctl_##name / (factor))
584 	WRT_SYSCTL(sched_min_granularity);
585 	WRT_SYSCTL(sched_latency);
586 	WRT_SYSCTL(sched_wakeup_granularity);
587 #undef WRT_SYSCTL
588 
589 	return 0;
590 }
591 #endif
592 
593 /*
594  * delta /= w
595  */
596 static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
597 {
598 	if (unlikely(se->load.weight != NICE_0_LOAD))
599 		delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
600 
601 	return delta;
602 }
603 
604 /*
605  * The idea is to set a period in which each task runs once.
606  *
607  * When there are too many tasks (sched_nr_latency) we have to stretch
608  * this period because otherwise the slices get too small.
609  *
610  * p = (nr <= nl) ? l : l*nr/nl
611  */
612 static u64 __sched_period(unsigned long nr_running)
613 {
614 	if (unlikely(nr_running > sched_nr_latency))
615 		return nr_running * sysctl_sched_min_granularity;
616 	else
617 		return sysctl_sched_latency;
618 }
619 
620 /*
621  * We calculate the wall-time slice from the period by taking a part
622  * proportional to the weight.
623  *
624  * s = p*P[w/rw]
625  */
626 static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
627 {
628 	u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
629 
630 	for_each_sched_entity(se) {
631 		struct load_weight *load;
632 		struct load_weight lw;
633 
634 		cfs_rq = cfs_rq_of(se);
635 		load = &cfs_rq->load;
636 
637 		if (unlikely(!se->on_rq)) {
638 			lw = cfs_rq->load;
639 
640 			update_load_add(&lw, se->load.weight);
641 			load = &lw;
642 		}
643 		slice = __calc_delta(slice, se->load.weight, load);
644 	}
645 	return slice;
646 }
647 
648 /*
649  * We calculate the vruntime slice of a to-be-inserted task.
650  *
651  * vs = s/w
652  */
653 static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
654 {
655 	return calc_delta_fair(sched_slice(cfs_rq, se), se);
656 }
657 
658 #ifdef CONFIG_SMP
659 static int select_idle_sibling(struct task_struct *p, int cpu);
660 static unsigned long task_h_load(struct task_struct *p);
661 
662 /*
663  * We choose a half-life close to 1 scheduling period.
664  * Note: The tables runnable_avg_yN_inv and runnable_avg_yN_sum are
665  * dependent on this value.
666  */
667 #define LOAD_AVG_PERIOD 32
668 #define LOAD_AVG_MAX 47742 /* maximum possible load avg */
669 #define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_AVG_MAX */
670 
671 /* Give new sched_entity start runnable values to heavy its load in infant time */
672 void init_entity_runnable_average(struct sched_entity *se)
673 {
674 	struct sched_avg *sa = &se->avg;
675 
676 	sa->last_update_time = 0;
677 	/*
678 	 * sched_avg's period_contrib should be strictly less then 1024, so
679 	 * we give it 1023 to make sure it is almost a period (1024us), and
680 	 * will definitely be update (after enqueue).
681 	 */
682 	sa->period_contrib = 1023;
683 	sa->load_avg = scale_load_down(se->load.weight);
684 	sa->load_sum = sa->load_avg * LOAD_AVG_MAX;
685 	/*
686 	 * At this point, util_avg won't be used in select_task_rq_fair anyway
687 	 */
688 	sa->util_avg = 0;
689 	sa->util_sum = 0;
690 	/* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
691 }
692 
693 /*
694  * With new tasks being created, their initial util_avgs are extrapolated
695  * based on the cfs_rq's current util_avg:
696  *
697  *   util_avg = cfs_rq->util_avg / (cfs_rq->load_avg + 1) * se.load.weight
698  *
699  * However, in many cases, the above util_avg does not give a desired
700  * value. Moreover, the sum of the util_avgs may be divergent, such
701  * as when the series is a harmonic series.
702  *
703  * To solve this problem, we also cap the util_avg of successive tasks to
704  * only 1/2 of the left utilization budget:
705  *
706  *   util_avg_cap = (1024 - cfs_rq->avg.util_avg) / 2^n
707  *
708  * where n denotes the nth task.
709  *
710  * For example, a simplest series from the beginning would be like:
711  *
712  *  task  util_avg: 512, 256, 128,  64,  32,   16,    8, ...
713  * cfs_rq util_avg: 512, 768, 896, 960, 992, 1008, 1016, ...
714  *
715  * Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap)
716  * if util_avg > util_avg_cap.
717  */
718 void post_init_entity_util_avg(struct sched_entity *se)
719 {
720 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
721 	struct sched_avg *sa = &se->avg;
722 	long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2;
723 
724 	if (cap > 0) {
725 		if (cfs_rq->avg.util_avg != 0) {
726 			sa->util_avg  = cfs_rq->avg.util_avg * se->load.weight;
727 			sa->util_avg /= (cfs_rq->avg.load_avg + 1);
728 
729 			if (sa->util_avg > cap)
730 				sa->util_avg = cap;
731 		} else {
732 			sa->util_avg = cap;
733 		}
734 		sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
735 	}
736 }
737 
738 static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq);
739 static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq);
740 #else
741 void init_entity_runnable_average(struct sched_entity *se)
742 {
743 }
744 void post_init_entity_util_avg(struct sched_entity *se)
745 {
746 }
747 #endif
748 
749 /*
750  * Update the current task's runtime statistics.
751  */
752 static void update_curr(struct cfs_rq *cfs_rq)
753 {
754 	struct sched_entity *curr = cfs_rq->curr;
755 	u64 now = rq_clock_task(rq_of(cfs_rq));
756 	u64 delta_exec;
757 
758 	if (unlikely(!curr))
759 		return;
760 
761 	delta_exec = now - curr->exec_start;
762 	if (unlikely((s64)delta_exec <= 0))
763 		return;
764 
765 	curr->exec_start = now;
766 
767 	schedstat_set(curr->statistics.exec_max,
768 		      max(delta_exec, curr->statistics.exec_max));
769 
770 	curr->sum_exec_runtime += delta_exec;
771 	schedstat_add(cfs_rq, exec_clock, delta_exec);
772 
773 	curr->vruntime += calc_delta_fair(delta_exec, curr);
774 	update_min_vruntime(cfs_rq);
775 
776 	if (entity_is_task(curr)) {
777 		struct task_struct *curtask = task_of(curr);
778 
779 		trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
780 		cpuacct_charge(curtask, delta_exec);
781 		account_group_exec_runtime(curtask, delta_exec);
782 	}
783 
784 	account_cfs_rq_runtime(cfs_rq, delta_exec);
785 }
786 
787 static void update_curr_fair(struct rq *rq)
788 {
789 	update_curr(cfs_rq_of(&rq->curr->se));
790 }
791 
792 #ifdef CONFIG_SCHEDSTATS
793 static inline void
794 update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
795 {
796 	u64 wait_start = rq_clock(rq_of(cfs_rq));
797 
798 	if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) &&
799 	    likely(wait_start > se->statistics.wait_start))
800 		wait_start -= se->statistics.wait_start;
801 
802 	se->statistics.wait_start = wait_start;
803 }
804 
805 static void
806 update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
807 {
808 	struct task_struct *p;
809 	u64 delta;
810 
811 	delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start;
812 
813 	if (entity_is_task(se)) {
814 		p = task_of(se);
815 		if (task_on_rq_migrating(p)) {
816 			/*
817 			 * Preserve migrating task's wait time so wait_start
818 			 * time stamp can be adjusted to accumulate wait time
819 			 * prior to migration.
820 			 */
821 			se->statistics.wait_start = delta;
822 			return;
823 		}
824 		trace_sched_stat_wait(p, delta);
825 	}
826 
827 	se->statistics.wait_max = max(se->statistics.wait_max, delta);
828 	se->statistics.wait_count++;
829 	se->statistics.wait_sum += delta;
830 	se->statistics.wait_start = 0;
831 }
832 
833 /*
834  * Task is being enqueued - update stats:
835  */
836 static inline void
837 update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
838 {
839 	/*
840 	 * Are we enqueueing a waiting task? (for current tasks
841 	 * a dequeue/enqueue event is a NOP)
842 	 */
843 	if (se != cfs_rq->curr)
844 		update_stats_wait_start(cfs_rq, se);
845 }
846 
847 static inline void
848 update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
849 {
850 	/*
851 	 * Mark the end of the wait period if dequeueing a
852 	 * waiting task:
853 	 */
854 	if (se != cfs_rq->curr)
855 		update_stats_wait_end(cfs_rq, se);
856 
857 	if (flags & DEQUEUE_SLEEP) {
858 		if (entity_is_task(se)) {
859 			struct task_struct *tsk = task_of(se);
860 
861 			if (tsk->state & TASK_INTERRUPTIBLE)
862 				se->statistics.sleep_start = rq_clock(rq_of(cfs_rq));
863 			if (tsk->state & TASK_UNINTERRUPTIBLE)
864 				se->statistics.block_start = rq_clock(rq_of(cfs_rq));
865 		}
866 	}
867 
868 }
869 #else
870 static inline void
871 update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
872 {
873 }
874 
875 static inline void
876 update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
877 {
878 }
879 
880 static inline void
881 update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
882 {
883 }
884 
885 static inline void
886 update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
887 {
888 }
889 #endif
890 
891 /*
892  * We are picking a new current task - update its stats:
893  */
894 static inline void
895 update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
896 {
897 	/*
898 	 * We are starting a new run period:
899 	 */
900 	se->exec_start = rq_clock_task(rq_of(cfs_rq));
901 }
902 
903 /**************************************************
904  * Scheduling class queueing methods:
905  */
906 
907 #ifdef CONFIG_NUMA_BALANCING
908 /*
909  * Approximate time to scan a full NUMA task in ms. The task scan period is
910  * calculated based on the tasks virtual memory size and
911  * numa_balancing_scan_size.
912  */
913 unsigned int sysctl_numa_balancing_scan_period_min = 1000;
914 unsigned int sysctl_numa_balancing_scan_period_max = 60000;
915 
916 /* Portion of address space to scan in MB */
917 unsigned int sysctl_numa_balancing_scan_size = 256;
918 
919 /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
920 unsigned int sysctl_numa_balancing_scan_delay = 1000;
921 
922 static unsigned int task_nr_scan_windows(struct task_struct *p)
923 {
924 	unsigned long rss = 0;
925 	unsigned long nr_scan_pages;
926 
927 	/*
928 	 * Calculations based on RSS as non-present and empty pages are skipped
929 	 * by the PTE scanner and NUMA hinting faults should be trapped based
930 	 * on resident pages
931 	 */
932 	nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT);
933 	rss = get_mm_rss(p->mm);
934 	if (!rss)
935 		rss = nr_scan_pages;
936 
937 	rss = round_up(rss, nr_scan_pages);
938 	return rss / nr_scan_pages;
939 }
940 
941 /* For sanitys sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */
942 #define MAX_SCAN_WINDOW 2560
943 
944 static unsigned int task_scan_min(struct task_struct *p)
945 {
946 	unsigned int scan_size = READ_ONCE(sysctl_numa_balancing_scan_size);
947 	unsigned int scan, floor;
948 	unsigned int windows = 1;
949 
950 	if (scan_size < MAX_SCAN_WINDOW)
951 		windows = MAX_SCAN_WINDOW / scan_size;
952 	floor = 1000 / windows;
953 
954 	scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
955 	return max_t(unsigned int, floor, scan);
956 }
957 
958 static unsigned int task_scan_max(struct task_struct *p)
959 {
960 	unsigned int smin = task_scan_min(p);
961 	unsigned int smax;
962 
963 	/* Watch for min being lower than max due to floor calculations */
964 	smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
965 	return max(smin, smax);
966 }
967 
968 static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
969 {
970 	rq->nr_numa_running += (p->numa_preferred_nid != -1);
971 	rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
972 }
973 
974 static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
975 {
976 	rq->nr_numa_running -= (p->numa_preferred_nid != -1);
977 	rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
978 }
979 
980 struct numa_group {
981 	atomic_t refcount;
982 
983 	spinlock_t lock; /* nr_tasks, tasks */
984 	int nr_tasks;
985 	pid_t gid;
986 	int active_nodes;
987 
988 	struct rcu_head rcu;
989 	unsigned long total_faults;
990 	unsigned long max_faults_cpu;
991 	/*
992 	 * Faults_cpu is used to decide whether memory should move
993 	 * towards the CPU. As a consequence, these stats are weighted
994 	 * more by CPU use than by memory faults.
995 	 */
996 	unsigned long *faults_cpu;
997 	unsigned long faults[0];
998 };
999 
1000 /* Shared or private faults. */
1001 #define NR_NUMA_HINT_FAULT_TYPES 2
1002 
1003 /* Memory and CPU locality */
1004 #define NR_NUMA_HINT_FAULT_STATS (NR_NUMA_HINT_FAULT_TYPES * 2)
1005 
1006 /* Averaged statistics, and temporary buffers. */
1007 #define NR_NUMA_HINT_FAULT_BUCKETS (NR_NUMA_HINT_FAULT_STATS * 2)
1008 
1009 pid_t task_numa_group_id(struct task_struct *p)
1010 {
1011 	return p->numa_group ? p->numa_group->gid : 0;
1012 }
1013 
1014 /*
1015  * The averaged statistics, shared & private, memory & cpu,
1016  * occupy the first half of the array. The second half of the
1017  * array is for current counters, which are averaged into the
1018  * first set by task_numa_placement.
1019  */
1020 static inline int task_faults_idx(enum numa_faults_stats s, int nid, int priv)
1021 {
1022 	return NR_NUMA_HINT_FAULT_TYPES * (s * nr_node_ids + nid) + priv;
1023 }
1024 
1025 static inline unsigned long task_faults(struct task_struct *p, int nid)
1026 {
1027 	if (!p->numa_faults)
1028 		return 0;
1029 
1030 	return p->numa_faults[task_faults_idx(NUMA_MEM, nid, 0)] +
1031 		p->numa_faults[task_faults_idx(NUMA_MEM, nid, 1)];
1032 }
1033 
1034 static inline unsigned long group_faults(struct task_struct *p, int nid)
1035 {
1036 	if (!p->numa_group)
1037 		return 0;
1038 
1039 	return p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 0)] +
1040 		p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 1)];
1041 }
1042 
1043 static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
1044 {
1045 	return group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 0)] +
1046 		group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)];
1047 }
1048 
1049 /*
1050  * A node triggering more than 1/3 as many NUMA faults as the maximum is
1051  * considered part of a numa group's pseudo-interleaving set. Migrations
1052  * between these nodes are slowed down, to allow things to settle down.
1053  */
1054 #define ACTIVE_NODE_FRACTION 3
1055 
1056 static bool numa_is_active_node(int nid, struct numa_group *ng)
1057 {
1058 	return group_faults_cpu(ng, nid) * ACTIVE_NODE_FRACTION > ng->max_faults_cpu;
1059 }
1060 
1061 /* Handle placement on systems where not all nodes are directly connected. */
1062 static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
1063 					int maxdist, bool task)
1064 {
1065 	unsigned long score = 0;
1066 	int node;
1067 
1068 	/*
1069 	 * All nodes are directly connected, and the same distance
1070 	 * from each other. No need for fancy placement algorithms.
1071 	 */
1072 	if (sched_numa_topology_type == NUMA_DIRECT)
1073 		return 0;
1074 
1075 	/*
1076 	 * This code is called for each node, introducing N^2 complexity,
1077 	 * which should be ok given the number of nodes rarely exceeds 8.
1078 	 */
1079 	for_each_online_node(node) {
1080 		unsigned long faults;
1081 		int dist = node_distance(nid, node);
1082 
1083 		/*
1084 		 * The furthest away nodes in the system are not interesting
1085 		 * for placement; nid was already counted.
1086 		 */
1087 		if (dist == sched_max_numa_distance || node == nid)
1088 			continue;
1089 
1090 		/*
1091 		 * On systems with a backplane NUMA topology, compare groups
1092 		 * of nodes, and move tasks towards the group with the most
1093 		 * memory accesses. When comparing two nodes at distance
1094 		 * "hoplimit", only nodes closer by than "hoplimit" are part
1095 		 * of each group. Skip other nodes.
1096 		 */
1097 		if (sched_numa_topology_type == NUMA_BACKPLANE &&
1098 					dist > maxdist)
1099 			continue;
1100 
1101 		/* Add up the faults from nearby nodes. */
1102 		if (task)
1103 			faults = task_faults(p, node);
1104 		else
1105 			faults = group_faults(p, node);
1106 
1107 		/*
1108 		 * On systems with a glueless mesh NUMA topology, there are
1109 		 * no fixed "groups of nodes". Instead, nodes that are not
1110 		 * directly connected bounce traffic through intermediate
1111 		 * nodes; a numa_group can occupy any set of nodes.
1112 		 * The further away a node is, the less the faults count.
1113 		 * This seems to result in good task placement.
1114 		 */
1115 		if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
1116 			faults *= (sched_max_numa_distance - dist);
1117 			faults /= (sched_max_numa_distance - LOCAL_DISTANCE);
1118 		}
1119 
1120 		score += faults;
1121 	}
1122 
1123 	return score;
1124 }
1125 
1126 /*
1127  * These return the fraction of accesses done by a particular task, or
1128  * task group, on a particular numa node.  The group weight is given a
1129  * larger multiplier, in order to group tasks together that are almost
1130  * evenly spread out between numa nodes.
1131  */
1132 static inline unsigned long task_weight(struct task_struct *p, int nid,
1133 					int dist)
1134 {
1135 	unsigned long faults, total_faults;
1136 
1137 	if (!p->numa_faults)
1138 		return 0;
1139 
1140 	total_faults = p->total_numa_faults;
1141 
1142 	if (!total_faults)
1143 		return 0;
1144 
1145 	faults = task_faults(p, nid);
1146 	faults += score_nearby_nodes(p, nid, dist, true);
1147 
1148 	return 1000 * faults / total_faults;
1149 }
1150 
1151 static inline unsigned long group_weight(struct task_struct *p, int nid,
1152 					 int dist)
1153 {
1154 	unsigned long faults, total_faults;
1155 
1156 	if (!p->numa_group)
1157 		return 0;
1158 
1159 	total_faults = p->numa_group->total_faults;
1160 
1161 	if (!total_faults)
1162 		return 0;
1163 
1164 	faults = group_faults(p, nid);
1165 	faults += score_nearby_nodes(p, nid, dist, false);
1166 
1167 	return 1000 * faults / total_faults;
1168 }
1169 
1170 bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
1171 				int src_nid, int dst_cpu)
1172 {
1173 	struct numa_group *ng = p->numa_group;
1174 	int dst_nid = cpu_to_node(dst_cpu);
1175 	int last_cpupid, this_cpupid;
1176 
1177 	this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
1178 
1179 	/*
1180 	 * Multi-stage node selection is used in conjunction with a periodic
1181 	 * migration fault to build a temporal task<->page relation. By using
1182 	 * a two-stage filter we remove short/unlikely relations.
1183 	 *
1184 	 * Using P(p) ~ n_p / n_t as per frequentist probability, we can equate
1185 	 * a task's usage of a particular page (n_p) per total usage of this
1186 	 * page (n_t) (in a given time-span) to a probability.
1187 	 *
1188 	 * Our periodic faults will sample this probability and getting the
1189 	 * same result twice in a row, given these samples are fully
1190 	 * independent, is then given by P(n)^2, provided our sample period
1191 	 * is sufficiently short compared to the usage pattern.
1192 	 *
1193 	 * This quadric squishes small probabilities, making it less likely we
1194 	 * act on an unlikely task<->page relation.
1195 	 */
1196 	last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
1197 	if (!cpupid_pid_unset(last_cpupid) &&
1198 				cpupid_to_nid(last_cpupid) != dst_nid)
1199 		return false;
1200 
1201 	/* Always allow migrate on private faults */
1202 	if (cpupid_match_pid(p, last_cpupid))
1203 		return true;
1204 
1205 	/* A shared fault, but p->numa_group has not been set up yet. */
1206 	if (!ng)
1207 		return true;
1208 
1209 	/*
1210 	 * Destination node is much more heavily used than the source
1211 	 * node? Allow migration.
1212 	 */
1213 	if (group_faults_cpu(ng, dst_nid) > group_faults_cpu(ng, src_nid) *
1214 					ACTIVE_NODE_FRACTION)
1215 		return true;
1216 
1217 	/*
1218 	 * Distribute memory according to CPU & memory use on each node,
1219 	 * with 3/4 hysteresis to avoid unnecessary memory migrations:
1220 	 *
1221 	 * faults_cpu(dst)   3   faults_cpu(src)
1222 	 * --------------- * - > ---------------
1223 	 * faults_mem(dst)   4   faults_mem(src)
1224 	 */
1225 	return group_faults_cpu(ng, dst_nid) * group_faults(p, src_nid) * 3 >
1226 	       group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
1227 }
1228 
1229 static unsigned long weighted_cpuload(const int cpu);
1230 static unsigned long source_load(int cpu, int type);
1231 static unsigned long target_load(int cpu, int type);
1232 static unsigned long capacity_of(int cpu);
1233 static long effective_load(struct task_group *tg, int cpu, long wl, long wg);
1234 
1235 /* Cached statistics for all CPUs within a node */
1236 struct numa_stats {
1237 	unsigned long nr_running;
1238 	unsigned long load;
1239 
1240 	/* Total compute capacity of CPUs on a node */
1241 	unsigned long compute_capacity;
1242 
1243 	/* Approximate capacity in terms of runnable tasks on a node */
1244 	unsigned long task_capacity;
1245 	int has_free_capacity;
1246 };
1247 
1248 /*
1249  * XXX borrowed from update_sg_lb_stats
1250  */
1251 static void update_numa_stats(struct numa_stats *ns, int nid)
1252 {
1253 	int smt, cpu, cpus = 0;
1254 	unsigned long capacity;
1255 
1256 	memset(ns, 0, sizeof(*ns));
1257 	for_each_cpu(cpu, cpumask_of_node(nid)) {
1258 		struct rq *rq = cpu_rq(cpu);
1259 
1260 		ns->nr_running += rq->nr_running;
1261 		ns->load += weighted_cpuload(cpu);
1262 		ns->compute_capacity += capacity_of(cpu);
1263 
1264 		cpus++;
1265 	}
1266 
1267 	/*
1268 	 * If we raced with hotplug and there are no CPUs left in our mask
1269 	 * the @ns structure is NULL'ed and task_numa_compare() will
1270 	 * not find this node attractive.
1271 	 *
1272 	 * We'll either bail at !has_free_capacity, or we'll detect a huge
1273 	 * imbalance and bail there.
1274 	 */
1275 	if (!cpus)
1276 		return;
1277 
1278 	/* smt := ceil(cpus / capacity), assumes: 1 < smt_power < 2 */
1279 	smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, ns->compute_capacity);
1280 	capacity = cpus / smt; /* cores */
1281 
1282 	ns->task_capacity = min_t(unsigned, capacity,
1283 		DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE));
1284 	ns->has_free_capacity = (ns->nr_running < ns->task_capacity);
1285 }
1286 
1287 struct task_numa_env {
1288 	struct task_struct *p;
1289 
1290 	int src_cpu, src_nid;
1291 	int dst_cpu, dst_nid;
1292 
1293 	struct numa_stats src_stats, dst_stats;
1294 
1295 	int imbalance_pct;
1296 	int dist;
1297 
1298 	struct task_struct *best_task;
1299 	long best_imp;
1300 	int best_cpu;
1301 };
1302 
1303 static void task_numa_assign(struct task_numa_env *env,
1304 			     struct task_struct *p, long imp)
1305 {
1306 	if (env->best_task)
1307 		put_task_struct(env->best_task);
1308 
1309 	env->best_task = p;
1310 	env->best_imp = imp;
1311 	env->best_cpu = env->dst_cpu;
1312 }
1313 
1314 static bool load_too_imbalanced(long src_load, long dst_load,
1315 				struct task_numa_env *env)
1316 {
1317 	long imb, old_imb;
1318 	long orig_src_load, orig_dst_load;
1319 	long src_capacity, dst_capacity;
1320 
1321 	/*
1322 	 * The load is corrected for the CPU capacity available on each node.
1323 	 *
1324 	 * src_load        dst_load
1325 	 * ------------ vs ---------
1326 	 * src_capacity    dst_capacity
1327 	 */
1328 	src_capacity = env->src_stats.compute_capacity;
1329 	dst_capacity = env->dst_stats.compute_capacity;
1330 
1331 	/* We care about the slope of the imbalance, not the direction. */
1332 	if (dst_load < src_load)
1333 		swap(dst_load, src_load);
1334 
1335 	/* Is the difference below the threshold? */
1336 	imb = dst_load * src_capacity * 100 -
1337 	      src_load * dst_capacity * env->imbalance_pct;
1338 	if (imb <= 0)
1339 		return false;
1340 
1341 	/*
1342 	 * The imbalance is above the allowed threshold.
1343 	 * Compare it with the old imbalance.
1344 	 */
1345 	orig_src_load = env->src_stats.load;
1346 	orig_dst_load = env->dst_stats.load;
1347 
1348 	if (orig_dst_load < orig_src_load)
1349 		swap(orig_dst_load, orig_src_load);
1350 
1351 	old_imb = orig_dst_load * src_capacity * 100 -
1352 		  orig_src_load * dst_capacity * env->imbalance_pct;
1353 
1354 	/* Would this change make things worse? */
1355 	return (imb > old_imb);
1356 }
1357 
1358 /*
1359  * This checks if the overall compute and NUMA accesses of the system would
1360  * be improved if the source tasks was migrated to the target dst_cpu taking
1361  * into account that it might be best if task running on the dst_cpu should
1362  * be exchanged with the source task
1363  */
1364 static void task_numa_compare(struct task_numa_env *env,
1365 			      long taskimp, long groupimp)
1366 {
1367 	struct rq *src_rq = cpu_rq(env->src_cpu);
1368 	struct rq *dst_rq = cpu_rq(env->dst_cpu);
1369 	struct task_struct *cur;
1370 	long src_load, dst_load;
1371 	long load;
1372 	long imp = env->p->numa_group ? groupimp : taskimp;
1373 	long moveimp = imp;
1374 	int dist = env->dist;
1375 	bool assigned = false;
1376 
1377 	rcu_read_lock();
1378 
1379 	raw_spin_lock_irq(&dst_rq->lock);
1380 	cur = dst_rq->curr;
1381 	/*
1382 	 * No need to move the exiting task or idle task.
1383 	 */
1384 	if ((cur->flags & PF_EXITING) || is_idle_task(cur))
1385 		cur = NULL;
1386 	else {
1387 		/*
1388 		 * The task_struct must be protected here to protect the
1389 		 * p->numa_faults access in the task_weight since the
1390 		 * numa_faults could already be freed in the following path:
1391 		 * finish_task_switch()
1392 		 *     --> put_task_struct()
1393 		 *         --> __put_task_struct()
1394 		 *             --> task_numa_free()
1395 		 */
1396 		get_task_struct(cur);
1397 	}
1398 
1399 	raw_spin_unlock_irq(&dst_rq->lock);
1400 
1401 	/*
1402 	 * Because we have preemption enabled we can get migrated around and
1403 	 * end try selecting ourselves (current == env->p) as a swap candidate.
1404 	 */
1405 	if (cur == env->p)
1406 		goto unlock;
1407 
1408 	/*
1409 	 * "imp" is the fault differential for the source task between the
1410 	 * source and destination node. Calculate the total differential for
1411 	 * the source task and potential destination task. The more negative
1412 	 * the value is, the more rmeote accesses that would be expected to
1413 	 * be incurred if the tasks were swapped.
1414 	 */
1415 	if (cur) {
1416 		/* Skip this swap candidate if cannot move to the source cpu */
1417 		if (!cpumask_test_cpu(env->src_cpu, tsk_cpus_allowed(cur)))
1418 			goto unlock;
1419 
1420 		/*
1421 		 * If dst and source tasks are in the same NUMA group, or not
1422 		 * in any group then look only at task weights.
1423 		 */
1424 		if (cur->numa_group == env->p->numa_group) {
1425 			imp = taskimp + task_weight(cur, env->src_nid, dist) -
1426 			      task_weight(cur, env->dst_nid, dist);
1427 			/*
1428 			 * Add some hysteresis to prevent swapping the
1429 			 * tasks within a group over tiny differences.
1430 			 */
1431 			if (cur->numa_group)
1432 				imp -= imp/16;
1433 		} else {
1434 			/*
1435 			 * Compare the group weights. If a task is all by
1436 			 * itself (not part of a group), use the task weight
1437 			 * instead.
1438 			 */
1439 			if (cur->numa_group)
1440 				imp += group_weight(cur, env->src_nid, dist) -
1441 				       group_weight(cur, env->dst_nid, dist);
1442 			else
1443 				imp += task_weight(cur, env->src_nid, dist) -
1444 				       task_weight(cur, env->dst_nid, dist);
1445 		}
1446 	}
1447 
1448 	if (imp <= env->best_imp && moveimp <= env->best_imp)
1449 		goto unlock;
1450 
1451 	if (!cur) {
1452 		/* Is there capacity at our destination? */
1453 		if (env->src_stats.nr_running <= env->src_stats.task_capacity &&
1454 		    !env->dst_stats.has_free_capacity)
1455 			goto unlock;
1456 
1457 		goto balance;
1458 	}
1459 
1460 	/* Balance doesn't matter much if we're running a task per cpu */
1461 	if (imp > env->best_imp && src_rq->nr_running == 1 &&
1462 			dst_rq->nr_running == 1)
1463 		goto assign;
1464 
1465 	/*
1466 	 * In the overloaded case, try and keep the load balanced.
1467 	 */
1468 balance:
1469 	load = task_h_load(env->p);
1470 	dst_load = env->dst_stats.load + load;
1471 	src_load = env->src_stats.load - load;
1472 
1473 	if (moveimp > imp && moveimp > env->best_imp) {
1474 		/*
1475 		 * If the improvement from just moving env->p direction is
1476 		 * better than swapping tasks around, check if a move is
1477 		 * possible. Store a slightly smaller score than moveimp,
1478 		 * so an actually idle CPU will win.
1479 		 */
1480 		if (!load_too_imbalanced(src_load, dst_load, env)) {
1481 			imp = moveimp - 1;
1482 			put_task_struct(cur);
1483 			cur = NULL;
1484 			goto assign;
1485 		}
1486 	}
1487 
1488 	if (imp <= env->best_imp)
1489 		goto unlock;
1490 
1491 	if (cur) {
1492 		load = task_h_load(cur);
1493 		dst_load -= load;
1494 		src_load += load;
1495 	}
1496 
1497 	if (load_too_imbalanced(src_load, dst_load, env))
1498 		goto unlock;
1499 
1500 	/*
1501 	 * One idle CPU per node is evaluated for a task numa move.
1502 	 * Call select_idle_sibling to maybe find a better one.
1503 	 */
1504 	if (!cur)
1505 		env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu);
1506 
1507 assign:
1508 	assigned = true;
1509 	task_numa_assign(env, cur, imp);
1510 unlock:
1511 	rcu_read_unlock();
1512 	/*
1513 	 * The dst_rq->curr isn't assigned. The protection for task_struct is
1514 	 * finished.
1515 	 */
1516 	if (cur && !assigned)
1517 		put_task_struct(cur);
1518 }
1519 
1520 static void task_numa_find_cpu(struct task_numa_env *env,
1521 				long taskimp, long groupimp)
1522 {
1523 	int cpu;
1524 
1525 	for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
1526 		/* Skip this CPU if the source task cannot migrate */
1527 		if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(env->p)))
1528 			continue;
1529 
1530 		env->dst_cpu = cpu;
1531 		task_numa_compare(env, taskimp, groupimp);
1532 	}
1533 }
1534 
1535 /* Only move tasks to a NUMA node less busy than the current node. */
1536 static bool numa_has_capacity(struct task_numa_env *env)
1537 {
1538 	struct numa_stats *src = &env->src_stats;
1539 	struct numa_stats *dst = &env->dst_stats;
1540 
1541 	if (src->has_free_capacity && !dst->has_free_capacity)
1542 		return false;
1543 
1544 	/*
1545 	 * Only consider a task move if the source has a higher load
1546 	 * than the destination, corrected for CPU capacity on each node.
1547 	 *
1548 	 *      src->load                dst->load
1549 	 * --------------------- vs ---------------------
1550 	 * src->compute_capacity    dst->compute_capacity
1551 	 */
1552 	if (src->load * dst->compute_capacity * env->imbalance_pct >
1553 
1554 	    dst->load * src->compute_capacity * 100)
1555 		return true;
1556 
1557 	return false;
1558 }
1559 
1560 static int task_numa_migrate(struct task_struct *p)
1561 {
1562 	struct task_numa_env env = {
1563 		.p = p,
1564 
1565 		.src_cpu = task_cpu(p),
1566 		.src_nid = task_node(p),
1567 
1568 		.imbalance_pct = 112,
1569 
1570 		.best_task = NULL,
1571 		.best_imp = 0,
1572 		.best_cpu = -1,
1573 	};
1574 	struct sched_domain *sd;
1575 	unsigned long taskweight, groupweight;
1576 	int nid, ret, dist;
1577 	long taskimp, groupimp;
1578 
1579 	/*
1580 	 * Pick the lowest SD_NUMA domain, as that would have the smallest
1581 	 * imbalance and would be the first to start moving tasks about.
1582 	 *
1583 	 * And we want to avoid any moving of tasks about, as that would create
1584 	 * random movement of tasks -- counter the numa conditions we're trying
1585 	 * to satisfy here.
1586 	 */
1587 	rcu_read_lock();
1588 	sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
1589 	if (sd)
1590 		env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
1591 	rcu_read_unlock();
1592 
1593 	/*
1594 	 * Cpusets can break the scheduler domain tree into smaller
1595 	 * balance domains, some of which do not cross NUMA boundaries.
1596 	 * Tasks that are "trapped" in such domains cannot be migrated
1597 	 * elsewhere, so there is no point in (re)trying.
1598 	 */
1599 	if (unlikely(!sd)) {
1600 		p->numa_preferred_nid = task_node(p);
1601 		return -EINVAL;
1602 	}
1603 
1604 	env.dst_nid = p->numa_preferred_nid;
1605 	dist = env.dist = node_distance(env.src_nid, env.dst_nid);
1606 	taskweight = task_weight(p, env.src_nid, dist);
1607 	groupweight = group_weight(p, env.src_nid, dist);
1608 	update_numa_stats(&env.src_stats, env.src_nid);
1609 	taskimp = task_weight(p, env.dst_nid, dist) - taskweight;
1610 	groupimp = group_weight(p, env.dst_nid, dist) - groupweight;
1611 	update_numa_stats(&env.dst_stats, env.dst_nid);
1612 
1613 	/* Try to find a spot on the preferred nid. */
1614 	if (numa_has_capacity(&env))
1615 		task_numa_find_cpu(&env, taskimp, groupimp);
1616 
1617 	/*
1618 	 * Look at other nodes in these cases:
1619 	 * - there is no space available on the preferred_nid
1620 	 * - the task is part of a numa_group that is interleaved across
1621 	 *   multiple NUMA nodes; in order to better consolidate the group,
1622 	 *   we need to check other locations.
1623 	 */
1624 	if (env.best_cpu == -1 || (p->numa_group && p->numa_group->active_nodes > 1)) {
1625 		for_each_online_node(nid) {
1626 			if (nid == env.src_nid || nid == p->numa_preferred_nid)
1627 				continue;
1628 
1629 			dist = node_distance(env.src_nid, env.dst_nid);
1630 			if (sched_numa_topology_type == NUMA_BACKPLANE &&
1631 						dist != env.dist) {
1632 				taskweight = task_weight(p, env.src_nid, dist);
1633 				groupweight = group_weight(p, env.src_nid, dist);
1634 			}
1635 
1636 			/* Only consider nodes where both task and groups benefit */
1637 			taskimp = task_weight(p, nid, dist) - taskweight;
1638 			groupimp = group_weight(p, nid, dist) - groupweight;
1639 			if (taskimp < 0 && groupimp < 0)
1640 				continue;
1641 
1642 			env.dist = dist;
1643 			env.dst_nid = nid;
1644 			update_numa_stats(&env.dst_stats, env.dst_nid);
1645 			if (numa_has_capacity(&env))
1646 				task_numa_find_cpu(&env, taskimp, groupimp);
1647 		}
1648 	}
1649 
1650 	/*
1651 	 * If the task is part of a workload that spans multiple NUMA nodes,
1652 	 * and is migrating into one of the workload's active nodes, remember
1653 	 * this node as the task's preferred numa node, so the workload can
1654 	 * settle down.
1655 	 * A task that migrated to a second choice node will be better off
1656 	 * trying for a better one later. Do not set the preferred node here.
1657 	 */
1658 	if (p->numa_group) {
1659 		struct numa_group *ng = p->numa_group;
1660 
1661 		if (env.best_cpu == -1)
1662 			nid = env.src_nid;
1663 		else
1664 			nid = env.dst_nid;
1665 
1666 		if (ng->active_nodes > 1 && numa_is_active_node(env.dst_nid, ng))
1667 			sched_setnuma(p, env.dst_nid);
1668 	}
1669 
1670 	/* No better CPU than the current one was found. */
1671 	if (env.best_cpu == -1)
1672 		return -EAGAIN;
1673 
1674 	/*
1675 	 * Reset the scan period if the task is being rescheduled on an
1676 	 * alternative node to recheck if the tasks is now properly placed.
1677 	 */
1678 	p->numa_scan_period = task_scan_min(p);
1679 
1680 	if (env.best_task == NULL) {
1681 		ret = migrate_task_to(p, env.best_cpu);
1682 		if (ret != 0)
1683 			trace_sched_stick_numa(p, env.src_cpu, env.best_cpu);
1684 		return ret;
1685 	}
1686 
1687 	ret = migrate_swap(p, env.best_task);
1688 	if (ret != 0)
1689 		trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task));
1690 	put_task_struct(env.best_task);
1691 	return ret;
1692 }
1693 
1694 /* Attempt to migrate a task to a CPU on the preferred node. */
1695 static void numa_migrate_preferred(struct task_struct *p)
1696 {
1697 	unsigned long interval = HZ;
1698 
1699 	/* This task has no NUMA fault statistics yet */
1700 	if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults))
1701 		return;
1702 
1703 	/* Periodically retry migrating the task to the preferred node */
1704 	interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16);
1705 	p->numa_migrate_retry = jiffies + interval;
1706 
1707 	/* Success if task is already running on preferred CPU */
1708 	if (task_node(p) == p->numa_preferred_nid)
1709 		return;
1710 
1711 	/* Otherwise, try migrate to a CPU on the preferred node */
1712 	task_numa_migrate(p);
1713 }
1714 
1715 /*
1716  * Find out how many nodes on the workload is actively running on. Do this by
1717  * tracking the nodes from which NUMA hinting faults are triggered. This can
1718  * be different from the set of nodes where the workload's memory is currently
1719  * located.
1720  */
1721 static void numa_group_count_active_nodes(struct numa_group *numa_group)
1722 {
1723 	unsigned long faults, max_faults = 0;
1724 	int nid, active_nodes = 0;
1725 
1726 	for_each_online_node(nid) {
1727 		faults = group_faults_cpu(numa_group, nid);
1728 		if (faults > max_faults)
1729 			max_faults = faults;
1730 	}
1731 
1732 	for_each_online_node(nid) {
1733 		faults = group_faults_cpu(numa_group, nid);
1734 		if (faults * ACTIVE_NODE_FRACTION > max_faults)
1735 			active_nodes++;
1736 	}
1737 
1738 	numa_group->max_faults_cpu = max_faults;
1739 	numa_group->active_nodes = active_nodes;
1740 }
1741 
1742 /*
1743  * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
1744  * increments. The more local the fault statistics are, the higher the scan
1745  * period will be for the next scan window. If local/(local+remote) ratio is
1746  * below NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS)
1747  * the scan period will decrease. Aim for 70% local accesses.
1748  */
1749 #define NUMA_PERIOD_SLOTS 10
1750 #define NUMA_PERIOD_THRESHOLD 7
1751 
1752 /*
1753  * Increase the scan period (slow down scanning) if the majority of
1754  * our memory is already on our local node, or if the majority of
1755  * the page accesses are shared with other processes.
1756  * Otherwise, decrease the scan period.
1757  */
1758 static void update_task_scan_period(struct task_struct *p,
1759 			unsigned long shared, unsigned long private)
1760 {
1761 	unsigned int period_slot;
1762 	int ratio;
1763 	int diff;
1764 
1765 	unsigned long remote = p->numa_faults_locality[0];
1766 	unsigned long local = p->numa_faults_locality[1];
1767 
1768 	/*
1769 	 * If there were no record hinting faults then either the task is
1770 	 * completely idle or all activity is areas that are not of interest
1771 	 * to automatic numa balancing. Related to that, if there were failed
1772 	 * migration then it implies we are migrating too quickly or the local
1773 	 * node is overloaded. In either case, scan slower
1774 	 */
1775 	if (local + shared == 0 || p->numa_faults_locality[2]) {
1776 		p->numa_scan_period = min(p->numa_scan_period_max,
1777 			p->numa_scan_period << 1);
1778 
1779 		p->mm->numa_next_scan = jiffies +
1780 			msecs_to_jiffies(p->numa_scan_period);
1781 
1782 		return;
1783 	}
1784 
1785 	/*
1786 	 * Prepare to scale scan period relative to the current period.
1787 	 *	 == NUMA_PERIOD_THRESHOLD scan period stays the same
1788 	 *       <  NUMA_PERIOD_THRESHOLD scan period decreases (scan faster)
1789 	 *	 >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower)
1790 	 */
1791 	period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);
1792 	ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
1793 	if (ratio >= NUMA_PERIOD_THRESHOLD) {
1794 		int slot = ratio - NUMA_PERIOD_THRESHOLD;
1795 		if (!slot)
1796 			slot = 1;
1797 		diff = slot * period_slot;
1798 	} else {
1799 		diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
1800 
1801 		/*
1802 		 * Scale scan rate increases based on sharing. There is an
1803 		 * inverse relationship between the degree of sharing and
1804 		 * the adjustment made to the scanning period. Broadly
1805 		 * speaking the intent is that there is little point
1806 		 * scanning faster if shared accesses dominate as it may
1807 		 * simply bounce migrations uselessly
1808 		 */
1809 		ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared + 1));
1810 		diff = (diff * ratio) / NUMA_PERIOD_SLOTS;
1811 	}
1812 
1813 	p->numa_scan_period = clamp(p->numa_scan_period + diff,
1814 			task_scan_min(p), task_scan_max(p));
1815 	memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
1816 }
1817 
1818 /*
1819  * Get the fraction of time the task has been running since the last
1820  * NUMA placement cycle. The scheduler keeps similar statistics, but
1821  * decays those on a 32ms period, which is orders of magnitude off
1822  * from the dozens-of-seconds NUMA balancing period. Use the scheduler
1823  * stats only if the task is so new there are no NUMA statistics yet.
1824  */
1825 static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
1826 {
1827 	u64 runtime, delta, now;
1828 	/* Use the start of this time slice to avoid calculations. */
1829 	now = p->se.exec_start;
1830 	runtime = p->se.sum_exec_runtime;
1831 
1832 	if (p->last_task_numa_placement) {
1833 		delta = runtime - p->last_sum_exec_runtime;
1834 		*period = now - p->last_task_numa_placement;
1835 	} else {
1836 		delta = p->se.avg.load_sum / p->se.load.weight;
1837 		*period = LOAD_AVG_MAX;
1838 	}
1839 
1840 	p->last_sum_exec_runtime = runtime;
1841 	p->last_task_numa_placement = now;
1842 
1843 	return delta;
1844 }
1845 
1846 /*
1847  * Determine the preferred nid for a task in a numa_group. This needs to
1848  * be done in a way that produces consistent results with group_weight,
1849  * otherwise workloads might not converge.
1850  */
1851 static int preferred_group_nid(struct task_struct *p, int nid)
1852 {
1853 	nodemask_t nodes;
1854 	int dist;
1855 
1856 	/* Direct connections between all NUMA nodes. */
1857 	if (sched_numa_topology_type == NUMA_DIRECT)
1858 		return nid;
1859 
1860 	/*
1861 	 * On a system with glueless mesh NUMA topology, group_weight
1862 	 * scores nodes according to the number of NUMA hinting faults on
1863 	 * both the node itself, and on nearby nodes.
1864 	 */
1865 	if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
1866 		unsigned long score, max_score = 0;
1867 		int node, max_node = nid;
1868 
1869 		dist = sched_max_numa_distance;
1870 
1871 		for_each_online_node(node) {
1872 			score = group_weight(p, node, dist);
1873 			if (score > max_score) {
1874 				max_score = score;
1875 				max_node = node;
1876 			}
1877 		}
1878 		return max_node;
1879 	}
1880 
1881 	/*
1882 	 * Finding the preferred nid in a system with NUMA backplane
1883 	 * interconnect topology is more involved. The goal is to locate
1884 	 * tasks from numa_groups near each other in the system, and
1885 	 * untangle workloads from different sides of the system. This requires
1886 	 * searching down the hierarchy of node groups, recursively searching
1887 	 * inside the highest scoring group of nodes. The nodemask tricks
1888 	 * keep the complexity of the search down.
1889 	 */
1890 	nodes = node_online_map;
1891 	for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) {
1892 		unsigned long max_faults = 0;
1893 		nodemask_t max_group = NODE_MASK_NONE;
1894 		int a, b;
1895 
1896 		/* Are there nodes at this distance from each other? */
1897 		if (!find_numa_distance(dist))
1898 			continue;
1899 
1900 		for_each_node_mask(a, nodes) {
1901 			unsigned long faults = 0;
1902 			nodemask_t this_group;
1903 			nodes_clear(this_group);
1904 
1905 			/* Sum group's NUMA faults; includes a==b case. */
1906 			for_each_node_mask(b, nodes) {
1907 				if (node_distance(a, b) < dist) {
1908 					faults += group_faults(p, b);
1909 					node_set(b, this_group);
1910 					node_clear(b, nodes);
1911 				}
1912 			}
1913 
1914 			/* Remember the top group. */
1915 			if (faults > max_faults) {
1916 				max_faults = faults;
1917 				max_group = this_group;
1918 				/*
1919 				 * subtle: at the smallest distance there is
1920 				 * just one node left in each "group", the
1921 				 * winner is the preferred nid.
1922 				 */
1923 				nid = a;
1924 			}
1925 		}
1926 		/* Next round, evaluate the nodes within max_group. */
1927 		if (!max_faults)
1928 			break;
1929 		nodes = max_group;
1930 	}
1931 	return nid;
1932 }
1933 
1934 static void task_numa_placement(struct task_struct *p)
1935 {
1936 	int seq, nid, max_nid = -1, max_group_nid = -1;
1937 	unsigned long max_faults = 0, max_group_faults = 0;
1938 	unsigned long fault_types[2] = { 0, 0 };
1939 	unsigned long total_faults;
1940 	u64 runtime, period;
1941 	spinlock_t *group_lock = NULL;
1942 
1943 	/*
1944 	 * The p->mm->numa_scan_seq field gets updated without
1945 	 * exclusive access. Use READ_ONCE() here to ensure
1946 	 * that the field is read in a single access:
1947 	 */
1948 	seq = READ_ONCE(p->mm->numa_scan_seq);
1949 	if (p->numa_scan_seq == seq)
1950 		return;
1951 	p->numa_scan_seq = seq;
1952 	p->numa_scan_period_max = task_scan_max(p);
1953 
1954 	total_faults = p->numa_faults_locality[0] +
1955 		       p->numa_faults_locality[1];
1956 	runtime = numa_get_avg_runtime(p, &period);
1957 
1958 	/* If the task is part of a group prevent parallel updates to group stats */
1959 	if (p->numa_group) {
1960 		group_lock = &p->numa_group->lock;
1961 		spin_lock_irq(group_lock);
1962 	}
1963 
1964 	/* Find the node with the highest number of faults */
1965 	for_each_online_node(nid) {
1966 		/* Keep track of the offsets in numa_faults array */
1967 		int mem_idx, membuf_idx, cpu_idx, cpubuf_idx;
1968 		unsigned long faults = 0, group_faults = 0;
1969 		int priv;
1970 
1971 		for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) {
1972 			long diff, f_diff, f_weight;
1973 
1974 			mem_idx = task_faults_idx(NUMA_MEM, nid, priv);
1975 			membuf_idx = task_faults_idx(NUMA_MEMBUF, nid, priv);
1976 			cpu_idx = task_faults_idx(NUMA_CPU, nid, priv);
1977 			cpubuf_idx = task_faults_idx(NUMA_CPUBUF, nid, priv);
1978 
1979 			/* Decay existing window, copy faults since last scan */
1980 			diff = p->numa_faults[membuf_idx] - p->numa_faults[mem_idx] / 2;
1981 			fault_types[priv] += p->numa_faults[membuf_idx];
1982 			p->numa_faults[membuf_idx] = 0;
1983 
1984 			/*
1985 			 * Normalize the faults_from, so all tasks in a group
1986 			 * count according to CPU use, instead of by the raw
1987 			 * number of faults. Tasks with little runtime have
1988 			 * little over-all impact on throughput, and thus their
1989 			 * faults are less important.
1990 			 */
1991 			f_weight = div64_u64(runtime << 16, period + 1);
1992 			f_weight = (f_weight * p->numa_faults[cpubuf_idx]) /
1993 				   (total_faults + 1);
1994 			f_diff = f_weight - p->numa_faults[cpu_idx] / 2;
1995 			p->numa_faults[cpubuf_idx] = 0;
1996 
1997 			p->numa_faults[mem_idx] += diff;
1998 			p->numa_faults[cpu_idx] += f_diff;
1999 			faults += p->numa_faults[mem_idx];
2000 			p->total_numa_faults += diff;
2001 			if (p->numa_group) {
2002 				/*
2003 				 * safe because we can only change our own group
2004 				 *
2005 				 * mem_idx represents the offset for a given
2006 				 * nid and priv in a specific region because it
2007 				 * is at the beginning of the numa_faults array.
2008 				 */
2009 				p->numa_group->faults[mem_idx] += diff;
2010 				p->numa_group->faults_cpu[mem_idx] += f_diff;
2011 				p->numa_group->total_faults += diff;
2012 				group_faults += p->numa_group->faults[mem_idx];
2013 			}
2014 		}
2015 
2016 		if (faults > max_faults) {
2017 			max_faults = faults;
2018 			max_nid = nid;
2019 		}
2020 
2021 		if (group_faults > max_group_faults) {
2022 			max_group_faults = group_faults;
2023 			max_group_nid = nid;
2024 		}
2025 	}
2026 
2027 	update_task_scan_period(p, fault_types[0], fault_types[1]);
2028 
2029 	if (p->numa_group) {
2030 		numa_group_count_active_nodes(p->numa_group);
2031 		spin_unlock_irq(group_lock);
2032 		max_nid = preferred_group_nid(p, max_group_nid);
2033 	}
2034 
2035 	if (max_faults) {
2036 		/* Set the new preferred node */
2037 		if (max_nid != p->numa_preferred_nid)
2038 			sched_setnuma(p, max_nid);
2039 
2040 		if (task_node(p) != p->numa_preferred_nid)
2041 			numa_migrate_preferred(p);
2042 	}
2043 }
2044 
2045 static inline int get_numa_group(struct numa_group *grp)
2046 {
2047 	return atomic_inc_not_zero(&grp->refcount);
2048 }
2049 
2050 static inline void put_numa_group(struct numa_group *grp)
2051 {
2052 	if (atomic_dec_and_test(&grp->refcount))
2053 		kfree_rcu(grp, rcu);
2054 }
2055 
2056 static void task_numa_group(struct task_struct *p, int cpupid, int flags,
2057 			int *priv)
2058 {
2059 	struct numa_group *grp, *my_grp;
2060 	struct task_struct *tsk;
2061 	bool join = false;
2062 	int cpu = cpupid_to_cpu(cpupid);
2063 	int i;
2064 
2065 	if (unlikely(!p->numa_group)) {
2066 		unsigned int size = sizeof(struct numa_group) +
2067 				    4*nr_node_ids*sizeof(unsigned long);
2068 
2069 		grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
2070 		if (!grp)
2071 			return;
2072 
2073 		atomic_set(&grp->refcount, 1);
2074 		grp->active_nodes = 1;
2075 		grp->max_faults_cpu = 0;
2076 		spin_lock_init(&grp->lock);
2077 		grp->gid = p->pid;
2078 		/* Second half of the array tracks nids where faults happen */
2079 		grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *
2080 						nr_node_ids;
2081 
2082 		for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
2083 			grp->faults[i] = p->numa_faults[i];
2084 
2085 		grp->total_faults = p->total_numa_faults;
2086 
2087 		grp->nr_tasks++;
2088 		rcu_assign_pointer(p->numa_group, grp);
2089 	}
2090 
2091 	rcu_read_lock();
2092 	tsk = READ_ONCE(cpu_rq(cpu)->curr);
2093 
2094 	if (!cpupid_match_pid(tsk, cpupid))
2095 		goto no_join;
2096 
2097 	grp = rcu_dereference(tsk->numa_group);
2098 	if (!grp)
2099 		goto no_join;
2100 
2101 	my_grp = p->numa_group;
2102 	if (grp == my_grp)
2103 		goto no_join;
2104 
2105 	/*
2106 	 * Only join the other group if its bigger; if we're the bigger group,
2107 	 * the other task will join us.
2108 	 */
2109 	if (my_grp->nr_tasks > grp->nr_tasks)
2110 		goto no_join;
2111 
2112 	/*
2113 	 * Tie-break on the grp address.
2114 	 */
2115 	if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp)
2116 		goto no_join;
2117 
2118 	/* Always join threads in the same process. */
2119 	if (tsk->mm == current->mm)
2120 		join = true;
2121 
2122 	/* Simple filter to avoid false positives due to PID collisions */
2123 	if (flags & TNF_SHARED)
2124 		join = true;
2125 
2126 	/* Update priv based on whether false sharing was detected */
2127 	*priv = !join;
2128 
2129 	if (join && !get_numa_group(grp))
2130 		goto no_join;
2131 
2132 	rcu_read_unlock();
2133 
2134 	if (!join)
2135 		return;
2136 
2137 	BUG_ON(irqs_disabled());
2138 	double_lock_irq(&my_grp->lock, &grp->lock);
2139 
2140 	for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
2141 		my_grp->faults[i] -= p->numa_faults[i];
2142 		grp->faults[i] += p->numa_faults[i];
2143 	}
2144 	my_grp->total_faults -= p->total_numa_faults;
2145 	grp->total_faults += p->total_numa_faults;
2146 
2147 	my_grp->nr_tasks--;
2148 	grp->nr_tasks++;
2149 
2150 	spin_unlock(&my_grp->lock);
2151 	spin_unlock_irq(&grp->lock);
2152 
2153 	rcu_assign_pointer(p->numa_group, grp);
2154 
2155 	put_numa_group(my_grp);
2156 	return;
2157 
2158 no_join:
2159 	rcu_read_unlock();
2160 	return;
2161 }
2162 
2163 void task_numa_free(struct task_struct *p)
2164 {
2165 	struct numa_group *grp = p->numa_group;
2166 	void *numa_faults = p->numa_faults;
2167 	unsigned long flags;
2168 	int i;
2169 
2170 	if (grp) {
2171 		spin_lock_irqsave(&grp->lock, flags);
2172 		for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
2173 			grp->faults[i] -= p->numa_faults[i];
2174 		grp->total_faults -= p->total_numa_faults;
2175 
2176 		grp->nr_tasks--;
2177 		spin_unlock_irqrestore(&grp->lock, flags);
2178 		RCU_INIT_POINTER(p->numa_group, NULL);
2179 		put_numa_group(grp);
2180 	}
2181 
2182 	p->numa_faults = NULL;
2183 	kfree(numa_faults);
2184 }
2185 
2186 /*
2187  * Got a PROT_NONE fault for a page on @node.
2188  */
2189 void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
2190 {
2191 	struct task_struct *p = current;
2192 	bool migrated = flags & TNF_MIGRATED;
2193 	int cpu_node = task_node(current);
2194 	int local = !!(flags & TNF_FAULT_LOCAL);
2195 	struct numa_group *ng;
2196 	int priv;
2197 
2198 	if (!static_branch_likely(&sched_numa_balancing))
2199 		return;
2200 
2201 	/* for example, ksmd faulting in a user's mm */
2202 	if (!p->mm)
2203 		return;
2204 
2205 	/* Allocate buffer to track faults on a per-node basis */
2206 	if (unlikely(!p->numa_faults)) {
2207 		int size = sizeof(*p->numa_faults) *
2208 			   NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids;
2209 
2210 		p->numa_faults = kzalloc(size, GFP_KERNEL|__GFP_NOWARN);
2211 		if (!p->numa_faults)
2212 			return;
2213 
2214 		p->total_numa_faults = 0;
2215 		memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
2216 	}
2217 
2218 	/*
2219 	 * First accesses are treated as private, otherwise consider accesses
2220 	 * to be private if the accessing pid has not changed
2221 	 */
2222 	if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) {
2223 		priv = 1;
2224 	} else {
2225 		priv = cpupid_match_pid(p, last_cpupid);
2226 		if (!priv && !(flags & TNF_NO_GROUP))
2227 			task_numa_group(p, last_cpupid, flags, &priv);
2228 	}
2229 
2230 	/*
2231 	 * If a workload spans multiple NUMA nodes, a shared fault that
2232 	 * occurs wholly within the set of nodes that the workload is
2233 	 * actively using should be counted as local. This allows the
2234 	 * scan rate to slow down when a workload has settled down.
2235 	 */
2236 	ng = p->numa_group;
2237 	if (!priv && !local && ng && ng->active_nodes > 1 &&
2238 				numa_is_active_node(cpu_node, ng) &&
2239 				numa_is_active_node(mem_node, ng))
2240 		local = 1;
2241 
2242 	task_numa_placement(p);
2243 
2244 	/*
2245 	 * Retry task to preferred node migration periodically, in case it
2246 	 * case it previously failed, or the scheduler moved us.
2247 	 */
2248 	if (time_after(jiffies, p->numa_migrate_retry))
2249 		numa_migrate_preferred(p);
2250 
2251 	if (migrated)
2252 		p->numa_pages_migrated += pages;
2253 	if (flags & TNF_MIGRATE_FAIL)
2254 		p->numa_faults_locality[2] += pages;
2255 
2256 	p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages;
2257 	p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages;
2258 	p->numa_faults_locality[local] += pages;
2259 }
2260 
2261 static void reset_ptenuma_scan(struct task_struct *p)
2262 {
2263 	/*
2264 	 * We only did a read acquisition of the mmap sem, so
2265 	 * p->mm->numa_scan_seq is written to without exclusive access
2266 	 * and the update is not guaranteed to be atomic. That's not
2267 	 * much of an issue though, since this is just used for
2268 	 * statistical sampling. Use READ_ONCE/WRITE_ONCE, which are not
2269 	 * expensive, to avoid any form of compiler optimizations:
2270 	 */
2271 	WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1);
2272 	p->mm->numa_scan_offset = 0;
2273 }
2274 
2275 /*
2276  * The expensive part of numa migration is done from task_work context.
2277  * Triggered from task_tick_numa().
2278  */
2279 void task_numa_work(struct callback_head *work)
2280 {
2281 	unsigned long migrate, next_scan, now = jiffies;
2282 	struct task_struct *p = current;
2283 	struct mm_struct *mm = p->mm;
2284 	u64 runtime = p->se.sum_exec_runtime;
2285 	struct vm_area_struct *vma;
2286 	unsigned long start, end;
2287 	unsigned long nr_pte_updates = 0;
2288 	long pages, virtpages;
2289 
2290 	WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
2291 
2292 	work->next = work; /* protect against double add */
2293 	/*
2294 	 * Who cares about NUMA placement when they're dying.
2295 	 *
2296 	 * NOTE: make sure not to dereference p->mm before this check,
2297 	 * exit_task_work() happens _after_ exit_mm() so we could be called
2298 	 * without p->mm even though we still had it when we enqueued this
2299 	 * work.
2300 	 */
2301 	if (p->flags & PF_EXITING)
2302 		return;
2303 
2304 	if (!mm->numa_next_scan) {
2305 		mm->numa_next_scan = now +
2306 			msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
2307 	}
2308 
2309 	/*
2310 	 * Enforce maximal scan/migration frequency..
2311 	 */
2312 	migrate = mm->numa_next_scan;
2313 	if (time_before(now, migrate))
2314 		return;
2315 
2316 	if (p->numa_scan_period == 0) {
2317 		p->numa_scan_period_max = task_scan_max(p);
2318 		p->numa_scan_period = task_scan_min(p);
2319 	}
2320 
2321 	next_scan = now + msecs_to_jiffies(p->numa_scan_period);
2322 	if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
2323 		return;
2324 
2325 	/*
2326 	 * Delay this task enough that another task of this mm will likely win
2327 	 * the next time around.
2328 	 */
2329 	p->node_stamp += 2 * TICK_NSEC;
2330 
2331 	start = mm->numa_scan_offset;
2332 	pages = sysctl_numa_balancing_scan_size;
2333 	pages <<= 20 - PAGE_SHIFT; /* MB in pages */
2334 	virtpages = pages * 8;	   /* Scan up to this much virtual space */
2335 	if (!pages)
2336 		return;
2337 
2338 
2339 	down_read(&mm->mmap_sem);
2340 	vma = find_vma(mm, start);
2341 	if (!vma) {
2342 		reset_ptenuma_scan(p);
2343 		start = 0;
2344 		vma = mm->mmap;
2345 	}
2346 	for (; vma; vma = vma->vm_next) {
2347 		if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
2348 			is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) {
2349 			continue;
2350 		}
2351 
2352 		/*
2353 		 * Shared library pages mapped by multiple processes are not
2354 		 * migrated as it is expected they are cache replicated. Avoid
2355 		 * hinting faults in read-only file-backed mappings or the vdso
2356 		 * as migrating the pages will be of marginal benefit.
2357 		 */
2358 		if (!vma->vm_mm ||
2359 		    (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
2360 			continue;
2361 
2362 		/*
2363 		 * Skip inaccessible VMAs to avoid any confusion between
2364 		 * PROT_NONE and NUMA hinting ptes
2365 		 */
2366 		if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
2367 			continue;
2368 
2369 		do {
2370 			start = max(start, vma->vm_start);
2371 			end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
2372 			end = min(end, vma->vm_end);
2373 			nr_pte_updates = change_prot_numa(vma, start, end);
2374 
2375 			/*
2376 			 * Try to scan sysctl_numa_balancing_size worth of
2377 			 * hpages that have at least one present PTE that
2378 			 * is not already pte-numa. If the VMA contains
2379 			 * areas that are unused or already full of prot_numa
2380 			 * PTEs, scan up to virtpages, to skip through those
2381 			 * areas faster.
2382 			 */
2383 			if (nr_pte_updates)
2384 				pages -= (end - start) >> PAGE_SHIFT;
2385 			virtpages -= (end - start) >> PAGE_SHIFT;
2386 
2387 			start = end;
2388 			if (pages <= 0 || virtpages <= 0)
2389 				goto out;
2390 
2391 			cond_resched();
2392 		} while (end != vma->vm_end);
2393 	}
2394 
2395 out:
2396 	/*
2397 	 * It is possible to reach the end of the VMA list but the last few
2398 	 * VMAs are not guaranteed to the vma_migratable. If they are not, we
2399 	 * would find the !migratable VMA on the next scan but not reset the
2400 	 * scanner to the start so check it now.
2401 	 */
2402 	if (vma)
2403 		mm->numa_scan_offset = start;
2404 	else
2405 		reset_ptenuma_scan(p);
2406 	up_read(&mm->mmap_sem);
2407 
2408 	/*
2409 	 * Make sure tasks use at least 32x as much time to run other code
2410 	 * than they used here, to limit NUMA PTE scanning overhead to 3% max.
2411 	 * Usually update_task_scan_period slows down scanning enough; on an
2412 	 * overloaded system we need to limit overhead on a per task basis.
2413 	 */
2414 	if (unlikely(p->se.sum_exec_runtime != runtime)) {
2415 		u64 diff = p->se.sum_exec_runtime - runtime;
2416 		p->node_stamp += 32 * diff;
2417 	}
2418 }
2419 
2420 /*
2421  * Drive the periodic memory faults..
2422  */
2423 void task_tick_numa(struct rq *rq, struct task_struct *curr)
2424 {
2425 	struct callback_head *work = &curr->numa_work;
2426 	u64 period, now;
2427 
2428 	/*
2429 	 * We don't care about NUMA placement if we don't have memory.
2430 	 */
2431 	if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
2432 		return;
2433 
2434 	/*
2435 	 * Using runtime rather than walltime has the dual advantage that
2436 	 * we (mostly) drive the selection from busy threads and that the
2437 	 * task needs to have done some actual work before we bother with
2438 	 * NUMA placement.
2439 	 */
2440 	now = curr->se.sum_exec_runtime;
2441 	period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
2442 
2443 	if (now > curr->node_stamp + period) {
2444 		if (!curr->node_stamp)
2445 			curr->numa_scan_period = task_scan_min(curr);
2446 		curr->node_stamp += period;
2447 
2448 		if (!time_before(jiffies, curr->mm->numa_next_scan)) {
2449 			init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
2450 			task_work_add(curr, work, true);
2451 		}
2452 	}
2453 }
2454 #else
2455 static void task_tick_numa(struct rq *rq, struct task_struct *curr)
2456 {
2457 }
2458 
2459 static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p)
2460 {
2461 }
2462 
2463 static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
2464 {
2465 }
2466 #endif /* CONFIG_NUMA_BALANCING */
2467 
2468 static void
2469 account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
2470 {
2471 	update_load_add(&cfs_rq->load, se->load.weight);
2472 	if (!parent_entity(se))
2473 		update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
2474 #ifdef CONFIG_SMP
2475 	if (entity_is_task(se)) {
2476 		struct rq *rq = rq_of(cfs_rq);
2477 
2478 		account_numa_enqueue(rq, task_of(se));
2479 		list_add(&se->group_node, &rq->cfs_tasks);
2480 	}
2481 #endif
2482 	cfs_rq->nr_running++;
2483 }
2484 
2485 static void
2486 account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
2487 {
2488 	update_load_sub(&cfs_rq->load, se->load.weight);
2489 	if (!parent_entity(se))
2490 		update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
2491 #ifdef CONFIG_SMP
2492 	if (entity_is_task(se)) {
2493 		account_numa_dequeue(rq_of(cfs_rq), task_of(se));
2494 		list_del_init(&se->group_node);
2495 	}
2496 #endif
2497 	cfs_rq->nr_running--;
2498 }
2499 
2500 #ifdef CONFIG_FAIR_GROUP_SCHED
2501 # ifdef CONFIG_SMP
2502 static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
2503 {
2504 	long tg_weight;
2505 
2506 	/*
2507 	 * Use this CPU's real-time load instead of the last load contribution
2508 	 * as the updating of the contribution is delayed, and we will use the
2509 	 * the real-time load to calc the share. See update_tg_load_avg().
2510 	 */
2511 	tg_weight = atomic_long_read(&tg->load_avg);
2512 	tg_weight -= cfs_rq->tg_load_avg_contrib;
2513 	tg_weight += cfs_rq->load.weight;
2514 
2515 	return tg_weight;
2516 }
2517 
2518 static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
2519 {
2520 	long tg_weight, load, shares;
2521 
2522 	tg_weight = calc_tg_weight(tg, cfs_rq);
2523 	load = cfs_rq->load.weight;
2524 
2525 	shares = (tg->shares * load);
2526 	if (tg_weight)
2527 		shares /= tg_weight;
2528 
2529 	if (shares < MIN_SHARES)
2530 		shares = MIN_SHARES;
2531 	if (shares > tg->shares)
2532 		shares = tg->shares;
2533 
2534 	return shares;
2535 }
2536 # else /* CONFIG_SMP */
2537 static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
2538 {
2539 	return tg->shares;
2540 }
2541 # endif /* CONFIG_SMP */
2542 static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
2543 			    unsigned long weight)
2544 {
2545 	if (se->on_rq) {
2546 		/* commit outstanding execution time */
2547 		if (cfs_rq->curr == se)
2548 			update_curr(cfs_rq);
2549 		account_entity_dequeue(cfs_rq, se);
2550 	}
2551 
2552 	update_load_set(&se->load, weight);
2553 
2554 	if (se->on_rq)
2555 		account_entity_enqueue(cfs_rq, se);
2556 }
2557 
2558 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
2559 
2560 static void update_cfs_shares(struct cfs_rq *cfs_rq)
2561 {
2562 	struct task_group *tg;
2563 	struct sched_entity *se;
2564 	long shares;
2565 
2566 	tg = cfs_rq->tg;
2567 	se = tg->se[cpu_of(rq_of(cfs_rq))];
2568 	if (!se || throttled_hierarchy(cfs_rq))
2569 		return;
2570 #ifndef CONFIG_SMP
2571 	if (likely(se->load.weight == tg->shares))
2572 		return;
2573 #endif
2574 	shares = calc_cfs_shares(cfs_rq, tg);
2575 
2576 	reweight_entity(cfs_rq_of(se), se, shares);
2577 }
2578 #else /* CONFIG_FAIR_GROUP_SCHED */
2579 static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
2580 {
2581 }
2582 #endif /* CONFIG_FAIR_GROUP_SCHED */
2583 
2584 #ifdef CONFIG_SMP
2585 /* Precomputed fixed inverse multiplies for multiplication by y^n */
2586 static const u32 runnable_avg_yN_inv[] = {
2587 	0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
2588 	0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
2589 	0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
2590 	0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9,
2591 	0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80,
2592 	0x85aac367, 0x82cd8698,
2593 };
2594 
2595 /*
2596  * Precomputed \Sum y^k { 1<=k<=n }.  These are floor(true_value) to prevent
2597  * over-estimates when re-combining.
2598  */
2599 static const u32 runnable_avg_yN_sum[] = {
2600 	    0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103,
2601 	 9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082,
2602 	17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,
2603 };
2604 
2605 /*
2606  * Precomputed \Sum y^k { 1<=k<=n, where n%32=0). Values are rolled down to
2607  * lower integers. See Documentation/scheduler/sched-avg.txt how these
2608  * were generated:
2609  */
2610 static const u32 __accumulated_sum_N32[] = {
2611 	    0, 23371, 35056, 40899, 43820, 45281,
2612 	46011, 46376, 46559, 46650, 46696, 46719,
2613 };
2614 
2615 /*
2616  * Approximate:
2617  *   val * y^n,    where y^32 ~= 0.5 (~1 scheduling period)
2618  */
2619 static __always_inline u64 decay_load(u64 val, u64 n)
2620 {
2621 	unsigned int local_n;
2622 
2623 	if (!n)
2624 		return val;
2625 	else if (unlikely(n > LOAD_AVG_PERIOD * 63))
2626 		return 0;
2627 
2628 	/* after bounds checking we can collapse to 32-bit */
2629 	local_n = n;
2630 
2631 	/*
2632 	 * As y^PERIOD = 1/2, we can combine
2633 	 *    y^n = 1/2^(n/PERIOD) * y^(n%PERIOD)
2634 	 * With a look-up table which covers y^n (n<PERIOD)
2635 	 *
2636 	 * To achieve constant time decay_load.
2637 	 */
2638 	if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
2639 		val >>= local_n / LOAD_AVG_PERIOD;
2640 		local_n %= LOAD_AVG_PERIOD;
2641 	}
2642 
2643 	val = mul_u64_u32_shr(val, runnable_avg_yN_inv[local_n], 32);
2644 	return val;
2645 }
2646 
2647 /*
2648  * For updates fully spanning n periods, the contribution to runnable
2649  * average will be: \Sum 1024*y^n
2650  *
2651  * We can compute this reasonably efficiently by combining:
2652  *   y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for  n <PERIOD}
2653  */
2654 static u32 __compute_runnable_contrib(u64 n)
2655 {
2656 	u32 contrib = 0;
2657 
2658 	if (likely(n <= LOAD_AVG_PERIOD))
2659 		return runnable_avg_yN_sum[n];
2660 	else if (unlikely(n >= LOAD_AVG_MAX_N))
2661 		return LOAD_AVG_MAX;
2662 
2663 	/* Since n < LOAD_AVG_MAX_N, n/LOAD_AVG_PERIOD < 11 */
2664 	contrib = __accumulated_sum_N32[n/LOAD_AVG_PERIOD];
2665 	n %= LOAD_AVG_PERIOD;
2666 	contrib = decay_load(contrib, n);
2667 	return contrib + runnable_avg_yN_sum[n];
2668 }
2669 
2670 #define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
2671 
2672 /*
2673  * We can represent the historical contribution to runnable average as the
2674  * coefficients of a geometric series.  To do this we sub-divide our runnable
2675  * history into segments of approximately 1ms (1024us); label the segment that
2676  * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
2677  *
2678  * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
2679  *      p0            p1           p2
2680  *     (now)       (~1ms ago)  (~2ms ago)
2681  *
2682  * Let u_i denote the fraction of p_i that the entity was runnable.
2683  *
2684  * We then designate the fractions u_i as our co-efficients, yielding the
2685  * following representation of historical load:
2686  *   u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
2687  *
2688  * We choose y based on the with of a reasonably scheduling period, fixing:
2689  *   y^32 = 0.5
2690  *
2691  * This means that the contribution to load ~32ms ago (u_32) will be weighted
2692  * approximately half as much as the contribution to load within the last ms
2693  * (u_0).
2694  *
2695  * When a period "rolls over" and we have new u_0`, multiplying the previous
2696  * sum again by y is sufficient to update:
2697  *   load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
2698  *            = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
2699  */
2700 static __always_inline int
2701 __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
2702 		  unsigned long weight, int running, struct cfs_rq *cfs_rq)
2703 {
2704 	u64 delta, scaled_delta, periods;
2705 	u32 contrib;
2706 	unsigned int delta_w, scaled_delta_w, decayed = 0;
2707 	unsigned long scale_freq, scale_cpu;
2708 
2709 	delta = now - sa->last_update_time;
2710 	/*
2711 	 * This should only happen when time goes backwards, which it
2712 	 * unfortunately does during sched clock init when we swap over to TSC.
2713 	 */
2714 	if ((s64)delta < 0) {
2715 		sa->last_update_time = now;
2716 		return 0;
2717 	}
2718 
2719 	/*
2720 	 * Use 1024ns as the unit of measurement since it's a reasonable
2721 	 * approximation of 1us and fast to compute.
2722 	 */
2723 	delta >>= 10;
2724 	if (!delta)
2725 		return 0;
2726 	sa->last_update_time = now;
2727 
2728 	scale_freq = arch_scale_freq_capacity(NULL, cpu);
2729 	scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
2730 
2731 	/* delta_w is the amount already accumulated against our next period */
2732 	delta_w = sa->period_contrib;
2733 	if (delta + delta_w >= 1024) {
2734 		decayed = 1;
2735 
2736 		/* how much left for next period will start over, we don't know yet */
2737 		sa->period_contrib = 0;
2738 
2739 		/*
2740 		 * Now that we know we're crossing a period boundary, figure
2741 		 * out how much from delta we need to complete the current
2742 		 * period and accrue it.
2743 		 */
2744 		delta_w = 1024 - delta_w;
2745 		scaled_delta_w = cap_scale(delta_w, scale_freq);
2746 		if (weight) {
2747 			sa->load_sum += weight * scaled_delta_w;
2748 			if (cfs_rq) {
2749 				cfs_rq->runnable_load_sum +=
2750 						weight * scaled_delta_w;
2751 			}
2752 		}
2753 		if (running)
2754 			sa->util_sum += scaled_delta_w * scale_cpu;
2755 
2756 		delta -= delta_w;
2757 
2758 		/* Figure out how many additional periods this update spans */
2759 		periods = delta / 1024;
2760 		delta %= 1024;
2761 
2762 		sa->load_sum = decay_load(sa->load_sum, periods + 1);
2763 		if (cfs_rq) {
2764 			cfs_rq->runnable_load_sum =
2765 				decay_load(cfs_rq->runnable_load_sum, periods + 1);
2766 		}
2767 		sa->util_sum = decay_load((u64)(sa->util_sum), periods + 1);
2768 
2769 		/* Efficiently calculate \sum (1..n_period) 1024*y^i */
2770 		contrib = __compute_runnable_contrib(periods);
2771 		contrib = cap_scale(contrib, scale_freq);
2772 		if (weight) {
2773 			sa->load_sum += weight * contrib;
2774 			if (cfs_rq)
2775 				cfs_rq->runnable_load_sum += weight * contrib;
2776 		}
2777 		if (running)
2778 			sa->util_sum += contrib * scale_cpu;
2779 	}
2780 
2781 	/* Remainder of delta accrued against u_0` */
2782 	scaled_delta = cap_scale(delta, scale_freq);
2783 	if (weight) {
2784 		sa->load_sum += weight * scaled_delta;
2785 		if (cfs_rq)
2786 			cfs_rq->runnable_load_sum += weight * scaled_delta;
2787 	}
2788 	if (running)
2789 		sa->util_sum += scaled_delta * scale_cpu;
2790 
2791 	sa->period_contrib += delta;
2792 
2793 	if (decayed) {
2794 		sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX);
2795 		if (cfs_rq) {
2796 			cfs_rq->runnable_load_avg =
2797 				div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX);
2798 		}
2799 		sa->util_avg = sa->util_sum / LOAD_AVG_MAX;
2800 	}
2801 
2802 	return decayed;
2803 }
2804 
2805 #ifdef CONFIG_FAIR_GROUP_SCHED
2806 /*
2807  * Updating tg's load_avg is necessary before update_cfs_share (which is done)
2808  * and effective_load (which is not done because it is too costly).
2809  */
2810 static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
2811 {
2812 	long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
2813 
2814 	/*
2815 	 * No need to update load_avg for root_task_group as it is not used.
2816 	 */
2817 	if (cfs_rq->tg == &root_task_group)
2818 		return;
2819 
2820 	if (force || abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
2821 		atomic_long_add(delta, &cfs_rq->tg->load_avg);
2822 		cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
2823 	}
2824 }
2825 
2826 /*
2827  * Called within set_task_rq() right before setting a task's cpu. The
2828  * caller only guarantees p->pi_lock is held; no other assumptions,
2829  * including the state of rq->lock, should be made.
2830  */
2831 void set_task_rq_fair(struct sched_entity *se,
2832 		      struct cfs_rq *prev, struct cfs_rq *next)
2833 {
2834 	if (!sched_feat(ATTACH_AGE_LOAD))
2835 		return;
2836 
2837 	/*
2838 	 * We are supposed to update the task to "current" time, then its up to
2839 	 * date and ready to go to new CPU/cfs_rq. But we have difficulty in
2840 	 * getting what current time is, so simply throw away the out-of-date
2841 	 * time. This will result in the wakee task is less decayed, but giving
2842 	 * the wakee more load sounds not bad.
2843 	 */
2844 	if (se->avg.last_update_time && prev) {
2845 		u64 p_last_update_time;
2846 		u64 n_last_update_time;
2847 
2848 #ifndef CONFIG_64BIT
2849 		u64 p_last_update_time_copy;
2850 		u64 n_last_update_time_copy;
2851 
2852 		do {
2853 			p_last_update_time_copy = prev->load_last_update_time_copy;
2854 			n_last_update_time_copy = next->load_last_update_time_copy;
2855 
2856 			smp_rmb();
2857 
2858 			p_last_update_time = prev->avg.last_update_time;
2859 			n_last_update_time = next->avg.last_update_time;
2860 
2861 		} while (p_last_update_time != p_last_update_time_copy ||
2862 			 n_last_update_time != n_last_update_time_copy);
2863 #else
2864 		p_last_update_time = prev->avg.last_update_time;
2865 		n_last_update_time = next->avg.last_update_time;
2866 #endif
2867 		__update_load_avg(p_last_update_time, cpu_of(rq_of(prev)),
2868 				  &se->avg, 0, 0, NULL);
2869 		se->avg.last_update_time = n_last_update_time;
2870 	}
2871 }
2872 #else /* CONFIG_FAIR_GROUP_SCHED */
2873 static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
2874 #endif /* CONFIG_FAIR_GROUP_SCHED */
2875 
2876 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
2877 
2878 static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
2879 {
2880 	struct rq *rq = rq_of(cfs_rq);
2881 	int cpu = cpu_of(rq);
2882 
2883 	if (cpu == smp_processor_id() && &rq->cfs == cfs_rq) {
2884 		unsigned long max = rq->cpu_capacity_orig;
2885 
2886 		/*
2887 		 * There are a few boundary cases this might miss but it should
2888 		 * get called often enough that that should (hopefully) not be
2889 		 * a real problem -- added to that it only calls on the local
2890 		 * CPU, so if we enqueue remotely we'll miss an update, but
2891 		 * the next tick/schedule should update.
2892 		 *
2893 		 * It will not get called when we go idle, because the idle
2894 		 * thread is a different class (!fair), nor will the utilization
2895 		 * number include things like RT tasks.
2896 		 *
2897 		 * As is, the util number is not freq-invariant (we'd have to
2898 		 * implement arch_scale_freq_capacity() for that).
2899 		 *
2900 		 * See cpu_util().
2901 		 */
2902 		cpufreq_update_util(rq_clock(rq),
2903 				    min(cfs_rq->avg.util_avg, max), max);
2904 	}
2905 }
2906 
2907 /*
2908  * Unsigned subtract and clamp on underflow.
2909  *
2910  * Explicitly do a load-store to ensure the intermediate value never hits
2911  * memory. This allows lockless observations without ever seeing the negative
2912  * values.
2913  */
2914 #define sub_positive(_ptr, _val) do {				\
2915 	typeof(_ptr) ptr = (_ptr);				\
2916 	typeof(*ptr) val = (_val);				\
2917 	typeof(*ptr) res, var = READ_ONCE(*ptr);		\
2918 	res = var - val;					\
2919 	if (res > var)						\
2920 		res = 0;					\
2921 	WRITE_ONCE(*ptr, res);					\
2922 } while (0)
2923 
2924 /* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */
2925 static inline int
2926 update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
2927 {
2928 	struct sched_avg *sa = &cfs_rq->avg;
2929 	int decayed, removed_load = 0, removed_util = 0;
2930 
2931 	if (atomic_long_read(&cfs_rq->removed_load_avg)) {
2932 		s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
2933 		sub_positive(&sa->load_avg, r);
2934 		sub_positive(&sa->load_sum, r * LOAD_AVG_MAX);
2935 		removed_load = 1;
2936 	}
2937 
2938 	if (atomic_long_read(&cfs_rq->removed_util_avg)) {
2939 		long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
2940 		sub_positive(&sa->util_avg, r);
2941 		sub_positive(&sa->util_sum, r * LOAD_AVG_MAX);
2942 		removed_util = 1;
2943 	}
2944 
2945 	decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
2946 		scale_load_down(cfs_rq->load.weight), cfs_rq->curr != NULL, cfs_rq);
2947 
2948 #ifndef CONFIG_64BIT
2949 	smp_wmb();
2950 	cfs_rq->load_last_update_time_copy = sa->last_update_time;
2951 #endif
2952 
2953 	if (update_freq && (decayed || removed_util))
2954 		cfs_rq_util_change(cfs_rq);
2955 
2956 	return decayed || removed_load;
2957 }
2958 
2959 /* Update task and its cfs_rq load average */
2960 static inline void update_load_avg(struct sched_entity *se, int update_tg)
2961 {
2962 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
2963 	u64 now = cfs_rq_clock_task(cfs_rq);
2964 	struct rq *rq = rq_of(cfs_rq);
2965 	int cpu = cpu_of(rq);
2966 
2967 	/*
2968 	 * Track task load average for carrying it to new CPU after migrated, and
2969 	 * track group sched_entity load average for task_h_load calc in migration
2970 	 */
2971 	__update_load_avg(now, cpu, &se->avg,
2972 			  se->on_rq * scale_load_down(se->load.weight),
2973 			  cfs_rq->curr == se, NULL);
2974 
2975 	if (update_cfs_rq_load_avg(now, cfs_rq, true) && update_tg)
2976 		update_tg_load_avg(cfs_rq, 0);
2977 }
2978 
2979 static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
2980 {
2981 	if (!sched_feat(ATTACH_AGE_LOAD))
2982 		goto skip_aging;
2983 
2984 	/*
2985 	 * If we got migrated (either between CPUs or between cgroups) we'll
2986 	 * have aged the average right before clearing @last_update_time.
2987 	 */
2988 	if (se->avg.last_update_time) {
2989 		__update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
2990 				  &se->avg, 0, 0, NULL);
2991 
2992 		/*
2993 		 * XXX: we could have just aged the entire load away if we've been
2994 		 * absent from the fair class for too long.
2995 		 */
2996 	}
2997 
2998 skip_aging:
2999 	se->avg.last_update_time = cfs_rq->avg.last_update_time;
3000 	cfs_rq->avg.load_avg += se->avg.load_avg;
3001 	cfs_rq->avg.load_sum += se->avg.load_sum;
3002 	cfs_rq->avg.util_avg += se->avg.util_avg;
3003 	cfs_rq->avg.util_sum += se->avg.util_sum;
3004 
3005 	cfs_rq_util_change(cfs_rq);
3006 }
3007 
3008 static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
3009 {
3010 	__update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
3011 			  &se->avg, se->on_rq * scale_load_down(se->load.weight),
3012 			  cfs_rq->curr == se, NULL);
3013 
3014 	sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
3015 	sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum);
3016 	sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
3017 	sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
3018 
3019 	cfs_rq_util_change(cfs_rq);
3020 }
3021 
3022 /* Add the load generated by se into cfs_rq's load average */
3023 static inline void
3024 enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
3025 {
3026 	struct sched_avg *sa = &se->avg;
3027 	u64 now = cfs_rq_clock_task(cfs_rq);
3028 	int migrated, decayed;
3029 
3030 	migrated = !sa->last_update_time;
3031 	if (!migrated) {
3032 		__update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
3033 			se->on_rq * scale_load_down(se->load.weight),
3034 			cfs_rq->curr == se, NULL);
3035 	}
3036 
3037 	decayed = update_cfs_rq_load_avg(now, cfs_rq, !migrated);
3038 
3039 	cfs_rq->runnable_load_avg += sa->load_avg;
3040 	cfs_rq->runnable_load_sum += sa->load_sum;
3041 
3042 	if (migrated)
3043 		attach_entity_load_avg(cfs_rq, se);
3044 
3045 	if (decayed || migrated)
3046 		update_tg_load_avg(cfs_rq, 0);
3047 }
3048 
3049 /* Remove the runnable load generated by se from cfs_rq's runnable load average */
3050 static inline void
3051 dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
3052 {
3053 	update_load_avg(se, 1);
3054 
3055 	cfs_rq->runnable_load_avg =
3056 		max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0);
3057 	cfs_rq->runnable_load_sum =
3058 		max_t(s64,  cfs_rq->runnable_load_sum - se->avg.load_sum, 0);
3059 }
3060 
3061 #ifndef CONFIG_64BIT
3062 static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
3063 {
3064 	u64 last_update_time_copy;
3065 	u64 last_update_time;
3066 
3067 	do {
3068 		last_update_time_copy = cfs_rq->load_last_update_time_copy;
3069 		smp_rmb();
3070 		last_update_time = cfs_rq->avg.last_update_time;
3071 	} while (last_update_time != last_update_time_copy);
3072 
3073 	return last_update_time;
3074 }
3075 #else
3076 static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
3077 {
3078 	return cfs_rq->avg.last_update_time;
3079 }
3080 #endif
3081 
3082 /*
3083  * Task first catches up with cfs_rq, and then subtract
3084  * itself from the cfs_rq (task must be off the queue now).
3085  */
3086 void remove_entity_load_avg(struct sched_entity *se)
3087 {
3088 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
3089 	u64 last_update_time;
3090 
3091 	/*
3092 	 * Newly created task or never used group entity should not be removed
3093 	 * from its (source) cfs_rq
3094 	 */
3095 	if (se->avg.last_update_time == 0)
3096 		return;
3097 
3098 	last_update_time = cfs_rq_last_update_time(cfs_rq);
3099 
3100 	__update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL);
3101 	atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg);
3102 	atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg);
3103 }
3104 
3105 static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)
3106 {
3107 	return cfs_rq->runnable_load_avg;
3108 }
3109 
3110 static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
3111 {
3112 	return cfs_rq->avg.load_avg;
3113 }
3114 
3115 static int idle_balance(struct rq *this_rq);
3116 
3117 #else /* CONFIG_SMP */
3118 
3119 static inline void update_load_avg(struct sched_entity *se, int not_used)
3120 {
3121 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
3122 	struct rq *rq = rq_of(cfs_rq);
3123 
3124 	cpufreq_trigger_update(rq_clock(rq));
3125 }
3126 
3127 static inline void
3128 enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
3129 static inline void
3130 dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
3131 static inline void remove_entity_load_avg(struct sched_entity *se) {}
3132 
3133 static inline void
3134 attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
3135 static inline void
3136 detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
3137 
3138 static inline int idle_balance(struct rq *rq)
3139 {
3140 	return 0;
3141 }
3142 
3143 #endif /* CONFIG_SMP */
3144 
3145 static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
3146 {
3147 #ifdef CONFIG_SCHEDSTATS
3148 	struct task_struct *tsk = NULL;
3149 
3150 	if (entity_is_task(se))
3151 		tsk = task_of(se);
3152 
3153 	if (se->statistics.sleep_start) {
3154 		u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.sleep_start;
3155 
3156 		if ((s64)delta < 0)
3157 			delta = 0;
3158 
3159 		if (unlikely(delta > se->statistics.sleep_max))
3160 			se->statistics.sleep_max = delta;
3161 
3162 		se->statistics.sleep_start = 0;
3163 		se->statistics.sum_sleep_runtime += delta;
3164 
3165 		if (tsk) {
3166 			account_scheduler_latency(tsk, delta >> 10, 1);
3167 			trace_sched_stat_sleep(tsk, delta);
3168 		}
3169 	}
3170 	if (se->statistics.block_start) {
3171 		u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.block_start;
3172 
3173 		if ((s64)delta < 0)
3174 			delta = 0;
3175 
3176 		if (unlikely(delta > se->statistics.block_max))
3177 			se->statistics.block_max = delta;
3178 
3179 		se->statistics.block_start = 0;
3180 		se->statistics.sum_sleep_runtime += delta;
3181 
3182 		if (tsk) {
3183 			if (tsk->in_iowait) {
3184 				se->statistics.iowait_sum += delta;
3185 				se->statistics.iowait_count++;
3186 				trace_sched_stat_iowait(tsk, delta);
3187 			}
3188 
3189 			trace_sched_stat_blocked(tsk, delta);
3190 
3191 			/*
3192 			 * Blocking time is in units of nanosecs, so shift by
3193 			 * 20 to get a milliseconds-range estimation of the
3194 			 * amount of time that the task spent sleeping:
3195 			 */
3196 			if (unlikely(prof_on == SLEEP_PROFILING)) {
3197 				profile_hits(SLEEP_PROFILING,
3198 						(void *)get_wchan(tsk),
3199 						delta >> 20);
3200 			}
3201 			account_scheduler_latency(tsk, delta >> 10, 0);
3202 		}
3203 	}
3204 #endif
3205 }
3206 
3207 static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
3208 {
3209 #ifdef CONFIG_SCHED_DEBUG
3210 	s64 d = se->vruntime - cfs_rq->min_vruntime;
3211 
3212 	if (d < 0)
3213 		d = -d;
3214 
3215 	if (d > 3*sysctl_sched_latency)
3216 		schedstat_inc(cfs_rq, nr_spread_over);
3217 #endif
3218 }
3219 
3220 static void
3221 place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
3222 {
3223 	u64 vruntime = cfs_rq->min_vruntime;
3224 
3225 	/*
3226 	 * The 'current' period is already promised to the current tasks,
3227 	 * however the extra weight of the new task will slow them down a
3228 	 * little, place the new task so that it fits in the slot that
3229 	 * stays open at the end.
3230 	 */
3231 	if (initial && sched_feat(START_DEBIT))
3232 		vruntime += sched_vslice(cfs_rq, se);
3233 
3234 	/* sleeps up to a single latency don't count. */
3235 	if (!initial) {
3236 		unsigned long thresh = sysctl_sched_latency;
3237 
3238 		/*
3239 		 * Halve their sleep time's effect, to allow
3240 		 * for a gentler effect of sleepers:
3241 		 */
3242 		if (sched_feat(GENTLE_FAIR_SLEEPERS))
3243 			thresh >>= 1;
3244 
3245 		vruntime -= thresh;
3246 	}
3247 
3248 	/* ensure we never gain time by being placed backwards. */
3249 	se->vruntime = max_vruntime(se->vruntime, vruntime);
3250 }
3251 
3252 static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
3253 
3254 static inline void check_schedstat_required(void)
3255 {
3256 #ifdef CONFIG_SCHEDSTATS
3257 	if (schedstat_enabled())
3258 		return;
3259 
3260 	/* Force schedstat enabled if a dependent tracepoint is active */
3261 	if (trace_sched_stat_wait_enabled()    ||
3262 			trace_sched_stat_sleep_enabled()   ||
3263 			trace_sched_stat_iowait_enabled()  ||
3264 			trace_sched_stat_blocked_enabled() ||
3265 			trace_sched_stat_runtime_enabled())  {
3266 		printk_deferred_once("Scheduler tracepoints stat_sleep, stat_iowait, "
3267 			     "stat_blocked and stat_runtime require the "
3268 			     "kernel parameter schedstats=enabled or "
3269 			     "kernel.sched_schedstats=1\n");
3270 	}
3271 #endif
3272 }
3273 
3274 
3275 /*
3276  * MIGRATION
3277  *
3278  *	dequeue
3279  *	  update_curr()
3280  *	    update_min_vruntime()
3281  *	  vruntime -= min_vruntime
3282  *
3283  *	enqueue
3284  *	  update_curr()
3285  *	    update_min_vruntime()
3286  *	  vruntime += min_vruntime
3287  *
3288  * this way the vruntime transition between RQs is done when both
3289  * min_vruntime are up-to-date.
3290  *
3291  * WAKEUP (remote)
3292  *
3293  *	->migrate_task_rq_fair() (p->state == TASK_WAKING)
3294  *	  vruntime -= min_vruntime
3295  *
3296  *	enqueue
3297  *	  update_curr()
3298  *	    update_min_vruntime()
3299  *	  vruntime += min_vruntime
3300  *
3301  * this way we don't have the most up-to-date min_vruntime on the originating
3302  * CPU and an up-to-date min_vruntime on the destination CPU.
3303  */
3304 
3305 static void
3306 enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
3307 {
3308 	bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATED);
3309 	bool curr = cfs_rq->curr == se;
3310 
3311 	/*
3312 	 * If we're the current task, we must renormalise before calling
3313 	 * update_curr().
3314 	 */
3315 	if (renorm && curr)
3316 		se->vruntime += cfs_rq->min_vruntime;
3317 
3318 	update_curr(cfs_rq);
3319 
3320 	/*
3321 	 * Otherwise, renormalise after, such that we're placed at the current
3322 	 * moment in time, instead of some random moment in the past. Being
3323 	 * placed in the past could significantly boost this task to the
3324 	 * fairness detriment of existing tasks.
3325 	 */
3326 	if (renorm && !curr)
3327 		se->vruntime += cfs_rq->min_vruntime;
3328 
3329 	enqueue_entity_load_avg(cfs_rq, se);
3330 	account_entity_enqueue(cfs_rq, se);
3331 	update_cfs_shares(cfs_rq);
3332 
3333 	if (flags & ENQUEUE_WAKEUP) {
3334 		place_entity(cfs_rq, se, 0);
3335 		if (schedstat_enabled())
3336 			enqueue_sleeper(cfs_rq, se);
3337 	}
3338 
3339 	check_schedstat_required();
3340 	if (schedstat_enabled()) {
3341 		update_stats_enqueue(cfs_rq, se);
3342 		check_spread(cfs_rq, se);
3343 	}
3344 	if (!curr)
3345 		__enqueue_entity(cfs_rq, se);
3346 	se->on_rq = 1;
3347 
3348 	if (cfs_rq->nr_running == 1) {
3349 		list_add_leaf_cfs_rq(cfs_rq);
3350 		check_enqueue_throttle(cfs_rq);
3351 	}
3352 }
3353 
3354 static void __clear_buddies_last(struct sched_entity *se)
3355 {
3356 	for_each_sched_entity(se) {
3357 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
3358 		if (cfs_rq->last != se)
3359 			break;
3360 
3361 		cfs_rq->last = NULL;
3362 	}
3363 }
3364 
3365 static void __clear_buddies_next(struct sched_entity *se)
3366 {
3367 	for_each_sched_entity(se) {
3368 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
3369 		if (cfs_rq->next != se)
3370 			break;
3371 
3372 		cfs_rq->next = NULL;
3373 	}
3374 }
3375 
3376 static void __clear_buddies_skip(struct sched_entity *se)
3377 {
3378 	for_each_sched_entity(se) {
3379 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
3380 		if (cfs_rq->skip != se)
3381 			break;
3382 
3383 		cfs_rq->skip = NULL;
3384 	}
3385 }
3386 
3387 static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
3388 {
3389 	if (cfs_rq->last == se)
3390 		__clear_buddies_last(se);
3391 
3392 	if (cfs_rq->next == se)
3393 		__clear_buddies_next(se);
3394 
3395 	if (cfs_rq->skip == se)
3396 		__clear_buddies_skip(se);
3397 }
3398 
3399 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
3400 
3401 static void
3402 dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
3403 {
3404 	/*
3405 	 * Update run-time statistics of the 'current'.
3406 	 */
3407 	update_curr(cfs_rq);
3408 	dequeue_entity_load_avg(cfs_rq, se);
3409 
3410 	if (schedstat_enabled())
3411 		update_stats_dequeue(cfs_rq, se, flags);
3412 
3413 	clear_buddies(cfs_rq, se);
3414 
3415 	if (se != cfs_rq->curr)
3416 		__dequeue_entity(cfs_rq, se);
3417 	se->on_rq = 0;
3418 	account_entity_dequeue(cfs_rq, se);
3419 
3420 	/*
3421 	 * Normalize the entity after updating the min_vruntime because the
3422 	 * update can refer to the ->curr item and we need to reflect this
3423 	 * movement in our normalized position.
3424 	 */
3425 	if (!(flags & DEQUEUE_SLEEP))
3426 		se->vruntime -= cfs_rq->min_vruntime;
3427 
3428 	/* return excess runtime on last dequeue */
3429 	return_cfs_rq_runtime(cfs_rq);
3430 
3431 	update_min_vruntime(cfs_rq);
3432 	update_cfs_shares(cfs_rq);
3433 }
3434 
3435 /*
3436  * Preempt the current task with a newly woken task if needed:
3437  */
3438 static void
3439 check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
3440 {
3441 	unsigned long ideal_runtime, delta_exec;
3442 	struct sched_entity *se;
3443 	s64 delta;
3444 
3445 	ideal_runtime = sched_slice(cfs_rq, curr);
3446 	delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
3447 	if (delta_exec > ideal_runtime) {
3448 		resched_curr(rq_of(cfs_rq));
3449 		/*
3450 		 * The current task ran long enough, ensure it doesn't get
3451 		 * re-elected due to buddy favours.
3452 		 */
3453 		clear_buddies(cfs_rq, curr);
3454 		return;
3455 	}
3456 
3457 	/*
3458 	 * Ensure that a task that missed wakeup preemption by a
3459 	 * narrow margin doesn't have to wait for a full slice.
3460 	 * This also mitigates buddy induced latencies under load.
3461 	 */
3462 	if (delta_exec < sysctl_sched_min_granularity)
3463 		return;
3464 
3465 	se = __pick_first_entity(cfs_rq);
3466 	delta = curr->vruntime - se->vruntime;
3467 
3468 	if (delta < 0)
3469 		return;
3470 
3471 	if (delta > ideal_runtime)
3472 		resched_curr(rq_of(cfs_rq));
3473 }
3474 
3475 static void
3476 set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
3477 {
3478 	/* 'current' is not kept within the tree. */
3479 	if (se->on_rq) {
3480 		/*
3481 		 * Any task has to be enqueued before it get to execute on
3482 		 * a CPU. So account for the time it spent waiting on the
3483 		 * runqueue.
3484 		 */
3485 		if (schedstat_enabled())
3486 			update_stats_wait_end(cfs_rq, se);
3487 		__dequeue_entity(cfs_rq, se);
3488 		update_load_avg(se, 1);
3489 	}
3490 
3491 	update_stats_curr_start(cfs_rq, se);
3492 	cfs_rq->curr = se;
3493 #ifdef CONFIG_SCHEDSTATS
3494 	/*
3495 	 * Track our maximum slice length, if the CPU's load is at
3496 	 * least twice that of our own weight (i.e. dont track it
3497 	 * when there are only lesser-weight tasks around):
3498 	 */
3499 	if (schedstat_enabled() && rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
3500 		se->statistics.slice_max = max(se->statistics.slice_max,
3501 			se->sum_exec_runtime - se->prev_sum_exec_runtime);
3502 	}
3503 #endif
3504 	se->prev_sum_exec_runtime = se->sum_exec_runtime;
3505 }
3506 
3507 static int
3508 wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
3509 
3510 /*
3511  * Pick the next process, keeping these things in mind, in this order:
3512  * 1) keep things fair between processes/task groups
3513  * 2) pick the "next" process, since someone really wants that to run
3514  * 3) pick the "last" process, for cache locality
3515  * 4) do not run the "skip" process, if something else is available
3516  */
3517 static struct sched_entity *
3518 pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
3519 {
3520 	struct sched_entity *left = __pick_first_entity(cfs_rq);
3521 	struct sched_entity *se;
3522 
3523 	/*
3524 	 * If curr is set we have to see if its left of the leftmost entity
3525 	 * still in the tree, provided there was anything in the tree at all.
3526 	 */
3527 	if (!left || (curr && entity_before(curr, left)))
3528 		left = curr;
3529 
3530 	se = left; /* ideally we run the leftmost entity */
3531 
3532 	/*
3533 	 * Avoid running the skip buddy, if running something else can
3534 	 * be done without getting too unfair.
3535 	 */
3536 	if (cfs_rq->skip == se) {
3537 		struct sched_entity *second;
3538 
3539 		if (se == curr) {
3540 			second = __pick_first_entity(cfs_rq);
3541 		} else {
3542 			second = __pick_next_entity(se);
3543 			if (!second || (curr && entity_before(curr, second)))
3544 				second = curr;
3545 		}
3546 
3547 		if (second && wakeup_preempt_entity(second, left) < 1)
3548 			se = second;
3549 	}
3550 
3551 	/*
3552 	 * Prefer last buddy, try to return the CPU to a preempted task.
3553 	 */
3554 	if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
3555 		se = cfs_rq->last;
3556 
3557 	/*
3558 	 * Someone really wants this to run. If it's not unfair, run it.
3559 	 */
3560 	if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
3561 		se = cfs_rq->next;
3562 
3563 	clear_buddies(cfs_rq, se);
3564 
3565 	return se;
3566 }
3567 
3568 static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
3569 
3570 static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
3571 {
3572 	/*
3573 	 * If still on the runqueue then deactivate_task()
3574 	 * was not called and update_curr() has to be done:
3575 	 */
3576 	if (prev->on_rq)
3577 		update_curr(cfs_rq);
3578 
3579 	/* throttle cfs_rqs exceeding runtime */
3580 	check_cfs_rq_runtime(cfs_rq);
3581 
3582 	if (schedstat_enabled()) {
3583 		check_spread(cfs_rq, prev);
3584 		if (prev->on_rq)
3585 			update_stats_wait_start(cfs_rq, prev);
3586 	}
3587 
3588 	if (prev->on_rq) {
3589 		/* Put 'current' back into the tree. */
3590 		__enqueue_entity(cfs_rq, prev);
3591 		/* in !on_rq case, update occurred at dequeue */
3592 		update_load_avg(prev, 0);
3593 	}
3594 	cfs_rq->curr = NULL;
3595 }
3596 
3597 static void
3598 entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
3599 {
3600 	/*
3601 	 * Update run-time statistics of the 'current'.
3602 	 */
3603 	update_curr(cfs_rq);
3604 
3605 	/*
3606 	 * Ensure that runnable average is periodically updated.
3607 	 */
3608 	update_load_avg(curr, 1);
3609 	update_cfs_shares(cfs_rq);
3610 
3611 #ifdef CONFIG_SCHED_HRTICK
3612 	/*
3613 	 * queued ticks are scheduled to match the slice, so don't bother
3614 	 * validating it and just reschedule.
3615 	 */
3616 	if (queued) {
3617 		resched_curr(rq_of(cfs_rq));
3618 		return;
3619 	}
3620 	/*
3621 	 * don't let the period tick interfere with the hrtick preemption
3622 	 */
3623 	if (!sched_feat(DOUBLE_TICK) &&
3624 			hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
3625 		return;
3626 #endif
3627 
3628 	if (cfs_rq->nr_running > 1)
3629 		check_preempt_tick(cfs_rq, curr);
3630 }
3631 
3632 
3633 /**************************************************
3634  * CFS bandwidth control machinery
3635  */
3636 
3637 #ifdef CONFIG_CFS_BANDWIDTH
3638 
3639 #ifdef HAVE_JUMP_LABEL
3640 static struct static_key __cfs_bandwidth_used;
3641 
3642 static inline bool cfs_bandwidth_used(void)
3643 {
3644 	return static_key_false(&__cfs_bandwidth_used);
3645 }
3646 
3647 void cfs_bandwidth_usage_inc(void)
3648 {
3649 	static_key_slow_inc(&__cfs_bandwidth_used);
3650 }
3651 
3652 void cfs_bandwidth_usage_dec(void)
3653 {
3654 	static_key_slow_dec(&__cfs_bandwidth_used);
3655 }
3656 #else /* HAVE_JUMP_LABEL */
3657 static bool cfs_bandwidth_used(void)
3658 {
3659 	return true;
3660 }
3661 
3662 void cfs_bandwidth_usage_inc(void) {}
3663 void cfs_bandwidth_usage_dec(void) {}
3664 #endif /* HAVE_JUMP_LABEL */
3665 
3666 /*
3667  * default period for cfs group bandwidth.
3668  * default: 0.1s, units: nanoseconds
3669  */
3670 static inline u64 default_cfs_period(void)
3671 {
3672 	return 100000000ULL;
3673 }
3674 
3675 static inline u64 sched_cfs_bandwidth_slice(void)
3676 {
3677 	return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
3678 }
3679 
3680 /*
3681  * Replenish runtime according to assigned quota and update expiration time.
3682  * We use sched_clock_cpu directly instead of rq->clock to avoid adding
3683  * additional synchronization around rq->lock.
3684  *
3685  * requires cfs_b->lock
3686  */
3687 void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
3688 {
3689 	u64 now;
3690 
3691 	if (cfs_b->quota == RUNTIME_INF)
3692 		return;
3693 
3694 	now = sched_clock_cpu(smp_processor_id());
3695 	cfs_b->runtime = cfs_b->quota;
3696 	cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
3697 }
3698 
3699 static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
3700 {
3701 	return &tg->cfs_bandwidth;
3702 }
3703 
3704 /* rq->task_clock normalized against any time this cfs_rq has spent throttled */
3705 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
3706 {
3707 	if (unlikely(cfs_rq->throttle_count))
3708 		return cfs_rq->throttled_clock_task;
3709 
3710 	return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
3711 }
3712 
3713 /* returns 0 on failure to allocate runtime */
3714 static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
3715 {
3716 	struct task_group *tg = cfs_rq->tg;
3717 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
3718 	u64 amount = 0, min_amount, expires;
3719 
3720 	/* note: this is a positive sum as runtime_remaining <= 0 */
3721 	min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
3722 
3723 	raw_spin_lock(&cfs_b->lock);
3724 	if (cfs_b->quota == RUNTIME_INF)
3725 		amount = min_amount;
3726 	else {
3727 		start_cfs_bandwidth(cfs_b);
3728 
3729 		if (cfs_b->runtime > 0) {
3730 			amount = min(cfs_b->runtime, min_amount);
3731 			cfs_b->runtime -= amount;
3732 			cfs_b->idle = 0;
3733 		}
3734 	}
3735 	expires = cfs_b->runtime_expires;
3736 	raw_spin_unlock(&cfs_b->lock);
3737 
3738 	cfs_rq->runtime_remaining += amount;
3739 	/*
3740 	 * we may have advanced our local expiration to account for allowed
3741 	 * spread between our sched_clock and the one on which runtime was
3742 	 * issued.
3743 	 */
3744 	if ((s64)(expires - cfs_rq->runtime_expires) > 0)
3745 		cfs_rq->runtime_expires = expires;
3746 
3747 	return cfs_rq->runtime_remaining > 0;
3748 }
3749 
3750 /*
3751  * Note: This depends on the synchronization provided by sched_clock and the
3752  * fact that rq->clock snapshots this value.
3753  */
3754 static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
3755 {
3756 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
3757 
3758 	/* if the deadline is ahead of our clock, nothing to do */
3759 	if (likely((s64)(rq_clock(rq_of(cfs_rq)) - cfs_rq->runtime_expires) < 0))
3760 		return;
3761 
3762 	if (cfs_rq->runtime_remaining < 0)
3763 		return;
3764 
3765 	/*
3766 	 * If the local deadline has passed we have to consider the
3767 	 * possibility that our sched_clock is 'fast' and the global deadline
3768 	 * has not truly expired.
3769 	 *
3770 	 * Fortunately we can check determine whether this the case by checking
3771 	 * whether the global deadline has advanced. It is valid to compare
3772 	 * cfs_b->runtime_expires without any locks since we only care about
3773 	 * exact equality, so a partial write will still work.
3774 	 */
3775 
3776 	if (cfs_rq->runtime_expires != cfs_b->runtime_expires) {
3777 		/* extend local deadline, drift is bounded above by 2 ticks */
3778 		cfs_rq->runtime_expires += TICK_NSEC;
3779 	} else {
3780 		/* global deadline is ahead, expiration has passed */
3781 		cfs_rq->runtime_remaining = 0;
3782 	}
3783 }
3784 
3785 static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
3786 {
3787 	/* dock delta_exec before expiring quota (as it could span periods) */
3788 	cfs_rq->runtime_remaining -= delta_exec;
3789 	expire_cfs_rq_runtime(cfs_rq);
3790 
3791 	if (likely(cfs_rq->runtime_remaining > 0))
3792 		return;
3793 
3794 	/*
3795 	 * if we're unable to extend our runtime we resched so that the active
3796 	 * hierarchy can be throttled
3797 	 */
3798 	if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
3799 		resched_curr(rq_of(cfs_rq));
3800 }
3801 
3802 static __always_inline
3803 void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
3804 {
3805 	if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
3806 		return;
3807 
3808 	__account_cfs_rq_runtime(cfs_rq, delta_exec);
3809 }
3810 
3811 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
3812 {
3813 	return cfs_bandwidth_used() && cfs_rq->throttled;
3814 }
3815 
3816 /* check whether cfs_rq, or any parent, is throttled */
3817 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
3818 {
3819 	return cfs_bandwidth_used() && cfs_rq->throttle_count;
3820 }
3821 
3822 /*
3823  * Ensure that neither of the group entities corresponding to src_cpu or
3824  * dest_cpu are members of a throttled hierarchy when performing group
3825  * load-balance operations.
3826  */
3827 static inline int throttled_lb_pair(struct task_group *tg,
3828 				    int src_cpu, int dest_cpu)
3829 {
3830 	struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
3831 
3832 	src_cfs_rq = tg->cfs_rq[src_cpu];
3833 	dest_cfs_rq = tg->cfs_rq[dest_cpu];
3834 
3835 	return throttled_hierarchy(src_cfs_rq) ||
3836 	       throttled_hierarchy(dest_cfs_rq);
3837 }
3838 
3839 /* updated child weight may affect parent so we have to do this bottom up */
3840 static int tg_unthrottle_up(struct task_group *tg, void *data)
3841 {
3842 	struct rq *rq = data;
3843 	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
3844 
3845 	cfs_rq->throttle_count--;
3846 #ifdef CONFIG_SMP
3847 	if (!cfs_rq->throttle_count) {
3848 		/* adjust cfs_rq_clock_task() */
3849 		cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
3850 					     cfs_rq->throttled_clock_task;
3851 	}
3852 #endif
3853 
3854 	return 0;
3855 }
3856 
3857 static int tg_throttle_down(struct task_group *tg, void *data)
3858 {
3859 	struct rq *rq = data;
3860 	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
3861 
3862 	/* group is entering throttled state, stop time */
3863 	if (!cfs_rq->throttle_count)
3864 		cfs_rq->throttled_clock_task = rq_clock_task(rq);
3865 	cfs_rq->throttle_count++;
3866 
3867 	return 0;
3868 }
3869 
3870 static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
3871 {
3872 	struct rq *rq = rq_of(cfs_rq);
3873 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
3874 	struct sched_entity *se;
3875 	long task_delta, dequeue = 1;
3876 	bool empty;
3877 
3878 	se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
3879 
3880 	/* freeze hierarchy runnable averages while throttled */
3881 	rcu_read_lock();
3882 	walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
3883 	rcu_read_unlock();
3884 
3885 	task_delta = cfs_rq->h_nr_running;
3886 	for_each_sched_entity(se) {
3887 		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
3888 		/* throttled entity or throttle-on-deactivate */
3889 		if (!se->on_rq)
3890 			break;
3891 
3892 		if (dequeue)
3893 			dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
3894 		qcfs_rq->h_nr_running -= task_delta;
3895 
3896 		if (qcfs_rq->load.weight)
3897 			dequeue = 0;
3898 	}
3899 
3900 	if (!se)
3901 		sub_nr_running(rq, task_delta);
3902 
3903 	cfs_rq->throttled = 1;
3904 	cfs_rq->throttled_clock = rq_clock(rq);
3905 	raw_spin_lock(&cfs_b->lock);
3906 	empty = list_empty(&cfs_b->throttled_cfs_rq);
3907 
3908 	/*
3909 	 * Add to the _head_ of the list, so that an already-started
3910 	 * distribute_cfs_runtime will not see us
3911 	 */
3912 	list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
3913 
3914 	/*
3915 	 * If we're the first throttled task, make sure the bandwidth
3916 	 * timer is running.
3917 	 */
3918 	if (empty)
3919 		start_cfs_bandwidth(cfs_b);
3920 
3921 	raw_spin_unlock(&cfs_b->lock);
3922 }
3923 
3924 void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
3925 {
3926 	struct rq *rq = rq_of(cfs_rq);
3927 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
3928 	struct sched_entity *se;
3929 	int enqueue = 1;
3930 	long task_delta;
3931 
3932 	se = cfs_rq->tg->se[cpu_of(rq)];
3933 
3934 	cfs_rq->throttled = 0;
3935 
3936 	update_rq_clock(rq);
3937 
3938 	raw_spin_lock(&cfs_b->lock);
3939 	cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
3940 	list_del_rcu(&cfs_rq->throttled_list);
3941 	raw_spin_unlock(&cfs_b->lock);
3942 
3943 	/* update hierarchical throttle state */
3944 	walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
3945 
3946 	if (!cfs_rq->load.weight)
3947 		return;
3948 
3949 	task_delta = cfs_rq->h_nr_running;
3950 	for_each_sched_entity(se) {
3951 		if (se->on_rq)
3952 			enqueue = 0;
3953 
3954 		cfs_rq = cfs_rq_of(se);
3955 		if (enqueue)
3956 			enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
3957 		cfs_rq->h_nr_running += task_delta;
3958 
3959 		if (cfs_rq_throttled(cfs_rq))
3960 			break;
3961 	}
3962 
3963 	if (!se)
3964 		add_nr_running(rq, task_delta);
3965 
3966 	/* determine whether we need to wake up potentially idle cpu */
3967 	if (rq->curr == rq->idle && rq->cfs.nr_running)
3968 		resched_curr(rq);
3969 }
3970 
3971 static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
3972 		u64 remaining, u64 expires)
3973 {
3974 	struct cfs_rq *cfs_rq;
3975 	u64 runtime;
3976 	u64 starting_runtime = remaining;
3977 
3978 	rcu_read_lock();
3979 	list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
3980 				throttled_list) {
3981 		struct rq *rq = rq_of(cfs_rq);
3982 
3983 		raw_spin_lock(&rq->lock);
3984 		if (!cfs_rq_throttled(cfs_rq))
3985 			goto next;
3986 
3987 		runtime = -cfs_rq->runtime_remaining + 1;
3988 		if (runtime > remaining)
3989 			runtime = remaining;
3990 		remaining -= runtime;
3991 
3992 		cfs_rq->runtime_remaining += runtime;
3993 		cfs_rq->runtime_expires = expires;
3994 
3995 		/* we check whether we're throttled above */
3996 		if (cfs_rq->runtime_remaining > 0)
3997 			unthrottle_cfs_rq(cfs_rq);
3998 
3999 next:
4000 		raw_spin_unlock(&rq->lock);
4001 
4002 		if (!remaining)
4003 			break;
4004 	}
4005 	rcu_read_unlock();
4006 
4007 	return starting_runtime - remaining;
4008 }
4009 
4010 /*
4011  * Responsible for refilling a task_group's bandwidth and unthrottling its
4012  * cfs_rqs as appropriate. If there has been no activity within the last
4013  * period the timer is deactivated until scheduling resumes; cfs_b->idle is
4014  * used to track this state.
4015  */
4016 static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
4017 {
4018 	u64 runtime, runtime_expires;
4019 	int throttled;
4020 
4021 	/* no need to continue the timer with no bandwidth constraint */
4022 	if (cfs_b->quota == RUNTIME_INF)
4023 		goto out_deactivate;
4024 
4025 	throttled = !list_empty(&cfs_b->throttled_cfs_rq);
4026 	cfs_b->nr_periods += overrun;
4027 
4028 	/*
4029 	 * idle depends on !throttled (for the case of a large deficit), and if
4030 	 * we're going inactive then everything else can be deferred
4031 	 */
4032 	if (cfs_b->idle && !throttled)
4033 		goto out_deactivate;
4034 
4035 	__refill_cfs_bandwidth_runtime(cfs_b);
4036 
4037 	if (!throttled) {
4038 		/* mark as potentially idle for the upcoming period */
4039 		cfs_b->idle = 1;
4040 		return 0;
4041 	}
4042 
4043 	/* account preceding periods in which throttling occurred */
4044 	cfs_b->nr_throttled += overrun;
4045 
4046 	runtime_expires = cfs_b->runtime_expires;
4047 
4048 	/*
4049 	 * This check is repeated as we are holding onto the new bandwidth while
4050 	 * we unthrottle. This can potentially race with an unthrottled group
4051 	 * trying to acquire new bandwidth from the global pool. This can result
4052 	 * in us over-using our runtime if it is all used during this loop, but
4053 	 * only by limited amounts in that extreme case.
4054 	 */
4055 	while (throttled && cfs_b->runtime > 0) {
4056 		runtime = cfs_b->runtime;
4057 		raw_spin_unlock(&cfs_b->lock);
4058 		/* we can't nest cfs_b->lock while distributing bandwidth */
4059 		runtime = distribute_cfs_runtime(cfs_b, runtime,
4060 						 runtime_expires);
4061 		raw_spin_lock(&cfs_b->lock);
4062 
4063 		throttled = !list_empty(&cfs_b->throttled_cfs_rq);
4064 
4065 		cfs_b->runtime -= min(runtime, cfs_b->runtime);
4066 	}
4067 
4068 	/*
4069 	 * While we are ensured activity in the period following an
4070 	 * unthrottle, this also covers the case in which the new bandwidth is
4071 	 * insufficient to cover the existing bandwidth deficit.  (Forcing the
4072 	 * timer to remain active while there are any throttled entities.)
4073 	 */
4074 	cfs_b->idle = 0;
4075 
4076 	return 0;
4077 
4078 out_deactivate:
4079 	return 1;
4080 }
4081 
4082 /* a cfs_rq won't donate quota below this amount */
4083 static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
4084 /* minimum remaining period time to redistribute slack quota */
4085 static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
4086 /* how long we wait to gather additional slack before distributing */
4087 static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
4088 
4089 /*
4090  * Are we near the end of the current quota period?
4091  *
4092  * Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the
4093  * hrtimer base being cleared by hrtimer_start. In the case of
4094  * migrate_hrtimers, base is never cleared, so we are fine.
4095  */
4096 static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
4097 {
4098 	struct hrtimer *refresh_timer = &cfs_b->period_timer;
4099 	u64 remaining;
4100 
4101 	/* if the call-back is running a quota refresh is already occurring */
4102 	if (hrtimer_callback_running(refresh_timer))
4103 		return 1;
4104 
4105 	/* is a quota refresh about to occur? */
4106 	remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
4107 	if (remaining < min_expire)
4108 		return 1;
4109 
4110 	return 0;
4111 }
4112 
4113 static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
4114 {
4115 	u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
4116 
4117 	/* if there's a quota refresh soon don't bother with slack */
4118 	if (runtime_refresh_within(cfs_b, min_left))
4119 		return;
4120 
4121 	hrtimer_start(&cfs_b->slack_timer,
4122 			ns_to_ktime(cfs_bandwidth_slack_period),
4123 			HRTIMER_MODE_REL);
4124 }
4125 
4126 /* we know any runtime found here is valid as update_curr() precedes return */
4127 static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4128 {
4129 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
4130 	s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
4131 
4132 	if (slack_runtime <= 0)
4133 		return;
4134 
4135 	raw_spin_lock(&cfs_b->lock);
4136 	if (cfs_b->quota != RUNTIME_INF &&
4137 	    cfs_rq->runtime_expires == cfs_b->runtime_expires) {
4138 		cfs_b->runtime += slack_runtime;
4139 
4140 		/* we are under rq->lock, defer unthrottling using a timer */
4141 		if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
4142 		    !list_empty(&cfs_b->throttled_cfs_rq))
4143 			start_cfs_slack_bandwidth(cfs_b);
4144 	}
4145 	raw_spin_unlock(&cfs_b->lock);
4146 
4147 	/* even if it's not valid for return we don't want to try again */
4148 	cfs_rq->runtime_remaining -= slack_runtime;
4149 }
4150 
4151 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4152 {
4153 	if (!cfs_bandwidth_used())
4154 		return;
4155 
4156 	if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
4157 		return;
4158 
4159 	__return_cfs_rq_runtime(cfs_rq);
4160 }
4161 
4162 /*
4163  * This is done with a timer (instead of inline with bandwidth return) since
4164  * it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.
4165  */
4166 static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
4167 {
4168 	u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
4169 	u64 expires;
4170 
4171 	/* confirm we're still not at a refresh boundary */
4172 	raw_spin_lock(&cfs_b->lock);
4173 	if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
4174 		raw_spin_unlock(&cfs_b->lock);
4175 		return;
4176 	}
4177 
4178 	if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
4179 		runtime = cfs_b->runtime;
4180 
4181 	expires = cfs_b->runtime_expires;
4182 	raw_spin_unlock(&cfs_b->lock);
4183 
4184 	if (!runtime)
4185 		return;
4186 
4187 	runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
4188 
4189 	raw_spin_lock(&cfs_b->lock);
4190 	if (expires == cfs_b->runtime_expires)
4191 		cfs_b->runtime -= min(runtime, cfs_b->runtime);
4192 	raw_spin_unlock(&cfs_b->lock);
4193 }
4194 
4195 /*
4196  * When a group wakes up we want to make sure that its quota is not already
4197  * expired/exceeded, otherwise it may be allowed to steal additional ticks of
4198  * runtime as update_curr() throttling can not not trigger until it's on-rq.
4199  */
4200 static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
4201 {
4202 	if (!cfs_bandwidth_used())
4203 		return;
4204 
4205 	/* Synchronize hierarchical throttle counter: */
4206 	if (unlikely(!cfs_rq->throttle_uptodate)) {
4207 		struct rq *rq = rq_of(cfs_rq);
4208 		struct cfs_rq *pcfs_rq;
4209 		struct task_group *tg;
4210 
4211 		cfs_rq->throttle_uptodate = 1;
4212 
4213 		/* Get closest up-to-date node, because leaves go first: */
4214 		for (tg = cfs_rq->tg->parent; tg; tg = tg->parent) {
4215 			pcfs_rq = tg->cfs_rq[cpu_of(rq)];
4216 			if (pcfs_rq->throttle_uptodate)
4217 				break;
4218 		}
4219 		if (tg) {
4220 			cfs_rq->throttle_count = pcfs_rq->throttle_count;
4221 			cfs_rq->throttled_clock_task = rq_clock_task(rq);
4222 		}
4223 	}
4224 
4225 	/* an active group must be handled by the update_curr()->put() path */
4226 	if (!cfs_rq->runtime_enabled || cfs_rq->curr)
4227 		return;
4228 
4229 	/* ensure the group is not already throttled */
4230 	if (cfs_rq_throttled(cfs_rq))
4231 		return;
4232 
4233 	/* update runtime allocation */
4234 	account_cfs_rq_runtime(cfs_rq, 0);
4235 	if (cfs_rq->runtime_remaining <= 0)
4236 		throttle_cfs_rq(cfs_rq);
4237 }
4238 
4239 /* conditionally throttle active cfs_rq's from put_prev_entity() */
4240 static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4241 {
4242 	if (!cfs_bandwidth_used())
4243 		return false;
4244 
4245 	if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
4246 		return false;
4247 
4248 	/*
4249 	 * it's possible for a throttled entity to be forced into a running
4250 	 * state (e.g. set_curr_task), in this case we're finished.
4251 	 */
4252 	if (cfs_rq_throttled(cfs_rq))
4253 		return true;
4254 
4255 	throttle_cfs_rq(cfs_rq);
4256 	return true;
4257 }
4258 
4259 static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
4260 {
4261 	struct cfs_bandwidth *cfs_b =
4262 		container_of(timer, struct cfs_bandwidth, slack_timer);
4263 
4264 	do_sched_cfs_slack_timer(cfs_b);
4265 
4266 	return HRTIMER_NORESTART;
4267 }
4268 
4269 static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
4270 {
4271 	struct cfs_bandwidth *cfs_b =
4272 		container_of(timer, struct cfs_bandwidth, period_timer);
4273 	int overrun;
4274 	int idle = 0;
4275 
4276 	raw_spin_lock(&cfs_b->lock);
4277 	for (;;) {
4278 		overrun = hrtimer_forward_now(timer, cfs_b->period);
4279 		if (!overrun)
4280 			break;
4281 
4282 		idle = do_sched_cfs_period_timer(cfs_b, overrun);
4283 	}
4284 	if (idle)
4285 		cfs_b->period_active = 0;
4286 	raw_spin_unlock(&cfs_b->lock);
4287 
4288 	return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
4289 }
4290 
4291 void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
4292 {
4293 	raw_spin_lock_init(&cfs_b->lock);
4294 	cfs_b->runtime = 0;
4295 	cfs_b->quota = RUNTIME_INF;
4296 	cfs_b->period = ns_to_ktime(default_cfs_period());
4297 
4298 	INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
4299 	hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
4300 	cfs_b->period_timer.function = sched_cfs_period_timer;
4301 	hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
4302 	cfs_b->slack_timer.function = sched_cfs_slack_timer;
4303 }
4304 
4305 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4306 {
4307 	cfs_rq->runtime_enabled = 0;
4308 	INIT_LIST_HEAD(&cfs_rq->throttled_list);
4309 }
4310 
4311 void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
4312 {
4313 	lockdep_assert_held(&cfs_b->lock);
4314 
4315 	if (!cfs_b->period_active) {
4316 		cfs_b->period_active = 1;
4317 		hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period);
4318 		hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED);
4319 	}
4320 }
4321 
4322 static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
4323 {
4324 	/* init_cfs_bandwidth() was not called */
4325 	if (!cfs_b->throttled_cfs_rq.next)
4326 		return;
4327 
4328 	hrtimer_cancel(&cfs_b->period_timer);
4329 	hrtimer_cancel(&cfs_b->slack_timer);
4330 }
4331 
4332 static void __maybe_unused update_runtime_enabled(struct rq *rq)
4333 {
4334 	struct cfs_rq *cfs_rq;
4335 
4336 	for_each_leaf_cfs_rq(rq, cfs_rq) {
4337 		struct cfs_bandwidth *cfs_b = &cfs_rq->tg->cfs_bandwidth;
4338 
4339 		raw_spin_lock(&cfs_b->lock);
4340 		cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF;
4341 		raw_spin_unlock(&cfs_b->lock);
4342 	}
4343 }
4344 
4345 static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
4346 {
4347 	struct cfs_rq *cfs_rq;
4348 
4349 	for_each_leaf_cfs_rq(rq, cfs_rq) {
4350 		if (!cfs_rq->runtime_enabled)
4351 			continue;
4352 
4353 		/*
4354 		 * clock_task is not advancing so we just need to make sure
4355 		 * there's some valid quota amount
4356 		 */
4357 		cfs_rq->runtime_remaining = 1;
4358 		/*
4359 		 * Offline rq is schedulable till cpu is completely disabled
4360 		 * in take_cpu_down(), so we prevent new cfs throttling here.
4361 		 */
4362 		cfs_rq->runtime_enabled = 0;
4363 
4364 		if (cfs_rq_throttled(cfs_rq))
4365 			unthrottle_cfs_rq(cfs_rq);
4366 	}
4367 }
4368 
4369 #else /* CONFIG_CFS_BANDWIDTH */
4370 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
4371 {
4372 	return rq_clock_task(rq_of(cfs_rq));
4373 }
4374 
4375 static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
4376 static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
4377 static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
4378 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
4379 
4380 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
4381 {
4382 	return 0;
4383 }
4384 
4385 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
4386 {
4387 	return 0;
4388 }
4389 
4390 static inline int throttled_lb_pair(struct task_group *tg,
4391 				    int src_cpu, int dest_cpu)
4392 {
4393 	return 0;
4394 }
4395 
4396 void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
4397 
4398 #ifdef CONFIG_FAIR_GROUP_SCHED
4399 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
4400 #endif
4401 
4402 static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
4403 {
4404 	return NULL;
4405 }
4406 static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
4407 static inline void update_runtime_enabled(struct rq *rq) {}
4408 static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
4409 
4410 #endif /* CONFIG_CFS_BANDWIDTH */
4411 
4412 /**************************************************
4413  * CFS operations on tasks:
4414  */
4415 
4416 #ifdef CONFIG_SCHED_HRTICK
4417 static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
4418 {
4419 	struct sched_entity *se = &p->se;
4420 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
4421 
4422 	WARN_ON(task_rq(p) != rq);
4423 
4424 	if (cfs_rq->nr_running > 1) {
4425 		u64 slice = sched_slice(cfs_rq, se);
4426 		u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
4427 		s64 delta = slice - ran;
4428 
4429 		if (delta < 0) {
4430 			if (rq->curr == p)
4431 				resched_curr(rq);
4432 			return;
4433 		}
4434 		hrtick_start(rq, delta);
4435 	}
4436 }
4437 
4438 /*
4439  * called from enqueue/dequeue and updates the hrtick when the
4440  * current task is from our class and nr_running is low enough
4441  * to matter.
4442  */
4443 static void hrtick_update(struct rq *rq)
4444 {
4445 	struct task_struct *curr = rq->curr;
4446 
4447 	if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class)
4448 		return;
4449 
4450 	if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
4451 		hrtick_start_fair(rq, curr);
4452 }
4453 #else /* !CONFIG_SCHED_HRTICK */
4454 static inline void
4455 hrtick_start_fair(struct rq *rq, struct task_struct *p)
4456 {
4457 }
4458 
4459 static inline void hrtick_update(struct rq *rq)
4460 {
4461 }
4462 #endif
4463 
4464 /*
4465  * The enqueue_task method is called before nr_running is
4466  * increased. Here we update the fair scheduling stats and
4467  * then put the task into the rbtree:
4468  */
4469 static void
4470 enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
4471 {
4472 	struct cfs_rq *cfs_rq;
4473 	struct sched_entity *se = &p->se;
4474 
4475 	for_each_sched_entity(se) {
4476 		if (se->on_rq)
4477 			break;
4478 		cfs_rq = cfs_rq_of(se);
4479 		enqueue_entity(cfs_rq, se, flags);
4480 
4481 		/*
4482 		 * end evaluation on encountering a throttled cfs_rq
4483 		 *
4484 		 * note: in the case of encountering a throttled cfs_rq we will
4485 		 * post the final h_nr_running increment below.
4486 		*/
4487 		if (cfs_rq_throttled(cfs_rq))
4488 			break;
4489 		cfs_rq->h_nr_running++;
4490 
4491 		flags = ENQUEUE_WAKEUP;
4492 	}
4493 
4494 	for_each_sched_entity(se) {
4495 		cfs_rq = cfs_rq_of(se);
4496 		cfs_rq->h_nr_running++;
4497 
4498 		if (cfs_rq_throttled(cfs_rq))
4499 			break;
4500 
4501 		update_load_avg(se, 1);
4502 		update_cfs_shares(cfs_rq);
4503 	}
4504 
4505 	if (!se)
4506 		add_nr_running(rq, 1);
4507 
4508 	hrtick_update(rq);
4509 }
4510 
4511 static void set_next_buddy(struct sched_entity *se);
4512 
4513 /*
4514  * The dequeue_task method is called before nr_running is
4515  * decreased. We remove the task from the rbtree and
4516  * update the fair scheduling stats:
4517  */
4518 static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
4519 {
4520 	struct cfs_rq *cfs_rq;
4521 	struct sched_entity *se = &p->se;
4522 	int task_sleep = flags & DEQUEUE_SLEEP;
4523 
4524 	for_each_sched_entity(se) {
4525 		cfs_rq = cfs_rq_of(se);
4526 		dequeue_entity(cfs_rq, se, flags);
4527 
4528 		/*
4529 		 * end evaluation on encountering a throttled cfs_rq
4530 		 *
4531 		 * note: in the case of encountering a throttled cfs_rq we will
4532 		 * post the final h_nr_running decrement below.
4533 		*/
4534 		if (cfs_rq_throttled(cfs_rq))
4535 			break;
4536 		cfs_rq->h_nr_running--;
4537 
4538 		/* Don't dequeue parent if it has other entities besides us */
4539 		if (cfs_rq->load.weight) {
4540 			/* Avoid re-evaluating load for this entity: */
4541 			se = parent_entity(se);
4542 			/*
4543 			 * Bias pick_next to pick a task from this cfs_rq, as
4544 			 * p is sleeping when it is within its sched_slice.
4545 			 */
4546 			if (task_sleep && se && !throttled_hierarchy(cfs_rq))
4547 				set_next_buddy(se);
4548 			break;
4549 		}
4550 		flags |= DEQUEUE_SLEEP;
4551 	}
4552 
4553 	for_each_sched_entity(se) {
4554 		cfs_rq = cfs_rq_of(se);
4555 		cfs_rq->h_nr_running--;
4556 
4557 		if (cfs_rq_throttled(cfs_rq))
4558 			break;
4559 
4560 		update_load_avg(se, 1);
4561 		update_cfs_shares(cfs_rq);
4562 	}
4563 
4564 	if (!se)
4565 		sub_nr_running(rq, 1);
4566 
4567 	hrtick_update(rq);
4568 }
4569 
4570 #ifdef CONFIG_SMP
4571 #ifdef CONFIG_NO_HZ_COMMON
4572 /*
4573  * per rq 'load' arrray crap; XXX kill this.
4574  */
4575 
4576 /*
4577  * The exact cpuload calculated at every tick would be:
4578  *
4579  *   load' = (1 - 1/2^i) * load + (1/2^i) * cur_load
4580  *
4581  * If a cpu misses updates for n ticks (as it was idle) and update gets
4582  * called on the n+1-th tick when cpu may be busy, then we have:
4583  *
4584  *   load_n   = (1 - 1/2^i)^n * load_0
4585  *   load_n+1 = (1 - 1/2^i)   * load_n + (1/2^i) * cur_load
4586  *
4587  * decay_load_missed() below does efficient calculation of
4588  *
4589  *   load' = (1 - 1/2^i)^n * load
4590  *
4591  * Because x^(n+m) := x^n * x^m we can decompose any x^n in power-of-2 factors.
4592  * This allows us to precompute the above in said factors, thereby allowing the
4593  * reduction of an arbitrary n in O(log_2 n) steps. (See also
4594  * fixed_power_int())
4595  *
4596  * The calculation is approximated on a 128 point scale.
4597  */
4598 #define DEGRADE_SHIFT		7
4599 
4600 static const u8 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
4601 static const u8 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
4602 	{   0,   0,  0,  0,  0,  0, 0, 0 },
4603 	{  64,  32,  8,  0,  0,  0, 0, 0 },
4604 	{  96,  72, 40, 12,  1,  0, 0, 0 },
4605 	{ 112,  98, 75, 43, 15,  1, 0, 0 },
4606 	{ 120, 112, 98, 76, 45, 16, 2, 0 }
4607 };
4608 
4609 /*
4610  * Update cpu_load for any missed ticks, due to tickless idle. The backlog
4611  * would be when CPU is idle and so we just decay the old load without
4612  * adding any new load.
4613  */
4614 static unsigned long
4615 decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
4616 {
4617 	int j = 0;
4618 
4619 	if (!missed_updates)
4620 		return load;
4621 
4622 	if (missed_updates >= degrade_zero_ticks[idx])
4623 		return 0;
4624 
4625 	if (idx == 1)
4626 		return load >> missed_updates;
4627 
4628 	while (missed_updates) {
4629 		if (missed_updates % 2)
4630 			load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
4631 
4632 		missed_updates >>= 1;
4633 		j++;
4634 	}
4635 	return load;
4636 }
4637 #endif /* CONFIG_NO_HZ_COMMON */
4638 
4639 /**
4640  * __cpu_load_update - update the rq->cpu_load[] statistics
4641  * @this_rq: The rq to update statistics for
4642  * @this_load: The current load
4643  * @pending_updates: The number of missed updates
4644  *
4645  * Update rq->cpu_load[] statistics. This function is usually called every
4646  * scheduler tick (TICK_NSEC).
4647  *
4648  * This function computes a decaying average:
4649  *
4650  *   load[i]' = (1 - 1/2^i) * load[i] + (1/2^i) * load
4651  *
4652  * Because of NOHZ it might not get called on every tick which gives need for
4653  * the @pending_updates argument.
4654  *
4655  *   load[i]_n = (1 - 1/2^i) * load[i]_n-1 + (1/2^i) * load_n-1
4656  *             = A * load[i]_n-1 + B ; A := (1 - 1/2^i), B := (1/2^i) * load
4657  *             = A * (A * load[i]_n-2 + B) + B
4658  *             = A * (A * (A * load[i]_n-3 + B) + B) + B
4659  *             = A^3 * load[i]_n-3 + (A^2 + A + 1) * B
4660  *             = A^n * load[i]_0 + (A^(n-1) + A^(n-2) + ... + 1) * B
4661  *             = A^n * load[i]_0 + ((1 - A^n) / (1 - A)) * B
4662  *             = (1 - 1/2^i)^n * (load[i]_0 - load) + load
4663  *
4664  * In the above we've assumed load_n := load, which is true for NOHZ_FULL as
4665  * any change in load would have resulted in the tick being turned back on.
4666  *
4667  * For regular NOHZ, this reduces to:
4668  *
4669  *   load[i]_n = (1 - 1/2^i)^n * load[i]_0
4670  *
4671  * see decay_load_misses(). For NOHZ_FULL we get to subtract and add the extra
4672  * term.
4673  */
4674 static void cpu_load_update(struct rq *this_rq, unsigned long this_load,
4675 			    unsigned long pending_updates)
4676 {
4677 	unsigned long __maybe_unused tickless_load = this_rq->cpu_load[0];
4678 	int i, scale;
4679 
4680 	this_rq->nr_load_updates++;
4681 
4682 	/* Update our load: */
4683 	this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
4684 	for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
4685 		unsigned long old_load, new_load;
4686 
4687 		/* scale is effectively 1 << i now, and >> i divides by scale */
4688 
4689 		old_load = this_rq->cpu_load[i];
4690 #ifdef CONFIG_NO_HZ_COMMON
4691 		old_load = decay_load_missed(old_load, pending_updates - 1, i);
4692 		if (tickless_load) {
4693 			old_load -= decay_load_missed(tickless_load, pending_updates - 1, i);
4694 			/*
4695 			 * old_load can never be a negative value because a
4696 			 * decayed tickless_load cannot be greater than the
4697 			 * original tickless_load.
4698 			 */
4699 			old_load += tickless_load;
4700 		}
4701 #endif
4702 		new_load = this_load;
4703 		/*
4704 		 * Round up the averaging division if load is increasing. This
4705 		 * prevents us from getting stuck on 9 if the load is 10, for
4706 		 * example.
4707 		 */
4708 		if (new_load > old_load)
4709 			new_load += scale - 1;
4710 
4711 		this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
4712 	}
4713 
4714 	sched_avg_update(this_rq);
4715 }
4716 
4717 /* Used instead of source_load when we know the type == 0 */
4718 static unsigned long weighted_cpuload(const int cpu)
4719 {
4720 	return cfs_rq_runnable_load_avg(&cpu_rq(cpu)->cfs);
4721 }
4722 
4723 #ifdef CONFIG_NO_HZ_COMMON
4724 /*
4725  * There is no sane way to deal with nohz on smp when using jiffies because the
4726  * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
4727  * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
4728  *
4729  * Therefore we need to avoid the delta approach from the regular tick when
4730  * possible since that would seriously skew the load calculation. This is why we
4731  * use cpu_load_update_periodic() for CPUs out of nohz. However we'll rely on
4732  * jiffies deltas for updates happening while in nohz mode (idle ticks, idle
4733  * loop exit, nohz_idle_balance, nohz full exit...)
4734  *
4735  * This means we might still be one tick off for nohz periods.
4736  */
4737 
4738 static void cpu_load_update_nohz(struct rq *this_rq,
4739 				 unsigned long curr_jiffies,
4740 				 unsigned long load)
4741 {
4742 	unsigned long pending_updates;
4743 
4744 	pending_updates = curr_jiffies - this_rq->last_load_update_tick;
4745 	if (pending_updates) {
4746 		this_rq->last_load_update_tick = curr_jiffies;
4747 		/*
4748 		 * In the regular NOHZ case, we were idle, this means load 0.
4749 		 * In the NOHZ_FULL case, we were non-idle, we should consider
4750 		 * its weighted load.
4751 		 */
4752 		cpu_load_update(this_rq, load, pending_updates);
4753 	}
4754 }
4755 
4756 /*
4757  * Called from nohz_idle_balance() to update the load ratings before doing the
4758  * idle balance.
4759  */
4760 static void cpu_load_update_idle(struct rq *this_rq)
4761 {
4762 	/*
4763 	 * bail if there's load or we're actually up-to-date.
4764 	 */
4765 	if (weighted_cpuload(cpu_of(this_rq)))
4766 		return;
4767 
4768 	cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), 0);
4769 }
4770 
4771 /*
4772  * Record CPU load on nohz entry so we know the tickless load to account
4773  * on nohz exit. cpu_load[0] happens then to be updated more frequently
4774  * than other cpu_load[idx] but it should be fine as cpu_load readers
4775  * shouldn't rely into synchronized cpu_load[*] updates.
4776  */
4777 void cpu_load_update_nohz_start(void)
4778 {
4779 	struct rq *this_rq = this_rq();
4780 
4781 	/*
4782 	 * This is all lockless but should be fine. If weighted_cpuload changes
4783 	 * concurrently we'll exit nohz. And cpu_load write can race with
4784 	 * cpu_load_update_idle() but both updater would be writing the same.
4785 	 */
4786 	this_rq->cpu_load[0] = weighted_cpuload(cpu_of(this_rq));
4787 }
4788 
4789 /*
4790  * Account the tickless load in the end of a nohz frame.
4791  */
4792 void cpu_load_update_nohz_stop(void)
4793 {
4794 	unsigned long curr_jiffies = READ_ONCE(jiffies);
4795 	struct rq *this_rq = this_rq();
4796 	unsigned long load;
4797 
4798 	if (curr_jiffies == this_rq->last_load_update_tick)
4799 		return;
4800 
4801 	load = weighted_cpuload(cpu_of(this_rq));
4802 	raw_spin_lock(&this_rq->lock);
4803 	update_rq_clock(this_rq);
4804 	cpu_load_update_nohz(this_rq, curr_jiffies, load);
4805 	raw_spin_unlock(&this_rq->lock);
4806 }
4807 #else /* !CONFIG_NO_HZ_COMMON */
4808 static inline void cpu_load_update_nohz(struct rq *this_rq,
4809 					unsigned long curr_jiffies,
4810 					unsigned long load) { }
4811 #endif /* CONFIG_NO_HZ_COMMON */
4812 
4813 static void cpu_load_update_periodic(struct rq *this_rq, unsigned long load)
4814 {
4815 #ifdef CONFIG_NO_HZ_COMMON
4816 	/* See the mess around cpu_load_update_nohz(). */
4817 	this_rq->last_load_update_tick = READ_ONCE(jiffies);
4818 #endif
4819 	cpu_load_update(this_rq, load, 1);
4820 }
4821 
4822 /*
4823  * Called from scheduler_tick()
4824  */
4825 void cpu_load_update_active(struct rq *this_rq)
4826 {
4827 	unsigned long load = weighted_cpuload(cpu_of(this_rq));
4828 
4829 	if (tick_nohz_tick_stopped())
4830 		cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), load);
4831 	else
4832 		cpu_load_update_periodic(this_rq, load);
4833 }
4834 
4835 /*
4836  * Return a low guess at the load of a migration-source cpu weighted
4837  * according to the scheduling class and "nice" value.
4838  *
4839  * We want to under-estimate the load of migration sources, to
4840  * balance conservatively.
4841  */
4842 static unsigned long source_load(int cpu, int type)
4843 {
4844 	struct rq *rq = cpu_rq(cpu);
4845 	unsigned long total = weighted_cpuload(cpu);
4846 
4847 	if (type == 0 || !sched_feat(LB_BIAS))
4848 		return total;
4849 
4850 	return min(rq->cpu_load[type-1], total);
4851 }
4852 
4853 /*
4854  * Return a high guess at the load of a migration-target cpu weighted
4855  * according to the scheduling class and "nice" value.
4856  */
4857 static unsigned long target_load(int cpu, int type)
4858 {
4859 	struct rq *rq = cpu_rq(cpu);
4860 	unsigned long total = weighted_cpuload(cpu);
4861 
4862 	if (type == 0 || !sched_feat(LB_BIAS))
4863 		return total;
4864 
4865 	return max(rq->cpu_load[type-1], total);
4866 }
4867 
4868 static unsigned long capacity_of(int cpu)
4869 {
4870 	return cpu_rq(cpu)->cpu_capacity;
4871 }
4872 
4873 static unsigned long capacity_orig_of(int cpu)
4874 {
4875 	return cpu_rq(cpu)->cpu_capacity_orig;
4876 }
4877 
4878 static unsigned long cpu_avg_load_per_task(int cpu)
4879 {
4880 	struct rq *rq = cpu_rq(cpu);
4881 	unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);
4882 	unsigned long load_avg = weighted_cpuload(cpu);
4883 
4884 	if (nr_running)
4885 		return load_avg / nr_running;
4886 
4887 	return 0;
4888 }
4889 
4890 #ifdef CONFIG_FAIR_GROUP_SCHED
4891 /*
4892  * effective_load() calculates the load change as seen from the root_task_group
4893  *
4894  * Adding load to a group doesn't make a group heavier, but can cause movement
4895  * of group shares between cpus. Assuming the shares were perfectly aligned one
4896  * can calculate the shift in shares.
4897  *
4898  * Calculate the effective load difference if @wl is added (subtracted) to @tg
4899  * on this @cpu and results in a total addition (subtraction) of @wg to the
4900  * total group weight.
4901  *
4902  * Given a runqueue weight distribution (rw_i) we can compute a shares
4903  * distribution (s_i) using:
4904  *
4905  *   s_i = rw_i / \Sum rw_j						(1)
4906  *
4907  * Suppose we have 4 CPUs and our @tg is a direct child of the root group and
4908  * has 7 equal weight tasks, distributed as below (rw_i), with the resulting
4909  * shares distribution (s_i):
4910  *
4911  *   rw_i = {   2,   4,   1,   0 }
4912  *   s_i  = { 2/7, 4/7, 1/7,   0 }
4913  *
4914  * As per wake_affine() we're interested in the load of two CPUs (the CPU the
4915  * task used to run on and the CPU the waker is running on), we need to
4916  * compute the effect of waking a task on either CPU and, in case of a sync
4917  * wakeup, compute the effect of the current task going to sleep.
4918  *
4919  * So for a change of @wl to the local @cpu with an overall group weight change
4920  * of @wl we can compute the new shares distribution (s'_i) using:
4921  *
4922  *   s'_i = (rw_i + @wl) / (@wg + \Sum rw_j)				(2)
4923  *
4924  * Suppose we're interested in CPUs 0 and 1, and want to compute the load
4925  * differences in waking a task to CPU 0. The additional task changes the
4926  * weight and shares distributions like:
4927  *
4928  *   rw'_i = {   3,   4,   1,   0 }
4929  *   s'_i  = { 3/8, 4/8, 1/8,   0 }
4930  *
4931  * We can then compute the difference in effective weight by using:
4932  *
4933  *   dw_i = S * (s'_i - s_i)						(3)
4934  *
4935  * Where 'S' is the group weight as seen by its parent.
4936  *
4937  * Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7)
4938  * times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 -
4939  * 4/7) times the weight of the group.
4940  */
4941 static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
4942 {
4943 	struct sched_entity *se = tg->se[cpu];
4944 
4945 	if (!tg->parent)	/* the trivial, non-cgroup case */
4946 		return wl;
4947 
4948 	for_each_sched_entity(se) {
4949 		long w, W;
4950 
4951 		tg = se->my_q->tg;
4952 
4953 		/*
4954 		 * W = @wg + \Sum rw_j
4955 		 */
4956 		W = wg + calc_tg_weight(tg, se->my_q);
4957 
4958 		/*
4959 		 * w = rw_i + @wl
4960 		 */
4961 		w = cfs_rq_load_avg(se->my_q) + wl;
4962 
4963 		/*
4964 		 * wl = S * s'_i; see (2)
4965 		 */
4966 		if (W > 0 && w < W)
4967 			wl = (w * (long)tg->shares) / W;
4968 		else
4969 			wl = tg->shares;
4970 
4971 		/*
4972 		 * Per the above, wl is the new se->load.weight value; since
4973 		 * those are clipped to [MIN_SHARES, ...) do so now. See
4974 		 * calc_cfs_shares().
4975 		 */
4976 		if (wl < MIN_SHARES)
4977 			wl = MIN_SHARES;
4978 
4979 		/*
4980 		 * wl = dw_i = S * (s'_i - s_i); see (3)
4981 		 */
4982 		wl -= se->avg.load_avg;
4983 
4984 		/*
4985 		 * Recursively apply this logic to all parent groups to compute
4986 		 * the final effective load change on the root group. Since
4987 		 * only the @tg group gets extra weight, all parent groups can
4988 		 * only redistribute existing shares. @wl is the shift in shares
4989 		 * resulting from this level per the above.
4990 		 */
4991 		wg = 0;
4992 	}
4993 
4994 	return wl;
4995 }
4996 #else
4997 
4998 static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
4999 {
5000 	return wl;
5001 }
5002 
5003 #endif
5004 
5005 static void record_wakee(struct task_struct *p)
5006 {
5007 	/*
5008 	 * Only decay a single time; tasks that have less then 1 wakeup per
5009 	 * jiffy will not have built up many flips.
5010 	 */
5011 	if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) {
5012 		current->wakee_flips >>= 1;
5013 		current->wakee_flip_decay_ts = jiffies;
5014 	}
5015 
5016 	if (current->last_wakee != p) {
5017 		current->last_wakee = p;
5018 		current->wakee_flips++;
5019 	}
5020 }
5021 
5022 /*
5023  * Detect M:N waker/wakee relationships via a switching-frequency heuristic.
5024  *
5025  * A waker of many should wake a different task than the one last awakened
5026  * at a frequency roughly N times higher than one of its wakees.
5027  *
5028  * In order to determine whether we should let the load spread vs consolidating
5029  * to shared cache, we look for a minimum 'flip' frequency of llc_size in one
5030  * partner, and a factor of lls_size higher frequency in the other.
5031  *
5032  * With both conditions met, we can be relatively sure that the relationship is
5033  * non-monogamous, with partner count exceeding socket size.
5034  *
5035  * Waker/wakee being client/server, worker/dispatcher, interrupt source or
5036  * whatever is irrelevant, spread criteria is apparent partner count exceeds
5037  * socket size.
5038  */
5039 static int wake_wide(struct task_struct *p)
5040 {
5041 	unsigned int master = current->wakee_flips;
5042 	unsigned int slave = p->wakee_flips;
5043 	int factor = this_cpu_read(sd_llc_size);
5044 
5045 	if (master < slave)
5046 		swap(master, slave);
5047 	if (slave < factor || master < slave * factor)
5048 		return 0;
5049 	return 1;
5050 }
5051 
5052 static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
5053 {
5054 	s64 this_load, load;
5055 	s64 this_eff_load, prev_eff_load;
5056 	int idx, this_cpu, prev_cpu;
5057 	struct task_group *tg;
5058 	unsigned long weight;
5059 	int balanced;
5060 
5061 	idx	  = sd->wake_idx;
5062 	this_cpu  = smp_processor_id();
5063 	prev_cpu  = task_cpu(p);
5064 	load	  = source_load(prev_cpu, idx);
5065 	this_load = target_load(this_cpu, idx);
5066 
5067 	/*
5068 	 * If sync wakeup then subtract the (maximum possible)
5069 	 * effect of the currently running task from the load
5070 	 * of the current CPU:
5071 	 */
5072 	if (sync) {
5073 		tg = task_group(current);
5074 		weight = current->se.avg.load_avg;
5075 
5076 		this_load += effective_load(tg, this_cpu, -weight, -weight);
5077 		load += effective_load(tg, prev_cpu, 0, -weight);
5078 	}
5079 
5080 	tg = task_group(p);
5081 	weight = p->se.avg.load_avg;
5082 
5083 	/*
5084 	 * In low-load situations, where prev_cpu is idle and this_cpu is idle
5085 	 * due to the sync cause above having dropped this_load to 0, we'll
5086 	 * always have an imbalance, but there's really nothing you can do
5087 	 * about that, so that's good too.
5088 	 *
5089 	 * Otherwise check if either cpus are near enough in load to allow this
5090 	 * task to be woken on this_cpu.
5091 	 */
5092 	this_eff_load = 100;
5093 	this_eff_load *= capacity_of(prev_cpu);
5094 
5095 	prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
5096 	prev_eff_load *= capacity_of(this_cpu);
5097 
5098 	if (this_load > 0) {
5099 		this_eff_load *= this_load +
5100 			effective_load(tg, this_cpu, weight, weight);
5101 
5102 		prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
5103 	}
5104 
5105 	balanced = this_eff_load <= prev_eff_load;
5106 
5107 	schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
5108 
5109 	if (!balanced)
5110 		return 0;
5111 
5112 	schedstat_inc(sd, ttwu_move_affine);
5113 	schedstat_inc(p, se.statistics.nr_wakeups_affine);
5114 
5115 	return 1;
5116 }
5117 
5118 /*
5119  * find_idlest_group finds and returns the least busy CPU group within the
5120  * domain.
5121  */
5122 static struct sched_group *
5123 find_idlest_group(struct sched_domain *sd, struct task_struct *p,
5124 		  int this_cpu, int sd_flag)
5125 {
5126 	struct sched_group *idlest = NULL, *group = sd->groups;
5127 	unsigned long min_load = ULONG_MAX, this_load = 0;
5128 	int load_idx = sd->forkexec_idx;
5129 	int imbalance = 100 + (sd->imbalance_pct-100)/2;
5130 
5131 	if (sd_flag & SD_BALANCE_WAKE)
5132 		load_idx = sd->wake_idx;
5133 
5134 	do {
5135 		unsigned long load, avg_load;
5136 		int local_group;
5137 		int i;
5138 
5139 		/* Skip over this group if it has no CPUs allowed */
5140 		if (!cpumask_intersects(sched_group_cpus(group),
5141 					tsk_cpus_allowed(p)))
5142 			continue;
5143 
5144 		local_group = cpumask_test_cpu(this_cpu,
5145 					       sched_group_cpus(group));
5146 
5147 		/* Tally up the load of all CPUs in the group */
5148 		avg_load = 0;
5149 
5150 		for_each_cpu(i, sched_group_cpus(group)) {
5151 			/* Bias balancing toward cpus of our domain */
5152 			if (local_group)
5153 				load = source_load(i, load_idx);
5154 			else
5155 				load = target_load(i, load_idx);
5156 
5157 			avg_load += load;
5158 		}
5159 
5160 		/* Adjust by relative CPU capacity of the group */
5161 		avg_load = (avg_load * SCHED_CAPACITY_SCALE) / group->sgc->capacity;
5162 
5163 		if (local_group) {
5164 			this_load = avg_load;
5165 		} else if (avg_load < min_load) {
5166 			min_load = avg_load;
5167 			idlest = group;
5168 		}
5169 	} while (group = group->next, group != sd->groups);
5170 
5171 	if (!idlest || 100*this_load < imbalance*min_load)
5172 		return NULL;
5173 	return idlest;
5174 }
5175 
5176 /*
5177  * find_idlest_cpu - find the idlest cpu among the cpus in group.
5178  */
5179 static int
5180 find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
5181 {
5182 	unsigned long load, min_load = ULONG_MAX;
5183 	unsigned int min_exit_latency = UINT_MAX;
5184 	u64 latest_idle_timestamp = 0;
5185 	int least_loaded_cpu = this_cpu;
5186 	int shallowest_idle_cpu = -1;
5187 	int i;
5188 
5189 	/* Traverse only the allowed CPUs */
5190 	for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
5191 		if (idle_cpu(i)) {
5192 			struct rq *rq = cpu_rq(i);
5193 			struct cpuidle_state *idle = idle_get_state(rq);
5194 			if (idle && idle->exit_latency < min_exit_latency) {
5195 				/*
5196 				 * We give priority to a CPU whose idle state
5197 				 * has the smallest exit latency irrespective
5198 				 * of any idle timestamp.
5199 				 */
5200 				min_exit_latency = idle->exit_latency;
5201 				latest_idle_timestamp = rq->idle_stamp;
5202 				shallowest_idle_cpu = i;
5203 			} else if ((!idle || idle->exit_latency == min_exit_latency) &&
5204 				   rq->idle_stamp > latest_idle_timestamp) {
5205 				/*
5206 				 * If equal or no active idle state, then
5207 				 * the most recently idled CPU might have
5208 				 * a warmer cache.
5209 				 */
5210 				latest_idle_timestamp = rq->idle_stamp;
5211 				shallowest_idle_cpu = i;
5212 			}
5213 		} else if (shallowest_idle_cpu == -1) {
5214 			load = weighted_cpuload(i);
5215 			if (load < min_load || (load == min_load && i == this_cpu)) {
5216 				min_load = load;
5217 				least_loaded_cpu = i;
5218 			}
5219 		}
5220 	}
5221 
5222 	return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
5223 }
5224 
5225 /*
5226  * Try and locate an idle CPU in the sched_domain.
5227  */
5228 static int select_idle_sibling(struct task_struct *p, int target)
5229 {
5230 	struct sched_domain *sd;
5231 	struct sched_group *sg;
5232 	int i = task_cpu(p);
5233 
5234 	if (idle_cpu(target))
5235 		return target;
5236 
5237 	/*
5238 	 * If the prevous cpu is cache affine and idle, don't be stupid.
5239 	 */
5240 	if (i != target && cpus_share_cache(i, target) && idle_cpu(i))
5241 		return i;
5242 
5243 	/*
5244 	 * Otherwise, iterate the domains and find an eligible idle cpu.
5245 	 *
5246 	 * A completely idle sched group at higher domains is more
5247 	 * desirable than an idle group at a lower level, because lower
5248 	 * domains have smaller groups and usually share hardware
5249 	 * resources which causes tasks to contend on them, e.g. x86
5250 	 * hyperthread siblings in the lowest domain (SMT) can contend
5251 	 * on the shared cpu pipeline.
5252 	 *
5253 	 * However, while we prefer idle groups at higher domains
5254 	 * finding an idle cpu at the lowest domain is still better than
5255 	 * returning 'target', which we've already established, isn't
5256 	 * idle.
5257 	 */
5258 	sd = rcu_dereference(per_cpu(sd_llc, target));
5259 	for_each_lower_domain(sd) {
5260 		sg = sd->groups;
5261 		do {
5262 			if (!cpumask_intersects(sched_group_cpus(sg),
5263 						tsk_cpus_allowed(p)))
5264 				goto next;
5265 
5266 			/* Ensure the entire group is idle */
5267 			for_each_cpu(i, sched_group_cpus(sg)) {
5268 				if (i == target || !idle_cpu(i))
5269 					goto next;
5270 			}
5271 
5272 			/*
5273 			 * It doesn't matter which cpu we pick, the
5274 			 * whole group is idle.
5275 			 */
5276 			target = cpumask_first_and(sched_group_cpus(sg),
5277 					tsk_cpus_allowed(p));
5278 			goto done;
5279 next:
5280 			sg = sg->next;
5281 		} while (sg != sd->groups);
5282 	}
5283 done:
5284 	return target;
5285 }
5286 
5287 /*
5288  * cpu_util returns the amount of capacity of a CPU that is used by CFS
5289  * tasks. The unit of the return value must be the one of capacity so we can
5290  * compare the utilization with the capacity of the CPU that is available for
5291  * CFS task (ie cpu_capacity).
5292  *
5293  * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the
5294  * recent utilization of currently non-runnable tasks on a CPU. It represents
5295  * the amount of utilization of a CPU in the range [0..capacity_orig] where
5296  * capacity_orig is the cpu_capacity available at the highest frequency
5297  * (arch_scale_freq_capacity()).
5298  * The utilization of a CPU converges towards a sum equal to or less than the
5299  * current capacity (capacity_curr <= capacity_orig) of the CPU because it is
5300  * the running time on this CPU scaled by capacity_curr.
5301  *
5302  * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even
5303  * higher than capacity_orig because of unfortunate rounding in
5304  * cfs.avg.util_avg or just after migrating tasks and new task wakeups until
5305  * the average stabilizes with the new running time. We need to check that the
5306  * utilization stays within the range of [0..capacity_orig] and cap it if
5307  * necessary. Without utilization capping, a group could be seen as overloaded
5308  * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of
5309  * available capacity. We allow utilization to overshoot capacity_curr (but not
5310  * capacity_orig) as it useful for predicting the capacity required after task
5311  * migrations (scheduler-driven DVFS).
5312  */
5313 static int cpu_util(int cpu)
5314 {
5315 	unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg;
5316 	unsigned long capacity = capacity_orig_of(cpu);
5317 
5318 	return (util >= capacity) ? capacity : util;
5319 }
5320 
5321 /*
5322  * select_task_rq_fair: Select target runqueue for the waking task in domains
5323  * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
5324  * SD_BALANCE_FORK, or SD_BALANCE_EXEC.
5325  *
5326  * Balances load by selecting the idlest cpu in the idlest group, or under
5327  * certain conditions an idle sibling cpu if the domain has SD_WAKE_AFFINE set.
5328  *
5329  * Returns the target cpu number.
5330  *
5331  * preempt must be disabled.
5332  */
5333 static int
5334 select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)
5335 {
5336 	struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
5337 	int cpu = smp_processor_id();
5338 	int new_cpu = prev_cpu;
5339 	int want_affine = 0;
5340 	int sync = wake_flags & WF_SYNC;
5341 
5342 	if (sd_flag & SD_BALANCE_WAKE) {
5343 		record_wakee(p);
5344 		want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
5345 	}
5346 
5347 	rcu_read_lock();
5348 	for_each_domain(cpu, tmp) {
5349 		if (!(tmp->flags & SD_LOAD_BALANCE))
5350 			break;
5351 
5352 		/*
5353 		 * If both cpu and prev_cpu are part of this domain,
5354 		 * cpu is a valid SD_WAKE_AFFINE target.
5355 		 */
5356 		if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
5357 		    cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
5358 			affine_sd = tmp;
5359 			break;
5360 		}
5361 
5362 		if (tmp->flags & sd_flag)
5363 			sd = tmp;
5364 		else if (!want_affine)
5365 			break;
5366 	}
5367 
5368 	if (affine_sd) {
5369 		sd = NULL; /* Prefer wake_affine over balance flags */
5370 		if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))
5371 			new_cpu = cpu;
5372 	}
5373 
5374 	if (!sd) {
5375 		if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
5376 			new_cpu = select_idle_sibling(p, new_cpu);
5377 
5378 	} else while (sd) {
5379 		struct sched_group *group;
5380 		int weight;
5381 
5382 		if (!(sd->flags & sd_flag)) {
5383 			sd = sd->child;
5384 			continue;
5385 		}
5386 
5387 		group = find_idlest_group(sd, p, cpu, sd_flag);
5388 		if (!group) {
5389 			sd = sd->child;
5390 			continue;
5391 		}
5392 
5393 		new_cpu = find_idlest_cpu(group, p, cpu);
5394 		if (new_cpu == -1 || new_cpu == cpu) {
5395 			/* Now try balancing at a lower domain level of cpu */
5396 			sd = sd->child;
5397 			continue;
5398 		}
5399 
5400 		/* Now try balancing at a lower domain level of new_cpu */
5401 		cpu = new_cpu;
5402 		weight = sd->span_weight;
5403 		sd = NULL;
5404 		for_each_domain(cpu, tmp) {
5405 			if (weight <= tmp->span_weight)
5406 				break;
5407 			if (tmp->flags & sd_flag)
5408 				sd = tmp;
5409 		}
5410 		/* while loop will break here if sd == NULL */
5411 	}
5412 	rcu_read_unlock();
5413 
5414 	return new_cpu;
5415 }
5416 
5417 /*
5418  * Called immediately before a task is migrated to a new cpu; task_cpu(p) and
5419  * cfs_rq_of(p) references at time of call are still valid and identify the
5420  * previous cpu. The caller guarantees p->pi_lock or task_rq(p)->lock is held.
5421  */
5422 static void migrate_task_rq_fair(struct task_struct *p)
5423 {
5424 	/*
5425 	 * As blocked tasks retain absolute vruntime the migration needs to
5426 	 * deal with this by subtracting the old and adding the new
5427 	 * min_vruntime -- the latter is done by enqueue_entity() when placing
5428 	 * the task on the new runqueue.
5429 	 */
5430 	if (p->state == TASK_WAKING) {
5431 		struct sched_entity *se = &p->se;
5432 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
5433 		u64 min_vruntime;
5434 
5435 #ifndef CONFIG_64BIT
5436 		u64 min_vruntime_copy;
5437 
5438 		do {
5439 			min_vruntime_copy = cfs_rq->min_vruntime_copy;
5440 			smp_rmb();
5441 			min_vruntime = cfs_rq->min_vruntime;
5442 		} while (min_vruntime != min_vruntime_copy);
5443 #else
5444 		min_vruntime = cfs_rq->min_vruntime;
5445 #endif
5446 
5447 		se->vruntime -= min_vruntime;
5448 	}
5449 
5450 	/*
5451 	 * We are supposed to update the task to "current" time, then its up to date
5452 	 * and ready to go to new CPU/cfs_rq. But we have difficulty in getting
5453 	 * what current time is, so simply throw away the out-of-date time. This
5454 	 * will result in the wakee task is less decayed, but giving the wakee more
5455 	 * load sounds not bad.
5456 	 */
5457 	remove_entity_load_avg(&p->se);
5458 
5459 	/* Tell new CPU we are migrated */
5460 	p->se.avg.last_update_time = 0;
5461 
5462 	/* We have migrated, no longer consider this task hot */
5463 	p->se.exec_start = 0;
5464 }
5465 
5466 static void task_dead_fair(struct task_struct *p)
5467 {
5468 	remove_entity_load_avg(&p->se);
5469 }
5470 #endif /* CONFIG_SMP */
5471 
5472 static unsigned long
5473 wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
5474 {
5475 	unsigned long gran = sysctl_sched_wakeup_granularity;
5476 
5477 	/*
5478 	 * Since its curr running now, convert the gran from real-time
5479 	 * to virtual-time in his units.
5480 	 *
5481 	 * By using 'se' instead of 'curr' we penalize light tasks, so
5482 	 * they get preempted easier. That is, if 'se' < 'curr' then
5483 	 * the resulting gran will be larger, therefore penalizing the
5484 	 * lighter, if otoh 'se' > 'curr' then the resulting gran will
5485 	 * be smaller, again penalizing the lighter task.
5486 	 *
5487 	 * This is especially important for buddies when the leftmost
5488 	 * task is higher priority than the buddy.
5489 	 */
5490 	return calc_delta_fair(gran, se);
5491 }
5492 
5493 /*
5494  * Should 'se' preempt 'curr'.
5495  *
5496  *             |s1
5497  *        |s2
5498  *   |s3
5499  *         g
5500  *      |<--->|c
5501  *
5502  *  w(c, s1) = -1
5503  *  w(c, s2) =  0
5504  *  w(c, s3) =  1
5505  *
5506  */
5507 static int
5508 wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
5509 {
5510 	s64 gran, vdiff = curr->vruntime - se->vruntime;
5511 
5512 	if (vdiff <= 0)
5513 		return -1;
5514 
5515 	gran = wakeup_gran(curr, se);
5516 	if (vdiff > gran)
5517 		return 1;
5518 
5519 	return 0;
5520 }
5521 
5522 static void set_last_buddy(struct sched_entity *se)
5523 {
5524 	if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
5525 		return;
5526 
5527 	for_each_sched_entity(se)
5528 		cfs_rq_of(se)->last = se;
5529 }
5530 
5531 static void set_next_buddy(struct sched_entity *se)
5532 {
5533 	if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
5534 		return;
5535 
5536 	for_each_sched_entity(se)
5537 		cfs_rq_of(se)->next = se;
5538 }
5539 
5540 static void set_skip_buddy(struct sched_entity *se)
5541 {
5542 	for_each_sched_entity(se)
5543 		cfs_rq_of(se)->skip = se;
5544 }
5545 
5546 /*
5547  * Preempt the current task with a newly woken task if needed:
5548  */
5549 static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
5550 {
5551 	struct task_struct *curr = rq->curr;
5552 	struct sched_entity *se = &curr->se, *pse = &p->se;
5553 	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
5554 	int scale = cfs_rq->nr_running >= sched_nr_latency;
5555 	int next_buddy_marked = 0;
5556 
5557 	if (unlikely(se == pse))
5558 		return;
5559 
5560 	/*
5561 	 * This is possible from callers such as attach_tasks(), in which we
5562 	 * unconditionally check_prempt_curr() after an enqueue (which may have
5563 	 * lead to a throttle).  This both saves work and prevents false
5564 	 * next-buddy nomination below.
5565 	 */
5566 	if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
5567 		return;
5568 
5569 	if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
5570 		set_next_buddy(pse);
5571 		next_buddy_marked = 1;
5572 	}
5573 
5574 	/*
5575 	 * We can come here with TIF_NEED_RESCHED already set from new task
5576 	 * wake up path.
5577 	 *
5578 	 * Note: this also catches the edge-case of curr being in a throttled
5579 	 * group (e.g. via set_curr_task), since update_curr() (in the
5580 	 * enqueue of curr) will have resulted in resched being set.  This
5581 	 * prevents us from potentially nominating it as a false LAST_BUDDY
5582 	 * below.
5583 	 */
5584 	if (test_tsk_need_resched(curr))
5585 		return;
5586 
5587 	/* Idle tasks are by definition preempted by non-idle tasks. */
5588 	if (unlikely(curr->policy == SCHED_IDLE) &&
5589 	    likely(p->policy != SCHED_IDLE))
5590 		goto preempt;
5591 
5592 	/*
5593 	 * Batch and idle tasks do not preempt non-idle tasks (their preemption
5594 	 * is driven by the tick):
5595 	 */
5596 	if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
5597 		return;
5598 
5599 	find_matching_se(&se, &pse);
5600 	update_curr(cfs_rq_of(se));
5601 	BUG_ON(!pse);
5602 	if (wakeup_preempt_entity(se, pse) == 1) {
5603 		/*
5604 		 * Bias pick_next to pick the sched entity that is
5605 		 * triggering this preemption.
5606 		 */
5607 		if (!next_buddy_marked)
5608 			set_next_buddy(pse);
5609 		goto preempt;
5610 	}
5611 
5612 	return;
5613 
5614 preempt:
5615 	resched_curr(rq);
5616 	/*
5617 	 * Only set the backward buddy when the current task is still
5618 	 * on the rq. This can happen when a wakeup gets interleaved
5619 	 * with schedule on the ->pre_schedule() or idle_balance()
5620 	 * point, either of which can * drop the rq lock.
5621 	 *
5622 	 * Also, during early boot the idle thread is in the fair class,
5623 	 * for obvious reasons its a bad idea to schedule back to it.
5624 	 */
5625 	if (unlikely(!se->on_rq || curr == rq->idle))
5626 		return;
5627 
5628 	if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
5629 		set_last_buddy(se);
5630 }
5631 
5632 static struct task_struct *
5633 pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie)
5634 {
5635 	struct cfs_rq *cfs_rq = &rq->cfs;
5636 	struct sched_entity *se;
5637 	struct task_struct *p;
5638 	int new_tasks;
5639 
5640 again:
5641 #ifdef CONFIG_FAIR_GROUP_SCHED
5642 	if (!cfs_rq->nr_running)
5643 		goto idle;
5644 
5645 	if (prev->sched_class != &fair_sched_class)
5646 		goto simple;
5647 
5648 	/*
5649 	 * Because of the set_next_buddy() in dequeue_task_fair() it is rather
5650 	 * likely that a next task is from the same cgroup as the current.
5651 	 *
5652 	 * Therefore attempt to avoid putting and setting the entire cgroup
5653 	 * hierarchy, only change the part that actually changes.
5654 	 */
5655 
5656 	do {
5657 		struct sched_entity *curr = cfs_rq->curr;
5658 
5659 		/*
5660 		 * Since we got here without doing put_prev_entity() we also
5661 		 * have to consider cfs_rq->curr. If it is still a runnable
5662 		 * entity, update_curr() will update its vruntime, otherwise
5663 		 * forget we've ever seen it.
5664 		 */
5665 		if (curr) {
5666 			if (curr->on_rq)
5667 				update_curr(cfs_rq);
5668 			else
5669 				curr = NULL;
5670 
5671 			/*
5672 			 * This call to check_cfs_rq_runtime() will do the
5673 			 * throttle and dequeue its entity in the parent(s).
5674 			 * Therefore the 'simple' nr_running test will indeed
5675 			 * be correct.
5676 			 */
5677 			if (unlikely(check_cfs_rq_runtime(cfs_rq)))
5678 				goto simple;
5679 		}
5680 
5681 		se = pick_next_entity(cfs_rq, curr);
5682 		cfs_rq = group_cfs_rq(se);
5683 	} while (cfs_rq);
5684 
5685 	p = task_of(se);
5686 
5687 	/*
5688 	 * Since we haven't yet done put_prev_entity and if the selected task
5689 	 * is a different task than we started out with, try and touch the
5690 	 * least amount of cfs_rqs.
5691 	 */
5692 	if (prev != p) {
5693 		struct sched_entity *pse = &prev->se;
5694 
5695 		while (!(cfs_rq = is_same_group(se, pse))) {
5696 			int se_depth = se->depth;
5697 			int pse_depth = pse->depth;
5698 
5699 			if (se_depth <= pse_depth) {
5700 				put_prev_entity(cfs_rq_of(pse), pse);
5701 				pse = parent_entity(pse);
5702 			}
5703 			if (se_depth >= pse_depth) {
5704 				set_next_entity(cfs_rq_of(se), se);
5705 				se = parent_entity(se);
5706 			}
5707 		}
5708 
5709 		put_prev_entity(cfs_rq, pse);
5710 		set_next_entity(cfs_rq, se);
5711 	}
5712 
5713 	if (hrtick_enabled(rq))
5714 		hrtick_start_fair(rq, p);
5715 
5716 	return p;
5717 simple:
5718 	cfs_rq = &rq->cfs;
5719 #endif
5720 
5721 	if (!cfs_rq->nr_running)
5722 		goto idle;
5723 
5724 	put_prev_task(rq, prev);
5725 
5726 	do {
5727 		se = pick_next_entity(cfs_rq, NULL);
5728 		set_next_entity(cfs_rq, se);
5729 		cfs_rq = group_cfs_rq(se);
5730 	} while (cfs_rq);
5731 
5732 	p = task_of(se);
5733 
5734 	if (hrtick_enabled(rq))
5735 		hrtick_start_fair(rq, p);
5736 
5737 	return p;
5738 
5739 idle:
5740 	/*
5741 	 * This is OK, because current is on_cpu, which avoids it being picked
5742 	 * for load-balance and preemption/IRQs are still disabled avoiding
5743 	 * further scheduler activity on it and we're being very careful to
5744 	 * re-start the picking loop.
5745 	 */
5746 	lockdep_unpin_lock(&rq->lock, cookie);
5747 	new_tasks = idle_balance(rq);
5748 	lockdep_repin_lock(&rq->lock, cookie);
5749 	/*
5750 	 * Because idle_balance() releases (and re-acquires) rq->lock, it is
5751 	 * possible for any higher priority task to appear. In that case we
5752 	 * must re-start the pick_next_entity() loop.
5753 	 */
5754 	if (new_tasks < 0)
5755 		return RETRY_TASK;
5756 
5757 	if (new_tasks > 0)
5758 		goto again;
5759 
5760 	return NULL;
5761 }
5762 
5763 /*
5764  * Account for a descheduled task:
5765  */
5766 static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
5767 {
5768 	struct sched_entity *se = &prev->se;
5769 	struct cfs_rq *cfs_rq;
5770 
5771 	for_each_sched_entity(se) {
5772 		cfs_rq = cfs_rq_of(se);
5773 		put_prev_entity(cfs_rq, se);
5774 	}
5775 }
5776 
5777 /*
5778  * sched_yield() is very simple
5779  *
5780  * The magic of dealing with the ->skip buddy is in pick_next_entity.
5781  */
5782 static void yield_task_fair(struct rq *rq)
5783 {
5784 	struct task_struct *curr = rq->curr;
5785 	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
5786 	struct sched_entity *se = &curr->se;
5787 
5788 	/*
5789 	 * Are we the only task in the tree?
5790 	 */
5791 	if (unlikely(rq->nr_running == 1))
5792 		return;
5793 
5794 	clear_buddies(cfs_rq, se);
5795 
5796 	if (curr->policy != SCHED_BATCH) {
5797 		update_rq_clock(rq);
5798 		/*
5799 		 * Update run-time statistics of the 'current'.
5800 		 */
5801 		update_curr(cfs_rq);
5802 		/*
5803 		 * Tell update_rq_clock() that we've just updated,
5804 		 * so we don't do microscopic update in schedule()
5805 		 * and double the fastpath cost.
5806 		 */
5807 		rq_clock_skip_update(rq, true);
5808 	}
5809 
5810 	set_skip_buddy(se);
5811 }
5812 
5813 static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt)
5814 {
5815 	struct sched_entity *se = &p->se;
5816 
5817 	/* throttled hierarchies are not runnable */
5818 	if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
5819 		return false;
5820 
5821 	/* Tell the scheduler that we'd really like pse to run next. */
5822 	set_next_buddy(se);
5823 
5824 	yield_task_fair(rq);
5825 
5826 	return true;
5827 }
5828 
5829 #ifdef CONFIG_SMP
5830 /**************************************************
5831  * Fair scheduling class load-balancing methods.
5832  *
5833  * BASICS
5834  *
5835  * The purpose of load-balancing is to achieve the same basic fairness the
5836  * per-cpu scheduler provides, namely provide a proportional amount of compute
5837  * time to each task. This is expressed in the following equation:
5838  *
5839  *   W_i,n/P_i == W_j,n/P_j for all i,j                               (1)
5840  *
5841  * Where W_i,n is the n-th weight average for cpu i. The instantaneous weight
5842  * W_i,0 is defined as:
5843  *
5844  *   W_i,0 = \Sum_j w_i,j                                             (2)
5845  *
5846  * Where w_i,j is the weight of the j-th runnable task on cpu i. This weight
5847  * is derived from the nice value as per sched_prio_to_weight[].
5848  *
5849  * The weight average is an exponential decay average of the instantaneous
5850  * weight:
5851  *
5852  *   W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0               (3)
5853  *
5854  * C_i is the compute capacity of cpu i, typically it is the
5855  * fraction of 'recent' time available for SCHED_OTHER task execution. But it
5856  * can also include other factors [XXX].
5857  *
5858  * To achieve this balance we define a measure of imbalance which follows
5859  * directly from (1):
5860  *
5861  *   imb_i,j = max{ avg(W/C), W_i/C_i } - min{ avg(W/C), W_j/C_j }    (4)
5862  *
5863  * We them move tasks around to minimize the imbalance. In the continuous
5864  * function space it is obvious this converges, in the discrete case we get
5865  * a few fun cases generally called infeasible weight scenarios.
5866  *
5867  * [XXX expand on:
5868  *     - infeasible weights;
5869  *     - local vs global optima in the discrete case. ]
5870  *
5871  *
5872  * SCHED DOMAINS
5873  *
5874  * In order to solve the imbalance equation (4), and avoid the obvious O(n^2)
5875  * for all i,j solution, we create a tree of cpus that follows the hardware
5876  * topology where each level pairs two lower groups (or better). This results
5877  * in O(log n) layers. Furthermore we reduce the number of cpus going up the
5878  * tree to only the first of the previous level and we decrease the frequency
5879  * of load-balance at each level inv. proportional to the number of cpus in
5880  * the groups.
5881  *
5882  * This yields:
5883  *
5884  *     log_2 n     1     n
5885  *   \Sum       { --- * --- * 2^i } = O(n)                            (5)
5886  *     i = 0      2^i   2^i
5887  *                               `- size of each group
5888  *         |         |     `- number of cpus doing load-balance
5889  *         |         `- freq
5890  *         `- sum over all levels
5891  *
5892  * Coupled with a limit on how many tasks we can migrate every balance pass,
5893  * this makes (5) the runtime complexity of the balancer.
5894  *
5895  * An important property here is that each CPU is still (indirectly) connected
5896  * to every other cpu in at most O(log n) steps:
5897  *
5898  * The adjacency matrix of the resulting graph is given by:
5899  *
5900  *             log_2 n
5901  *   A_i,j = \Union     (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1)  (6)
5902  *             k = 0
5903  *
5904  * And you'll find that:
5905  *
5906  *   A^(log_2 n)_i,j != 0  for all i,j                                (7)
5907  *
5908  * Showing there's indeed a path between every cpu in at most O(log n) steps.
5909  * The task movement gives a factor of O(m), giving a convergence complexity
5910  * of:
5911  *
5912  *   O(nm log n),  n := nr_cpus, m := nr_tasks                        (8)
5913  *
5914  *
5915  * WORK CONSERVING
5916  *
5917  * In order to avoid CPUs going idle while there's still work to do, new idle
5918  * balancing is more aggressive and has the newly idle cpu iterate up the domain
5919  * tree itself instead of relying on other CPUs to bring it work.
5920  *
5921  * This adds some complexity to both (5) and (8) but it reduces the total idle
5922  * time.
5923  *
5924  * [XXX more?]
5925  *
5926  *
5927  * CGROUPS
5928  *
5929  * Cgroups make a horror show out of (2), instead of a simple sum we get:
5930  *
5931  *                                s_k,i
5932  *   W_i,0 = \Sum_j \Prod_k w_k * -----                               (9)
5933  *                                 S_k
5934  *
5935  * Where
5936  *
5937  *   s_k,i = \Sum_j w_i,j,k  and  S_k = \Sum_i s_k,i                 (10)
5938  *
5939  * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on cpu i.
5940  *
5941  * The big problem is S_k, its a global sum needed to compute a local (W_i)
5942  * property.
5943  *
5944  * [XXX write more on how we solve this.. _after_ merging pjt's patches that
5945  *      rewrite all of this once again.]
5946  */
5947 
5948 static unsigned long __read_mostly max_load_balance_interval = HZ/10;
5949 
5950 enum fbq_type { regular, remote, all };
5951 
5952 #define LBF_ALL_PINNED	0x01
5953 #define LBF_NEED_BREAK	0x02
5954 #define LBF_DST_PINNED  0x04
5955 #define LBF_SOME_PINNED	0x08
5956 
5957 struct lb_env {
5958 	struct sched_domain	*sd;
5959 
5960 	struct rq		*src_rq;
5961 	int			src_cpu;
5962 
5963 	int			dst_cpu;
5964 	struct rq		*dst_rq;
5965 
5966 	struct cpumask		*dst_grpmask;
5967 	int			new_dst_cpu;
5968 	enum cpu_idle_type	idle;
5969 	long			imbalance;
5970 	/* The set of CPUs under consideration for load-balancing */
5971 	struct cpumask		*cpus;
5972 
5973 	unsigned int		flags;
5974 
5975 	unsigned int		loop;
5976 	unsigned int		loop_break;
5977 	unsigned int		loop_max;
5978 
5979 	enum fbq_type		fbq_type;
5980 	struct list_head	tasks;
5981 };
5982 
5983 /*
5984  * Is this task likely cache-hot:
5985  */
5986 static int task_hot(struct task_struct *p, struct lb_env *env)
5987 {
5988 	s64 delta;
5989 
5990 	lockdep_assert_held(&env->src_rq->lock);
5991 
5992 	if (p->sched_class != &fair_sched_class)
5993 		return 0;
5994 
5995 	if (unlikely(p->policy == SCHED_IDLE))
5996 		return 0;
5997 
5998 	/*
5999 	 * Buddy candidates are cache hot:
6000 	 */
6001 	if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running &&
6002 			(&p->se == cfs_rq_of(&p->se)->next ||
6003 			 &p->se == cfs_rq_of(&p->se)->last))
6004 		return 1;
6005 
6006 	if (sysctl_sched_migration_cost == -1)
6007 		return 1;
6008 	if (sysctl_sched_migration_cost == 0)
6009 		return 0;
6010 
6011 	delta = rq_clock_task(env->src_rq) - p->se.exec_start;
6012 
6013 	return delta < (s64)sysctl_sched_migration_cost;
6014 }
6015 
6016 #ifdef CONFIG_NUMA_BALANCING
6017 /*
6018  * Returns 1, if task migration degrades locality
6019  * Returns 0, if task migration improves locality i.e migration preferred.
6020  * Returns -1, if task migration is not affected by locality.
6021  */
6022 static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
6023 {
6024 	struct numa_group *numa_group = rcu_dereference(p->numa_group);
6025 	unsigned long src_faults, dst_faults;
6026 	int src_nid, dst_nid;
6027 
6028 	if (!static_branch_likely(&sched_numa_balancing))
6029 		return -1;
6030 
6031 	if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
6032 		return -1;
6033 
6034 	src_nid = cpu_to_node(env->src_cpu);
6035 	dst_nid = cpu_to_node(env->dst_cpu);
6036 
6037 	if (src_nid == dst_nid)
6038 		return -1;
6039 
6040 	/* Migrating away from the preferred node is always bad. */
6041 	if (src_nid == p->numa_preferred_nid) {
6042 		if (env->src_rq->nr_running > env->src_rq->nr_preferred_running)
6043 			return 1;
6044 		else
6045 			return -1;
6046 	}
6047 
6048 	/* Encourage migration to the preferred node. */
6049 	if (dst_nid == p->numa_preferred_nid)
6050 		return 0;
6051 
6052 	if (numa_group) {
6053 		src_faults = group_faults(p, src_nid);
6054 		dst_faults = group_faults(p, dst_nid);
6055 	} else {
6056 		src_faults = task_faults(p, src_nid);
6057 		dst_faults = task_faults(p, dst_nid);
6058 	}
6059 
6060 	return dst_faults < src_faults;
6061 }
6062 
6063 #else
6064 static inline int migrate_degrades_locality(struct task_struct *p,
6065 					     struct lb_env *env)
6066 {
6067 	return -1;
6068 }
6069 #endif
6070 
6071 /*
6072  * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
6073  */
6074 static
6075 int can_migrate_task(struct task_struct *p, struct lb_env *env)
6076 {
6077 	int tsk_cache_hot;
6078 
6079 	lockdep_assert_held(&env->src_rq->lock);
6080 
6081 	/*
6082 	 * We do not migrate tasks that are:
6083 	 * 1) throttled_lb_pair, or
6084 	 * 2) cannot be migrated to this CPU due to cpus_allowed, or
6085 	 * 3) running (obviously), or
6086 	 * 4) are cache-hot on their current CPU.
6087 	 */
6088 	if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
6089 		return 0;
6090 
6091 	if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
6092 		int cpu;
6093 
6094 		schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
6095 
6096 		env->flags |= LBF_SOME_PINNED;
6097 
6098 		/*
6099 		 * Remember if this task can be migrated to any other cpu in
6100 		 * our sched_group. We may want to revisit it if we couldn't
6101 		 * meet load balance goals by pulling other tasks on src_cpu.
6102 		 *
6103 		 * Also avoid computing new_dst_cpu if we have already computed
6104 		 * one in current iteration.
6105 		 */
6106 		if (!env->dst_grpmask || (env->flags & LBF_DST_PINNED))
6107 			return 0;
6108 
6109 		/* Prevent to re-select dst_cpu via env's cpus */
6110 		for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
6111 			if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
6112 				env->flags |= LBF_DST_PINNED;
6113 				env->new_dst_cpu = cpu;
6114 				break;
6115 			}
6116 		}
6117 
6118 		return 0;
6119 	}
6120 
6121 	/* Record that we found atleast one task that could run on dst_cpu */
6122 	env->flags &= ~LBF_ALL_PINNED;
6123 
6124 	if (task_running(env->src_rq, p)) {
6125 		schedstat_inc(p, se.statistics.nr_failed_migrations_running);
6126 		return 0;
6127 	}
6128 
6129 	/*
6130 	 * Aggressive migration if:
6131 	 * 1) destination numa is preferred
6132 	 * 2) task is cache cold, or
6133 	 * 3) too many balance attempts have failed.
6134 	 */
6135 	tsk_cache_hot = migrate_degrades_locality(p, env);
6136 	if (tsk_cache_hot == -1)
6137 		tsk_cache_hot = task_hot(p, env);
6138 
6139 	if (tsk_cache_hot <= 0 ||
6140 	    env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
6141 		if (tsk_cache_hot == 1) {
6142 			schedstat_inc(env->sd, lb_hot_gained[env->idle]);
6143 			schedstat_inc(p, se.statistics.nr_forced_migrations);
6144 		}
6145 		return 1;
6146 	}
6147 
6148 	schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
6149 	return 0;
6150 }
6151 
6152 /*
6153  * detach_task() -- detach the task for the migration specified in env
6154  */
6155 static void detach_task(struct task_struct *p, struct lb_env *env)
6156 {
6157 	lockdep_assert_held(&env->src_rq->lock);
6158 
6159 	p->on_rq = TASK_ON_RQ_MIGRATING;
6160 	deactivate_task(env->src_rq, p, 0);
6161 	set_task_cpu(p, env->dst_cpu);
6162 }
6163 
6164 /*
6165  * detach_one_task() -- tries to dequeue exactly one task from env->src_rq, as
6166  * part of active balancing operations within "domain".
6167  *
6168  * Returns a task if successful and NULL otherwise.
6169  */
6170 static struct task_struct *detach_one_task(struct lb_env *env)
6171 {
6172 	struct task_struct *p, *n;
6173 
6174 	lockdep_assert_held(&env->src_rq->lock);
6175 
6176 	list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
6177 		if (!can_migrate_task(p, env))
6178 			continue;
6179 
6180 		detach_task(p, env);
6181 
6182 		/*
6183 		 * Right now, this is only the second place where
6184 		 * lb_gained[env->idle] is updated (other is detach_tasks)
6185 		 * so we can safely collect stats here rather than
6186 		 * inside detach_tasks().
6187 		 */
6188 		schedstat_inc(env->sd, lb_gained[env->idle]);
6189 		return p;
6190 	}
6191 	return NULL;
6192 }
6193 
6194 static const unsigned int sched_nr_migrate_break = 32;
6195 
6196 /*
6197  * detach_tasks() -- tries to detach up to imbalance weighted load from
6198  * busiest_rq, as part of a balancing operation within domain "sd".
6199  *
6200  * Returns number of detached tasks if successful and 0 otherwise.
6201  */
6202 static int detach_tasks(struct lb_env *env)
6203 {
6204 	struct list_head *tasks = &env->src_rq->cfs_tasks;
6205 	struct task_struct *p;
6206 	unsigned long load;
6207 	int detached = 0;
6208 
6209 	lockdep_assert_held(&env->src_rq->lock);
6210 
6211 	if (env->imbalance <= 0)
6212 		return 0;
6213 
6214 	while (!list_empty(tasks)) {
6215 		/*
6216 		 * We don't want to steal all, otherwise we may be treated likewise,
6217 		 * which could at worst lead to a livelock crash.
6218 		 */
6219 		if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1)
6220 			break;
6221 
6222 		p = list_first_entry(tasks, struct task_struct, se.group_node);
6223 
6224 		env->loop++;
6225 		/* We've more or less seen every task there is, call it quits */
6226 		if (env->loop > env->loop_max)
6227 			break;
6228 
6229 		/* take a breather every nr_migrate tasks */
6230 		if (env->loop > env->loop_break) {
6231 			env->loop_break += sched_nr_migrate_break;
6232 			env->flags |= LBF_NEED_BREAK;
6233 			break;
6234 		}
6235 
6236 		if (!can_migrate_task(p, env))
6237 			goto next;
6238 
6239 		load = task_h_load(p);
6240 
6241 		if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
6242 			goto next;
6243 
6244 		if ((load / 2) > env->imbalance)
6245 			goto next;
6246 
6247 		detach_task(p, env);
6248 		list_add(&p->se.group_node, &env->tasks);
6249 
6250 		detached++;
6251 		env->imbalance -= load;
6252 
6253 #ifdef CONFIG_PREEMPT
6254 		/*
6255 		 * NEWIDLE balancing is a source of latency, so preemptible
6256 		 * kernels will stop after the first task is detached to minimize
6257 		 * the critical section.
6258 		 */
6259 		if (env->idle == CPU_NEWLY_IDLE)
6260 			break;
6261 #endif
6262 
6263 		/*
6264 		 * We only want to steal up to the prescribed amount of
6265 		 * weighted load.
6266 		 */
6267 		if (env->imbalance <= 0)
6268 			break;
6269 
6270 		continue;
6271 next:
6272 		list_move_tail(&p->se.group_node, tasks);
6273 	}
6274 
6275 	/*
6276 	 * Right now, this is one of only two places we collect this stat
6277 	 * so we can safely collect detach_one_task() stats here rather
6278 	 * than inside detach_one_task().
6279 	 */
6280 	schedstat_add(env->sd, lb_gained[env->idle], detached);
6281 
6282 	return detached;
6283 }
6284 
6285 /*
6286  * attach_task() -- attach the task detached by detach_task() to its new rq.
6287  */
6288 static void attach_task(struct rq *rq, struct task_struct *p)
6289 {
6290 	lockdep_assert_held(&rq->lock);
6291 
6292 	BUG_ON(task_rq(p) != rq);
6293 	activate_task(rq, p, 0);
6294 	p->on_rq = TASK_ON_RQ_QUEUED;
6295 	check_preempt_curr(rq, p, 0);
6296 }
6297 
6298 /*
6299  * attach_one_task() -- attaches the task returned from detach_one_task() to
6300  * its new rq.
6301  */
6302 static void attach_one_task(struct rq *rq, struct task_struct *p)
6303 {
6304 	raw_spin_lock(&rq->lock);
6305 	attach_task(rq, p);
6306 	raw_spin_unlock(&rq->lock);
6307 }
6308 
6309 /*
6310  * attach_tasks() -- attaches all tasks detached by detach_tasks() to their
6311  * new rq.
6312  */
6313 static void attach_tasks(struct lb_env *env)
6314 {
6315 	struct list_head *tasks = &env->tasks;
6316 	struct task_struct *p;
6317 
6318 	raw_spin_lock(&env->dst_rq->lock);
6319 
6320 	while (!list_empty(tasks)) {
6321 		p = list_first_entry(tasks, struct task_struct, se.group_node);
6322 		list_del_init(&p->se.group_node);
6323 
6324 		attach_task(env->dst_rq, p);
6325 	}
6326 
6327 	raw_spin_unlock(&env->dst_rq->lock);
6328 }
6329 
6330 #ifdef CONFIG_FAIR_GROUP_SCHED
6331 static void update_blocked_averages(int cpu)
6332 {
6333 	struct rq *rq = cpu_rq(cpu);
6334 	struct cfs_rq *cfs_rq;
6335 	unsigned long flags;
6336 
6337 	raw_spin_lock_irqsave(&rq->lock, flags);
6338 	update_rq_clock(rq);
6339 
6340 	/*
6341 	 * Iterates the task_group tree in a bottom up fashion, see
6342 	 * list_add_leaf_cfs_rq() for details.
6343 	 */
6344 	for_each_leaf_cfs_rq(rq, cfs_rq) {
6345 		/* throttled entities do not contribute to load */
6346 		if (throttled_hierarchy(cfs_rq))
6347 			continue;
6348 
6349 		if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true))
6350 			update_tg_load_avg(cfs_rq, 0);
6351 	}
6352 	raw_spin_unlock_irqrestore(&rq->lock, flags);
6353 }
6354 
6355 /*
6356  * Compute the hierarchical load factor for cfs_rq and all its ascendants.
6357  * This needs to be done in a top-down fashion because the load of a child
6358  * group is a fraction of its parents load.
6359  */
6360 static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
6361 {
6362 	struct rq *rq = rq_of(cfs_rq);
6363 	struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
6364 	unsigned long now = jiffies;
6365 	unsigned long load;
6366 
6367 	if (cfs_rq->last_h_load_update == now)
6368 		return;
6369 
6370 	cfs_rq->h_load_next = NULL;
6371 	for_each_sched_entity(se) {
6372 		cfs_rq = cfs_rq_of(se);
6373 		cfs_rq->h_load_next = se;
6374 		if (cfs_rq->last_h_load_update == now)
6375 			break;
6376 	}
6377 
6378 	if (!se) {
6379 		cfs_rq->h_load = cfs_rq_load_avg(cfs_rq);
6380 		cfs_rq->last_h_load_update = now;
6381 	}
6382 
6383 	while ((se = cfs_rq->h_load_next) != NULL) {
6384 		load = cfs_rq->h_load;
6385 		load = div64_ul(load * se->avg.load_avg,
6386 			cfs_rq_load_avg(cfs_rq) + 1);
6387 		cfs_rq = group_cfs_rq(se);
6388 		cfs_rq->h_load = load;
6389 		cfs_rq->last_h_load_update = now;
6390 	}
6391 }
6392 
6393 static unsigned long task_h_load(struct task_struct *p)
6394 {
6395 	struct cfs_rq *cfs_rq = task_cfs_rq(p);
6396 
6397 	update_cfs_rq_h_load(cfs_rq);
6398 	return div64_ul(p->se.avg.load_avg * cfs_rq->h_load,
6399 			cfs_rq_load_avg(cfs_rq) + 1);
6400 }
6401 #else
6402 static inline void update_blocked_averages(int cpu)
6403 {
6404 	struct rq *rq = cpu_rq(cpu);
6405 	struct cfs_rq *cfs_rq = &rq->cfs;
6406 	unsigned long flags;
6407 
6408 	raw_spin_lock_irqsave(&rq->lock, flags);
6409 	update_rq_clock(rq);
6410 	update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true);
6411 	raw_spin_unlock_irqrestore(&rq->lock, flags);
6412 }
6413 
6414 static unsigned long task_h_load(struct task_struct *p)
6415 {
6416 	return p->se.avg.load_avg;
6417 }
6418 #endif
6419 
6420 /********** Helpers for find_busiest_group ************************/
6421 
6422 enum group_type {
6423 	group_other = 0,
6424 	group_imbalanced,
6425 	group_overloaded,
6426 };
6427 
6428 /*
6429  * sg_lb_stats - stats of a sched_group required for load_balancing
6430  */
6431 struct sg_lb_stats {
6432 	unsigned long avg_load; /*Avg load across the CPUs of the group */
6433 	unsigned long group_load; /* Total load over the CPUs of the group */
6434 	unsigned long sum_weighted_load; /* Weighted load of group's tasks */
6435 	unsigned long load_per_task;
6436 	unsigned long group_capacity;
6437 	unsigned long group_util; /* Total utilization of the group */
6438 	unsigned int sum_nr_running; /* Nr tasks running in the group */
6439 	unsigned int idle_cpus;
6440 	unsigned int group_weight;
6441 	enum group_type group_type;
6442 	int group_no_capacity;
6443 #ifdef CONFIG_NUMA_BALANCING
6444 	unsigned int nr_numa_running;
6445 	unsigned int nr_preferred_running;
6446 #endif
6447 };
6448 
6449 /*
6450  * sd_lb_stats - Structure to store the statistics of a sched_domain
6451  *		 during load balancing.
6452  */
6453 struct sd_lb_stats {
6454 	struct sched_group *busiest;	/* Busiest group in this sd */
6455 	struct sched_group *local;	/* Local group in this sd */
6456 	unsigned long total_load;	/* Total load of all groups in sd */
6457 	unsigned long total_capacity;	/* Total capacity of all groups in sd */
6458 	unsigned long avg_load;	/* Average load across all groups in sd */
6459 
6460 	struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
6461 	struct sg_lb_stats local_stat;	/* Statistics of the local group */
6462 };
6463 
6464 static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
6465 {
6466 	/*
6467 	 * Skimp on the clearing to avoid duplicate work. We can avoid clearing
6468 	 * local_stat because update_sg_lb_stats() does a full clear/assignment.
6469 	 * We must however clear busiest_stat::avg_load because
6470 	 * update_sd_pick_busiest() reads this before assignment.
6471 	 */
6472 	*sds = (struct sd_lb_stats){
6473 		.busiest = NULL,
6474 		.local = NULL,
6475 		.total_load = 0UL,
6476 		.total_capacity = 0UL,
6477 		.busiest_stat = {
6478 			.avg_load = 0UL,
6479 			.sum_nr_running = 0,
6480 			.group_type = group_other,
6481 		},
6482 	};
6483 }
6484 
6485 /**
6486  * get_sd_load_idx - Obtain the load index for a given sched domain.
6487  * @sd: The sched_domain whose load_idx is to be obtained.
6488  * @idle: The idle status of the CPU for whose sd load_idx is obtained.
6489  *
6490  * Return: The load index.
6491  */
6492 static inline int get_sd_load_idx(struct sched_domain *sd,
6493 					enum cpu_idle_type idle)
6494 {
6495 	int load_idx;
6496 
6497 	switch (idle) {
6498 	case CPU_NOT_IDLE:
6499 		load_idx = sd->busy_idx;
6500 		break;
6501 
6502 	case CPU_NEWLY_IDLE:
6503 		load_idx = sd->newidle_idx;
6504 		break;
6505 	default:
6506 		load_idx = sd->idle_idx;
6507 		break;
6508 	}
6509 
6510 	return load_idx;
6511 }
6512 
6513 static unsigned long scale_rt_capacity(int cpu)
6514 {
6515 	struct rq *rq = cpu_rq(cpu);
6516 	u64 total, used, age_stamp, avg;
6517 	s64 delta;
6518 
6519 	/*
6520 	 * Since we're reading these variables without serialization make sure
6521 	 * we read them once before doing sanity checks on them.
6522 	 */
6523 	age_stamp = READ_ONCE(rq->age_stamp);
6524 	avg = READ_ONCE(rq->rt_avg);
6525 	delta = __rq_clock_broken(rq) - age_stamp;
6526 
6527 	if (unlikely(delta < 0))
6528 		delta = 0;
6529 
6530 	total = sched_avg_period() + delta;
6531 
6532 	used = div_u64(avg, total);
6533 
6534 	if (likely(used < SCHED_CAPACITY_SCALE))
6535 		return SCHED_CAPACITY_SCALE - used;
6536 
6537 	return 1;
6538 }
6539 
6540 static void update_cpu_capacity(struct sched_domain *sd, int cpu)
6541 {
6542 	unsigned long capacity = arch_scale_cpu_capacity(sd, cpu);
6543 	struct sched_group *sdg = sd->groups;
6544 
6545 	cpu_rq(cpu)->cpu_capacity_orig = capacity;
6546 
6547 	capacity *= scale_rt_capacity(cpu);
6548 	capacity >>= SCHED_CAPACITY_SHIFT;
6549 
6550 	if (!capacity)
6551 		capacity = 1;
6552 
6553 	cpu_rq(cpu)->cpu_capacity = capacity;
6554 	sdg->sgc->capacity = capacity;
6555 }
6556 
6557 void update_group_capacity(struct sched_domain *sd, int cpu)
6558 {
6559 	struct sched_domain *child = sd->child;
6560 	struct sched_group *group, *sdg = sd->groups;
6561 	unsigned long capacity;
6562 	unsigned long interval;
6563 
6564 	interval = msecs_to_jiffies(sd->balance_interval);
6565 	interval = clamp(interval, 1UL, max_load_balance_interval);
6566 	sdg->sgc->next_update = jiffies + interval;
6567 
6568 	if (!child) {
6569 		update_cpu_capacity(sd, cpu);
6570 		return;
6571 	}
6572 
6573 	capacity = 0;
6574 
6575 	if (child->flags & SD_OVERLAP) {
6576 		/*
6577 		 * SD_OVERLAP domains cannot assume that child groups
6578 		 * span the current group.
6579 		 */
6580 
6581 		for_each_cpu(cpu, sched_group_cpus(sdg)) {
6582 			struct sched_group_capacity *sgc;
6583 			struct rq *rq = cpu_rq(cpu);
6584 
6585 			/*
6586 			 * build_sched_domains() -> init_sched_groups_capacity()
6587 			 * gets here before we've attached the domains to the
6588 			 * runqueues.
6589 			 *
6590 			 * Use capacity_of(), which is set irrespective of domains
6591 			 * in update_cpu_capacity().
6592 			 *
6593 			 * This avoids capacity from being 0 and
6594 			 * causing divide-by-zero issues on boot.
6595 			 */
6596 			if (unlikely(!rq->sd)) {
6597 				capacity += capacity_of(cpu);
6598 				continue;
6599 			}
6600 
6601 			sgc = rq->sd->groups->sgc;
6602 			capacity += sgc->capacity;
6603 		}
6604 	} else  {
6605 		/*
6606 		 * !SD_OVERLAP domains can assume that child groups
6607 		 * span the current group.
6608 		 */
6609 
6610 		group = child->groups;
6611 		do {
6612 			capacity += group->sgc->capacity;
6613 			group = group->next;
6614 		} while (group != child->groups);
6615 	}
6616 
6617 	sdg->sgc->capacity = capacity;
6618 }
6619 
6620 /*
6621  * Check whether the capacity of the rq has been noticeably reduced by side
6622  * activity. The imbalance_pct is used for the threshold.
6623  * Return true is the capacity is reduced
6624  */
6625 static inline int
6626 check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
6627 {
6628 	return ((rq->cpu_capacity * sd->imbalance_pct) <
6629 				(rq->cpu_capacity_orig * 100));
6630 }
6631 
6632 /*
6633  * Group imbalance indicates (and tries to solve) the problem where balancing
6634  * groups is inadequate due to tsk_cpus_allowed() constraints.
6635  *
6636  * Imagine a situation of two groups of 4 cpus each and 4 tasks each with a
6637  * cpumask covering 1 cpu of the first group and 3 cpus of the second group.
6638  * Something like:
6639  *
6640  * 	{ 0 1 2 3 } { 4 5 6 7 }
6641  * 	        *     * * *
6642  *
6643  * If we were to balance group-wise we'd place two tasks in the first group and
6644  * two tasks in the second group. Clearly this is undesired as it will overload
6645  * cpu 3 and leave one of the cpus in the second group unused.
6646  *
6647  * The current solution to this issue is detecting the skew in the first group
6648  * by noticing the lower domain failed to reach balance and had difficulty
6649  * moving tasks due to affinity constraints.
6650  *
6651  * When this is so detected; this group becomes a candidate for busiest; see
6652  * update_sd_pick_busiest(). And calculate_imbalance() and
6653  * find_busiest_group() avoid some of the usual balance conditions to allow it
6654  * to create an effective group imbalance.
6655  *
6656  * This is a somewhat tricky proposition since the next run might not find the
6657  * group imbalance and decide the groups need to be balanced again. A most
6658  * subtle and fragile situation.
6659  */
6660 
6661 static inline int sg_imbalanced(struct sched_group *group)
6662 {
6663 	return group->sgc->imbalance;
6664 }
6665 
6666 /*
6667  * group_has_capacity returns true if the group has spare capacity that could
6668  * be used by some tasks.
6669  * We consider that a group has spare capacity if the  * number of task is
6670  * smaller than the number of CPUs or if the utilization is lower than the
6671  * available capacity for CFS tasks.
6672  * For the latter, we use a threshold to stabilize the state, to take into
6673  * account the variance of the tasks' load and to return true if the available
6674  * capacity in meaningful for the load balancer.
6675  * As an example, an available capacity of 1% can appear but it doesn't make
6676  * any benefit for the load balance.
6677  */
6678 static inline bool
6679 group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs)
6680 {
6681 	if (sgs->sum_nr_running < sgs->group_weight)
6682 		return true;
6683 
6684 	if ((sgs->group_capacity * 100) >
6685 			(sgs->group_util * env->sd->imbalance_pct))
6686 		return true;
6687 
6688 	return false;
6689 }
6690 
6691 /*
6692  *  group_is_overloaded returns true if the group has more tasks than it can
6693  *  handle.
6694  *  group_is_overloaded is not equals to !group_has_capacity because a group
6695  *  with the exact right number of tasks, has no more spare capacity but is not
6696  *  overloaded so both group_has_capacity and group_is_overloaded return
6697  *  false.
6698  */
6699 static inline bool
6700 group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs)
6701 {
6702 	if (sgs->sum_nr_running <= sgs->group_weight)
6703 		return false;
6704 
6705 	if ((sgs->group_capacity * 100) <
6706 			(sgs->group_util * env->sd->imbalance_pct))
6707 		return true;
6708 
6709 	return false;
6710 }
6711 
6712 static inline enum
6713 group_type group_classify(struct sched_group *group,
6714 			  struct sg_lb_stats *sgs)
6715 {
6716 	if (sgs->group_no_capacity)
6717 		return group_overloaded;
6718 
6719 	if (sg_imbalanced(group))
6720 		return group_imbalanced;
6721 
6722 	return group_other;
6723 }
6724 
6725 /**
6726  * update_sg_lb_stats - Update sched_group's statistics for load balancing.
6727  * @env: The load balancing environment.
6728  * @group: sched_group whose statistics are to be updated.
6729  * @load_idx: Load index of sched_domain of this_cpu for load calc.
6730  * @local_group: Does group contain this_cpu.
6731  * @sgs: variable to hold the statistics for this group.
6732  * @overload: Indicate more than one runnable task for any CPU.
6733  */
6734 static inline void update_sg_lb_stats(struct lb_env *env,
6735 			struct sched_group *group, int load_idx,
6736 			int local_group, struct sg_lb_stats *sgs,
6737 			bool *overload)
6738 {
6739 	unsigned long load;
6740 	int i, nr_running;
6741 
6742 	memset(sgs, 0, sizeof(*sgs));
6743 
6744 	for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
6745 		struct rq *rq = cpu_rq(i);
6746 
6747 		/* Bias balancing toward cpus of our domain */
6748 		if (local_group)
6749 			load = target_load(i, load_idx);
6750 		else
6751 			load = source_load(i, load_idx);
6752 
6753 		sgs->group_load += load;
6754 		sgs->group_util += cpu_util(i);
6755 		sgs->sum_nr_running += rq->cfs.h_nr_running;
6756 
6757 		nr_running = rq->nr_running;
6758 		if (nr_running > 1)
6759 			*overload = true;
6760 
6761 #ifdef CONFIG_NUMA_BALANCING
6762 		sgs->nr_numa_running += rq->nr_numa_running;
6763 		sgs->nr_preferred_running += rq->nr_preferred_running;
6764 #endif
6765 		sgs->sum_weighted_load += weighted_cpuload(i);
6766 		/*
6767 		 * No need to call idle_cpu() if nr_running is not 0
6768 		 */
6769 		if (!nr_running && idle_cpu(i))
6770 			sgs->idle_cpus++;
6771 	}
6772 
6773 	/* Adjust by relative CPU capacity of the group */
6774 	sgs->group_capacity = group->sgc->capacity;
6775 	sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity;
6776 
6777 	if (sgs->sum_nr_running)
6778 		sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
6779 
6780 	sgs->group_weight = group->group_weight;
6781 
6782 	sgs->group_no_capacity = group_is_overloaded(env, sgs);
6783 	sgs->group_type = group_classify(group, sgs);
6784 }
6785 
6786 /**
6787  * update_sd_pick_busiest - return 1 on busiest group
6788  * @env: The load balancing environment.
6789  * @sds: sched_domain statistics
6790  * @sg: sched_group candidate to be checked for being the busiest
6791  * @sgs: sched_group statistics
6792  *
6793  * Determine if @sg is a busier group than the previously selected
6794  * busiest group.
6795  *
6796  * Return: %true if @sg is a busier group than the previously selected
6797  * busiest group. %false otherwise.
6798  */
6799 static bool update_sd_pick_busiest(struct lb_env *env,
6800 				   struct sd_lb_stats *sds,
6801 				   struct sched_group *sg,
6802 				   struct sg_lb_stats *sgs)
6803 {
6804 	struct sg_lb_stats *busiest = &sds->busiest_stat;
6805 
6806 	if (sgs->group_type > busiest->group_type)
6807 		return true;
6808 
6809 	if (sgs->group_type < busiest->group_type)
6810 		return false;
6811 
6812 	if (sgs->avg_load <= busiest->avg_load)
6813 		return false;
6814 
6815 	/* This is the busiest node in its class. */
6816 	if (!(env->sd->flags & SD_ASYM_PACKING))
6817 		return true;
6818 
6819 	/* No ASYM_PACKING if target cpu is already busy */
6820 	if (env->idle == CPU_NOT_IDLE)
6821 		return true;
6822 	/*
6823 	 * ASYM_PACKING needs to move all the work to the lowest
6824 	 * numbered CPUs in the group, therefore mark all groups
6825 	 * higher than ourself as busy.
6826 	 */
6827 	if (sgs->sum_nr_running && env->dst_cpu < group_first_cpu(sg)) {
6828 		if (!sds->busiest)
6829 			return true;
6830 
6831 		/* Prefer to move from highest possible cpu's work */
6832 		if (group_first_cpu(sds->busiest) < group_first_cpu(sg))
6833 			return true;
6834 	}
6835 
6836 	return false;
6837 }
6838 
6839 #ifdef CONFIG_NUMA_BALANCING
6840 static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
6841 {
6842 	if (sgs->sum_nr_running > sgs->nr_numa_running)
6843 		return regular;
6844 	if (sgs->sum_nr_running > sgs->nr_preferred_running)
6845 		return remote;
6846 	return all;
6847 }
6848 
6849 static inline enum fbq_type fbq_classify_rq(struct rq *rq)
6850 {
6851 	if (rq->nr_running > rq->nr_numa_running)
6852 		return regular;
6853 	if (rq->nr_running > rq->nr_preferred_running)
6854 		return remote;
6855 	return all;
6856 }
6857 #else
6858 static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
6859 {
6860 	return all;
6861 }
6862 
6863 static inline enum fbq_type fbq_classify_rq(struct rq *rq)
6864 {
6865 	return regular;
6866 }
6867 #endif /* CONFIG_NUMA_BALANCING */
6868 
6869 /**
6870  * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
6871  * @env: The load balancing environment.
6872  * @sds: variable to hold the statistics for this sched_domain.
6873  */
6874 static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
6875 {
6876 	struct sched_domain *child = env->sd->child;
6877 	struct sched_group *sg = env->sd->groups;
6878 	struct sg_lb_stats tmp_sgs;
6879 	int load_idx, prefer_sibling = 0;
6880 	bool overload = false;
6881 
6882 	if (child && child->flags & SD_PREFER_SIBLING)
6883 		prefer_sibling = 1;
6884 
6885 	load_idx = get_sd_load_idx(env->sd, env->idle);
6886 
6887 	do {
6888 		struct sg_lb_stats *sgs = &tmp_sgs;
6889 		int local_group;
6890 
6891 		local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
6892 		if (local_group) {
6893 			sds->local = sg;
6894 			sgs = &sds->local_stat;
6895 
6896 			if (env->idle != CPU_NEWLY_IDLE ||
6897 			    time_after_eq(jiffies, sg->sgc->next_update))
6898 				update_group_capacity(env->sd, env->dst_cpu);
6899 		}
6900 
6901 		update_sg_lb_stats(env, sg, load_idx, local_group, sgs,
6902 						&overload);
6903 
6904 		if (local_group)
6905 			goto next_group;
6906 
6907 		/*
6908 		 * In case the child domain prefers tasks go to siblings
6909 		 * first, lower the sg capacity so that we'll try
6910 		 * and move all the excess tasks away. We lower the capacity
6911 		 * of a group only if the local group has the capacity to fit
6912 		 * these excess tasks. The extra check prevents the case where
6913 		 * you always pull from the heaviest group when it is already
6914 		 * under-utilized (possible with a large weight task outweighs
6915 		 * the tasks on the system).
6916 		 */
6917 		if (prefer_sibling && sds->local &&
6918 		    group_has_capacity(env, &sds->local_stat) &&
6919 		    (sgs->sum_nr_running > 1)) {
6920 			sgs->group_no_capacity = 1;
6921 			sgs->group_type = group_classify(sg, sgs);
6922 		}
6923 
6924 		if (update_sd_pick_busiest(env, sds, sg, sgs)) {
6925 			sds->busiest = sg;
6926 			sds->busiest_stat = *sgs;
6927 		}
6928 
6929 next_group:
6930 		/* Now, start updating sd_lb_stats */
6931 		sds->total_load += sgs->group_load;
6932 		sds->total_capacity += sgs->group_capacity;
6933 
6934 		sg = sg->next;
6935 	} while (sg != env->sd->groups);
6936 
6937 	if (env->sd->flags & SD_NUMA)
6938 		env->fbq_type = fbq_classify_group(&sds->busiest_stat);
6939 
6940 	if (!env->sd->parent) {
6941 		/* update overload indicator if we are at root domain */
6942 		if (env->dst_rq->rd->overload != overload)
6943 			env->dst_rq->rd->overload = overload;
6944 	}
6945 
6946 }
6947 
6948 /**
6949  * check_asym_packing - Check to see if the group is packed into the
6950  *			sched doman.
6951  *
6952  * This is primarily intended to used at the sibling level.  Some
6953  * cores like POWER7 prefer to use lower numbered SMT threads.  In the
6954  * case of POWER7, it can move to lower SMT modes only when higher
6955  * threads are idle.  When in lower SMT modes, the threads will
6956  * perform better since they share less core resources.  Hence when we
6957  * have idle threads, we want them to be the higher ones.
6958  *
6959  * This packing function is run on idle threads.  It checks to see if
6960  * the busiest CPU in this domain (core in the P7 case) has a higher
6961  * CPU number than the packing function is being run on.  Here we are
6962  * assuming lower CPU number will be equivalent to lower a SMT thread
6963  * number.
6964  *
6965  * Return: 1 when packing is required and a task should be moved to
6966  * this CPU.  The amount of the imbalance is returned in *imbalance.
6967  *
6968  * @env: The load balancing environment.
6969  * @sds: Statistics of the sched_domain which is to be packed
6970  */
6971 static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
6972 {
6973 	int busiest_cpu;
6974 
6975 	if (!(env->sd->flags & SD_ASYM_PACKING))
6976 		return 0;
6977 
6978 	if (env->idle == CPU_NOT_IDLE)
6979 		return 0;
6980 
6981 	if (!sds->busiest)
6982 		return 0;
6983 
6984 	busiest_cpu = group_first_cpu(sds->busiest);
6985 	if (env->dst_cpu > busiest_cpu)
6986 		return 0;
6987 
6988 	env->imbalance = DIV_ROUND_CLOSEST(
6989 		sds->busiest_stat.avg_load * sds->busiest_stat.group_capacity,
6990 		SCHED_CAPACITY_SCALE);
6991 
6992 	return 1;
6993 }
6994 
6995 /**
6996  * fix_small_imbalance - Calculate the minor imbalance that exists
6997  *			amongst the groups of a sched_domain, during
6998  *			load balancing.
6999  * @env: The load balancing environment.
7000  * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
7001  */
7002 static inline
7003 void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
7004 {
7005 	unsigned long tmp, capa_now = 0, capa_move = 0;
7006 	unsigned int imbn = 2;
7007 	unsigned long scaled_busy_load_per_task;
7008 	struct sg_lb_stats *local, *busiest;
7009 
7010 	local = &sds->local_stat;
7011 	busiest = &sds->busiest_stat;
7012 
7013 	if (!local->sum_nr_running)
7014 		local->load_per_task = cpu_avg_load_per_task(env->dst_cpu);
7015 	else if (busiest->load_per_task > local->load_per_task)
7016 		imbn = 1;
7017 
7018 	scaled_busy_load_per_task =
7019 		(busiest->load_per_task * SCHED_CAPACITY_SCALE) /
7020 		busiest->group_capacity;
7021 
7022 	if (busiest->avg_load + scaled_busy_load_per_task >=
7023 	    local->avg_load + (scaled_busy_load_per_task * imbn)) {
7024 		env->imbalance = busiest->load_per_task;
7025 		return;
7026 	}
7027 
7028 	/*
7029 	 * OK, we don't have enough imbalance to justify moving tasks,
7030 	 * however we may be able to increase total CPU capacity used by
7031 	 * moving them.
7032 	 */
7033 
7034 	capa_now += busiest->group_capacity *
7035 			min(busiest->load_per_task, busiest->avg_load);
7036 	capa_now += local->group_capacity *
7037 			min(local->load_per_task, local->avg_load);
7038 	capa_now /= SCHED_CAPACITY_SCALE;
7039 
7040 	/* Amount of load we'd subtract */
7041 	if (busiest->avg_load > scaled_busy_load_per_task) {
7042 		capa_move += busiest->group_capacity *
7043 			    min(busiest->load_per_task,
7044 				busiest->avg_load - scaled_busy_load_per_task);
7045 	}
7046 
7047 	/* Amount of load we'd add */
7048 	if (busiest->avg_load * busiest->group_capacity <
7049 	    busiest->load_per_task * SCHED_CAPACITY_SCALE) {
7050 		tmp = (busiest->avg_load * busiest->group_capacity) /
7051 		      local->group_capacity;
7052 	} else {
7053 		tmp = (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
7054 		      local->group_capacity;
7055 	}
7056 	capa_move += local->group_capacity *
7057 		    min(local->load_per_task, local->avg_load + tmp);
7058 	capa_move /= SCHED_CAPACITY_SCALE;
7059 
7060 	/* Move if we gain throughput */
7061 	if (capa_move > capa_now)
7062 		env->imbalance = busiest->load_per_task;
7063 }
7064 
7065 /**
7066  * calculate_imbalance - Calculate the amount of imbalance present within the
7067  *			 groups of a given sched_domain during load balance.
7068  * @env: load balance environment
7069  * @sds: statistics of the sched_domain whose imbalance is to be calculated.
7070  */
7071 static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
7072 {
7073 	unsigned long max_pull, load_above_capacity = ~0UL;
7074 	struct sg_lb_stats *local, *busiest;
7075 
7076 	local = &sds->local_stat;
7077 	busiest = &sds->busiest_stat;
7078 
7079 	if (busiest->group_type == group_imbalanced) {
7080 		/*
7081 		 * In the group_imb case we cannot rely on group-wide averages
7082 		 * to ensure cpu-load equilibrium, look at wider averages. XXX
7083 		 */
7084 		busiest->load_per_task =
7085 			min(busiest->load_per_task, sds->avg_load);
7086 	}
7087 
7088 	/*
7089 	 * Avg load of busiest sg can be less and avg load of local sg can
7090 	 * be greater than avg load across all sgs of sd because avg load
7091 	 * factors in sg capacity and sgs with smaller group_type are
7092 	 * skipped when updating the busiest sg:
7093 	 */
7094 	if (busiest->avg_load <= sds->avg_load ||
7095 	    local->avg_load >= sds->avg_load) {
7096 		env->imbalance = 0;
7097 		return fix_small_imbalance(env, sds);
7098 	}
7099 
7100 	/*
7101 	 * If there aren't any idle cpus, avoid creating some.
7102 	 */
7103 	if (busiest->group_type == group_overloaded &&
7104 	    local->group_type   == group_overloaded) {
7105 		load_above_capacity = busiest->sum_nr_running * SCHED_CAPACITY_SCALE;
7106 		if (load_above_capacity > busiest->group_capacity) {
7107 			load_above_capacity -= busiest->group_capacity;
7108 			load_above_capacity *= NICE_0_LOAD;
7109 			load_above_capacity /= busiest->group_capacity;
7110 		} else
7111 			load_above_capacity = ~0UL;
7112 	}
7113 
7114 	/*
7115 	 * We're trying to get all the cpus to the average_load, so we don't
7116 	 * want to push ourselves above the average load, nor do we wish to
7117 	 * reduce the max loaded cpu below the average load. At the same time,
7118 	 * we also don't want to reduce the group load below the group
7119 	 * capacity. Thus we look for the minimum possible imbalance.
7120 	 */
7121 	max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity);
7122 
7123 	/* How much load to actually move to equalise the imbalance */
7124 	env->imbalance = min(
7125 		max_pull * busiest->group_capacity,
7126 		(sds->avg_load - local->avg_load) * local->group_capacity
7127 	) / SCHED_CAPACITY_SCALE;
7128 
7129 	/*
7130 	 * if *imbalance is less than the average load per runnable task
7131 	 * there is no guarantee that any tasks will be moved so we'll have
7132 	 * a think about bumping its value to force at least one task to be
7133 	 * moved
7134 	 */
7135 	if (env->imbalance < busiest->load_per_task)
7136 		return fix_small_imbalance(env, sds);
7137 }
7138 
7139 /******* find_busiest_group() helpers end here *********************/
7140 
7141 /**
7142  * find_busiest_group - Returns the busiest group within the sched_domain
7143  * if there is an imbalance.
7144  *
7145  * Also calculates the amount of weighted load which should be moved
7146  * to restore balance.
7147  *
7148  * @env: The load balancing environment.
7149  *
7150  * Return:	- The busiest group if imbalance exists.
7151  */
7152 static struct sched_group *find_busiest_group(struct lb_env *env)
7153 {
7154 	struct sg_lb_stats *local, *busiest;
7155 	struct sd_lb_stats sds;
7156 
7157 	init_sd_lb_stats(&sds);
7158 
7159 	/*
7160 	 * Compute the various statistics relavent for load balancing at
7161 	 * this level.
7162 	 */
7163 	update_sd_lb_stats(env, &sds);
7164 	local = &sds.local_stat;
7165 	busiest = &sds.busiest_stat;
7166 
7167 	/* ASYM feature bypasses nice load balance check */
7168 	if (check_asym_packing(env, &sds))
7169 		return sds.busiest;
7170 
7171 	/* There is no busy sibling group to pull tasks from */
7172 	if (!sds.busiest || busiest->sum_nr_running == 0)
7173 		goto out_balanced;
7174 
7175 	sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load)
7176 						/ sds.total_capacity;
7177 
7178 	/*
7179 	 * If the busiest group is imbalanced the below checks don't
7180 	 * work because they assume all things are equal, which typically
7181 	 * isn't true due to cpus_allowed constraints and the like.
7182 	 */
7183 	if (busiest->group_type == group_imbalanced)
7184 		goto force_balance;
7185 
7186 	/* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
7187 	if (env->idle == CPU_NEWLY_IDLE && group_has_capacity(env, local) &&
7188 	    busiest->group_no_capacity)
7189 		goto force_balance;
7190 
7191 	/*
7192 	 * If the local group is busier than the selected busiest group
7193 	 * don't try and pull any tasks.
7194 	 */
7195 	if (local->avg_load >= busiest->avg_load)
7196 		goto out_balanced;
7197 
7198 	/*
7199 	 * Don't pull any tasks if this group is already above the domain
7200 	 * average load.
7201 	 */
7202 	if (local->avg_load >= sds.avg_load)
7203 		goto out_balanced;
7204 
7205 	if (env->idle == CPU_IDLE) {
7206 		/*
7207 		 * This cpu is idle. If the busiest group is not overloaded
7208 		 * and there is no imbalance between this and busiest group
7209 		 * wrt idle cpus, it is balanced. The imbalance becomes
7210 		 * significant if the diff is greater than 1 otherwise we
7211 		 * might end up to just move the imbalance on another group
7212 		 */
7213 		if ((busiest->group_type != group_overloaded) &&
7214 				(local->idle_cpus <= (busiest->idle_cpus + 1)))
7215 			goto out_balanced;
7216 	} else {
7217 		/*
7218 		 * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
7219 		 * imbalance_pct to be conservative.
7220 		 */
7221 		if (100 * busiest->avg_load <=
7222 				env->sd->imbalance_pct * local->avg_load)
7223 			goto out_balanced;
7224 	}
7225 
7226 force_balance:
7227 	/* Looks like there is an imbalance. Compute it */
7228 	calculate_imbalance(env, &sds);
7229 	return sds.busiest;
7230 
7231 out_balanced:
7232 	env->imbalance = 0;
7233 	return NULL;
7234 }
7235 
7236 /*
7237  * find_busiest_queue - find the busiest runqueue among the cpus in group.
7238  */
7239 static struct rq *find_busiest_queue(struct lb_env *env,
7240 				     struct sched_group *group)
7241 {
7242 	struct rq *busiest = NULL, *rq;
7243 	unsigned long busiest_load = 0, busiest_capacity = 1;
7244 	int i;
7245 
7246 	for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
7247 		unsigned long capacity, wl;
7248 		enum fbq_type rt;
7249 
7250 		rq = cpu_rq(i);
7251 		rt = fbq_classify_rq(rq);
7252 
7253 		/*
7254 		 * We classify groups/runqueues into three groups:
7255 		 *  - regular: there are !numa tasks
7256 		 *  - remote:  there are numa tasks that run on the 'wrong' node
7257 		 *  - all:     there is no distinction
7258 		 *
7259 		 * In order to avoid migrating ideally placed numa tasks,
7260 		 * ignore those when there's better options.
7261 		 *
7262 		 * If we ignore the actual busiest queue to migrate another
7263 		 * task, the next balance pass can still reduce the busiest
7264 		 * queue by moving tasks around inside the node.
7265 		 *
7266 		 * If we cannot move enough load due to this classification
7267 		 * the next pass will adjust the group classification and
7268 		 * allow migration of more tasks.
7269 		 *
7270 		 * Both cases only affect the total convergence complexity.
7271 		 */
7272 		if (rt > env->fbq_type)
7273 			continue;
7274 
7275 		capacity = capacity_of(i);
7276 
7277 		wl = weighted_cpuload(i);
7278 
7279 		/*
7280 		 * When comparing with imbalance, use weighted_cpuload()
7281 		 * which is not scaled with the cpu capacity.
7282 		 */
7283 
7284 		if (rq->nr_running == 1 && wl > env->imbalance &&
7285 		    !check_cpu_capacity(rq, env->sd))
7286 			continue;
7287 
7288 		/*
7289 		 * For the load comparisons with the other cpu's, consider
7290 		 * the weighted_cpuload() scaled with the cpu capacity, so
7291 		 * that the load can be moved away from the cpu that is
7292 		 * potentially running at a lower capacity.
7293 		 *
7294 		 * Thus we're looking for max(wl_i / capacity_i), crosswise
7295 		 * multiplication to rid ourselves of the division works out
7296 		 * to: wl_i * capacity_j > wl_j * capacity_i;  where j is
7297 		 * our previous maximum.
7298 		 */
7299 		if (wl * busiest_capacity > busiest_load * capacity) {
7300 			busiest_load = wl;
7301 			busiest_capacity = capacity;
7302 			busiest = rq;
7303 		}
7304 	}
7305 
7306 	return busiest;
7307 }
7308 
7309 /*
7310  * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
7311  * so long as it is large enough.
7312  */
7313 #define MAX_PINNED_INTERVAL	512
7314 
7315 /* Working cpumask for load_balance and load_balance_newidle. */
7316 DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
7317 
7318 static int need_active_balance(struct lb_env *env)
7319 {
7320 	struct sched_domain *sd = env->sd;
7321 
7322 	if (env->idle == CPU_NEWLY_IDLE) {
7323 
7324 		/*
7325 		 * ASYM_PACKING needs to force migrate tasks from busy but
7326 		 * higher numbered CPUs in order to pack all tasks in the
7327 		 * lowest numbered CPUs.
7328 		 */
7329 		if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu)
7330 			return 1;
7331 	}
7332 
7333 	/*
7334 	 * The dst_cpu is idle and the src_cpu CPU has only 1 CFS task.
7335 	 * It's worth migrating the task if the src_cpu's capacity is reduced
7336 	 * because of other sched_class or IRQs if more capacity stays
7337 	 * available on dst_cpu.
7338 	 */
7339 	if ((env->idle != CPU_NOT_IDLE) &&
7340 	    (env->src_rq->cfs.h_nr_running == 1)) {
7341 		if ((check_cpu_capacity(env->src_rq, sd)) &&
7342 		    (capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100))
7343 			return 1;
7344 	}
7345 
7346 	return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
7347 }
7348 
7349 static int active_load_balance_cpu_stop(void *data);
7350 
7351 static int should_we_balance(struct lb_env *env)
7352 {
7353 	struct sched_group *sg = env->sd->groups;
7354 	struct cpumask *sg_cpus, *sg_mask;
7355 	int cpu, balance_cpu = -1;
7356 
7357 	/*
7358 	 * In the newly idle case, we will allow all the cpu's
7359 	 * to do the newly idle load balance.
7360 	 */
7361 	if (env->idle == CPU_NEWLY_IDLE)
7362 		return 1;
7363 
7364 	sg_cpus = sched_group_cpus(sg);
7365 	sg_mask = sched_group_mask(sg);
7366 	/* Try to find first idle cpu */
7367 	for_each_cpu_and(cpu, sg_cpus, env->cpus) {
7368 		if (!cpumask_test_cpu(cpu, sg_mask) || !idle_cpu(cpu))
7369 			continue;
7370 
7371 		balance_cpu = cpu;
7372 		break;
7373 	}
7374 
7375 	if (balance_cpu == -1)
7376 		balance_cpu = group_balance_cpu(sg);
7377 
7378 	/*
7379 	 * First idle cpu or the first cpu(busiest) in this sched group
7380 	 * is eligible for doing load balancing at this and above domains.
7381 	 */
7382 	return balance_cpu == env->dst_cpu;
7383 }
7384 
7385 /*
7386  * Check this_cpu to ensure it is balanced within domain. Attempt to move
7387  * tasks if there is an imbalance.
7388  */
7389 static int load_balance(int this_cpu, struct rq *this_rq,
7390 			struct sched_domain *sd, enum cpu_idle_type idle,
7391 			int *continue_balancing)
7392 {
7393 	int ld_moved, cur_ld_moved, active_balance = 0;
7394 	struct sched_domain *sd_parent = sd->parent;
7395 	struct sched_group *group;
7396 	struct rq *busiest;
7397 	unsigned long flags;
7398 	struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);
7399 
7400 	struct lb_env env = {
7401 		.sd		= sd,
7402 		.dst_cpu	= this_cpu,
7403 		.dst_rq		= this_rq,
7404 		.dst_grpmask    = sched_group_cpus(sd->groups),
7405 		.idle		= idle,
7406 		.loop_break	= sched_nr_migrate_break,
7407 		.cpus		= cpus,
7408 		.fbq_type	= all,
7409 		.tasks		= LIST_HEAD_INIT(env.tasks),
7410 	};
7411 
7412 	/*
7413 	 * For NEWLY_IDLE load_balancing, we don't need to consider
7414 	 * other cpus in our group
7415 	 */
7416 	if (idle == CPU_NEWLY_IDLE)
7417 		env.dst_grpmask = NULL;
7418 
7419 	cpumask_copy(cpus, cpu_active_mask);
7420 
7421 	schedstat_inc(sd, lb_count[idle]);
7422 
7423 redo:
7424 	if (!should_we_balance(&env)) {
7425 		*continue_balancing = 0;
7426 		goto out_balanced;
7427 	}
7428 
7429 	group = find_busiest_group(&env);
7430 	if (!group) {
7431 		schedstat_inc(sd, lb_nobusyg[idle]);
7432 		goto out_balanced;
7433 	}
7434 
7435 	busiest = find_busiest_queue(&env, group);
7436 	if (!busiest) {
7437 		schedstat_inc(sd, lb_nobusyq[idle]);
7438 		goto out_balanced;
7439 	}
7440 
7441 	BUG_ON(busiest == env.dst_rq);
7442 
7443 	schedstat_add(sd, lb_imbalance[idle], env.imbalance);
7444 
7445 	env.src_cpu = busiest->cpu;
7446 	env.src_rq = busiest;
7447 
7448 	ld_moved = 0;
7449 	if (busiest->nr_running > 1) {
7450 		/*
7451 		 * Attempt to move tasks. If find_busiest_group has found
7452 		 * an imbalance but busiest->nr_running <= 1, the group is
7453 		 * still unbalanced. ld_moved simply stays zero, so it is
7454 		 * correctly treated as an imbalance.
7455 		 */
7456 		env.flags |= LBF_ALL_PINNED;
7457 		env.loop_max  = min(sysctl_sched_nr_migrate, busiest->nr_running);
7458 
7459 more_balance:
7460 		raw_spin_lock_irqsave(&busiest->lock, flags);
7461 
7462 		/*
7463 		 * cur_ld_moved - load moved in current iteration
7464 		 * ld_moved     - cumulative load moved across iterations
7465 		 */
7466 		cur_ld_moved = detach_tasks(&env);
7467 
7468 		/*
7469 		 * We've detached some tasks from busiest_rq. Every
7470 		 * task is masked "TASK_ON_RQ_MIGRATING", so we can safely
7471 		 * unlock busiest->lock, and we are able to be sure
7472 		 * that nobody can manipulate the tasks in parallel.
7473 		 * See task_rq_lock() family for the details.
7474 		 */
7475 
7476 		raw_spin_unlock(&busiest->lock);
7477 
7478 		if (cur_ld_moved) {
7479 			attach_tasks(&env);
7480 			ld_moved += cur_ld_moved;
7481 		}
7482 
7483 		local_irq_restore(flags);
7484 
7485 		if (env.flags & LBF_NEED_BREAK) {
7486 			env.flags &= ~LBF_NEED_BREAK;
7487 			goto more_balance;
7488 		}
7489 
7490 		/*
7491 		 * Revisit (affine) tasks on src_cpu that couldn't be moved to
7492 		 * us and move them to an alternate dst_cpu in our sched_group
7493 		 * where they can run. The upper limit on how many times we
7494 		 * iterate on same src_cpu is dependent on number of cpus in our
7495 		 * sched_group.
7496 		 *
7497 		 * This changes load balance semantics a bit on who can move
7498 		 * load to a given_cpu. In addition to the given_cpu itself
7499 		 * (or a ilb_cpu acting on its behalf where given_cpu is
7500 		 * nohz-idle), we now have balance_cpu in a position to move
7501 		 * load to given_cpu. In rare situations, this may cause
7502 		 * conflicts (balance_cpu and given_cpu/ilb_cpu deciding
7503 		 * _independently_ and at _same_ time to move some load to
7504 		 * given_cpu) causing exceess load to be moved to given_cpu.
7505 		 * This however should not happen so much in practice and
7506 		 * moreover subsequent load balance cycles should correct the
7507 		 * excess load moved.
7508 		 */
7509 		if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
7510 
7511 			/* Prevent to re-select dst_cpu via env's cpus */
7512 			cpumask_clear_cpu(env.dst_cpu, env.cpus);
7513 
7514 			env.dst_rq	 = cpu_rq(env.new_dst_cpu);
7515 			env.dst_cpu	 = env.new_dst_cpu;
7516 			env.flags	&= ~LBF_DST_PINNED;
7517 			env.loop	 = 0;
7518 			env.loop_break	 = sched_nr_migrate_break;
7519 
7520 			/*
7521 			 * Go back to "more_balance" rather than "redo" since we
7522 			 * need to continue with same src_cpu.
7523 			 */
7524 			goto more_balance;
7525 		}
7526 
7527 		/*
7528 		 * We failed to reach balance because of affinity.
7529 		 */
7530 		if (sd_parent) {
7531 			int *group_imbalance = &sd_parent->groups->sgc->imbalance;
7532 
7533 			if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0)
7534 				*group_imbalance = 1;
7535 		}
7536 
7537 		/* All tasks on this runqueue were pinned by CPU affinity */
7538 		if (unlikely(env.flags & LBF_ALL_PINNED)) {
7539 			cpumask_clear_cpu(cpu_of(busiest), cpus);
7540 			if (!cpumask_empty(cpus)) {
7541 				env.loop = 0;
7542 				env.loop_break = sched_nr_migrate_break;
7543 				goto redo;
7544 			}
7545 			goto out_all_pinned;
7546 		}
7547 	}
7548 
7549 	if (!ld_moved) {
7550 		schedstat_inc(sd, lb_failed[idle]);
7551 		/*
7552 		 * Increment the failure counter only on periodic balance.
7553 		 * We do not want newidle balance, which can be very
7554 		 * frequent, pollute the failure counter causing
7555 		 * excessive cache_hot migrations and active balances.
7556 		 */
7557 		if (idle != CPU_NEWLY_IDLE)
7558 			sd->nr_balance_failed++;
7559 
7560 		if (need_active_balance(&env)) {
7561 			raw_spin_lock_irqsave(&busiest->lock, flags);
7562 
7563 			/* don't kick the active_load_balance_cpu_stop,
7564 			 * if the curr task on busiest cpu can't be
7565 			 * moved to this_cpu
7566 			 */
7567 			if (!cpumask_test_cpu(this_cpu,
7568 					tsk_cpus_allowed(busiest->curr))) {
7569 				raw_spin_unlock_irqrestore(&busiest->lock,
7570 							    flags);
7571 				env.flags |= LBF_ALL_PINNED;
7572 				goto out_one_pinned;
7573 			}
7574 
7575 			/*
7576 			 * ->active_balance synchronizes accesses to
7577 			 * ->active_balance_work.  Once set, it's cleared
7578 			 * only after active load balance is finished.
7579 			 */
7580 			if (!busiest->active_balance) {
7581 				busiest->active_balance = 1;
7582 				busiest->push_cpu = this_cpu;
7583 				active_balance = 1;
7584 			}
7585 			raw_spin_unlock_irqrestore(&busiest->lock, flags);
7586 
7587 			if (active_balance) {
7588 				stop_one_cpu_nowait(cpu_of(busiest),
7589 					active_load_balance_cpu_stop, busiest,
7590 					&busiest->active_balance_work);
7591 			}
7592 
7593 			/* We've kicked active balancing, force task migration. */
7594 			sd->nr_balance_failed = sd->cache_nice_tries+1;
7595 		}
7596 	} else
7597 		sd->nr_balance_failed = 0;
7598 
7599 	if (likely(!active_balance)) {
7600 		/* We were unbalanced, so reset the balancing interval */
7601 		sd->balance_interval = sd->min_interval;
7602 	} else {
7603 		/*
7604 		 * If we've begun active balancing, start to back off. This
7605 		 * case may not be covered by the all_pinned logic if there
7606 		 * is only 1 task on the busy runqueue (because we don't call
7607 		 * detach_tasks).
7608 		 */
7609 		if (sd->balance_interval < sd->max_interval)
7610 			sd->balance_interval *= 2;
7611 	}
7612 
7613 	goto out;
7614 
7615 out_balanced:
7616 	/*
7617 	 * We reach balance although we may have faced some affinity
7618 	 * constraints. Clear the imbalance flag if it was set.
7619 	 */
7620 	if (sd_parent) {
7621 		int *group_imbalance = &sd_parent->groups->sgc->imbalance;
7622 
7623 		if (*group_imbalance)
7624 			*group_imbalance = 0;
7625 	}
7626 
7627 out_all_pinned:
7628 	/*
7629 	 * We reach balance because all tasks are pinned at this level so
7630 	 * we can't migrate them. Let the imbalance flag set so parent level
7631 	 * can try to migrate them.
7632 	 */
7633 	schedstat_inc(sd, lb_balanced[idle]);
7634 
7635 	sd->nr_balance_failed = 0;
7636 
7637 out_one_pinned:
7638 	/* tune up the balancing interval */
7639 	if (((env.flags & LBF_ALL_PINNED) &&
7640 			sd->balance_interval < MAX_PINNED_INTERVAL) ||
7641 			(sd->balance_interval < sd->max_interval))
7642 		sd->balance_interval *= 2;
7643 
7644 	ld_moved = 0;
7645 out:
7646 	return ld_moved;
7647 }
7648 
7649 static inline unsigned long
7650 get_sd_balance_interval(struct sched_domain *sd, int cpu_busy)
7651 {
7652 	unsigned long interval = sd->balance_interval;
7653 
7654 	if (cpu_busy)
7655 		interval *= sd->busy_factor;
7656 
7657 	/* scale ms to jiffies */
7658 	interval = msecs_to_jiffies(interval);
7659 	interval = clamp(interval, 1UL, max_load_balance_interval);
7660 
7661 	return interval;
7662 }
7663 
7664 static inline void
7665 update_next_balance(struct sched_domain *sd, int cpu_busy, unsigned long *next_balance)
7666 {
7667 	unsigned long interval, next;
7668 
7669 	interval = get_sd_balance_interval(sd, cpu_busy);
7670 	next = sd->last_balance + interval;
7671 
7672 	if (time_after(*next_balance, next))
7673 		*next_balance = next;
7674 }
7675 
7676 /*
7677  * idle_balance is called by schedule() if this_cpu is about to become
7678  * idle. Attempts to pull tasks from other CPUs.
7679  */
7680 static int idle_balance(struct rq *this_rq)
7681 {
7682 	unsigned long next_balance = jiffies + HZ;
7683 	int this_cpu = this_rq->cpu;
7684 	struct sched_domain *sd;
7685 	int pulled_task = 0;
7686 	u64 curr_cost = 0;
7687 
7688 	/*
7689 	 * We must set idle_stamp _before_ calling idle_balance(), such that we
7690 	 * measure the duration of idle_balance() as idle time.
7691 	 */
7692 	this_rq->idle_stamp = rq_clock(this_rq);
7693 
7694 	if (this_rq->avg_idle < sysctl_sched_migration_cost ||
7695 	    !this_rq->rd->overload) {
7696 		rcu_read_lock();
7697 		sd = rcu_dereference_check_sched_domain(this_rq->sd);
7698 		if (sd)
7699 			update_next_balance(sd, 0, &next_balance);
7700 		rcu_read_unlock();
7701 
7702 		goto out;
7703 	}
7704 
7705 	raw_spin_unlock(&this_rq->lock);
7706 
7707 	update_blocked_averages(this_cpu);
7708 	rcu_read_lock();
7709 	for_each_domain(this_cpu, sd) {
7710 		int continue_balancing = 1;
7711 		u64 t0, domain_cost;
7712 
7713 		if (!(sd->flags & SD_LOAD_BALANCE))
7714 			continue;
7715 
7716 		if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
7717 			update_next_balance(sd, 0, &next_balance);
7718 			break;
7719 		}
7720 
7721 		if (sd->flags & SD_BALANCE_NEWIDLE) {
7722 			t0 = sched_clock_cpu(this_cpu);
7723 
7724 			pulled_task = load_balance(this_cpu, this_rq,
7725 						   sd, CPU_NEWLY_IDLE,
7726 						   &continue_balancing);
7727 
7728 			domain_cost = sched_clock_cpu(this_cpu) - t0;
7729 			if (domain_cost > sd->max_newidle_lb_cost)
7730 				sd->max_newidle_lb_cost = domain_cost;
7731 
7732 			curr_cost += domain_cost;
7733 		}
7734 
7735 		update_next_balance(sd, 0, &next_balance);
7736 
7737 		/*
7738 		 * Stop searching for tasks to pull if there are
7739 		 * now runnable tasks on this rq.
7740 		 */
7741 		if (pulled_task || this_rq->nr_running > 0)
7742 			break;
7743 	}
7744 	rcu_read_unlock();
7745 
7746 	raw_spin_lock(&this_rq->lock);
7747 
7748 	if (curr_cost > this_rq->max_idle_balance_cost)
7749 		this_rq->max_idle_balance_cost = curr_cost;
7750 
7751 	/*
7752 	 * While browsing the domains, we released the rq lock, a task could
7753 	 * have been enqueued in the meantime. Since we're not going idle,
7754 	 * pretend we pulled a task.
7755 	 */
7756 	if (this_rq->cfs.h_nr_running && !pulled_task)
7757 		pulled_task = 1;
7758 
7759 out:
7760 	/* Move the next balance forward */
7761 	if (time_after(this_rq->next_balance, next_balance))
7762 		this_rq->next_balance = next_balance;
7763 
7764 	/* Is there a task of a high priority class? */
7765 	if (this_rq->nr_running != this_rq->cfs.h_nr_running)
7766 		pulled_task = -1;
7767 
7768 	if (pulled_task)
7769 		this_rq->idle_stamp = 0;
7770 
7771 	return pulled_task;
7772 }
7773 
7774 /*
7775  * active_load_balance_cpu_stop is run by cpu stopper. It pushes
7776  * running tasks off the busiest CPU onto idle CPUs. It requires at
7777  * least 1 task to be running on each physical CPU where possible, and
7778  * avoids physical / logical imbalances.
7779  */
7780 static int active_load_balance_cpu_stop(void *data)
7781 {
7782 	struct rq *busiest_rq = data;
7783 	int busiest_cpu = cpu_of(busiest_rq);
7784 	int target_cpu = busiest_rq->push_cpu;
7785 	struct rq *target_rq = cpu_rq(target_cpu);
7786 	struct sched_domain *sd;
7787 	struct task_struct *p = NULL;
7788 
7789 	raw_spin_lock_irq(&busiest_rq->lock);
7790 
7791 	/* make sure the requested cpu hasn't gone down in the meantime */
7792 	if (unlikely(busiest_cpu != smp_processor_id() ||
7793 		     !busiest_rq->active_balance))
7794 		goto out_unlock;
7795 
7796 	/* Is there any task to move? */
7797 	if (busiest_rq->nr_running <= 1)
7798 		goto out_unlock;
7799 
7800 	/*
7801 	 * This condition is "impossible", if it occurs
7802 	 * we need to fix it. Originally reported by
7803 	 * Bjorn Helgaas on a 128-cpu setup.
7804 	 */
7805 	BUG_ON(busiest_rq == target_rq);
7806 
7807 	/* Search for an sd spanning us and the target CPU. */
7808 	rcu_read_lock();
7809 	for_each_domain(target_cpu, sd) {
7810 		if ((sd->flags & SD_LOAD_BALANCE) &&
7811 		    cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
7812 				break;
7813 	}
7814 
7815 	if (likely(sd)) {
7816 		struct lb_env env = {
7817 			.sd		= sd,
7818 			.dst_cpu	= target_cpu,
7819 			.dst_rq		= target_rq,
7820 			.src_cpu	= busiest_rq->cpu,
7821 			.src_rq		= busiest_rq,
7822 			.idle		= CPU_IDLE,
7823 		};
7824 
7825 		schedstat_inc(sd, alb_count);
7826 
7827 		p = detach_one_task(&env);
7828 		if (p) {
7829 			schedstat_inc(sd, alb_pushed);
7830 			/* Active balancing done, reset the failure counter. */
7831 			sd->nr_balance_failed = 0;
7832 		} else {
7833 			schedstat_inc(sd, alb_failed);
7834 		}
7835 	}
7836 	rcu_read_unlock();
7837 out_unlock:
7838 	busiest_rq->active_balance = 0;
7839 	raw_spin_unlock(&busiest_rq->lock);
7840 
7841 	if (p)
7842 		attach_one_task(target_rq, p);
7843 
7844 	local_irq_enable();
7845 
7846 	return 0;
7847 }
7848 
7849 static inline int on_null_domain(struct rq *rq)
7850 {
7851 	return unlikely(!rcu_dereference_sched(rq->sd));
7852 }
7853 
7854 #ifdef CONFIG_NO_HZ_COMMON
7855 /*
7856  * idle load balancing details
7857  * - When one of the busy CPUs notice that there may be an idle rebalancing
7858  *   needed, they will kick the idle load balancer, which then does idle
7859  *   load balancing for all the idle CPUs.
7860  */
7861 static struct {
7862 	cpumask_var_t idle_cpus_mask;
7863 	atomic_t nr_cpus;
7864 	unsigned long next_balance;     /* in jiffy units */
7865 } nohz ____cacheline_aligned;
7866 
7867 static inline int find_new_ilb(void)
7868 {
7869 	int ilb = cpumask_first(nohz.idle_cpus_mask);
7870 
7871 	if (ilb < nr_cpu_ids && idle_cpu(ilb))
7872 		return ilb;
7873 
7874 	return nr_cpu_ids;
7875 }
7876 
7877 /*
7878  * Kick a CPU to do the nohz balancing, if it is time for it. We pick the
7879  * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
7880  * CPU (if there is one).
7881  */
7882 static void nohz_balancer_kick(void)
7883 {
7884 	int ilb_cpu;
7885 
7886 	nohz.next_balance++;
7887 
7888 	ilb_cpu = find_new_ilb();
7889 
7890 	if (ilb_cpu >= nr_cpu_ids)
7891 		return;
7892 
7893 	if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)))
7894 		return;
7895 	/*
7896 	 * Use smp_send_reschedule() instead of resched_cpu().
7897 	 * This way we generate a sched IPI on the target cpu which
7898 	 * is idle. And the softirq performing nohz idle load balance
7899 	 * will be run before returning from the IPI.
7900 	 */
7901 	smp_send_reschedule(ilb_cpu);
7902 	return;
7903 }
7904 
7905 void nohz_balance_exit_idle(unsigned int cpu)
7906 {
7907 	if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
7908 		/*
7909 		 * Completely isolated CPUs don't ever set, so we must test.
7910 		 */
7911 		if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) {
7912 			cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
7913 			atomic_dec(&nohz.nr_cpus);
7914 		}
7915 		clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
7916 	}
7917 }
7918 
7919 static inline void set_cpu_sd_state_busy(void)
7920 {
7921 	struct sched_domain *sd;
7922 	int cpu = smp_processor_id();
7923 
7924 	rcu_read_lock();
7925 	sd = rcu_dereference(per_cpu(sd_busy, cpu));
7926 
7927 	if (!sd || !sd->nohz_idle)
7928 		goto unlock;
7929 	sd->nohz_idle = 0;
7930 
7931 	atomic_inc(&sd->groups->sgc->nr_busy_cpus);
7932 unlock:
7933 	rcu_read_unlock();
7934 }
7935 
7936 void set_cpu_sd_state_idle(void)
7937 {
7938 	struct sched_domain *sd;
7939 	int cpu = smp_processor_id();
7940 
7941 	rcu_read_lock();
7942 	sd = rcu_dereference(per_cpu(sd_busy, cpu));
7943 
7944 	if (!sd || sd->nohz_idle)
7945 		goto unlock;
7946 	sd->nohz_idle = 1;
7947 
7948 	atomic_dec(&sd->groups->sgc->nr_busy_cpus);
7949 unlock:
7950 	rcu_read_unlock();
7951 }
7952 
7953 /*
7954  * This routine will record that the cpu is going idle with tick stopped.
7955  * This info will be used in performing idle load balancing in the future.
7956  */
7957 void nohz_balance_enter_idle(int cpu)
7958 {
7959 	/*
7960 	 * If this cpu is going down, then nothing needs to be done.
7961 	 */
7962 	if (!cpu_active(cpu))
7963 		return;
7964 
7965 	if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
7966 		return;
7967 
7968 	/*
7969 	 * If we're a completely isolated CPU, we don't play.
7970 	 */
7971 	if (on_null_domain(cpu_rq(cpu)))
7972 		return;
7973 
7974 	cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
7975 	atomic_inc(&nohz.nr_cpus);
7976 	set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
7977 }
7978 #endif
7979 
7980 static DEFINE_SPINLOCK(balancing);
7981 
7982 /*
7983  * Scale the max load_balance interval with the number of CPUs in the system.
7984  * This trades load-balance latency on larger machines for less cross talk.
7985  */
7986 void update_max_interval(void)
7987 {
7988 	max_load_balance_interval = HZ*num_online_cpus()/10;
7989 }
7990 
7991 /*
7992  * It checks each scheduling domain to see if it is due to be balanced,
7993  * and initiates a balancing operation if so.
7994  *
7995  * Balancing parameters are set up in init_sched_domains.
7996  */
7997 static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
7998 {
7999 	int continue_balancing = 1;
8000 	int cpu = rq->cpu;
8001 	unsigned long interval;
8002 	struct sched_domain *sd;
8003 	/* Earliest time when we have to do rebalance again */
8004 	unsigned long next_balance = jiffies + 60*HZ;
8005 	int update_next_balance = 0;
8006 	int need_serialize, need_decay = 0;
8007 	u64 max_cost = 0;
8008 
8009 	update_blocked_averages(cpu);
8010 
8011 	rcu_read_lock();
8012 	for_each_domain(cpu, sd) {
8013 		/*
8014 		 * Decay the newidle max times here because this is a regular
8015 		 * visit to all the domains. Decay ~1% per second.
8016 		 */
8017 		if (time_after(jiffies, sd->next_decay_max_lb_cost)) {
8018 			sd->max_newidle_lb_cost =
8019 				(sd->max_newidle_lb_cost * 253) / 256;
8020 			sd->next_decay_max_lb_cost = jiffies + HZ;
8021 			need_decay = 1;
8022 		}
8023 		max_cost += sd->max_newidle_lb_cost;
8024 
8025 		if (!(sd->flags & SD_LOAD_BALANCE))
8026 			continue;
8027 
8028 		/*
8029 		 * Stop the load balance at this level. There is another
8030 		 * CPU in our sched group which is doing load balancing more
8031 		 * actively.
8032 		 */
8033 		if (!continue_balancing) {
8034 			if (need_decay)
8035 				continue;
8036 			break;
8037 		}
8038 
8039 		interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
8040 
8041 		need_serialize = sd->flags & SD_SERIALIZE;
8042 		if (need_serialize) {
8043 			if (!spin_trylock(&balancing))
8044 				goto out;
8045 		}
8046 
8047 		if (time_after_eq(jiffies, sd->last_balance + interval)) {
8048 			if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
8049 				/*
8050 				 * The LBF_DST_PINNED logic could have changed
8051 				 * env->dst_cpu, so we can't know our idle
8052 				 * state even if we migrated tasks. Update it.
8053 				 */
8054 				idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
8055 			}
8056 			sd->last_balance = jiffies;
8057 			interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
8058 		}
8059 		if (need_serialize)
8060 			spin_unlock(&balancing);
8061 out:
8062 		if (time_after(next_balance, sd->last_balance + interval)) {
8063 			next_balance = sd->last_balance + interval;
8064 			update_next_balance = 1;
8065 		}
8066 	}
8067 	if (need_decay) {
8068 		/*
8069 		 * Ensure the rq-wide value also decays but keep it at a
8070 		 * reasonable floor to avoid funnies with rq->avg_idle.
8071 		 */
8072 		rq->max_idle_balance_cost =
8073 			max((u64)sysctl_sched_migration_cost, max_cost);
8074 	}
8075 	rcu_read_unlock();
8076 
8077 	/*
8078 	 * next_balance will be updated only when there is a need.
8079 	 * When the cpu is attached to null domain for ex, it will not be
8080 	 * updated.
8081 	 */
8082 	if (likely(update_next_balance)) {
8083 		rq->next_balance = next_balance;
8084 
8085 #ifdef CONFIG_NO_HZ_COMMON
8086 		/*
8087 		 * If this CPU has been elected to perform the nohz idle
8088 		 * balance. Other idle CPUs have already rebalanced with
8089 		 * nohz_idle_balance() and nohz.next_balance has been
8090 		 * updated accordingly. This CPU is now running the idle load
8091 		 * balance for itself and we need to update the
8092 		 * nohz.next_balance accordingly.
8093 		 */
8094 		if ((idle == CPU_IDLE) && time_after(nohz.next_balance, rq->next_balance))
8095 			nohz.next_balance = rq->next_balance;
8096 #endif
8097 	}
8098 }
8099 
8100 #ifdef CONFIG_NO_HZ_COMMON
8101 /*
8102  * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
8103  * rebalancing for all the cpus for whom scheduler ticks are stopped.
8104  */
8105 static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
8106 {
8107 	int this_cpu = this_rq->cpu;
8108 	struct rq *rq;
8109 	int balance_cpu;
8110 	/* Earliest time when we have to do rebalance again */
8111 	unsigned long next_balance = jiffies + 60*HZ;
8112 	int update_next_balance = 0;
8113 
8114 	if (idle != CPU_IDLE ||
8115 	    !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))
8116 		goto end;
8117 
8118 	for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
8119 		if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
8120 			continue;
8121 
8122 		/*
8123 		 * If this cpu gets work to do, stop the load balancing
8124 		 * work being done for other cpus. Next load
8125 		 * balancing owner will pick it up.
8126 		 */
8127 		if (need_resched())
8128 			break;
8129 
8130 		rq = cpu_rq(balance_cpu);
8131 
8132 		/*
8133 		 * If time for next balance is due,
8134 		 * do the balance.
8135 		 */
8136 		if (time_after_eq(jiffies, rq->next_balance)) {
8137 			raw_spin_lock_irq(&rq->lock);
8138 			update_rq_clock(rq);
8139 			cpu_load_update_idle(rq);
8140 			raw_spin_unlock_irq(&rq->lock);
8141 			rebalance_domains(rq, CPU_IDLE);
8142 		}
8143 
8144 		if (time_after(next_balance, rq->next_balance)) {
8145 			next_balance = rq->next_balance;
8146 			update_next_balance = 1;
8147 		}
8148 	}
8149 
8150 	/*
8151 	 * next_balance will be updated only when there is a need.
8152 	 * When the CPU is attached to null domain for ex, it will not be
8153 	 * updated.
8154 	 */
8155 	if (likely(update_next_balance))
8156 		nohz.next_balance = next_balance;
8157 end:
8158 	clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
8159 }
8160 
8161 /*
8162  * Current heuristic for kicking the idle load balancer in the presence
8163  * of an idle cpu in the system.
8164  *   - This rq has more than one task.
8165  *   - This rq has at least one CFS task and the capacity of the CPU is
8166  *     significantly reduced because of RT tasks or IRQs.
8167  *   - At parent of LLC scheduler domain level, this cpu's scheduler group has
8168  *     multiple busy cpu.
8169  *   - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
8170  *     domain span are idle.
8171  */
8172 static inline bool nohz_kick_needed(struct rq *rq)
8173 {
8174 	unsigned long now = jiffies;
8175 	struct sched_domain *sd;
8176 	struct sched_group_capacity *sgc;
8177 	int nr_busy, cpu = rq->cpu;
8178 	bool kick = false;
8179 
8180 	if (unlikely(rq->idle_balance))
8181 		return false;
8182 
8183        /*
8184 	* We may be recently in ticked or tickless idle mode. At the first
8185 	* busy tick after returning from idle, we will update the busy stats.
8186 	*/
8187 	set_cpu_sd_state_busy();
8188 	nohz_balance_exit_idle(cpu);
8189 
8190 	/*
8191 	 * None are in tickless mode and hence no need for NOHZ idle load
8192 	 * balancing.
8193 	 */
8194 	if (likely(!atomic_read(&nohz.nr_cpus)))
8195 		return false;
8196 
8197 	if (time_before(now, nohz.next_balance))
8198 		return false;
8199 
8200 	if (rq->nr_running >= 2)
8201 		return true;
8202 
8203 	rcu_read_lock();
8204 	sd = rcu_dereference(per_cpu(sd_busy, cpu));
8205 	if (sd) {
8206 		sgc = sd->groups->sgc;
8207 		nr_busy = atomic_read(&sgc->nr_busy_cpus);
8208 
8209 		if (nr_busy > 1) {
8210 			kick = true;
8211 			goto unlock;
8212 		}
8213 
8214 	}
8215 
8216 	sd = rcu_dereference(rq->sd);
8217 	if (sd) {
8218 		if ((rq->cfs.h_nr_running >= 1) &&
8219 				check_cpu_capacity(rq, sd)) {
8220 			kick = true;
8221 			goto unlock;
8222 		}
8223 	}
8224 
8225 	sd = rcu_dereference(per_cpu(sd_asym, cpu));
8226 	if (sd && (cpumask_first_and(nohz.idle_cpus_mask,
8227 				  sched_domain_span(sd)) < cpu)) {
8228 		kick = true;
8229 		goto unlock;
8230 	}
8231 
8232 unlock:
8233 	rcu_read_unlock();
8234 	return kick;
8235 }
8236 #else
8237 static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { }
8238 #endif
8239 
8240 /*
8241  * run_rebalance_domains is triggered when needed from the scheduler tick.
8242  * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
8243  */
8244 static void run_rebalance_domains(struct softirq_action *h)
8245 {
8246 	struct rq *this_rq = this_rq();
8247 	enum cpu_idle_type idle = this_rq->idle_balance ?
8248 						CPU_IDLE : CPU_NOT_IDLE;
8249 
8250 	/*
8251 	 * If this cpu has a pending nohz_balance_kick, then do the
8252 	 * balancing on behalf of the other idle cpus whose ticks are
8253 	 * stopped. Do nohz_idle_balance *before* rebalance_domains to
8254 	 * give the idle cpus a chance to load balance. Else we may
8255 	 * load balance only within the local sched_domain hierarchy
8256 	 * and abort nohz_idle_balance altogether if we pull some load.
8257 	 */
8258 	nohz_idle_balance(this_rq, idle);
8259 	rebalance_domains(this_rq, idle);
8260 }
8261 
8262 /*
8263  * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
8264  */
8265 void trigger_load_balance(struct rq *rq)
8266 {
8267 	/* Don't need to rebalance while attached to NULL domain */
8268 	if (unlikely(on_null_domain(rq)))
8269 		return;
8270 
8271 	if (time_after_eq(jiffies, rq->next_balance))
8272 		raise_softirq(SCHED_SOFTIRQ);
8273 #ifdef CONFIG_NO_HZ_COMMON
8274 	if (nohz_kick_needed(rq))
8275 		nohz_balancer_kick();
8276 #endif
8277 }
8278 
8279 static void rq_online_fair(struct rq *rq)
8280 {
8281 	update_sysctl();
8282 
8283 	update_runtime_enabled(rq);
8284 }
8285 
8286 static void rq_offline_fair(struct rq *rq)
8287 {
8288 	update_sysctl();
8289 
8290 	/* Ensure any throttled groups are reachable by pick_next_task */
8291 	unthrottle_offline_cfs_rqs(rq);
8292 }
8293 
8294 #endif /* CONFIG_SMP */
8295 
8296 /*
8297  * scheduler tick hitting a task of our scheduling class:
8298  */
8299 static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
8300 {
8301 	struct cfs_rq *cfs_rq;
8302 	struct sched_entity *se = &curr->se;
8303 
8304 	for_each_sched_entity(se) {
8305 		cfs_rq = cfs_rq_of(se);
8306 		entity_tick(cfs_rq, se, queued);
8307 	}
8308 
8309 	if (static_branch_unlikely(&sched_numa_balancing))
8310 		task_tick_numa(rq, curr);
8311 }
8312 
8313 /*
8314  * called on fork with the child task as argument from the parent's context
8315  *  - child not yet on the tasklist
8316  *  - preemption disabled
8317  */
8318 static void task_fork_fair(struct task_struct *p)
8319 {
8320 	struct cfs_rq *cfs_rq;
8321 	struct sched_entity *se = &p->se, *curr;
8322 	int this_cpu = smp_processor_id();
8323 	struct rq *rq = this_rq();
8324 	unsigned long flags;
8325 
8326 	raw_spin_lock_irqsave(&rq->lock, flags);
8327 
8328 	update_rq_clock(rq);
8329 
8330 	cfs_rq = task_cfs_rq(current);
8331 	curr = cfs_rq->curr;
8332 
8333 	/*
8334 	 * Not only the cpu but also the task_group of the parent might have
8335 	 * been changed after parent->se.parent,cfs_rq were copied to
8336 	 * child->se.parent,cfs_rq. So call __set_task_cpu() to make those
8337 	 * of child point to valid ones.
8338 	 */
8339 	rcu_read_lock();
8340 	__set_task_cpu(p, this_cpu);
8341 	rcu_read_unlock();
8342 
8343 	update_curr(cfs_rq);
8344 
8345 	if (curr)
8346 		se->vruntime = curr->vruntime;
8347 	place_entity(cfs_rq, se, 1);
8348 
8349 	if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
8350 		/*
8351 		 * Upon rescheduling, sched_class::put_prev_task() will place
8352 		 * 'current' within the tree based on its new key value.
8353 		 */
8354 		swap(curr->vruntime, se->vruntime);
8355 		resched_curr(rq);
8356 	}
8357 
8358 	se->vruntime -= cfs_rq->min_vruntime;
8359 
8360 	raw_spin_unlock_irqrestore(&rq->lock, flags);
8361 }
8362 
8363 /*
8364  * Priority of the task has changed. Check to see if we preempt
8365  * the current task.
8366  */
8367 static void
8368 prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
8369 {
8370 	if (!task_on_rq_queued(p))
8371 		return;
8372 
8373 	/*
8374 	 * Reschedule if we are currently running on this runqueue and
8375 	 * our priority decreased, or if we are not currently running on
8376 	 * this runqueue and our priority is higher than the current's
8377 	 */
8378 	if (rq->curr == p) {
8379 		if (p->prio > oldprio)
8380 			resched_curr(rq);
8381 	} else
8382 		check_preempt_curr(rq, p, 0);
8383 }
8384 
8385 static inline bool vruntime_normalized(struct task_struct *p)
8386 {
8387 	struct sched_entity *se = &p->se;
8388 
8389 	/*
8390 	 * In both the TASK_ON_RQ_QUEUED and TASK_ON_RQ_MIGRATING cases,
8391 	 * the dequeue_entity(.flags=0) will already have normalized the
8392 	 * vruntime.
8393 	 */
8394 	if (p->on_rq)
8395 		return true;
8396 
8397 	/*
8398 	 * When !on_rq, vruntime of the task has usually NOT been normalized.
8399 	 * But there are some cases where it has already been normalized:
8400 	 *
8401 	 * - A forked child which is waiting for being woken up by
8402 	 *   wake_up_new_task().
8403 	 * - A task which has been woken up by try_to_wake_up() and
8404 	 *   waiting for actually being woken up by sched_ttwu_pending().
8405 	 */
8406 	if (!se->sum_exec_runtime || p->state == TASK_WAKING)
8407 		return true;
8408 
8409 	return false;
8410 }
8411 
8412 static void detach_task_cfs_rq(struct task_struct *p)
8413 {
8414 	struct sched_entity *se = &p->se;
8415 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
8416 
8417 	if (!vruntime_normalized(p)) {
8418 		/*
8419 		 * Fix up our vruntime so that the current sleep doesn't
8420 		 * cause 'unlimited' sleep bonus.
8421 		 */
8422 		place_entity(cfs_rq, se, 0);
8423 		se->vruntime -= cfs_rq->min_vruntime;
8424 	}
8425 
8426 	/* Catch up with the cfs_rq and remove our load when we leave */
8427 	detach_entity_load_avg(cfs_rq, se);
8428 }
8429 
8430 static void attach_task_cfs_rq(struct task_struct *p)
8431 {
8432 	struct sched_entity *se = &p->se;
8433 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
8434 
8435 #ifdef CONFIG_FAIR_GROUP_SCHED
8436 	/*
8437 	 * Since the real-depth could have been changed (only FAIR
8438 	 * class maintain depth value), reset depth properly.
8439 	 */
8440 	se->depth = se->parent ? se->parent->depth + 1 : 0;
8441 #endif
8442 
8443 	/* Synchronize task with its cfs_rq */
8444 	attach_entity_load_avg(cfs_rq, se);
8445 
8446 	if (!vruntime_normalized(p))
8447 		se->vruntime += cfs_rq->min_vruntime;
8448 }
8449 
8450 static void switched_from_fair(struct rq *rq, struct task_struct *p)
8451 {
8452 	detach_task_cfs_rq(p);
8453 }
8454 
8455 static void switched_to_fair(struct rq *rq, struct task_struct *p)
8456 {
8457 	attach_task_cfs_rq(p);
8458 
8459 	if (task_on_rq_queued(p)) {
8460 		/*
8461 		 * We were most likely switched from sched_rt, so
8462 		 * kick off the schedule if running, otherwise just see
8463 		 * if we can still preempt the current task.
8464 		 */
8465 		if (rq->curr == p)
8466 			resched_curr(rq);
8467 		else
8468 			check_preempt_curr(rq, p, 0);
8469 	}
8470 }
8471 
8472 /* Account for a task changing its policy or group.
8473  *
8474  * This routine is mostly called to set cfs_rq->curr field when a task
8475  * migrates between groups/classes.
8476  */
8477 static void set_curr_task_fair(struct rq *rq)
8478 {
8479 	struct sched_entity *se = &rq->curr->se;
8480 
8481 	for_each_sched_entity(se) {
8482 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
8483 
8484 		set_next_entity(cfs_rq, se);
8485 		/* ensure bandwidth has been allocated on our new cfs_rq */
8486 		account_cfs_rq_runtime(cfs_rq, 0);
8487 	}
8488 }
8489 
8490 void init_cfs_rq(struct cfs_rq *cfs_rq)
8491 {
8492 	cfs_rq->tasks_timeline = RB_ROOT;
8493 	cfs_rq->min_vruntime = (u64)(-(1LL << 20));
8494 #ifndef CONFIG_64BIT
8495 	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
8496 #endif
8497 #ifdef CONFIG_SMP
8498 	atomic_long_set(&cfs_rq->removed_load_avg, 0);
8499 	atomic_long_set(&cfs_rq->removed_util_avg, 0);
8500 #endif
8501 }
8502 
8503 #ifdef CONFIG_FAIR_GROUP_SCHED
8504 static void task_move_group_fair(struct task_struct *p)
8505 {
8506 	detach_task_cfs_rq(p);
8507 	set_task_rq(p, task_cpu(p));
8508 
8509 #ifdef CONFIG_SMP
8510 	/* Tell se's cfs_rq has been changed -- migrated */
8511 	p->se.avg.last_update_time = 0;
8512 #endif
8513 	attach_task_cfs_rq(p);
8514 }
8515 
8516 void free_fair_sched_group(struct task_group *tg)
8517 {
8518 	int i;
8519 
8520 	destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
8521 
8522 	for_each_possible_cpu(i) {
8523 		if (tg->cfs_rq)
8524 			kfree(tg->cfs_rq[i]);
8525 		if (tg->se)
8526 			kfree(tg->se[i]);
8527 	}
8528 
8529 	kfree(tg->cfs_rq);
8530 	kfree(tg->se);
8531 }
8532 
8533 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8534 {
8535 	struct sched_entity *se;
8536 	struct cfs_rq *cfs_rq;
8537 	struct rq *rq;
8538 	int i;
8539 
8540 	tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
8541 	if (!tg->cfs_rq)
8542 		goto err;
8543 	tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
8544 	if (!tg->se)
8545 		goto err;
8546 
8547 	tg->shares = NICE_0_LOAD;
8548 
8549 	init_cfs_bandwidth(tg_cfs_bandwidth(tg));
8550 
8551 	for_each_possible_cpu(i) {
8552 		rq = cpu_rq(i);
8553 
8554 		cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
8555 				      GFP_KERNEL, cpu_to_node(i));
8556 		if (!cfs_rq)
8557 			goto err;
8558 
8559 		se = kzalloc_node(sizeof(struct sched_entity),
8560 				  GFP_KERNEL, cpu_to_node(i));
8561 		if (!se)
8562 			goto err_free_rq;
8563 
8564 		init_cfs_rq(cfs_rq);
8565 		init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
8566 		init_entity_runnable_average(se);
8567 
8568 		raw_spin_lock_irq(&rq->lock);
8569 		post_init_entity_util_avg(se);
8570 		raw_spin_unlock_irq(&rq->lock);
8571 	}
8572 
8573 	return 1;
8574 
8575 err_free_rq:
8576 	kfree(cfs_rq);
8577 err:
8578 	return 0;
8579 }
8580 
8581 void unregister_fair_sched_group(struct task_group *tg)
8582 {
8583 	unsigned long flags;
8584 	struct rq *rq;
8585 	int cpu;
8586 
8587 	for_each_possible_cpu(cpu) {
8588 		if (tg->se[cpu])
8589 			remove_entity_load_avg(tg->se[cpu]);
8590 
8591 		/*
8592 		 * Only empty task groups can be destroyed; so we can speculatively
8593 		 * check on_list without danger of it being re-added.
8594 		 */
8595 		if (!tg->cfs_rq[cpu]->on_list)
8596 			continue;
8597 
8598 		rq = cpu_rq(cpu);
8599 
8600 		raw_spin_lock_irqsave(&rq->lock, flags);
8601 		list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
8602 		raw_spin_unlock_irqrestore(&rq->lock, flags);
8603 	}
8604 }
8605 
8606 void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
8607 			struct sched_entity *se, int cpu,
8608 			struct sched_entity *parent)
8609 {
8610 	struct rq *rq = cpu_rq(cpu);
8611 
8612 	cfs_rq->tg = tg;
8613 	cfs_rq->rq = rq;
8614 	init_cfs_rq_runtime(cfs_rq);
8615 
8616 	tg->cfs_rq[cpu] = cfs_rq;
8617 	tg->se[cpu] = se;
8618 
8619 	/* se could be NULL for root_task_group */
8620 	if (!se)
8621 		return;
8622 
8623 	if (!parent) {
8624 		se->cfs_rq = &rq->cfs;
8625 		se->depth = 0;
8626 	} else {
8627 		se->cfs_rq = parent->my_q;
8628 		se->depth = parent->depth + 1;
8629 	}
8630 
8631 	se->my_q = cfs_rq;
8632 	/* guarantee group entities always have weight */
8633 	update_load_set(&se->load, NICE_0_LOAD);
8634 	se->parent = parent;
8635 }
8636 
8637 static DEFINE_MUTEX(shares_mutex);
8638 
8639 int sched_group_set_shares(struct task_group *tg, unsigned long shares)
8640 {
8641 	int i;
8642 	unsigned long flags;
8643 
8644 	/*
8645 	 * We can't change the weight of the root cgroup.
8646 	 */
8647 	if (!tg->se[0])
8648 		return -EINVAL;
8649 
8650 	shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
8651 
8652 	mutex_lock(&shares_mutex);
8653 	if (tg->shares == shares)
8654 		goto done;
8655 
8656 	tg->shares = shares;
8657 	for_each_possible_cpu(i) {
8658 		struct rq *rq = cpu_rq(i);
8659 		struct sched_entity *se;
8660 
8661 		se = tg->se[i];
8662 		/* Propagate contribution to hierarchy */
8663 		raw_spin_lock_irqsave(&rq->lock, flags);
8664 
8665 		/* Possible calls to update_curr() need rq clock */
8666 		update_rq_clock(rq);
8667 		for_each_sched_entity(se)
8668 			update_cfs_shares(group_cfs_rq(se));
8669 		raw_spin_unlock_irqrestore(&rq->lock, flags);
8670 	}
8671 
8672 done:
8673 	mutex_unlock(&shares_mutex);
8674 	return 0;
8675 }
8676 #else /* CONFIG_FAIR_GROUP_SCHED */
8677 
8678 void free_fair_sched_group(struct task_group *tg) { }
8679 
8680 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8681 {
8682 	return 1;
8683 }
8684 
8685 void unregister_fair_sched_group(struct task_group *tg) { }
8686 
8687 #endif /* CONFIG_FAIR_GROUP_SCHED */
8688 
8689 
8690 static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
8691 {
8692 	struct sched_entity *se = &task->se;
8693 	unsigned int rr_interval = 0;
8694 
8695 	/*
8696 	 * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
8697 	 * idle runqueue:
8698 	 */
8699 	if (rq->cfs.load.weight)
8700 		rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
8701 
8702 	return rr_interval;
8703 }
8704 
8705 /*
8706  * All the scheduling class methods:
8707  */
8708 const struct sched_class fair_sched_class = {
8709 	.next			= &idle_sched_class,
8710 	.enqueue_task		= enqueue_task_fair,
8711 	.dequeue_task		= dequeue_task_fair,
8712 	.yield_task		= yield_task_fair,
8713 	.yield_to_task		= yield_to_task_fair,
8714 
8715 	.check_preempt_curr	= check_preempt_wakeup,
8716 
8717 	.pick_next_task		= pick_next_task_fair,
8718 	.put_prev_task		= put_prev_task_fair,
8719 
8720 #ifdef CONFIG_SMP
8721 	.select_task_rq		= select_task_rq_fair,
8722 	.migrate_task_rq	= migrate_task_rq_fair,
8723 
8724 	.rq_online		= rq_online_fair,
8725 	.rq_offline		= rq_offline_fair,
8726 
8727 	.task_dead		= task_dead_fair,
8728 	.set_cpus_allowed	= set_cpus_allowed_common,
8729 #endif
8730 
8731 	.set_curr_task          = set_curr_task_fair,
8732 	.task_tick		= task_tick_fair,
8733 	.task_fork		= task_fork_fair,
8734 
8735 	.prio_changed		= prio_changed_fair,
8736 	.switched_from		= switched_from_fair,
8737 	.switched_to		= switched_to_fair,
8738 
8739 	.get_rr_interval	= get_rr_interval_fair,
8740 
8741 	.update_curr		= update_curr_fair,
8742 
8743 #ifdef CONFIG_FAIR_GROUP_SCHED
8744 	.task_move_group	= task_move_group_fair,
8745 #endif
8746 };
8747 
8748 #ifdef CONFIG_SCHED_DEBUG
8749 void print_cfs_stats(struct seq_file *m, int cpu)
8750 {
8751 	struct cfs_rq *cfs_rq;
8752 
8753 	rcu_read_lock();
8754 	for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
8755 		print_cfs_rq(m, cpu, cfs_rq);
8756 	rcu_read_unlock();
8757 }
8758 
8759 #ifdef CONFIG_NUMA_BALANCING
8760 void show_numa_stats(struct task_struct *p, struct seq_file *m)
8761 {
8762 	int node;
8763 	unsigned long tsf = 0, tpf = 0, gsf = 0, gpf = 0;
8764 
8765 	for_each_online_node(node) {
8766 		if (p->numa_faults) {
8767 			tsf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 0)];
8768 			tpf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 1)];
8769 		}
8770 		if (p->numa_group) {
8771 			gsf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 0)],
8772 			gpf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 1)];
8773 		}
8774 		print_numa_stats(m, node, tsf, tpf, gsf, gpf);
8775 	}
8776 }
8777 #endif /* CONFIG_NUMA_BALANCING */
8778 #endif /* CONFIG_SCHED_DEBUG */
8779 
8780 __init void init_sched_fair_class(void)
8781 {
8782 #ifdef CONFIG_SMP
8783 	open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
8784 
8785 #ifdef CONFIG_NO_HZ_COMMON
8786 	nohz.next_balance = jiffies;
8787 	zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
8788 #endif
8789 #endif /* SMP */
8790 
8791 }
8792