xref: /linux/kernel/sched/fair.c (revision a671de086874b9d8155369319b2bd989cf55d77c)
1 /*
2  * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
3  *
4  *  Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
5  *
6  *  Interactivity improvements by Mike Galbraith
7  *  (C) 2007 Mike Galbraith <efault@gmx.de>
8  *
9  *  Various enhancements by Dmitry Adamushko.
10  *  (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
11  *
12  *  Group scheduling enhancements by Srivatsa Vaddagiri
13  *  Copyright IBM Corporation, 2007
14  *  Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
15  *
16  *  Scaled math optimizations by Thomas Gleixner
17  *  Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
18  *
19  *  Adaptive scheduling granularity, math enhancements by Peter Zijlstra
20  *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
21  */
22 
23 #include <linux/latencytop.h>
24 #include <linux/sched.h>
25 #include <linux/cpumask.h>
26 #include <linux/slab.h>
27 #include <linux/profile.h>
28 #include <linux/interrupt.h>
29 
30 #include <trace/events/sched.h>
31 
32 #include "sched.h"
33 
34 /*
35  * Targeted preemption latency for CPU-bound tasks:
36  * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
37  *
38  * NOTE: this latency value is not the same as the concept of
39  * 'timeslice length' - timeslices in CFS are of variable length
40  * and have no persistent notion like in traditional, time-slice
41  * based scheduling concepts.
42  *
43  * (to see the precise effective timeslice length of your workload,
44  *  run vmstat and monitor the context-switches (cs) field)
45  */
46 unsigned int sysctl_sched_latency = 6000000ULL;
47 unsigned int normalized_sysctl_sched_latency = 6000000ULL;
48 
49 /*
50  * The initial- and re-scaling of tunables is configurable
51  * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
52  *
53  * Options are:
54  * SCHED_TUNABLESCALING_NONE - unscaled, always *1
55  * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
56  * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
57  */
58 enum sched_tunable_scaling sysctl_sched_tunable_scaling
59 	= SCHED_TUNABLESCALING_LOG;
60 
61 /*
62  * Minimal preemption granularity for CPU-bound tasks:
63  * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
64  */
65 unsigned int sysctl_sched_min_granularity = 750000ULL;
66 unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
67 
68 /*
69  * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
70  */
71 static unsigned int sched_nr_latency = 8;
72 
73 /*
74  * After fork, child runs first. If set to 0 (default) then
75  * parent will (try to) run first.
76  */
77 unsigned int sysctl_sched_child_runs_first __read_mostly;
78 
79 /*
80  * SCHED_OTHER wake-up granularity.
81  * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
82  *
83  * This option delays the preemption effects of decoupled workloads
84  * and reduces their over-scheduling. Synchronous workloads will still
85  * have immediate wakeup/sleep latencies.
86  */
87 unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
88 unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
89 
90 const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
91 
92 /*
93  * The exponential sliding  window over which load is averaged for shares
94  * distribution.
95  * (default: 10msec)
96  */
97 unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
98 
99 #ifdef CONFIG_CFS_BANDWIDTH
100 /*
101  * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
102  * each time a cfs_rq requests quota.
103  *
104  * Note: in the case that the slice exceeds the runtime remaining (either due
105  * to consumption or the quota being specified to be smaller than the slice)
106  * we will always only issue the remaining available time.
107  *
108  * default: 5 msec, units: microseconds
109   */
110 unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
111 #endif
112 
113 /*
114  * Increase the granularity value when there are more CPUs,
115  * because with more CPUs the 'effective latency' as visible
116  * to users decreases. But the relationship is not linear,
117  * so pick a second-best guess by going with the log2 of the
118  * number of CPUs.
119  *
120  * This idea comes from the SD scheduler of Con Kolivas:
121  */
122 static int get_update_sysctl_factor(void)
123 {
124 	unsigned int cpus = min_t(int, num_online_cpus(), 8);
125 	unsigned int factor;
126 
127 	switch (sysctl_sched_tunable_scaling) {
128 	case SCHED_TUNABLESCALING_NONE:
129 		factor = 1;
130 		break;
131 	case SCHED_TUNABLESCALING_LINEAR:
132 		factor = cpus;
133 		break;
134 	case SCHED_TUNABLESCALING_LOG:
135 	default:
136 		factor = 1 + ilog2(cpus);
137 		break;
138 	}
139 
140 	return factor;
141 }
142 
143 static void update_sysctl(void)
144 {
145 	unsigned int factor = get_update_sysctl_factor();
146 
147 #define SET_SYSCTL(name) \
148 	(sysctl_##name = (factor) * normalized_sysctl_##name)
149 	SET_SYSCTL(sched_min_granularity);
150 	SET_SYSCTL(sched_latency);
151 	SET_SYSCTL(sched_wakeup_granularity);
152 #undef SET_SYSCTL
153 }
154 
155 void sched_init_granularity(void)
156 {
157 	update_sysctl();
158 }
159 
160 #if BITS_PER_LONG == 32
161 # define WMULT_CONST	(~0UL)
162 #else
163 # define WMULT_CONST	(1UL << 32)
164 #endif
165 
166 #define WMULT_SHIFT	32
167 
168 /*
169  * Shift right and round:
170  */
171 #define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
172 
173 /*
174  * delta *= weight / lw
175  */
176 static unsigned long
177 calc_delta_mine(unsigned long delta_exec, unsigned long weight,
178 		struct load_weight *lw)
179 {
180 	u64 tmp;
181 
182 	/*
183 	 * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
184 	 * entities since MIN_SHARES = 2. Treat weight as 1 if less than
185 	 * 2^SCHED_LOAD_RESOLUTION.
186 	 */
187 	if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
188 		tmp = (u64)delta_exec * scale_load_down(weight);
189 	else
190 		tmp = (u64)delta_exec;
191 
192 	if (!lw->inv_weight) {
193 		unsigned long w = scale_load_down(lw->weight);
194 
195 		if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
196 			lw->inv_weight = 1;
197 		else if (unlikely(!w))
198 			lw->inv_weight = WMULT_CONST;
199 		else
200 			lw->inv_weight = WMULT_CONST / w;
201 	}
202 
203 	/*
204 	 * Check whether we'd overflow the 64-bit multiplication:
205 	 */
206 	if (unlikely(tmp > WMULT_CONST))
207 		tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
208 			WMULT_SHIFT/2);
209 	else
210 		tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
211 
212 	return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
213 }
214 
215 
216 const struct sched_class fair_sched_class;
217 
218 /**************************************************************
219  * CFS operations on generic schedulable entities:
220  */
221 
222 #ifdef CONFIG_FAIR_GROUP_SCHED
223 
224 /* cpu runqueue to which this cfs_rq is attached */
225 static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
226 {
227 	return cfs_rq->rq;
228 }
229 
230 /* An entity is a task if it doesn't "own" a runqueue */
231 #define entity_is_task(se)	(!se->my_q)
232 
233 static inline struct task_struct *task_of(struct sched_entity *se)
234 {
235 #ifdef CONFIG_SCHED_DEBUG
236 	WARN_ON_ONCE(!entity_is_task(se));
237 #endif
238 	return container_of(se, struct task_struct, se);
239 }
240 
241 /* Walk up scheduling entities hierarchy */
242 #define for_each_sched_entity(se) \
243 		for (; se; se = se->parent)
244 
245 static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
246 {
247 	return p->se.cfs_rq;
248 }
249 
250 /* runqueue on which this entity is (to be) queued */
251 static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
252 {
253 	return se->cfs_rq;
254 }
255 
256 /* runqueue "owned" by this group */
257 static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
258 {
259 	return grp->my_q;
260 }
261 
262 static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
263 				       int force_update);
264 
265 static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
266 {
267 	if (!cfs_rq->on_list) {
268 		/*
269 		 * Ensure we either appear before our parent (if already
270 		 * enqueued) or force our parent to appear after us when it is
271 		 * enqueued.  The fact that we always enqueue bottom-up
272 		 * reduces this to two cases.
273 		 */
274 		if (cfs_rq->tg->parent &&
275 		    cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
276 			list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
277 				&rq_of(cfs_rq)->leaf_cfs_rq_list);
278 		} else {
279 			list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
280 				&rq_of(cfs_rq)->leaf_cfs_rq_list);
281 		}
282 
283 		cfs_rq->on_list = 1;
284 		/* We should have no load, but we need to update last_decay. */
285 		update_cfs_rq_blocked_load(cfs_rq, 0);
286 	}
287 }
288 
289 static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
290 {
291 	if (cfs_rq->on_list) {
292 		list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
293 		cfs_rq->on_list = 0;
294 	}
295 }
296 
297 /* Iterate thr' all leaf cfs_rq's on a runqueue */
298 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
299 	list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
300 
301 /* Do the two (enqueued) entities belong to the same group ? */
302 static inline int
303 is_same_group(struct sched_entity *se, struct sched_entity *pse)
304 {
305 	if (se->cfs_rq == pse->cfs_rq)
306 		return 1;
307 
308 	return 0;
309 }
310 
311 static inline struct sched_entity *parent_entity(struct sched_entity *se)
312 {
313 	return se->parent;
314 }
315 
316 /* return depth at which a sched entity is present in the hierarchy */
317 static inline int depth_se(struct sched_entity *se)
318 {
319 	int depth = 0;
320 
321 	for_each_sched_entity(se)
322 		depth++;
323 
324 	return depth;
325 }
326 
327 static void
328 find_matching_se(struct sched_entity **se, struct sched_entity **pse)
329 {
330 	int se_depth, pse_depth;
331 
332 	/*
333 	 * preemption test can be made between sibling entities who are in the
334 	 * same cfs_rq i.e who have a common parent. Walk up the hierarchy of
335 	 * both tasks until we find their ancestors who are siblings of common
336 	 * parent.
337 	 */
338 
339 	/* First walk up until both entities are at same depth */
340 	se_depth = depth_se(*se);
341 	pse_depth = depth_se(*pse);
342 
343 	while (se_depth > pse_depth) {
344 		se_depth--;
345 		*se = parent_entity(*se);
346 	}
347 
348 	while (pse_depth > se_depth) {
349 		pse_depth--;
350 		*pse = parent_entity(*pse);
351 	}
352 
353 	while (!is_same_group(*se, *pse)) {
354 		*se = parent_entity(*se);
355 		*pse = parent_entity(*pse);
356 	}
357 }
358 
359 #else	/* !CONFIG_FAIR_GROUP_SCHED */
360 
361 static inline struct task_struct *task_of(struct sched_entity *se)
362 {
363 	return container_of(se, struct task_struct, se);
364 }
365 
366 static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
367 {
368 	return container_of(cfs_rq, struct rq, cfs);
369 }
370 
371 #define entity_is_task(se)	1
372 
373 #define for_each_sched_entity(se) \
374 		for (; se; se = NULL)
375 
376 static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
377 {
378 	return &task_rq(p)->cfs;
379 }
380 
381 static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
382 {
383 	struct task_struct *p = task_of(se);
384 	struct rq *rq = task_rq(p);
385 
386 	return &rq->cfs;
387 }
388 
389 /* runqueue "owned" by this group */
390 static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
391 {
392 	return NULL;
393 }
394 
395 static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
396 {
397 }
398 
399 static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
400 {
401 }
402 
403 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
404 		for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
405 
406 static inline int
407 is_same_group(struct sched_entity *se, struct sched_entity *pse)
408 {
409 	return 1;
410 }
411 
412 static inline struct sched_entity *parent_entity(struct sched_entity *se)
413 {
414 	return NULL;
415 }
416 
417 static inline void
418 find_matching_se(struct sched_entity **se, struct sched_entity **pse)
419 {
420 }
421 
422 #endif	/* CONFIG_FAIR_GROUP_SCHED */
423 
424 static __always_inline
425 void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec);
426 
427 /**************************************************************
428  * Scheduling class tree data structure manipulation methods:
429  */
430 
431 static inline u64 max_vruntime(u64 min_vruntime, u64 vruntime)
432 {
433 	s64 delta = (s64)(vruntime - min_vruntime);
434 	if (delta > 0)
435 		min_vruntime = vruntime;
436 
437 	return min_vruntime;
438 }
439 
440 static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
441 {
442 	s64 delta = (s64)(vruntime - min_vruntime);
443 	if (delta < 0)
444 		min_vruntime = vruntime;
445 
446 	return min_vruntime;
447 }
448 
449 static inline int entity_before(struct sched_entity *a,
450 				struct sched_entity *b)
451 {
452 	return (s64)(a->vruntime - b->vruntime) < 0;
453 }
454 
455 static void update_min_vruntime(struct cfs_rq *cfs_rq)
456 {
457 	u64 vruntime = cfs_rq->min_vruntime;
458 
459 	if (cfs_rq->curr)
460 		vruntime = cfs_rq->curr->vruntime;
461 
462 	if (cfs_rq->rb_leftmost) {
463 		struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
464 						   struct sched_entity,
465 						   run_node);
466 
467 		if (!cfs_rq->curr)
468 			vruntime = se->vruntime;
469 		else
470 			vruntime = min_vruntime(vruntime, se->vruntime);
471 	}
472 
473 	cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
474 #ifndef CONFIG_64BIT
475 	smp_wmb();
476 	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
477 #endif
478 }
479 
480 /*
481  * Enqueue an entity into the rb-tree:
482  */
483 static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
484 {
485 	struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
486 	struct rb_node *parent = NULL;
487 	struct sched_entity *entry;
488 	int leftmost = 1;
489 
490 	/*
491 	 * Find the right place in the rbtree:
492 	 */
493 	while (*link) {
494 		parent = *link;
495 		entry = rb_entry(parent, struct sched_entity, run_node);
496 		/*
497 		 * We dont care about collisions. Nodes with
498 		 * the same key stay together.
499 		 */
500 		if (entity_before(se, entry)) {
501 			link = &parent->rb_left;
502 		} else {
503 			link = &parent->rb_right;
504 			leftmost = 0;
505 		}
506 	}
507 
508 	/*
509 	 * Maintain a cache of leftmost tree entries (it is frequently
510 	 * used):
511 	 */
512 	if (leftmost)
513 		cfs_rq->rb_leftmost = &se->run_node;
514 
515 	rb_link_node(&se->run_node, parent, link);
516 	rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
517 }
518 
519 static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
520 {
521 	if (cfs_rq->rb_leftmost == &se->run_node) {
522 		struct rb_node *next_node;
523 
524 		next_node = rb_next(&se->run_node);
525 		cfs_rq->rb_leftmost = next_node;
526 	}
527 
528 	rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
529 }
530 
531 struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
532 {
533 	struct rb_node *left = cfs_rq->rb_leftmost;
534 
535 	if (!left)
536 		return NULL;
537 
538 	return rb_entry(left, struct sched_entity, run_node);
539 }
540 
541 static struct sched_entity *__pick_next_entity(struct sched_entity *se)
542 {
543 	struct rb_node *next = rb_next(&se->run_node);
544 
545 	if (!next)
546 		return NULL;
547 
548 	return rb_entry(next, struct sched_entity, run_node);
549 }
550 
551 #ifdef CONFIG_SCHED_DEBUG
552 struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
553 {
554 	struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
555 
556 	if (!last)
557 		return NULL;
558 
559 	return rb_entry(last, struct sched_entity, run_node);
560 }
561 
562 /**************************************************************
563  * Scheduling class statistics methods:
564  */
565 
566 int sched_proc_update_handler(struct ctl_table *table, int write,
567 		void __user *buffer, size_t *lenp,
568 		loff_t *ppos)
569 {
570 	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
571 	int factor = get_update_sysctl_factor();
572 
573 	if (ret || !write)
574 		return ret;
575 
576 	sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
577 					sysctl_sched_min_granularity);
578 
579 #define WRT_SYSCTL(name) \
580 	(normalized_sysctl_##name = sysctl_##name / (factor))
581 	WRT_SYSCTL(sched_min_granularity);
582 	WRT_SYSCTL(sched_latency);
583 	WRT_SYSCTL(sched_wakeup_granularity);
584 #undef WRT_SYSCTL
585 
586 	return 0;
587 }
588 #endif
589 
590 /*
591  * delta /= w
592  */
593 static inline unsigned long
594 calc_delta_fair(unsigned long delta, struct sched_entity *se)
595 {
596 	if (unlikely(se->load.weight != NICE_0_LOAD))
597 		delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load);
598 
599 	return delta;
600 }
601 
602 /*
603  * The idea is to set a period in which each task runs once.
604  *
605  * When there are too many tasks (sched_nr_latency) we have to stretch
606  * this period because otherwise the slices get too small.
607  *
608  * p = (nr <= nl) ? l : l*nr/nl
609  */
610 static u64 __sched_period(unsigned long nr_running)
611 {
612 	u64 period = sysctl_sched_latency;
613 	unsigned long nr_latency = sched_nr_latency;
614 
615 	if (unlikely(nr_running > nr_latency)) {
616 		period = sysctl_sched_min_granularity;
617 		period *= nr_running;
618 	}
619 
620 	return period;
621 }
622 
623 /*
624  * We calculate the wall-time slice from the period by taking a part
625  * proportional to the weight.
626  *
627  * s = p*P[w/rw]
628  */
629 static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
630 {
631 	u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
632 
633 	for_each_sched_entity(se) {
634 		struct load_weight *load;
635 		struct load_weight lw;
636 
637 		cfs_rq = cfs_rq_of(se);
638 		load = &cfs_rq->load;
639 
640 		if (unlikely(!se->on_rq)) {
641 			lw = cfs_rq->load;
642 
643 			update_load_add(&lw, se->load.weight);
644 			load = &lw;
645 		}
646 		slice = calc_delta_mine(slice, se->load.weight, load);
647 	}
648 	return slice;
649 }
650 
651 /*
652  * We calculate the vruntime slice of a to be inserted task
653  *
654  * vs = s/w
655  */
656 static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
657 {
658 	return calc_delta_fair(sched_slice(cfs_rq, se), se);
659 }
660 
661 /*
662  * Update the current task's runtime statistics. Skip current tasks that
663  * are not in our scheduling class.
664  */
665 static inline void
666 __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
667 	      unsigned long delta_exec)
668 {
669 	unsigned long delta_exec_weighted;
670 
671 	schedstat_set(curr->statistics.exec_max,
672 		      max((u64)delta_exec, curr->statistics.exec_max));
673 
674 	curr->sum_exec_runtime += delta_exec;
675 	schedstat_add(cfs_rq, exec_clock, delta_exec);
676 	delta_exec_weighted = calc_delta_fair(delta_exec, curr);
677 
678 	curr->vruntime += delta_exec_weighted;
679 	update_min_vruntime(cfs_rq);
680 }
681 
682 static void update_curr(struct cfs_rq *cfs_rq)
683 {
684 	struct sched_entity *curr = cfs_rq->curr;
685 	u64 now = rq_of(cfs_rq)->clock_task;
686 	unsigned long delta_exec;
687 
688 	if (unlikely(!curr))
689 		return;
690 
691 	/*
692 	 * Get the amount of time the current task was running
693 	 * since the last time we changed load (this cannot
694 	 * overflow on 32 bits):
695 	 */
696 	delta_exec = (unsigned long)(now - curr->exec_start);
697 	if (!delta_exec)
698 		return;
699 
700 	__update_curr(cfs_rq, curr, delta_exec);
701 	curr->exec_start = now;
702 
703 	if (entity_is_task(curr)) {
704 		struct task_struct *curtask = task_of(curr);
705 
706 		trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
707 		cpuacct_charge(curtask, delta_exec);
708 		account_group_exec_runtime(curtask, delta_exec);
709 	}
710 
711 	account_cfs_rq_runtime(cfs_rq, delta_exec);
712 }
713 
714 static inline void
715 update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
716 {
717 	schedstat_set(se->statistics.wait_start, rq_of(cfs_rq)->clock);
718 }
719 
720 /*
721  * Task is being enqueued - update stats:
722  */
723 static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
724 {
725 	/*
726 	 * Are we enqueueing a waiting task? (for current tasks
727 	 * a dequeue/enqueue event is a NOP)
728 	 */
729 	if (se != cfs_rq->curr)
730 		update_stats_wait_start(cfs_rq, se);
731 }
732 
733 static void
734 update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
735 {
736 	schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
737 			rq_of(cfs_rq)->clock - se->statistics.wait_start));
738 	schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
739 	schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
740 			rq_of(cfs_rq)->clock - se->statistics.wait_start);
741 #ifdef CONFIG_SCHEDSTATS
742 	if (entity_is_task(se)) {
743 		trace_sched_stat_wait(task_of(se),
744 			rq_of(cfs_rq)->clock - se->statistics.wait_start);
745 	}
746 #endif
747 	schedstat_set(se->statistics.wait_start, 0);
748 }
749 
750 static inline void
751 update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
752 {
753 	/*
754 	 * Mark the end of the wait period if dequeueing a
755 	 * waiting task:
756 	 */
757 	if (se != cfs_rq->curr)
758 		update_stats_wait_end(cfs_rq, se);
759 }
760 
761 /*
762  * We are picking a new current task - update its stats:
763  */
764 static inline void
765 update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
766 {
767 	/*
768 	 * We are starting a new run period:
769 	 */
770 	se->exec_start = rq_of(cfs_rq)->clock_task;
771 }
772 
773 /**************************************************
774  * Scheduling class queueing methods:
775  */
776 
777 static void
778 account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
779 {
780 	update_load_add(&cfs_rq->load, se->load.weight);
781 	if (!parent_entity(se))
782 		update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
783 #ifdef CONFIG_SMP
784 	if (entity_is_task(se))
785 		list_add(&se->group_node, &rq_of(cfs_rq)->cfs_tasks);
786 #endif
787 	cfs_rq->nr_running++;
788 }
789 
790 static void
791 account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
792 {
793 	update_load_sub(&cfs_rq->load, se->load.weight);
794 	if (!parent_entity(se))
795 		update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
796 	if (entity_is_task(se))
797 		list_del_init(&se->group_node);
798 	cfs_rq->nr_running--;
799 }
800 
801 #ifdef CONFIG_FAIR_GROUP_SCHED
802 # ifdef CONFIG_SMP
803 static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
804 {
805 	long tg_weight;
806 
807 	/*
808 	 * Use this CPU's actual weight instead of the last load_contribution
809 	 * to gain a more accurate current total weight. See
810 	 * update_cfs_rq_load_contribution().
811 	 */
812 	tg_weight = atomic64_read(&tg->load_avg);
813 	tg_weight -= cfs_rq->tg_load_contrib;
814 	tg_weight += cfs_rq->load.weight;
815 
816 	return tg_weight;
817 }
818 
819 static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
820 {
821 	long tg_weight, load, shares;
822 
823 	tg_weight = calc_tg_weight(tg, cfs_rq);
824 	load = cfs_rq->load.weight;
825 
826 	shares = (tg->shares * load);
827 	if (tg_weight)
828 		shares /= tg_weight;
829 
830 	if (shares < MIN_SHARES)
831 		shares = MIN_SHARES;
832 	if (shares > tg->shares)
833 		shares = tg->shares;
834 
835 	return shares;
836 }
837 # else /* CONFIG_SMP */
838 static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
839 {
840 	return tg->shares;
841 }
842 # endif /* CONFIG_SMP */
843 static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
844 			    unsigned long weight)
845 {
846 	if (se->on_rq) {
847 		/* commit outstanding execution time */
848 		if (cfs_rq->curr == se)
849 			update_curr(cfs_rq);
850 		account_entity_dequeue(cfs_rq, se);
851 	}
852 
853 	update_load_set(&se->load, weight);
854 
855 	if (se->on_rq)
856 		account_entity_enqueue(cfs_rq, se);
857 }
858 
859 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
860 
861 static void update_cfs_shares(struct cfs_rq *cfs_rq)
862 {
863 	struct task_group *tg;
864 	struct sched_entity *se;
865 	long shares;
866 
867 	tg = cfs_rq->tg;
868 	se = tg->se[cpu_of(rq_of(cfs_rq))];
869 	if (!se || throttled_hierarchy(cfs_rq))
870 		return;
871 #ifndef CONFIG_SMP
872 	if (likely(se->load.weight == tg->shares))
873 		return;
874 #endif
875 	shares = calc_cfs_shares(cfs_rq, tg);
876 
877 	reweight_entity(cfs_rq_of(se), se, shares);
878 }
879 #else /* CONFIG_FAIR_GROUP_SCHED */
880 static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
881 {
882 }
883 #endif /* CONFIG_FAIR_GROUP_SCHED */
884 
885 /* Only depends on SMP, FAIR_GROUP_SCHED may be removed when useful in lb */
886 #if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)
887 /*
888  * We choose a half-life close to 1 scheduling period.
889  * Note: The tables below are dependent on this value.
890  */
891 #define LOAD_AVG_PERIOD 32
892 #define LOAD_AVG_MAX 47742 /* maximum possible load avg */
893 #define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */
894 
895 /* Precomputed fixed inverse multiplies for multiplication by y^n */
896 static const u32 runnable_avg_yN_inv[] = {
897 	0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
898 	0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
899 	0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
900 	0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9,
901 	0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80,
902 	0x85aac367, 0x82cd8698,
903 };
904 
905 /*
906  * Precomputed \Sum y^k { 1<=k<=n }.  These are floor(true_value) to prevent
907  * over-estimates when re-combining.
908  */
909 static const u32 runnable_avg_yN_sum[] = {
910 	    0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103,
911 	 9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082,
912 	17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,
913 };
914 
915 /*
916  * Approximate:
917  *   val * y^n,    where y^32 ~= 0.5 (~1 scheduling period)
918  */
919 static __always_inline u64 decay_load(u64 val, u64 n)
920 {
921 	unsigned int local_n;
922 
923 	if (!n)
924 		return val;
925 	else if (unlikely(n > LOAD_AVG_PERIOD * 63))
926 		return 0;
927 
928 	/* after bounds checking we can collapse to 32-bit */
929 	local_n = n;
930 
931 	/*
932 	 * As y^PERIOD = 1/2, we can combine
933 	 *    y^n = 1/2^(n/PERIOD) * k^(n%PERIOD)
934 	 * With a look-up table which covers k^n (n<PERIOD)
935 	 *
936 	 * To achieve constant time decay_load.
937 	 */
938 	if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
939 		val >>= local_n / LOAD_AVG_PERIOD;
940 		local_n %= LOAD_AVG_PERIOD;
941 	}
942 
943 	val *= runnable_avg_yN_inv[local_n];
944 	/* We don't use SRR here since we always want to round down. */
945 	return val >> 32;
946 }
947 
948 /*
949  * For updates fully spanning n periods, the contribution to runnable
950  * average will be: \Sum 1024*y^n
951  *
952  * We can compute this reasonably efficiently by combining:
953  *   y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for  n <PERIOD}
954  */
955 static u32 __compute_runnable_contrib(u64 n)
956 {
957 	u32 contrib = 0;
958 
959 	if (likely(n <= LOAD_AVG_PERIOD))
960 		return runnable_avg_yN_sum[n];
961 	else if (unlikely(n >= LOAD_AVG_MAX_N))
962 		return LOAD_AVG_MAX;
963 
964 	/* Compute \Sum k^n combining precomputed values for k^i, \Sum k^j */
965 	do {
966 		contrib /= 2; /* y^LOAD_AVG_PERIOD = 1/2 */
967 		contrib += runnable_avg_yN_sum[LOAD_AVG_PERIOD];
968 
969 		n -= LOAD_AVG_PERIOD;
970 	} while (n > LOAD_AVG_PERIOD);
971 
972 	contrib = decay_load(contrib, n);
973 	return contrib + runnable_avg_yN_sum[n];
974 }
975 
976 /*
977  * We can represent the historical contribution to runnable average as the
978  * coefficients of a geometric series.  To do this we sub-divide our runnable
979  * history into segments of approximately 1ms (1024us); label the segment that
980  * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
981  *
982  * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
983  *      p0            p1           p2
984  *     (now)       (~1ms ago)  (~2ms ago)
985  *
986  * Let u_i denote the fraction of p_i that the entity was runnable.
987  *
988  * We then designate the fractions u_i as our co-efficients, yielding the
989  * following representation of historical load:
990  *   u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
991  *
992  * We choose y based on the with of a reasonably scheduling period, fixing:
993  *   y^32 = 0.5
994  *
995  * This means that the contribution to load ~32ms ago (u_32) will be weighted
996  * approximately half as much as the contribution to load within the last ms
997  * (u_0).
998  *
999  * When a period "rolls over" and we have new u_0`, multiplying the previous
1000  * sum again by y is sufficient to update:
1001  *   load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
1002  *            = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
1003  */
1004 static __always_inline int __update_entity_runnable_avg(u64 now,
1005 							struct sched_avg *sa,
1006 							int runnable)
1007 {
1008 	u64 delta, periods;
1009 	u32 runnable_contrib;
1010 	int delta_w, decayed = 0;
1011 
1012 	delta = now - sa->last_runnable_update;
1013 	/*
1014 	 * This should only happen when time goes backwards, which it
1015 	 * unfortunately does during sched clock init when we swap over to TSC.
1016 	 */
1017 	if ((s64)delta < 0) {
1018 		sa->last_runnable_update = now;
1019 		return 0;
1020 	}
1021 
1022 	/*
1023 	 * Use 1024ns as the unit of measurement since it's a reasonable
1024 	 * approximation of 1us and fast to compute.
1025 	 */
1026 	delta >>= 10;
1027 	if (!delta)
1028 		return 0;
1029 	sa->last_runnable_update = now;
1030 
1031 	/* delta_w is the amount already accumulated against our next period */
1032 	delta_w = sa->runnable_avg_period % 1024;
1033 	if (delta + delta_w >= 1024) {
1034 		/* period roll-over */
1035 		decayed = 1;
1036 
1037 		/*
1038 		 * Now that we know we're crossing a period boundary, figure
1039 		 * out how much from delta we need to complete the current
1040 		 * period and accrue it.
1041 		 */
1042 		delta_w = 1024 - delta_w;
1043 		if (runnable)
1044 			sa->runnable_avg_sum += delta_w;
1045 		sa->runnable_avg_period += delta_w;
1046 
1047 		delta -= delta_w;
1048 
1049 		/* Figure out how many additional periods this update spans */
1050 		periods = delta / 1024;
1051 		delta %= 1024;
1052 
1053 		sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum,
1054 						  periods + 1);
1055 		sa->runnable_avg_period = decay_load(sa->runnable_avg_period,
1056 						     periods + 1);
1057 
1058 		/* Efficiently calculate \sum (1..n_period) 1024*y^i */
1059 		runnable_contrib = __compute_runnable_contrib(periods);
1060 		if (runnable)
1061 			sa->runnable_avg_sum += runnable_contrib;
1062 		sa->runnable_avg_period += runnable_contrib;
1063 	}
1064 
1065 	/* Remainder of delta accrued against u_0` */
1066 	if (runnable)
1067 		sa->runnable_avg_sum += delta;
1068 	sa->runnable_avg_period += delta;
1069 
1070 	return decayed;
1071 }
1072 
1073 /* Synchronize an entity's decay with its parenting cfs_rq.*/
1074 static inline u64 __synchronize_entity_decay(struct sched_entity *se)
1075 {
1076 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
1077 	u64 decays = atomic64_read(&cfs_rq->decay_counter);
1078 
1079 	decays -= se->avg.decay_count;
1080 	if (!decays)
1081 		return 0;
1082 
1083 	se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);
1084 	se->avg.decay_count = 0;
1085 
1086 	return decays;
1087 }
1088 
1089 #ifdef CONFIG_FAIR_GROUP_SCHED
1090 static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
1091 						 int force_update)
1092 {
1093 	struct task_group *tg = cfs_rq->tg;
1094 	s64 tg_contrib;
1095 
1096 	tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg;
1097 	tg_contrib -= cfs_rq->tg_load_contrib;
1098 
1099 	if (force_update || abs64(tg_contrib) > cfs_rq->tg_load_contrib / 8) {
1100 		atomic64_add(tg_contrib, &tg->load_avg);
1101 		cfs_rq->tg_load_contrib += tg_contrib;
1102 	}
1103 }
1104 
1105 /*
1106  * Aggregate cfs_rq runnable averages into an equivalent task_group
1107  * representation for computing load contributions.
1108  */
1109 static inline void __update_tg_runnable_avg(struct sched_avg *sa,
1110 						  struct cfs_rq *cfs_rq)
1111 {
1112 	struct task_group *tg = cfs_rq->tg;
1113 	long contrib;
1114 
1115 	/* The fraction of a cpu used by this cfs_rq */
1116 	contrib = div_u64(sa->runnable_avg_sum << NICE_0_SHIFT,
1117 			  sa->runnable_avg_period + 1);
1118 	contrib -= cfs_rq->tg_runnable_contrib;
1119 
1120 	if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) {
1121 		atomic_add(contrib, &tg->runnable_avg);
1122 		cfs_rq->tg_runnable_contrib += contrib;
1123 	}
1124 }
1125 
1126 static inline void __update_group_entity_contrib(struct sched_entity *se)
1127 {
1128 	struct cfs_rq *cfs_rq = group_cfs_rq(se);
1129 	struct task_group *tg = cfs_rq->tg;
1130 	int runnable_avg;
1131 
1132 	u64 contrib;
1133 
1134 	contrib = cfs_rq->tg_load_contrib * tg->shares;
1135 	se->avg.load_avg_contrib = div64_u64(contrib,
1136 					     atomic64_read(&tg->load_avg) + 1);
1137 
1138 	/*
1139 	 * For group entities we need to compute a correction term in the case
1140 	 * that they are consuming <1 cpu so that we would contribute the same
1141 	 * load as a task of equal weight.
1142 	 *
1143 	 * Explicitly co-ordinating this measurement would be expensive, but
1144 	 * fortunately the sum of each cpus contribution forms a usable
1145 	 * lower-bound on the true value.
1146 	 *
1147 	 * Consider the aggregate of 2 contributions.  Either they are disjoint
1148 	 * (and the sum represents true value) or they are disjoint and we are
1149 	 * understating by the aggregate of their overlap.
1150 	 *
1151 	 * Extending this to N cpus, for a given overlap, the maximum amount we
1152 	 * understand is then n_i(n_i+1)/2 * w_i where n_i is the number of
1153 	 * cpus that overlap for this interval and w_i is the interval width.
1154 	 *
1155 	 * On a small machine; the first term is well-bounded which bounds the
1156 	 * total error since w_i is a subset of the period.  Whereas on a
1157 	 * larger machine, while this first term can be larger, if w_i is the
1158 	 * of consequential size guaranteed to see n_i*w_i quickly converge to
1159 	 * our upper bound of 1-cpu.
1160 	 */
1161 	runnable_avg = atomic_read(&tg->runnable_avg);
1162 	if (runnable_avg < NICE_0_LOAD) {
1163 		se->avg.load_avg_contrib *= runnable_avg;
1164 		se->avg.load_avg_contrib >>= NICE_0_SHIFT;
1165 	}
1166 }
1167 #else
1168 static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
1169 						 int force_update) {}
1170 static inline void __update_tg_runnable_avg(struct sched_avg *sa,
1171 						  struct cfs_rq *cfs_rq) {}
1172 static inline void __update_group_entity_contrib(struct sched_entity *se) {}
1173 #endif
1174 
1175 static inline void __update_task_entity_contrib(struct sched_entity *se)
1176 {
1177 	u32 contrib;
1178 
1179 	/* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */
1180 	contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);
1181 	contrib /= (se->avg.runnable_avg_period + 1);
1182 	se->avg.load_avg_contrib = scale_load(contrib);
1183 }
1184 
1185 /* Compute the current contribution to load_avg by se, return any delta */
1186 static long __update_entity_load_avg_contrib(struct sched_entity *se)
1187 {
1188 	long old_contrib = se->avg.load_avg_contrib;
1189 
1190 	if (entity_is_task(se)) {
1191 		__update_task_entity_contrib(se);
1192 	} else {
1193 		__update_tg_runnable_avg(&se->avg, group_cfs_rq(se));
1194 		__update_group_entity_contrib(se);
1195 	}
1196 
1197 	return se->avg.load_avg_contrib - old_contrib;
1198 }
1199 
1200 static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq,
1201 						 long load_contrib)
1202 {
1203 	if (likely(load_contrib < cfs_rq->blocked_load_avg))
1204 		cfs_rq->blocked_load_avg -= load_contrib;
1205 	else
1206 		cfs_rq->blocked_load_avg = 0;
1207 }
1208 
1209 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
1210 
1211 /* Update a sched_entity's runnable average */
1212 static inline void update_entity_load_avg(struct sched_entity *se,
1213 					  int update_cfs_rq)
1214 {
1215 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
1216 	long contrib_delta;
1217 	u64 now;
1218 
1219 	/*
1220 	 * For a group entity we need to use their owned cfs_rq_clock_task() in
1221 	 * case they are the parent of a throttled hierarchy.
1222 	 */
1223 	if (entity_is_task(se))
1224 		now = cfs_rq_clock_task(cfs_rq);
1225 	else
1226 		now = cfs_rq_clock_task(group_cfs_rq(se));
1227 
1228 	if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq))
1229 		return;
1230 
1231 	contrib_delta = __update_entity_load_avg_contrib(se);
1232 
1233 	if (!update_cfs_rq)
1234 		return;
1235 
1236 	if (se->on_rq)
1237 		cfs_rq->runnable_load_avg += contrib_delta;
1238 	else
1239 		subtract_blocked_load_contrib(cfs_rq, -contrib_delta);
1240 }
1241 
1242 /*
1243  * Decay the load contributed by all blocked children and account this so that
1244  * their contribution may appropriately discounted when they wake up.
1245  */
1246 static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
1247 {
1248 	u64 now = cfs_rq_clock_task(cfs_rq) >> 20;
1249 	u64 decays;
1250 
1251 	decays = now - cfs_rq->last_decay;
1252 	if (!decays && !force_update)
1253 		return;
1254 
1255 	if (atomic64_read(&cfs_rq->removed_load)) {
1256 		u64 removed_load = atomic64_xchg(&cfs_rq->removed_load, 0);
1257 		subtract_blocked_load_contrib(cfs_rq, removed_load);
1258 	}
1259 
1260 	if (decays) {
1261 		cfs_rq->blocked_load_avg = decay_load(cfs_rq->blocked_load_avg,
1262 						      decays);
1263 		atomic64_add(decays, &cfs_rq->decay_counter);
1264 		cfs_rq->last_decay = now;
1265 	}
1266 
1267 	__update_cfs_rq_tg_load_contrib(cfs_rq, force_update);
1268 	update_cfs_shares(cfs_rq);
1269 }
1270 
1271 static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
1272 {
1273 	__update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable);
1274 	__update_tg_runnable_avg(&rq->avg, &rq->cfs);
1275 }
1276 
1277 /* Add the load generated by se into cfs_rq's child load-average */
1278 static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
1279 						  struct sched_entity *se,
1280 						  int wakeup)
1281 {
1282 	/*
1283 	 * We track migrations using entity decay_count <= 0, on a wake-up
1284 	 * migration we use a negative decay count to track the remote decays
1285 	 * accumulated while sleeping.
1286 	 */
1287 	if (unlikely(se->avg.decay_count <= 0)) {
1288 		se->avg.last_runnable_update = rq_of(cfs_rq)->clock_task;
1289 		if (se->avg.decay_count) {
1290 			/*
1291 			 * In a wake-up migration we have to approximate the
1292 			 * time sleeping.  This is because we can't synchronize
1293 			 * clock_task between the two cpus, and it is not
1294 			 * guaranteed to be read-safe.  Instead, we can
1295 			 * approximate this using our carried decays, which are
1296 			 * explicitly atomically readable.
1297 			 */
1298 			se->avg.last_runnable_update -= (-se->avg.decay_count)
1299 							<< 20;
1300 			update_entity_load_avg(se, 0);
1301 			/* Indicate that we're now synchronized and on-rq */
1302 			se->avg.decay_count = 0;
1303 		}
1304 		wakeup = 0;
1305 	} else {
1306 		__synchronize_entity_decay(se);
1307 	}
1308 
1309 	/* migrated tasks did not contribute to our blocked load */
1310 	if (wakeup) {
1311 		subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
1312 		update_entity_load_avg(se, 0);
1313 	}
1314 
1315 	cfs_rq->runnable_load_avg += se->avg.load_avg_contrib;
1316 	/* we force update consideration on load-balancer moves */
1317 	update_cfs_rq_blocked_load(cfs_rq, !wakeup);
1318 }
1319 
1320 /*
1321  * Remove se's load from this cfs_rq child load-average, if the entity is
1322  * transitioning to a blocked state we track its projected decay using
1323  * blocked_load_avg.
1324  */
1325 static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
1326 						  struct sched_entity *se,
1327 						  int sleep)
1328 {
1329 	update_entity_load_avg(se, 1);
1330 	/* we force update consideration on load-balancer moves */
1331 	update_cfs_rq_blocked_load(cfs_rq, !sleep);
1332 
1333 	cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib;
1334 	if (sleep) {
1335 		cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
1336 		se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
1337 	} /* migrations, e.g. sleep=0 leave decay_count == 0 */
1338 }
1339 #else
1340 static inline void update_entity_load_avg(struct sched_entity *se,
1341 					  int update_cfs_rq) {}
1342 static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
1343 static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
1344 					   struct sched_entity *se,
1345 					   int wakeup) {}
1346 static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
1347 					   struct sched_entity *se,
1348 					   int sleep) {}
1349 static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
1350 					      int force_update) {}
1351 #endif
1352 
1353 static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
1354 {
1355 #ifdef CONFIG_SCHEDSTATS
1356 	struct task_struct *tsk = NULL;
1357 
1358 	if (entity_is_task(se))
1359 		tsk = task_of(se);
1360 
1361 	if (se->statistics.sleep_start) {
1362 		u64 delta = rq_of(cfs_rq)->clock - se->statistics.sleep_start;
1363 
1364 		if ((s64)delta < 0)
1365 			delta = 0;
1366 
1367 		if (unlikely(delta > se->statistics.sleep_max))
1368 			se->statistics.sleep_max = delta;
1369 
1370 		se->statistics.sleep_start = 0;
1371 		se->statistics.sum_sleep_runtime += delta;
1372 
1373 		if (tsk) {
1374 			account_scheduler_latency(tsk, delta >> 10, 1);
1375 			trace_sched_stat_sleep(tsk, delta);
1376 		}
1377 	}
1378 	if (se->statistics.block_start) {
1379 		u64 delta = rq_of(cfs_rq)->clock - se->statistics.block_start;
1380 
1381 		if ((s64)delta < 0)
1382 			delta = 0;
1383 
1384 		if (unlikely(delta > se->statistics.block_max))
1385 			se->statistics.block_max = delta;
1386 
1387 		se->statistics.block_start = 0;
1388 		se->statistics.sum_sleep_runtime += delta;
1389 
1390 		if (tsk) {
1391 			if (tsk->in_iowait) {
1392 				se->statistics.iowait_sum += delta;
1393 				se->statistics.iowait_count++;
1394 				trace_sched_stat_iowait(tsk, delta);
1395 			}
1396 
1397 			trace_sched_stat_blocked(tsk, delta);
1398 
1399 			/*
1400 			 * Blocking time is in units of nanosecs, so shift by
1401 			 * 20 to get a milliseconds-range estimation of the
1402 			 * amount of time that the task spent sleeping:
1403 			 */
1404 			if (unlikely(prof_on == SLEEP_PROFILING)) {
1405 				profile_hits(SLEEP_PROFILING,
1406 						(void *)get_wchan(tsk),
1407 						delta >> 20);
1408 			}
1409 			account_scheduler_latency(tsk, delta >> 10, 0);
1410 		}
1411 	}
1412 #endif
1413 }
1414 
1415 static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
1416 {
1417 #ifdef CONFIG_SCHED_DEBUG
1418 	s64 d = se->vruntime - cfs_rq->min_vruntime;
1419 
1420 	if (d < 0)
1421 		d = -d;
1422 
1423 	if (d > 3*sysctl_sched_latency)
1424 		schedstat_inc(cfs_rq, nr_spread_over);
1425 #endif
1426 }
1427 
1428 static void
1429 place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
1430 {
1431 	u64 vruntime = cfs_rq->min_vruntime;
1432 
1433 	/*
1434 	 * The 'current' period is already promised to the current tasks,
1435 	 * however the extra weight of the new task will slow them down a
1436 	 * little, place the new task so that it fits in the slot that
1437 	 * stays open at the end.
1438 	 */
1439 	if (initial && sched_feat(START_DEBIT))
1440 		vruntime += sched_vslice(cfs_rq, se);
1441 
1442 	/* sleeps up to a single latency don't count. */
1443 	if (!initial) {
1444 		unsigned long thresh = sysctl_sched_latency;
1445 
1446 		/*
1447 		 * Halve their sleep time's effect, to allow
1448 		 * for a gentler effect of sleepers:
1449 		 */
1450 		if (sched_feat(GENTLE_FAIR_SLEEPERS))
1451 			thresh >>= 1;
1452 
1453 		vruntime -= thresh;
1454 	}
1455 
1456 	/* ensure we never gain time by being placed backwards. */
1457 	vruntime = max_vruntime(se->vruntime, vruntime);
1458 
1459 	se->vruntime = vruntime;
1460 }
1461 
1462 static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
1463 
1464 static void
1465 enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
1466 {
1467 	/*
1468 	 * Update the normalized vruntime before updating min_vruntime
1469 	 * through callig update_curr().
1470 	 */
1471 	if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))
1472 		se->vruntime += cfs_rq->min_vruntime;
1473 
1474 	/*
1475 	 * Update run-time statistics of the 'current'.
1476 	 */
1477 	update_curr(cfs_rq);
1478 	account_entity_enqueue(cfs_rq, se);
1479 	enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);
1480 
1481 	if (flags & ENQUEUE_WAKEUP) {
1482 		place_entity(cfs_rq, se, 0);
1483 		enqueue_sleeper(cfs_rq, se);
1484 	}
1485 
1486 	update_stats_enqueue(cfs_rq, se);
1487 	check_spread(cfs_rq, se);
1488 	if (se != cfs_rq->curr)
1489 		__enqueue_entity(cfs_rq, se);
1490 	se->on_rq = 1;
1491 
1492 	if (cfs_rq->nr_running == 1) {
1493 		list_add_leaf_cfs_rq(cfs_rq);
1494 		check_enqueue_throttle(cfs_rq);
1495 	}
1496 }
1497 
1498 static void __clear_buddies_last(struct sched_entity *se)
1499 {
1500 	for_each_sched_entity(se) {
1501 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
1502 		if (cfs_rq->last == se)
1503 			cfs_rq->last = NULL;
1504 		else
1505 			break;
1506 	}
1507 }
1508 
1509 static void __clear_buddies_next(struct sched_entity *se)
1510 {
1511 	for_each_sched_entity(se) {
1512 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
1513 		if (cfs_rq->next == se)
1514 			cfs_rq->next = NULL;
1515 		else
1516 			break;
1517 	}
1518 }
1519 
1520 static void __clear_buddies_skip(struct sched_entity *se)
1521 {
1522 	for_each_sched_entity(se) {
1523 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
1524 		if (cfs_rq->skip == se)
1525 			cfs_rq->skip = NULL;
1526 		else
1527 			break;
1528 	}
1529 }
1530 
1531 static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
1532 {
1533 	if (cfs_rq->last == se)
1534 		__clear_buddies_last(se);
1535 
1536 	if (cfs_rq->next == se)
1537 		__clear_buddies_next(se);
1538 
1539 	if (cfs_rq->skip == se)
1540 		__clear_buddies_skip(se);
1541 }
1542 
1543 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
1544 
1545 static void
1546 dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
1547 {
1548 	/*
1549 	 * Update run-time statistics of the 'current'.
1550 	 */
1551 	update_curr(cfs_rq);
1552 
1553 	update_stats_dequeue(cfs_rq, se);
1554 	if (flags & DEQUEUE_SLEEP) {
1555 #ifdef CONFIG_SCHEDSTATS
1556 		if (entity_is_task(se)) {
1557 			struct task_struct *tsk = task_of(se);
1558 
1559 			if (tsk->state & TASK_INTERRUPTIBLE)
1560 				se->statistics.sleep_start = rq_of(cfs_rq)->clock;
1561 			if (tsk->state & TASK_UNINTERRUPTIBLE)
1562 				se->statistics.block_start = rq_of(cfs_rq)->clock;
1563 		}
1564 #endif
1565 	}
1566 
1567 	clear_buddies(cfs_rq, se);
1568 
1569 	if (se != cfs_rq->curr)
1570 		__dequeue_entity(cfs_rq, se);
1571 	account_entity_dequeue(cfs_rq, se);
1572 	dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP);
1573 
1574 	/*
1575 	 * Normalize the entity after updating the min_vruntime because the
1576 	 * update can refer to the ->curr item and we need to reflect this
1577 	 * movement in our normalized position.
1578 	 */
1579 	if (!(flags & DEQUEUE_SLEEP))
1580 		se->vruntime -= cfs_rq->min_vruntime;
1581 
1582 	/* return excess runtime on last dequeue */
1583 	return_cfs_rq_runtime(cfs_rq);
1584 
1585 	update_min_vruntime(cfs_rq);
1586 	se->on_rq = 0;
1587 }
1588 
1589 /*
1590  * Preempt the current task with a newly woken task if needed:
1591  */
1592 static void
1593 check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
1594 {
1595 	unsigned long ideal_runtime, delta_exec;
1596 	struct sched_entity *se;
1597 	s64 delta;
1598 
1599 	ideal_runtime = sched_slice(cfs_rq, curr);
1600 	delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
1601 	if (delta_exec > ideal_runtime) {
1602 		resched_task(rq_of(cfs_rq)->curr);
1603 		/*
1604 		 * The current task ran long enough, ensure it doesn't get
1605 		 * re-elected due to buddy favours.
1606 		 */
1607 		clear_buddies(cfs_rq, curr);
1608 		return;
1609 	}
1610 
1611 	/*
1612 	 * Ensure that a task that missed wakeup preemption by a
1613 	 * narrow margin doesn't have to wait for a full slice.
1614 	 * This also mitigates buddy induced latencies under load.
1615 	 */
1616 	if (delta_exec < sysctl_sched_min_granularity)
1617 		return;
1618 
1619 	se = __pick_first_entity(cfs_rq);
1620 	delta = curr->vruntime - se->vruntime;
1621 
1622 	if (delta < 0)
1623 		return;
1624 
1625 	if (delta > ideal_runtime)
1626 		resched_task(rq_of(cfs_rq)->curr);
1627 }
1628 
1629 static void
1630 set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
1631 {
1632 	/* 'current' is not kept within the tree. */
1633 	if (se->on_rq) {
1634 		/*
1635 		 * Any task has to be enqueued before it get to execute on
1636 		 * a CPU. So account for the time it spent waiting on the
1637 		 * runqueue.
1638 		 */
1639 		update_stats_wait_end(cfs_rq, se);
1640 		__dequeue_entity(cfs_rq, se);
1641 	}
1642 
1643 	update_stats_curr_start(cfs_rq, se);
1644 	cfs_rq->curr = se;
1645 #ifdef CONFIG_SCHEDSTATS
1646 	/*
1647 	 * Track our maximum slice length, if the CPU's load is at
1648 	 * least twice that of our own weight (i.e. dont track it
1649 	 * when there are only lesser-weight tasks around):
1650 	 */
1651 	if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
1652 		se->statistics.slice_max = max(se->statistics.slice_max,
1653 			se->sum_exec_runtime - se->prev_sum_exec_runtime);
1654 	}
1655 #endif
1656 	se->prev_sum_exec_runtime = se->sum_exec_runtime;
1657 }
1658 
1659 static int
1660 wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
1661 
1662 /*
1663  * Pick the next process, keeping these things in mind, in this order:
1664  * 1) keep things fair between processes/task groups
1665  * 2) pick the "next" process, since someone really wants that to run
1666  * 3) pick the "last" process, for cache locality
1667  * 4) do not run the "skip" process, if something else is available
1668  */
1669 static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
1670 {
1671 	struct sched_entity *se = __pick_first_entity(cfs_rq);
1672 	struct sched_entity *left = se;
1673 
1674 	/*
1675 	 * Avoid running the skip buddy, if running something else can
1676 	 * be done without getting too unfair.
1677 	 */
1678 	if (cfs_rq->skip == se) {
1679 		struct sched_entity *second = __pick_next_entity(se);
1680 		if (second && wakeup_preempt_entity(second, left) < 1)
1681 			se = second;
1682 	}
1683 
1684 	/*
1685 	 * Prefer last buddy, try to return the CPU to a preempted task.
1686 	 */
1687 	if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
1688 		se = cfs_rq->last;
1689 
1690 	/*
1691 	 * Someone really wants this to run. If it's not unfair, run it.
1692 	 */
1693 	if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
1694 		se = cfs_rq->next;
1695 
1696 	clear_buddies(cfs_rq, se);
1697 
1698 	return se;
1699 }
1700 
1701 static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
1702 
1703 static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
1704 {
1705 	/*
1706 	 * If still on the runqueue then deactivate_task()
1707 	 * was not called and update_curr() has to be done:
1708 	 */
1709 	if (prev->on_rq)
1710 		update_curr(cfs_rq);
1711 
1712 	/* throttle cfs_rqs exceeding runtime */
1713 	check_cfs_rq_runtime(cfs_rq);
1714 
1715 	check_spread(cfs_rq, prev);
1716 	if (prev->on_rq) {
1717 		update_stats_wait_start(cfs_rq, prev);
1718 		/* Put 'current' back into the tree. */
1719 		__enqueue_entity(cfs_rq, prev);
1720 		/* in !on_rq case, update occurred at dequeue */
1721 		update_entity_load_avg(prev, 1);
1722 	}
1723 	cfs_rq->curr = NULL;
1724 }
1725 
1726 static void
1727 entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
1728 {
1729 	/*
1730 	 * Update run-time statistics of the 'current'.
1731 	 */
1732 	update_curr(cfs_rq);
1733 
1734 	/*
1735 	 * Ensure that runnable average is periodically updated.
1736 	 */
1737 	update_entity_load_avg(curr, 1);
1738 	update_cfs_rq_blocked_load(cfs_rq, 1);
1739 
1740 #ifdef CONFIG_SCHED_HRTICK
1741 	/*
1742 	 * queued ticks are scheduled to match the slice, so don't bother
1743 	 * validating it and just reschedule.
1744 	 */
1745 	if (queued) {
1746 		resched_task(rq_of(cfs_rq)->curr);
1747 		return;
1748 	}
1749 	/*
1750 	 * don't let the period tick interfere with the hrtick preemption
1751 	 */
1752 	if (!sched_feat(DOUBLE_TICK) &&
1753 			hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
1754 		return;
1755 #endif
1756 
1757 	if (cfs_rq->nr_running > 1)
1758 		check_preempt_tick(cfs_rq, curr);
1759 }
1760 
1761 
1762 /**************************************************
1763  * CFS bandwidth control machinery
1764  */
1765 
1766 #ifdef CONFIG_CFS_BANDWIDTH
1767 
1768 #ifdef HAVE_JUMP_LABEL
1769 static struct static_key __cfs_bandwidth_used;
1770 
1771 static inline bool cfs_bandwidth_used(void)
1772 {
1773 	return static_key_false(&__cfs_bandwidth_used);
1774 }
1775 
1776 void account_cfs_bandwidth_used(int enabled, int was_enabled)
1777 {
1778 	/* only need to count groups transitioning between enabled/!enabled */
1779 	if (enabled && !was_enabled)
1780 		static_key_slow_inc(&__cfs_bandwidth_used);
1781 	else if (!enabled && was_enabled)
1782 		static_key_slow_dec(&__cfs_bandwidth_used);
1783 }
1784 #else /* HAVE_JUMP_LABEL */
1785 static bool cfs_bandwidth_used(void)
1786 {
1787 	return true;
1788 }
1789 
1790 void account_cfs_bandwidth_used(int enabled, int was_enabled) {}
1791 #endif /* HAVE_JUMP_LABEL */
1792 
1793 /*
1794  * default period for cfs group bandwidth.
1795  * default: 0.1s, units: nanoseconds
1796  */
1797 static inline u64 default_cfs_period(void)
1798 {
1799 	return 100000000ULL;
1800 }
1801 
1802 static inline u64 sched_cfs_bandwidth_slice(void)
1803 {
1804 	return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
1805 }
1806 
1807 /*
1808  * Replenish runtime according to assigned quota and update expiration time.
1809  * We use sched_clock_cpu directly instead of rq->clock to avoid adding
1810  * additional synchronization around rq->lock.
1811  *
1812  * requires cfs_b->lock
1813  */
1814 void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
1815 {
1816 	u64 now;
1817 
1818 	if (cfs_b->quota == RUNTIME_INF)
1819 		return;
1820 
1821 	now = sched_clock_cpu(smp_processor_id());
1822 	cfs_b->runtime = cfs_b->quota;
1823 	cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
1824 }
1825 
1826 static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
1827 {
1828 	return &tg->cfs_bandwidth;
1829 }
1830 
1831 /* rq->task_clock normalized against any time this cfs_rq has spent throttled */
1832 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
1833 {
1834 	if (unlikely(cfs_rq->throttle_count))
1835 		return cfs_rq->throttled_clock_task;
1836 
1837 	return rq_of(cfs_rq)->clock_task - cfs_rq->throttled_clock_task_time;
1838 }
1839 
1840 /* returns 0 on failure to allocate runtime */
1841 static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1842 {
1843 	struct task_group *tg = cfs_rq->tg;
1844 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
1845 	u64 amount = 0, min_amount, expires;
1846 
1847 	/* note: this is a positive sum as runtime_remaining <= 0 */
1848 	min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
1849 
1850 	raw_spin_lock(&cfs_b->lock);
1851 	if (cfs_b->quota == RUNTIME_INF)
1852 		amount = min_amount;
1853 	else {
1854 		/*
1855 		 * If the bandwidth pool has become inactive, then at least one
1856 		 * period must have elapsed since the last consumption.
1857 		 * Refresh the global state and ensure bandwidth timer becomes
1858 		 * active.
1859 		 */
1860 		if (!cfs_b->timer_active) {
1861 			__refill_cfs_bandwidth_runtime(cfs_b);
1862 			__start_cfs_bandwidth(cfs_b);
1863 		}
1864 
1865 		if (cfs_b->runtime > 0) {
1866 			amount = min(cfs_b->runtime, min_amount);
1867 			cfs_b->runtime -= amount;
1868 			cfs_b->idle = 0;
1869 		}
1870 	}
1871 	expires = cfs_b->runtime_expires;
1872 	raw_spin_unlock(&cfs_b->lock);
1873 
1874 	cfs_rq->runtime_remaining += amount;
1875 	/*
1876 	 * we may have advanced our local expiration to account for allowed
1877 	 * spread between our sched_clock and the one on which runtime was
1878 	 * issued.
1879 	 */
1880 	if ((s64)(expires - cfs_rq->runtime_expires) > 0)
1881 		cfs_rq->runtime_expires = expires;
1882 
1883 	return cfs_rq->runtime_remaining > 0;
1884 }
1885 
1886 /*
1887  * Note: This depends on the synchronization provided by sched_clock and the
1888  * fact that rq->clock snapshots this value.
1889  */
1890 static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1891 {
1892 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
1893 	struct rq *rq = rq_of(cfs_rq);
1894 
1895 	/* if the deadline is ahead of our clock, nothing to do */
1896 	if (likely((s64)(rq->clock - cfs_rq->runtime_expires) < 0))
1897 		return;
1898 
1899 	if (cfs_rq->runtime_remaining < 0)
1900 		return;
1901 
1902 	/*
1903 	 * If the local deadline has passed we have to consider the
1904 	 * possibility that our sched_clock is 'fast' and the global deadline
1905 	 * has not truly expired.
1906 	 *
1907 	 * Fortunately we can check determine whether this the case by checking
1908 	 * whether the global deadline has advanced.
1909 	 */
1910 
1911 	if ((s64)(cfs_rq->runtime_expires - cfs_b->runtime_expires) >= 0) {
1912 		/* extend local deadline, drift is bounded above by 2 ticks */
1913 		cfs_rq->runtime_expires += TICK_NSEC;
1914 	} else {
1915 		/* global deadline is ahead, expiration has passed */
1916 		cfs_rq->runtime_remaining = 0;
1917 	}
1918 }
1919 
1920 static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
1921 				     unsigned long delta_exec)
1922 {
1923 	/* dock delta_exec before expiring quota (as it could span periods) */
1924 	cfs_rq->runtime_remaining -= delta_exec;
1925 	expire_cfs_rq_runtime(cfs_rq);
1926 
1927 	if (likely(cfs_rq->runtime_remaining > 0))
1928 		return;
1929 
1930 	/*
1931 	 * if we're unable to extend our runtime we resched so that the active
1932 	 * hierarchy can be throttled
1933 	 */
1934 	if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
1935 		resched_task(rq_of(cfs_rq)->curr);
1936 }
1937 
1938 static __always_inline
1939 void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec)
1940 {
1941 	if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
1942 		return;
1943 
1944 	__account_cfs_rq_runtime(cfs_rq, delta_exec);
1945 }
1946 
1947 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
1948 {
1949 	return cfs_bandwidth_used() && cfs_rq->throttled;
1950 }
1951 
1952 /* check whether cfs_rq, or any parent, is throttled */
1953 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
1954 {
1955 	return cfs_bandwidth_used() && cfs_rq->throttle_count;
1956 }
1957 
1958 /*
1959  * Ensure that neither of the group entities corresponding to src_cpu or
1960  * dest_cpu are members of a throttled hierarchy when performing group
1961  * load-balance operations.
1962  */
1963 static inline int throttled_lb_pair(struct task_group *tg,
1964 				    int src_cpu, int dest_cpu)
1965 {
1966 	struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
1967 
1968 	src_cfs_rq = tg->cfs_rq[src_cpu];
1969 	dest_cfs_rq = tg->cfs_rq[dest_cpu];
1970 
1971 	return throttled_hierarchy(src_cfs_rq) ||
1972 	       throttled_hierarchy(dest_cfs_rq);
1973 }
1974 
1975 /* updated child weight may affect parent so we have to do this bottom up */
1976 static int tg_unthrottle_up(struct task_group *tg, void *data)
1977 {
1978 	struct rq *rq = data;
1979 	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
1980 
1981 	cfs_rq->throttle_count--;
1982 #ifdef CONFIG_SMP
1983 	if (!cfs_rq->throttle_count) {
1984 		/* adjust cfs_rq_clock_task() */
1985 		cfs_rq->throttled_clock_task_time += rq->clock_task -
1986 					     cfs_rq->throttled_clock_task;
1987 	}
1988 #endif
1989 
1990 	return 0;
1991 }
1992 
1993 static int tg_throttle_down(struct task_group *tg, void *data)
1994 {
1995 	struct rq *rq = data;
1996 	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
1997 
1998 	/* group is entering throttled state, stop time */
1999 	if (!cfs_rq->throttle_count)
2000 		cfs_rq->throttled_clock_task = rq->clock_task;
2001 	cfs_rq->throttle_count++;
2002 
2003 	return 0;
2004 }
2005 
2006 static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
2007 {
2008 	struct rq *rq = rq_of(cfs_rq);
2009 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
2010 	struct sched_entity *se;
2011 	long task_delta, dequeue = 1;
2012 
2013 	se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
2014 
2015 	/* freeze hierarchy runnable averages while throttled */
2016 	rcu_read_lock();
2017 	walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
2018 	rcu_read_unlock();
2019 
2020 	task_delta = cfs_rq->h_nr_running;
2021 	for_each_sched_entity(se) {
2022 		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
2023 		/* throttled entity or throttle-on-deactivate */
2024 		if (!se->on_rq)
2025 			break;
2026 
2027 		if (dequeue)
2028 			dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
2029 		qcfs_rq->h_nr_running -= task_delta;
2030 
2031 		if (qcfs_rq->load.weight)
2032 			dequeue = 0;
2033 	}
2034 
2035 	if (!se)
2036 		rq->nr_running -= task_delta;
2037 
2038 	cfs_rq->throttled = 1;
2039 	cfs_rq->throttled_clock = rq->clock;
2040 	raw_spin_lock(&cfs_b->lock);
2041 	list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
2042 	raw_spin_unlock(&cfs_b->lock);
2043 }
2044 
2045 void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
2046 {
2047 	struct rq *rq = rq_of(cfs_rq);
2048 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
2049 	struct sched_entity *se;
2050 	int enqueue = 1;
2051 	long task_delta;
2052 
2053 	se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
2054 
2055 	cfs_rq->throttled = 0;
2056 	raw_spin_lock(&cfs_b->lock);
2057 	cfs_b->throttled_time += rq->clock - cfs_rq->throttled_clock;
2058 	list_del_rcu(&cfs_rq->throttled_list);
2059 	raw_spin_unlock(&cfs_b->lock);
2060 
2061 	update_rq_clock(rq);
2062 	/* update hierarchical throttle state */
2063 	walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
2064 
2065 	if (!cfs_rq->load.weight)
2066 		return;
2067 
2068 	task_delta = cfs_rq->h_nr_running;
2069 	for_each_sched_entity(se) {
2070 		if (se->on_rq)
2071 			enqueue = 0;
2072 
2073 		cfs_rq = cfs_rq_of(se);
2074 		if (enqueue)
2075 			enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
2076 		cfs_rq->h_nr_running += task_delta;
2077 
2078 		if (cfs_rq_throttled(cfs_rq))
2079 			break;
2080 	}
2081 
2082 	if (!se)
2083 		rq->nr_running += task_delta;
2084 
2085 	/* determine whether we need to wake up potentially idle cpu */
2086 	if (rq->curr == rq->idle && rq->cfs.nr_running)
2087 		resched_task(rq->curr);
2088 }
2089 
2090 static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
2091 		u64 remaining, u64 expires)
2092 {
2093 	struct cfs_rq *cfs_rq;
2094 	u64 runtime = remaining;
2095 
2096 	rcu_read_lock();
2097 	list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
2098 				throttled_list) {
2099 		struct rq *rq = rq_of(cfs_rq);
2100 
2101 		raw_spin_lock(&rq->lock);
2102 		if (!cfs_rq_throttled(cfs_rq))
2103 			goto next;
2104 
2105 		runtime = -cfs_rq->runtime_remaining + 1;
2106 		if (runtime > remaining)
2107 			runtime = remaining;
2108 		remaining -= runtime;
2109 
2110 		cfs_rq->runtime_remaining += runtime;
2111 		cfs_rq->runtime_expires = expires;
2112 
2113 		/* we check whether we're throttled above */
2114 		if (cfs_rq->runtime_remaining > 0)
2115 			unthrottle_cfs_rq(cfs_rq);
2116 
2117 next:
2118 		raw_spin_unlock(&rq->lock);
2119 
2120 		if (!remaining)
2121 			break;
2122 	}
2123 	rcu_read_unlock();
2124 
2125 	return remaining;
2126 }
2127 
2128 /*
2129  * Responsible for refilling a task_group's bandwidth and unthrottling its
2130  * cfs_rqs as appropriate. If there has been no activity within the last
2131  * period the timer is deactivated until scheduling resumes; cfs_b->idle is
2132  * used to track this state.
2133  */
2134 static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
2135 {
2136 	u64 runtime, runtime_expires;
2137 	int idle = 1, throttled;
2138 
2139 	raw_spin_lock(&cfs_b->lock);
2140 	/* no need to continue the timer with no bandwidth constraint */
2141 	if (cfs_b->quota == RUNTIME_INF)
2142 		goto out_unlock;
2143 
2144 	throttled = !list_empty(&cfs_b->throttled_cfs_rq);
2145 	/* idle depends on !throttled (for the case of a large deficit) */
2146 	idle = cfs_b->idle && !throttled;
2147 	cfs_b->nr_periods += overrun;
2148 
2149 	/* if we're going inactive then everything else can be deferred */
2150 	if (idle)
2151 		goto out_unlock;
2152 
2153 	__refill_cfs_bandwidth_runtime(cfs_b);
2154 
2155 	if (!throttled) {
2156 		/* mark as potentially idle for the upcoming period */
2157 		cfs_b->idle = 1;
2158 		goto out_unlock;
2159 	}
2160 
2161 	/* account preceding periods in which throttling occurred */
2162 	cfs_b->nr_throttled += overrun;
2163 
2164 	/*
2165 	 * There are throttled entities so we must first use the new bandwidth
2166 	 * to unthrottle them before making it generally available.  This
2167 	 * ensures that all existing debts will be paid before a new cfs_rq is
2168 	 * allowed to run.
2169 	 */
2170 	runtime = cfs_b->runtime;
2171 	runtime_expires = cfs_b->runtime_expires;
2172 	cfs_b->runtime = 0;
2173 
2174 	/*
2175 	 * This check is repeated as we are holding onto the new bandwidth
2176 	 * while we unthrottle.  This can potentially race with an unthrottled
2177 	 * group trying to acquire new bandwidth from the global pool.
2178 	 */
2179 	while (throttled && runtime > 0) {
2180 		raw_spin_unlock(&cfs_b->lock);
2181 		/* we can't nest cfs_b->lock while distributing bandwidth */
2182 		runtime = distribute_cfs_runtime(cfs_b, runtime,
2183 						 runtime_expires);
2184 		raw_spin_lock(&cfs_b->lock);
2185 
2186 		throttled = !list_empty(&cfs_b->throttled_cfs_rq);
2187 	}
2188 
2189 	/* return (any) remaining runtime */
2190 	cfs_b->runtime = runtime;
2191 	/*
2192 	 * While we are ensured activity in the period following an
2193 	 * unthrottle, this also covers the case in which the new bandwidth is
2194 	 * insufficient to cover the existing bandwidth deficit.  (Forcing the
2195 	 * timer to remain active while there are any throttled entities.)
2196 	 */
2197 	cfs_b->idle = 0;
2198 out_unlock:
2199 	if (idle)
2200 		cfs_b->timer_active = 0;
2201 	raw_spin_unlock(&cfs_b->lock);
2202 
2203 	return idle;
2204 }
2205 
2206 /* a cfs_rq won't donate quota below this amount */
2207 static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
2208 /* minimum remaining period time to redistribute slack quota */
2209 static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
2210 /* how long we wait to gather additional slack before distributing */
2211 static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
2212 
2213 /* are we near the end of the current quota period? */
2214 static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
2215 {
2216 	struct hrtimer *refresh_timer = &cfs_b->period_timer;
2217 	u64 remaining;
2218 
2219 	/* if the call-back is running a quota refresh is already occurring */
2220 	if (hrtimer_callback_running(refresh_timer))
2221 		return 1;
2222 
2223 	/* is a quota refresh about to occur? */
2224 	remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
2225 	if (remaining < min_expire)
2226 		return 1;
2227 
2228 	return 0;
2229 }
2230 
2231 static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
2232 {
2233 	u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
2234 
2235 	/* if there's a quota refresh soon don't bother with slack */
2236 	if (runtime_refresh_within(cfs_b, min_left))
2237 		return;
2238 
2239 	start_bandwidth_timer(&cfs_b->slack_timer,
2240 				ns_to_ktime(cfs_bandwidth_slack_period));
2241 }
2242 
2243 /* we know any runtime found here is valid as update_curr() precedes return */
2244 static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
2245 {
2246 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
2247 	s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
2248 
2249 	if (slack_runtime <= 0)
2250 		return;
2251 
2252 	raw_spin_lock(&cfs_b->lock);
2253 	if (cfs_b->quota != RUNTIME_INF &&
2254 	    cfs_rq->runtime_expires == cfs_b->runtime_expires) {
2255 		cfs_b->runtime += slack_runtime;
2256 
2257 		/* we are under rq->lock, defer unthrottling using a timer */
2258 		if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
2259 		    !list_empty(&cfs_b->throttled_cfs_rq))
2260 			start_cfs_slack_bandwidth(cfs_b);
2261 	}
2262 	raw_spin_unlock(&cfs_b->lock);
2263 
2264 	/* even if it's not valid for return we don't want to try again */
2265 	cfs_rq->runtime_remaining -= slack_runtime;
2266 }
2267 
2268 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
2269 {
2270 	if (!cfs_bandwidth_used())
2271 		return;
2272 
2273 	if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
2274 		return;
2275 
2276 	__return_cfs_rq_runtime(cfs_rq);
2277 }
2278 
2279 /*
2280  * This is done with a timer (instead of inline with bandwidth return) since
2281  * it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.
2282  */
2283 static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
2284 {
2285 	u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
2286 	u64 expires;
2287 
2288 	/* confirm we're still not at a refresh boundary */
2289 	if (runtime_refresh_within(cfs_b, min_bandwidth_expiration))
2290 		return;
2291 
2292 	raw_spin_lock(&cfs_b->lock);
2293 	if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) {
2294 		runtime = cfs_b->runtime;
2295 		cfs_b->runtime = 0;
2296 	}
2297 	expires = cfs_b->runtime_expires;
2298 	raw_spin_unlock(&cfs_b->lock);
2299 
2300 	if (!runtime)
2301 		return;
2302 
2303 	runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
2304 
2305 	raw_spin_lock(&cfs_b->lock);
2306 	if (expires == cfs_b->runtime_expires)
2307 		cfs_b->runtime = runtime;
2308 	raw_spin_unlock(&cfs_b->lock);
2309 }
2310 
2311 /*
2312  * When a group wakes up we want to make sure that its quota is not already
2313  * expired/exceeded, otherwise it may be allowed to steal additional ticks of
2314  * runtime as update_curr() throttling can not not trigger until it's on-rq.
2315  */
2316 static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
2317 {
2318 	if (!cfs_bandwidth_used())
2319 		return;
2320 
2321 	/* an active group must be handled by the update_curr()->put() path */
2322 	if (!cfs_rq->runtime_enabled || cfs_rq->curr)
2323 		return;
2324 
2325 	/* ensure the group is not already throttled */
2326 	if (cfs_rq_throttled(cfs_rq))
2327 		return;
2328 
2329 	/* update runtime allocation */
2330 	account_cfs_rq_runtime(cfs_rq, 0);
2331 	if (cfs_rq->runtime_remaining <= 0)
2332 		throttle_cfs_rq(cfs_rq);
2333 }
2334 
2335 /* conditionally throttle active cfs_rq's from put_prev_entity() */
2336 static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
2337 {
2338 	if (!cfs_bandwidth_used())
2339 		return;
2340 
2341 	if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
2342 		return;
2343 
2344 	/*
2345 	 * it's possible for a throttled entity to be forced into a running
2346 	 * state (e.g. set_curr_task), in this case we're finished.
2347 	 */
2348 	if (cfs_rq_throttled(cfs_rq))
2349 		return;
2350 
2351 	throttle_cfs_rq(cfs_rq);
2352 }
2353 
2354 static inline u64 default_cfs_period(void);
2355 static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun);
2356 static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b);
2357 
2358 static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
2359 {
2360 	struct cfs_bandwidth *cfs_b =
2361 		container_of(timer, struct cfs_bandwidth, slack_timer);
2362 	do_sched_cfs_slack_timer(cfs_b);
2363 
2364 	return HRTIMER_NORESTART;
2365 }
2366 
2367 static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
2368 {
2369 	struct cfs_bandwidth *cfs_b =
2370 		container_of(timer, struct cfs_bandwidth, period_timer);
2371 	ktime_t now;
2372 	int overrun;
2373 	int idle = 0;
2374 
2375 	for (;;) {
2376 		now = hrtimer_cb_get_time(timer);
2377 		overrun = hrtimer_forward(timer, now, cfs_b->period);
2378 
2379 		if (!overrun)
2380 			break;
2381 
2382 		idle = do_sched_cfs_period_timer(cfs_b, overrun);
2383 	}
2384 
2385 	return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
2386 }
2387 
2388 void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
2389 {
2390 	raw_spin_lock_init(&cfs_b->lock);
2391 	cfs_b->runtime = 0;
2392 	cfs_b->quota = RUNTIME_INF;
2393 	cfs_b->period = ns_to_ktime(default_cfs_period());
2394 
2395 	INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
2396 	hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
2397 	cfs_b->period_timer.function = sched_cfs_period_timer;
2398 	hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
2399 	cfs_b->slack_timer.function = sched_cfs_slack_timer;
2400 }
2401 
2402 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
2403 {
2404 	cfs_rq->runtime_enabled = 0;
2405 	INIT_LIST_HEAD(&cfs_rq->throttled_list);
2406 }
2407 
2408 /* requires cfs_b->lock, may release to reprogram timer */
2409 void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
2410 {
2411 	/*
2412 	 * The timer may be active because we're trying to set a new bandwidth
2413 	 * period or because we're racing with the tear-down path
2414 	 * (timer_active==0 becomes visible before the hrtimer call-back
2415 	 * terminates).  In either case we ensure that it's re-programmed
2416 	 */
2417 	while (unlikely(hrtimer_active(&cfs_b->period_timer))) {
2418 		raw_spin_unlock(&cfs_b->lock);
2419 		/* ensure cfs_b->lock is available while we wait */
2420 		hrtimer_cancel(&cfs_b->period_timer);
2421 
2422 		raw_spin_lock(&cfs_b->lock);
2423 		/* if someone else restarted the timer then we're done */
2424 		if (cfs_b->timer_active)
2425 			return;
2426 	}
2427 
2428 	cfs_b->timer_active = 1;
2429 	start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
2430 }
2431 
2432 static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
2433 {
2434 	hrtimer_cancel(&cfs_b->period_timer);
2435 	hrtimer_cancel(&cfs_b->slack_timer);
2436 }
2437 
2438 static void unthrottle_offline_cfs_rqs(struct rq *rq)
2439 {
2440 	struct cfs_rq *cfs_rq;
2441 
2442 	for_each_leaf_cfs_rq(rq, cfs_rq) {
2443 		struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
2444 
2445 		if (!cfs_rq->runtime_enabled)
2446 			continue;
2447 
2448 		/*
2449 		 * clock_task is not advancing so we just need to make sure
2450 		 * there's some valid quota amount
2451 		 */
2452 		cfs_rq->runtime_remaining = cfs_b->quota;
2453 		if (cfs_rq_throttled(cfs_rq))
2454 			unthrottle_cfs_rq(cfs_rq);
2455 	}
2456 }
2457 
2458 #else /* CONFIG_CFS_BANDWIDTH */
2459 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
2460 {
2461 	return rq_of(cfs_rq)->clock_task;
2462 }
2463 
2464 static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
2465 				     unsigned long delta_exec) {}
2466 static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
2467 static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
2468 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
2469 
2470 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
2471 {
2472 	return 0;
2473 }
2474 
2475 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
2476 {
2477 	return 0;
2478 }
2479 
2480 static inline int throttled_lb_pair(struct task_group *tg,
2481 				    int src_cpu, int dest_cpu)
2482 {
2483 	return 0;
2484 }
2485 
2486 void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
2487 
2488 #ifdef CONFIG_FAIR_GROUP_SCHED
2489 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
2490 #endif
2491 
2492 static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
2493 {
2494 	return NULL;
2495 }
2496 static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
2497 static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
2498 
2499 #endif /* CONFIG_CFS_BANDWIDTH */
2500 
2501 /**************************************************
2502  * CFS operations on tasks:
2503  */
2504 
2505 #ifdef CONFIG_SCHED_HRTICK
2506 static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
2507 {
2508 	struct sched_entity *se = &p->se;
2509 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
2510 
2511 	WARN_ON(task_rq(p) != rq);
2512 
2513 	if (cfs_rq->nr_running > 1) {
2514 		u64 slice = sched_slice(cfs_rq, se);
2515 		u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
2516 		s64 delta = slice - ran;
2517 
2518 		if (delta < 0) {
2519 			if (rq->curr == p)
2520 				resched_task(p);
2521 			return;
2522 		}
2523 
2524 		/*
2525 		 * Don't schedule slices shorter than 10000ns, that just
2526 		 * doesn't make sense. Rely on vruntime for fairness.
2527 		 */
2528 		if (rq->curr != p)
2529 			delta = max_t(s64, 10000LL, delta);
2530 
2531 		hrtick_start(rq, delta);
2532 	}
2533 }
2534 
2535 /*
2536  * called from enqueue/dequeue and updates the hrtick when the
2537  * current task is from our class and nr_running is low enough
2538  * to matter.
2539  */
2540 static void hrtick_update(struct rq *rq)
2541 {
2542 	struct task_struct *curr = rq->curr;
2543 
2544 	if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class)
2545 		return;
2546 
2547 	if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
2548 		hrtick_start_fair(rq, curr);
2549 }
2550 #else /* !CONFIG_SCHED_HRTICK */
2551 static inline void
2552 hrtick_start_fair(struct rq *rq, struct task_struct *p)
2553 {
2554 }
2555 
2556 static inline void hrtick_update(struct rq *rq)
2557 {
2558 }
2559 #endif
2560 
2561 /*
2562  * The enqueue_task method is called before nr_running is
2563  * increased. Here we update the fair scheduling stats and
2564  * then put the task into the rbtree:
2565  */
2566 static void
2567 enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
2568 {
2569 	struct cfs_rq *cfs_rq;
2570 	struct sched_entity *se = &p->se;
2571 
2572 	for_each_sched_entity(se) {
2573 		if (se->on_rq)
2574 			break;
2575 		cfs_rq = cfs_rq_of(se);
2576 		enqueue_entity(cfs_rq, se, flags);
2577 
2578 		/*
2579 		 * end evaluation on encountering a throttled cfs_rq
2580 		 *
2581 		 * note: in the case of encountering a throttled cfs_rq we will
2582 		 * post the final h_nr_running increment below.
2583 		*/
2584 		if (cfs_rq_throttled(cfs_rq))
2585 			break;
2586 		cfs_rq->h_nr_running++;
2587 
2588 		flags = ENQUEUE_WAKEUP;
2589 	}
2590 
2591 	for_each_sched_entity(se) {
2592 		cfs_rq = cfs_rq_of(se);
2593 		cfs_rq->h_nr_running++;
2594 
2595 		if (cfs_rq_throttled(cfs_rq))
2596 			break;
2597 
2598 		update_entity_load_avg(se, 1);
2599 		update_cfs_rq_blocked_load(cfs_rq, 0);
2600 	}
2601 
2602 	if (!se) {
2603 		update_rq_runnable_avg(rq, rq->nr_running);
2604 		inc_nr_running(rq);
2605 	}
2606 	hrtick_update(rq);
2607 }
2608 
2609 static void set_next_buddy(struct sched_entity *se);
2610 
2611 /*
2612  * The dequeue_task method is called before nr_running is
2613  * decreased. We remove the task from the rbtree and
2614  * update the fair scheduling stats:
2615  */
2616 static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
2617 {
2618 	struct cfs_rq *cfs_rq;
2619 	struct sched_entity *se = &p->se;
2620 	int task_sleep = flags & DEQUEUE_SLEEP;
2621 
2622 	for_each_sched_entity(se) {
2623 		cfs_rq = cfs_rq_of(se);
2624 		dequeue_entity(cfs_rq, se, flags);
2625 
2626 		/*
2627 		 * end evaluation on encountering a throttled cfs_rq
2628 		 *
2629 		 * note: in the case of encountering a throttled cfs_rq we will
2630 		 * post the final h_nr_running decrement below.
2631 		*/
2632 		if (cfs_rq_throttled(cfs_rq))
2633 			break;
2634 		cfs_rq->h_nr_running--;
2635 
2636 		/* Don't dequeue parent if it has other entities besides us */
2637 		if (cfs_rq->load.weight) {
2638 			/*
2639 			 * Bias pick_next to pick a task from this cfs_rq, as
2640 			 * p is sleeping when it is within its sched_slice.
2641 			 */
2642 			if (task_sleep && parent_entity(se))
2643 				set_next_buddy(parent_entity(se));
2644 
2645 			/* avoid re-evaluating load for this entity */
2646 			se = parent_entity(se);
2647 			break;
2648 		}
2649 		flags |= DEQUEUE_SLEEP;
2650 	}
2651 
2652 	for_each_sched_entity(se) {
2653 		cfs_rq = cfs_rq_of(se);
2654 		cfs_rq->h_nr_running--;
2655 
2656 		if (cfs_rq_throttled(cfs_rq))
2657 			break;
2658 
2659 		update_entity_load_avg(se, 1);
2660 		update_cfs_rq_blocked_load(cfs_rq, 0);
2661 	}
2662 
2663 	if (!se) {
2664 		dec_nr_running(rq);
2665 		update_rq_runnable_avg(rq, 1);
2666 	}
2667 	hrtick_update(rq);
2668 }
2669 
2670 #ifdef CONFIG_SMP
2671 /* Used instead of source_load when we know the type == 0 */
2672 static unsigned long weighted_cpuload(const int cpu)
2673 {
2674 	return cpu_rq(cpu)->load.weight;
2675 }
2676 
2677 /*
2678  * Return a low guess at the load of a migration-source cpu weighted
2679  * according to the scheduling class and "nice" value.
2680  *
2681  * We want to under-estimate the load of migration sources, to
2682  * balance conservatively.
2683  */
2684 static unsigned long source_load(int cpu, int type)
2685 {
2686 	struct rq *rq = cpu_rq(cpu);
2687 	unsigned long total = weighted_cpuload(cpu);
2688 
2689 	if (type == 0 || !sched_feat(LB_BIAS))
2690 		return total;
2691 
2692 	return min(rq->cpu_load[type-1], total);
2693 }
2694 
2695 /*
2696  * Return a high guess at the load of a migration-target cpu weighted
2697  * according to the scheduling class and "nice" value.
2698  */
2699 static unsigned long target_load(int cpu, int type)
2700 {
2701 	struct rq *rq = cpu_rq(cpu);
2702 	unsigned long total = weighted_cpuload(cpu);
2703 
2704 	if (type == 0 || !sched_feat(LB_BIAS))
2705 		return total;
2706 
2707 	return max(rq->cpu_load[type-1], total);
2708 }
2709 
2710 static unsigned long power_of(int cpu)
2711 {
2712 	return cpu_rq(cpu)->cpu_power;
2713 }
2714 
2715 static unsigned long cpu_avg_load_per_task(int cpu)
2716 {
2717 	struct rq *rq = cpu_rq(cpu);
2718 	unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
2719 
2720 	if (nr_running)
2721 		return rq->load.weight / nr_running;
2722 
2723 	return 0;
2724 }
2725 
2726 
2727 static void task_waking_fair(struct task_struct *p)
2728 {
2729 	struct sched_entity *se = &p->se;
2730 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
2731 	u64 min_vruntime;
2732 
2733 #ifndef CONFIG_64BIT
2734 	u64 min_vruntime_copy;
2735 
2736 	do {
2737 		min_vruntime_copy = cfs_rq->min_vruntime_copy;
2738 		smp_rmb();
2739 		min_vruntime = cfs_rq->min_vruntime;
2740 	} while (min_vruntime != min_vruntime_copy);
2741 #else
2742 	min_vruntime = cfs_rq->min_vruntime;
2743 #endif
2744 
2745 	se->vruntime -= min_vruntime;
2746 }
2747 
2748 #ifdef CONFIG_FAIR_GROUP_SCHED
2749 /*
2750  * effective_load() calculates the load change as seen from the root_task_group
2751  *
2752  * Adding load to a group doesn't make a group heavier, but can cause movement
2753  * of group shares between cpus. Assuming the shares were perfectly aligned one
2754  * can calculate the shift in shares.
2755  *
2756  * Calculate the effective load difference if @wl is added (subtracted) to @tg
2757  * on this @cpu and results in a total addition (subtraction) of @wg to the
2758  * total group weight.
2759  *
2760  * Given a runqueue weight distribution (rw_i) we can compute a shares
2761  * distribution (s_i) using:
2762  *
2763  *   s_i = rw_i / \Sum rw_j						(1)
2764  *
2765  * Suppose we have 4 CPUs and our @tg is a direct child of the root group and
2766  * has 7 equal weight tasks, distributed as below (rw_i), with the resulting
2767  * shares distribution (s_i):
2768  *
2769  *   rw_i = {   2,   4,   1,   0 }
2770  *   s_i  = { 2/7, 4/7, 1/7,   0 }
2771  *
2772  * As per wake_affine() we're interested in the load of two CPUs (the CPU the
2773  * task used to run on and the CPU the waker is running on), we need to
2774  * compute the effect of waking a task on either CPU and, in case of a sync
2775  * wakeup, compute the effect of the current task going to sleep.
2776  *
2777  * So for a change of @wl to the local @cpu with an overall group weight change
2778  * of @wl we can compute the new shares distribution (s'_i) using:
2779  *
2780  *   s'_i = (rw_i + @wl) / (@wg + \Sum rw_j)				(2)
2781  *
2782  * Suppose we're interested in CPUs 0 and 1, and want to compute the load
2783  * differences in waking a task to CPU 0. The additional task changes the
2784  * weight and shares distributions like:
2785  *
2786  *   rw'_i = {   3,   4,   1,   0 }
2787  *   s'_i  = { 3/8, 4/8, 1/8,   0 }
2788  *
2789  * We can then compute the difference in effective weight by using:
2790  *
2791  *   dw_i = S * (s'_i - s_i)						(3)
2792  *
2793  * Where 'S' is the group weight as seen by its parent.
2794  *
2795  * Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7)
2796  * times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 -
2797  * 4/7) times the weight of the group.
2798  */
2799 static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
2800 {
2801 	struct sched_entity *se = tg->se[cpu];
2802 
2803 	if (!tg->parent)	/* the trivial, non-cgroup case */
2804 		return wl;
2805 
2806 	for_each_sched_entity(se) {
2807 		long w, W;
2808 
2809 		tg = se->my_q->tg;
2810 
2811 		/*
2812 		 * W = @wg + \Sum rw_j
2813 		 */
2814 		W = wg + calc_tg_weight(tg, se->my_q);
2815 
2816 		/*
2817 		 * w = rw_i + @wl
2818 		 */
2819 		w = se->my_q->load.weight + wl;
2820 
2821 		/*
2822 		 * wl = S * s'_i; see (2)
2823 		 */
2824 		if (W > 0 && w < W)
2825 			wl = (w * tg->shares) / W;
2826 		else
2827 			wl = tg->shares;
2828 
2829 		/*
2830 		 * Per the above, wl is the new se->load.weight value; since
2831 		 * those are clipped to [MIN_SHARES, ...) do so now. See
2832 		 * calc_cfs_shares().
2833 		 */
2834 		if (wl < MIN_SHARES)
2835 			wl = MIN_SHARES;
2836 
2837 		/*
2838 		 * wl = dw_i = S * (s'_i - s_i); see (3)
2839 		 */
2840 		wl -= se->load.weight;
2841 
2842 		/*
2843 		 * Recursively apply this logic to all parent groups to compute
2844 		 * the final effective load change on the root group. Since
2845 		 * only the @tg group gets extra weight, all parent groups can
2846 		 * only redistribute existing shares. @wl is the shift in shares
2847 		 * resulting from this level per the above.
2848 		 */
2849 		wg = 0;
2850 	}
2851 
2852 	return wl;
2853 }
2854 #else
2855 
2856 static inline unsigned long effective_load(struct task_group *tg, int cpu,
2857 		unsigned long wl, unsigned long wg)
2858 {
2859 	return wl;
2860 }
2861 
2862 #endif
2863 
2864 static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
2865 {
2866 	s64 this_load, load;
2867 	int idx, this_cpu, prev_cpu;
2868 	unsigned long tl_per_task;
2869 	struct task_group *tg;
2870 	unsigned long weight;
2871 	int balanced;
2872 
2873 	idx	  = sd->wake_idx;
2874 	this_cpu  = smp_processor_id();
2875 	prev_cpu  = task_cpu(p);
2876 	load	  = source_load(prev_cpu, idx);
2877 	this_load = target_load(this_cpu, idx);
2878 
2879 	/*
2880 	 * If sync wakeup then subtract the (maximum possible)
2881 	 * effect of the currently running task from the load
2882 	 * of the current CPU:
2883 	 */
2884 	if (sync) {
2885 		tg = task_group(current);
2886 		weight = current->se.load.weight;
2887 
2888 		this_load += effective_load(tg, this_cpu, -weight, -weight);
2889 		load += effective_load(tg, prev_cpu, 0, -weight);
2890 	}
2891 
2892 	tg = task_group(p);
2893 	weight = p->se.load.weight;
2894 
2895 	/*
2896 	 * In low-load situations, where prev_cpu is idle and this_cpu is idle
2897 	 * due to the sync cause above having dropped this_load to 0, we'll
2898 	 * always have an imbalance, but there's really nothing you can do
2899 	 * about that, so that's good too.
2900 	 *
2901 	 * Otherwise check if either cpus are near enough in load to allow this
2902 	 * task to be woken on this_cpu.
2903 	 */
2904 	if (this_load > 0) {
2905 		s64 this_eff_load, prev_eff_load;
2906 
2907 		this_eff_load = 100;
2908 		this_eff_load *= power_of(prev_cpu);
2909 		this_eff_load *= this_load +
2910 			effective_load(tg, this_cpu, weight, weight);
2911 
2912 		prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
2913 		prev_eff_load *= power_of(this_cpu);
2914 		prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
2915 
2916 		balanced = this_eff_load <= prev_eff_load;
2917 	} else
2918 		balanced = true;
2919 
2920 	/*
2921 	 * If the currently running task will sleep within
2922 	 * a reasonable amount of time then attract this newly
2923 	 * woken task:
2924 	 */
2925 	if (sync && balanced)
2926 		return 1;
2927 
2928 	schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
2929 	tl_per_task = cpu_avg_load_per_task(this_cpu);
2930 
2931 	if (balanced ||
2932 	    (this_load <= load &&
2933 	     this_load + target_load(prev_cpu, idx) <= tl_per_task)) {
2934 		/*
2935 		 * This domain has SD_WAKE_AFFINE and
2936 		 * p is cache cold in this domain, and
2937 		 * there is no bad imbalance.
2938 		 */
2939 		schedstat_inc(sd, ttwu_move_affine);
2940 		schedstat_inc(p, se.statistics.nr_wakeups_affine);
2941 
2942 		return 1;
2943 	}
2944 	return 0;
2945 }
2946 
2947 /*
2948  * find_idlest_group finds and returns the least busy CPU group within the
2949  * domain.
2950  */
2951 static struct sched_group *
2952 find_idlest_group(struct sched_domain *sd, struct task_struct *p,
2953 		  int this_cpu, int load_idx)
2954 {
2955 	struct sched_group *idlest = NULL, *group = sd->groups;
2956 	unsigned long min_load = ULONG_MAX, this_load = 0;
2957 	int imbalance = 100 + (sd->imbalance_pct-100)/2;
2958 
2959 	do {
2960 		unsigned long load, avg_load;
2961 		int local_group;
2962 		int i;
2963 
2964 		/* Skip over this group if it has no CPUs allowed */
2965 		if (!cpumask_intersects(sched_group_cpus(group),
2966 					tsk_cpus_allowed(p)))
2967 			continue;
2968 
2969 		local_group = cpumask_test_cpu(this_cpu,
2970 					       sched_group_cpus(group));
2971 
2972 		/* Tally up the load of all CPUs in the group */
2973 		avg_load = 0;
2974 
2975 		for_each_cpu(i, sched_group_cpus(group)) {
2976 			/* Bias balancing toward cpus of our domain */
2977 			if (local_group)
2978 				load = source_load(i, load_idx);
2979 			else
2980 				load = target_load(i, load_idx);
2981 
2982 			avg_load += load;
2983 		}
2984 
2985 		/* Adjust by relative CPU power of the group */
2986 		avg_load = (avg_load * SCHED_POWER_SCALE) / group->sgp->power;
2987 
2988 		if (local_group) {
2989 			this_load = avg_load;
2990 		} else if (avg_load < min_load) {
2991 			min_load = avg_load;
2992 			idlest = group;
2993 		}
2994 	} while (group = group->next, group != sd->groups);
2995 
2996 	if (!idlest || 100*this_load < imbalance*min_load)
2997 		return NULL;
2998 	return idlest;
2999 }
3000 
3001 /*
3002  * find_idlest_cpu - find the idlest cpu among the cpus in group.
3003  */
3004 static int
3005 find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
3006 {
3007 	unsigned long load, min_load = ULONG_MAX;
3008 	int idlest = -1;
3009 	int i;
3010 
3011 	/* Traverse only the allowed CPUs */
3012 	for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
3013 		load = weighted_cpuload(i);
3014 
3015 		if (load < min_load || (load == min_load && i == this_cpu)) {
3016 			min_load = load;
3017 			idlest = i;
3018 		}
3019 	}
3020 
3021 	return idlest;
3022 }
3023 
3024 /*
3025  * Try and locate an idle CPU in the sched_domain.
3026  */
3027 static int select_idle_sibling(struct task_struct *p, int target)
3028 {
3029 	int cpu = smp_processor_id();
3030 	int prev_cpu = task_cpu(p);
3031 	struct sched_domain *sd;
3032 	struct sched_group *sg;
3033 	int i;
3034 
3035 	/*
3036 	 * If the task is going to be woken-up on this cpu and if it is
3037 	 * already idle, then it is the right target.
3038 	 */
3039 	if (target == cpu && idle_cpu(cpu))
3040 		return cpu;
3041 
3042 	/*
3043 	 * If the task is going to be woken-up on the cpu where it previously
3044 	 * ran and if it is currently idle, then it the right target.
3045 	 */
3046 	if (target == prev_cpu && idle_cpu(prev_cpu))
3047 		return prev_cpu;
3048 
3049 	/*
3050 	 * Otherwise, iterate the domains and find an elegible idle cpu.
3051 	 */
3052 	sd = rcu_dereference(per_cpu(sd_llc, target));
3053 	for_each_lower_domain(sd) {
3054 		sg = sd->groups;
3055 		do {
3056 			if (!cpumask_intersects(sched_group_cpus(sg),
3057 						tsk_cpus_allowed(p)))
3058 				goto next;
3059 
3060 			for_each_cpu(i, sched_group_cpus(sg)) {
3061 				if (!idle_cpu(i))
3062 					goto next;
3063 			}
3064 
3065 			target = cpumask_first_and(sched_group_cpus(sg),
3066 					tsk_cpus_allowed(p));
3067 			goto done;
3068 next:
3069 			sg = sg->next;
3070 		} while (sg != sd->groups);
3071 	}
3072 done:
3073 	return target;
3074 }
3075 
3076 /*
3077  * sched_balance_self: balance the current task (running on cpu) in domains
3078  * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
3079  * SD_BALANCE_EXEC.
3080  *
3081  * Balance, ie. select the least loaded group.
3082  *
3083  * Returns the target CPU number, or the same CPU if no balancing is needed.
3084  *
3085  * preempt must be disabled.
3086  */
3087 static int
3088 select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
3089 {
3090 	struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
3091 	int cpu = smp_processor_id();
3092 	int prev_cpu = task_cpu(p);
3093 	int new_cpu = cpu;
3094 	int want_affine = 0;
3095 	int sync = wake_flags & WF_SYNC;
3096 
3097 	if (p->nr_cpus_allowed == 1)
3098 		return prev_cpu;
3099 
3100 	if (sd_flag & SD_BALANCE_WAKE) {
3101 		if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
3102 			want_affine = 1;
3103 		new_cpu = prev_cpu;
3104 	}
3105 
3106 	rcu_read_lock();
3107 	for_each_domain(cpu, tmp) {
3108 		if (!(tmp->flags & SD_LOAD_BALANCE))
3109 			continue;
3110 
3111 		/*
3112 		 * If both cpu and prev_cpu are part of this domain,
3113 		 * cpu is a valid SD_WAKE_AFFINE target.
3114 		 */
3115 		if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
3116 		    cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
3117 			affine_sd = tmp;
3118 			break;
3119 		}
3120 
3121 		if (tmp->flags & sd_flag)
3122 			sd = tmp;
3123 	}
3124 
3125 	if (affine_sd) {
3126 		if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))
3127 			prev_cpu = cpu;
3128 
3129 		new_cpu = select_idle_sibling(p, prev_cpu);
3130 		goto unlock;
3131 	}
3132 
3133 	while (sd) {
3134 		int load_idx = sd->forkexec_idx;
3135 		struct sched_group *group;
3136 		int weight;
3137 
3138 		if (!(sd->flags & sd_flag)) {
3139 			sd = sd->child;
3140 			continue;
3141 		}
3142 
3143 		if (sd_flag & SD_BALANCE_WAKE)
3144 			load_idx = sd->wake_idx;
3145 
3146 		group = find_idlest_group(sd, p, cpu, load_idx);
3147 		if (!group) {
3148 			sd = sd->child;
3149 			continue;
3150 		}
3151 
3152 		new_cpu = find_idlest_cpu(group, p, cpu);
3153 		if (new_cpu == -1 || new_cpu == cpu) {
3154 			/* Now try balancing at a lower domain level of cpu */
3155 			sd = sd->child;
3156 			continue;
3157 		}
3158 
3159 		/* Now try balancing at a lower domain level of new_cpu */
3160 		cpu = new_cpu;
3161 		weight = sd->span_weight;
3162 		sd = NULL;
3163 		for_each_domain(cpu, tmp) {
3164 			if (weight <= tmp->span_weight)
3165 				break;
3166 			if (tmp->flags & sd_flag)
3167 				sd = tmp;
3168 		}
3169 		/* while loop will break here if sd == NULL */
3170 	}
3171 unlock:
3172 	rcu_read_unlock();
3173 
3174 	return new_cpu;
3175 }
3176 
3177 /*
3178  * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
3179  * removed when useful for applications beyond shares distribution (e.g.
3180  * load-balance).
3181  */
3182 #ifdef CONFIG_FAIR_GROUP_SCHED
3183 /*
3184  * Called immediately before a task is migrated to a new cpu; task_cpu(p) and
3185  * cfs_rq_of(p) references at time of call are still valid and identify the
3186  * previous cpu.  However, the caller only guarantees p->pi_lock is held; no
3187  * other assumptions, including the state of rq->lock, should be made.
3188  */
3189 static void
3190 migrate_task_rq_fair(struct task_struct *p, int next_cpu)
3191 {
3192 	struct sched_entity *se = &p->se;
3193 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
3194 
3195 	/*
3196 	 * Load tracking: accumulate removed load so that it can be processed
3197 	 * when we next update owning cfs_rq under rq->lock.  Tasks contribute
3198 	 * to blocked load iff they have a positive decay-count.  It can never
3199 	 * be negative here since on-rq tasks have decay-count == 0.
3200 	 */
3201 	if (se->avg.decay_count) {
3202 		se->avg.decay_count = -__synchronize_entity_decay(se);
3203 		atomic64_add(se->avg.load_avg_contrib, &cfs_rq->removed_load);
3204 	}
3205 }
3206 #endif
3207 #endif /* CONFIG_SMP */
3208 
3209 static unsigned long
3210 wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
3211 {
3212 	unsigned long gran = sysctl_sched_wakeup_granularity;
3213 
3214 	/*
3215 	 * Since its curr running now, convert the gran from real-time
3216 	 * to virtual-time in his units.
3217 	 *
3218 	 * By using 'se' instead of 'curr' we penalize light tasks, so
3219 	 * they get preempted easier. That is, if 'se' < 'curr' then
3220 	 * the resulting gran will be larger, therefore penalizing the
3221 	 * lighter, if otoh 'se' > 'curr' then the resulting gran will
3222 	 * be smaller, again penalizing the lighter task.
3223 	 *
3224 	 * This is especially important for buddies when the leftmost
3225 	 * task is higher priority than the buddy.
3226 	 */
3227 	return calc_delta_fair(gran, se);
3228 }
3229 
3230 /*
3231  * Should 'se' preempt 'curr'.
3232  *
3233  *             |s1
3234  *        |s2
3235  *   |s3
3236  *         g
3237  *      |<--->|c
3238  *
3239  *  w(c, s1) = -1
3240  *  w(c, s2) =  0
3241  *  w(c, s3) =  1
3242  *
3243  */
3244 static int
3245 wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
3246 {
3247 	s64 gran, vdiff = curr->vruntime - se->vruntime;
3248 
3249 	if (vdiff <= 0)
3250 		return -1;
3251 
3252 	gran = wakeup_gran(curr, se);
3253 	if (vdiff > gran)
3254 		return 1;
3255 
3256 	return 0;
3257 }
3258 
3259 static void set_last_buddy(struct sched_entity *se)
3260 {
3261 	if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
3262 		return;
3263 
3264 	for_each_sched_entity(se)
3265 		cfs_rq_of(se)->last = se;
3266 }
3267 
3268 static void set_next_buddy(struct sched_entity *se)
3269 {
3270 	if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
3271 		return;
3272 
3273 	for_each_sched_entity(se)
3274 		cfs_rq_of(se)->next = se;
3275 }
3276 
3277 static void set_skip_buddy(struct sched_entity *se)
3278 {
3279 	for_each_sched_entity(se)
3280 		cfs_rq_of(se)->skip = se;
3281 }
3282 
3283 /*
3284  * Preempt the current task with a newly woken task if needed:
3285  */
3286 static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
3287 {
3288 	struct task_struct *curr = rq->curr;
3289 	struct sched_entity *se = &curr->se, *pse = &p->se;
3290 	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
3291 	int scale = cfs_rq->nr_running >= sched_nr_latency;
3292 	int next_buddy_marked = 0;
3293 
3294 	if (unlikely(se == pse))
3295 		return;
3296 
3297 	/*
3298 	 * This is possible from callers such as move_task(), in which we
3299 	 * unconditionally check_prempt_curr() after an enqueue (which may have
3300 	 * lead to a throttle).  This both saves work and prevents false
3301 	 * next-buddy nomination below.
3302 	 */
3303 	if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
3304 		return;
3305 
3306 	if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
3307 		set_next_buddy(pse);
3308 		next_buddy_marked = 1;
3309 	}
3310 
3311 	/*
3312 	 * We can come here with TIF_NEED_RESCHED already set from new task
3313 	 * wake up path.
3314 	 *
3315 	 * Note: this also catches the edge-case of curr being in a throttled
3316 	 * group (e.g. via set_curr_task), since update_curr() (in the
3317 	 * enqueue of curr) will have resulted in resched being set.  This
3318 	 * prevents us from potentially nominating it as a false LAST_BUDDY
3319 	 * below.
3320 	 */
3321 	if (test_tsk_need_resched(curr))
3322 		return;
3323 
3324 	/* Idle tasks are by definition preempted by non-idle tasks. */
3325 	if (unlikely(curr->policy == SCHED_IDLE) &&
3326 	    likely(p->policy != SCHED_IDLE))
3327 		goto preempt;
3328 
3329 	/*
3330 	 * Batch and idle tasks do not preempt non-idle tasks (their preemption
3331 	 * is driven by the tick):
3332 	 */
3333 	if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
3334 		return;
3335 
3336 	find_matching_se(&se, &pse);
3337 	update_curr(cfs_rq_of(se));
3338 	BUG_ON(!pse);
3339 	if (wakeup_preempt_entity(se, pse) == 1) {
3340 		/*
3341 		 * Bias pick_next to pick the sched entity that is
3342 		 * triggering this preemption.
3343 		 */
3344 		if (!next_buddy_marked)
3345 			set_next_buddy(pse);
3346 		goto preempt;
3347 	}
3348 
3349 	return;
3350 
3351 preempt:
3352 	resched_task(curr);
3353 	/*
3354 	 * Only set the backward buddy when the current task is still
3355 	 * on the rq. This can happen when a wakeup gets interleaved
3356 	 * with schedule on the ->pre_schedule() or idle_balance()
3357 	 * point, either of which can * drop the rq lock.
3358 	 *
3359 	 * Also, during early boot the idle thread is in the fair class,
3360 	 * for obvious reasons its a bad idea to schedule back to it.
3361 	 */
3362 	if (unlikely(!se->on_rq || curr == rq->idle))
3363 		return;
3364 
3365 	if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
3366 		set_last_buddy(se);
3367 }
3368 
3369 static struct task_struct *pick_next_task_fair(struct rq *rq)
3370 {
3371 	struct task_struct *p;
3372 	struct cfs_rq *cfs_rq = &rq->cfs;
3373 	struct sched_entity *se;
3374 
3375 	if (!cfs_rq->nr_running)
3376 		return NULL;
3377 
3378 	do {
3379 		se = pick_next_entity(cfs_rq);
3380 		set_next_entity(cfs_rq, se);
3381 		cfs_rq = group_cfs_rq(se);
3382 	} while (cfs_rq);
3383 
3384 	p = task_of(se);
3385 	if (hrtick_enabled(rq))
3386 		hrtick_start_fair(rq, p);
3387 
3388 	return p;
3389 }
3390 
3391 /*
3392  * Account for a descheduled task:
3393  */
3394 static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
3395 {
3396 	struct sched_entity *se = &prev->se;
3397 	struct cfs_rq *cfs_rq;
3398 
3399 	for_each_sched_entity(se) {
3400 		cfs_rq = cfs_rq_of(se);
3401 		put_prev_entity(cfs_rq, se);
3402 	}
3403 }
3404 
3405 /*
3406  * sched_yield() is very simple
3407  *
3408  * The magic of dealing with the ->skip buddy is in pick_next_entity.
3409  */
3410 static void yield_task_fair(struct rq *rq)
3411 {
3412 	struct task_struct *curr = rq->curr;
3413 	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
3414 	struct sched_entity *se = &curr->se;
3415 
3416 	/*
3417 	 * Are we the only task in the tree?
3418 	 */
3419 	if (unlikely(rq->nr_running == 1))
3420 		return;
3421 
3422 	clear_buddies(cfs_rq, se);
3423 
3424 	if (curr->policy != SCHED_BATCH) {
3425 		update_rq_clock(rq);
3426 		/*
3427 		 * Update run-time statistics of the 'current'.
3428 		 */
3429 		update_curr(cfs_rq);
3430 		/*
3431 		 * Tell update_rq_clock() that we've just updated,
3432 		 * so we don't do microscopic update in schedule()
3433 		 * and double the fastpath cost.
3434 		 */
3435 		 rq->skip_clock_update = 1;
3436 	}
3437 
3438 	set_skip_buddy(se);
3439 }
3440 
3441 static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt)
3442 {
3443 	struct sched_entity *se = &p->se;
3444 
3445 	/* throttled hierarchies are not runnable */
3446 	if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
3447 		return false;
3448 
3449 	/* Tell the scheduler that we'd really like pse to run next. */
3450 	set_next_buddy(se);
3451 
3452 	yield_task_fair(rq);
3453 
3454 	return true;
3455 }
3456 
3457 #ifdef CONFIG_SMP
3458 /**************************************************
3459  * Fair scheduling class load-balancing methods.
3460  *
3461  * BASICS
3462  *
3463  * The purpose of load-balancing is to achieve the same basic fairness the
3464  * per-cpu scheduler provides, namely provide a proportional amount of compute
3465  * time to each task. This is expressed in the following equation:
3466  *
3467  *   W_i,n/P_i == W_j,n/P_j for all i,j                               (1)
3468  *
3469  * Where W_i,n is the n-th weight average for cpu i. The instantaneous weight
3470  * W_i,0 is defined as:
3471  *
3472  *   W_i,0 = \Sum_j w_i,j                                             (2)
3473  *
3474  * Where w_i,j is the weight of the j-th runnable task on cpu i. This weight
3475  * is derived from the nice value as per prio_to_weight[].
3476  *
3477  * The weight average is an exponential decay average of the instantaneous
3478  * weight:
3479  *
3480  *   W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0               (3)
3481  *
3482  * P_i is the cpu power (or compute capacity) of cpu i, typically it is the
3483  * fraction of 'recent' time available for SCHED_OTHER task execution. But it
3484  * can also include other factors [XXX].
3485  *
3486  * To achieve this balance we define a measure of imbalance which follows
3487  * directly from (1):
3488  *
3489  *   imb_i,j = max{ avg(W/P), W_i/P_i } - min{ avg(W/P), W_j/P_j }    (4)
3490  *
3491  * We them move tasks around to minimize the imbalance. In the continuous
3492  * function space it is obvious this converges, in the discrete case we get
3493  * a few fun cases generally called infeasible weight scenarios.
3494  *
3495  * [XXX expand on:
3496  *     - infeasible weights;
3497  *     - local vs global optima in the discrete case. ]
3498  *
3499  *
3500  * SCHED DOMAINS
3501  *
3502  * In order to solve the imbalance equation (4), and avoid the obvious O(n^2)
3503  * for all i,j solution, we create a tree of cpus that follows the hardware
3504  * topology where each level pairs two lower groups (or better). This results
3505  * in O(log n) layers. Furthermore we reduce the number of cpus going up the
3506  * tree to only the first of the previous level and we decrease the frequency
3507  * of load-balance at each level inv. proportional to the number of cpus in
3508  * the groups.
3509  *
3510  * This yields:
3511  *
3512  *     log_2 n     1     n
3513  *   \Sum       { --- * --- * 2^i } = O(n)                            (5)
3514  *     i = 0      2^i   2^i
3515  *                               `- size of each group
3516  *         |         |     `- number of cpus doing load-balance
3517  *         |         `- freq
3518  *         `- sum over all levels
3519  *
3520  * Coupled with a limit on how many tasks we can migrate every balance pass,
3521  * this makes (5) the runtime complexity of the balancer.
3522  *
3523  * An important property here is that each CPU is still (indirectly) connected
3524  * to every other cpu in at most O(log n) steps:
3525  *
3526  * The adjacency matrix of the resulting graph is given by:
3527  *
3528  *             log_2 n
3529  *   A_i,j = \Union     (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1)  (6)
3530  *             k = 0
3531  *
3532  * And you'll find that:
3533  *
3534  *   A^(log_2 n)_i,j != 0  for all i,j                                (7)
3535  *
3536  * Showing there's indeed a path between every cpu in at most O(log n) steps.
3537  * The task movement gives a factor of O(m), giving a convergence complexity
3538  * of:
3539  *
3540  *   O(nm log n),  n := nr_cpus, m := nr_tasks                        (8)
3541  *
3542  *
3543  * WORK CONSERVING
3544  *
3545  * In order to avoid CPUs going idle while there's still work to do, new idle
3546  * balancing is more aggressive and has the newly idle cpu iterate up the domain
3547  * tree itself instead of relying on other CPUs to bring it work.
3548  *
3549  * This adds some complexity to both (5) and (8) but it reduces the total idle
3550  * time.
3551  *
3552  * [XXX more?]
3553  *
3554  *
3555  * CGROUPS
3556  *
3557  * Cgroups make a horror show out of (2), instead of a simple sum we get:
3558  *
3559  *                                s_k,i
3560  *   W_i,0 = \Sum_j \Prod_k w_k * -----                               (9)
3561  *                                 S_k
3562  *
3563  * Where
3564  *
3565  *   s_k,i = \Sum_j w_i,j,k  and  S_k = \Sum_i s_k,i                 (10)
3566  *
3567  * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on cpu i.
3568  *
3569  * The big problem is S_k, its a global sum needed to compute a local (W_i)
3570  * property.
3571  *
3572  * [XXX write more on how we solve this.. _after_ merging pjt's patches that
3573  *      rewrite all of this once again.]
3574  */
3575 
3576 static unsigned long __read_mostly max_load_balance_interval = HZ/10;
3577 
3578 #define LBF_ALL_PINNED	0x01
3579 #define LBF_NEED_BREAK	0x02
3580 #define LBF_SOME_PINNED 0x04
3581 
3582 struct lb_env {
3583 	struct sched_domain	*sd;
3584 
3585 	struct rq		*src_rq;
3586 	int			src_cpu;
3587 
3588 	int			dst_cpu;
3589 	struct rq		*dst_rq;
3590 
3591 	struct cpumask		*dst_grpmask;
3592 	int			new_dst_cpu;
3593 	enum cpu_idle_type	idle;
3594 	long			imbalance;
3595 	/* The set of CPUs under consideration for load-balancing */
3596 	struct cpumask		*cpus;
3597 
3598 	unsigned int		flags;
3599 
3600 	unsigned int		loop;
3601 	unsigned int		loop_break;
3602 	unsigned int		loop_max;
3603 };
3604 
3605 /*
3606  * move_task - move a task from one runqueue to another runqueue.
3607  * Both runqueues must be locked.
3608  */
3609 static void move_task(struct task_struct *p, struct lb_env *env)
3610 {
3611 	deactivate_task(env->src_rq, p, 0);
3612 	set_task_cpu(p, env->dst_cpu);
3613 	activate_task(env->dst_rq, p, 0);
3614 	check_preempt_curr(env->dst_rq, p, 0);
3615 }
3616 
3617 /*
3618  * Is this task likely cache-hot:
3619  */
3620 static int
3621 task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
3622 {
3623 	s64 delta;
3624 
3625 	if (p->sched_class != &fair_sched_class)
3626 		return 0;
3627 
3628 	if (unlikely(p->policy == SCHED_IDLE))
3629 		return 0;
3630 
3631 	/*
3632 	 * Buddy candidates are cache hot:
3633 	 */
3634 	if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&
3635 			(&p->se == cfs_rq_of(&p->se)->next ||
3636 			 &p->se == cfs_rq_of(&p->se)->last))
3637 		return 1;
3638 
3639 	if (sysctl_sched_migration_cost == -1)
3640 		return 1;
3641 	if (sysctl_sched_migration_cost == 0)
3642 		return 0;
3643 
3644 	delta = now - p->se.exec_start;
3645 
3646 	return delta < (s64)sysctl_sched_migration_cost;
3647 }
3648 
3649 /*
3650  * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
3651  */
3652 static
3653 int can_migrate_task(struct task_struct *p, struct lb_env *env)
3654 {
3655 	int tsk_cache_hot = 0;
3656 	/*
3657 	 * We do not migrate tasks that are:
3658 	 * 1) running (obviously), or
3659 	 * 2) cannot be migrated to this CPU due to cpus_allowed, or
3660 	 * 3) are cache-hot on their current CPU.
3661 	 */
3662 	if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
3663 		int new_dst_cpu;
3664 
3665 		schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
3666 
3667 		/*
3668 		 * Remember if this task can be migrated to any other cpu in
3669 		 * our sched_group. We may want to revisit it if we couldn't
3670 		 * meet load balance goals by pulling other tasks on src_cpu.
3671 		 *
3672 		 * Also avoid computing new_dst_cpu if we have already computed
3673 		 * one in current iteration.
3674 		 */
3675 		if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED))
3676 			return 0;
3677 
3678 		new_dst_cpu = cpumask_first_and(env->dst_grpmask,
3679 						tsk_cpus_allowed(p));
3680 		if (new_dst_cpu < nr_cpu_ids) {
3681 			env->flags |= LBF_SOME_PINNED;
3682 			env->new_dst_cpu = new_dst_cpu;
3683 		}
3684 		return 0;
3685 	}
3686 
3687 	/* Record that we found atleast one task that could run on dst_cpu */
3688 	env->flags &= ~LBF_ALL_PINNED;
3689 
3690 	if (task_running(env->src_rq, p)) {
3691 		schedstat_inc(p, se.statistics.nr_failed_migrations_running);
3692 		return 0;
3693 	}
3694 
3695 	/*
3696 	 * Aggressive migration if:
3697 	 * 1) task is cache cold, or
3698 	 * 2) too many balance attempts have failed.
3699 	 */
3700 
3701 	tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd);
3702 	if (!tsk_cache_hot ||
3703 		env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
3704 #ifdef CONFIG_SCHEDSTATS
3705 		if (tsk_cache_hot) {
3706 			schedstat_inc(env->sd, lb_hot_gained[env->idle]);
3707 			schedstat_inc(p, se.statistics.nr_forced_migrations);
3708 		}
3709 #endif
3710 		return 1;
3711 	}
3712 
3713 	if (tsk_cache_hot) {
3714 		schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
3715 		return 0;
3716 	}
3717 	return 1;
3718 }
3719 
3720 /*
3721  * move_one_task tries to move exactly one task from busiest to this_rq, as
3722  * part of active balancing operations within "domain".
3723  * Returns 1 if successful and 0 otherwise.
3724  *
3725  * Called with both runqueues locked.
3726  */
3727 static int move_one_task(struct lb_env *env)
3728 {
3729 	struct task_struct *p, *n;
3730 
3731 	list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
3732 		if (throttled_lb_pair(task_group(p), env->src_rq->cpu, env->dst_cpu))
3733 			continue;
3734 
3735 		if (!can_migrate_task(p, env))
3736 			continue;
3737 
3738 		move_task(p, env);
3739 		/*
3740 		 * Right now, this is only the second place move_task()
3741 		 * is called, so we can safely collect move_task()
3742 		 * stats here rather than inside move_task().
3743 		 */
3744 		schedstat_inc(env->sd, lb_gained[env->idle]);
3745 		return 1;
3746 	}
3747 	return 0;
3748 }
3749 
3750 static unsigned long task_h_load(struct task_struct *p);
3751 
3752 static const unsigned int sched_nr_migrate_break = 32;
3753 
3754 /*
3755  * move_tasks tries to move up to imbalance weighted load from busiest to
3756  * this_rq, as part of a balancing operation within domain "sd".
3757  * Returns 1 if successful and 0 otherwise.
3758  *
3759  * Called with both runqueues locked.
3760  */
3761 static int move_tasks(struct lb_env *env)
3762 {
3763 	struct list_head *tasks = &env->src_rq->cfs_tasks;
3764 	struct task_struct *p;
3765 	unsigned long load;
3766 	int pulled = 0;
3767 
3768 	if (env->imbalance <= 0)
3769 		return 0;
3770 
3771 	while (!list_empty(tasks)) {
3772 		p = list_first_entry(tasks, struct task_struct, se.group_node);
3773 
3774 		env->loop++;
3775 		/* We've more or less seen every task there is, call it quits */
3776 		if (env->loop > env->loop_max)
3777 			break;
3778 
3779 		/* take a breather every nr_migrate tasks */
3780 		if (env->loop > env->loop_break) {
3781 			env->loop_break += sched_nr_migrate_break;
3782 			env->flags |= LBF_NEED_BREAK;
3783 			break;
3784 		}
3785 
3786 		if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
3787 			goto next;
3788 
3789 		load = task_h_load(p);
3790 
3791 		if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
3792 			goto next;
3793 
3794 		if ((load / 2) > env->imbalance)
3795 			goto next;
3796 
3797 		if (!can_migrate_task(p, env))
3798 			goto next;
3799 
3800 		move_task(p, env);
3801 		pulled++;
3802 		env->imbalance -= load;
3803 
3804 #ifdef CONFIG_PREEMPT
3805 		/*
3806 		 * NEWIDLE balancing is a source of latency, so preemptible
3807 		 * kernels will stop after the first task is pulled to minimize
3808 		 * the critical section.
3809 		 */
3810 		if (env->idle == CPU_NEWLY_IDLE)
3811 			break;
3812 #endif
3813 
3814 		/*
3815 		 * We only want to steal up to the prescribed amount of
3816 		 * weighted load.
3817 		 */
3818 		if (env->imbalance <= 0)
3819 			break;
3820 
3821 		continue;
3822 next:
3823 		list_move_tail(&p->se.group_node, tasks);
3824 	}
3825 
3826 	/*
3827 	 * Right now, this is one of only two places move_task() is called,
3828 	 * so we can safely collect move_task() stats here rather than
3829 	 * inside move_task().
3830 	 */
3831 	schedstat_add(env->sd, lb_gained[env->idle], pulled);
3832 
3833 	return pulled;
3834 }
3835 
3836 #ifdef CONFIG_FAIR_GROUP_SCHED
3837 /*
3838  * update tg->load_weight by folding this cpu's load_avg
3839  */
3840 static void __update_blocked_averages_cpu(struct task_group *tg, int cpu)
3841 {
3842 	struct sched_entity *se = tg->se[cpu];
3843 	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];
3844 
3845 	/* throttled entities do not contribute to load */
3846 	if (throttled_hierarchy(cfs_rq))
3847 		return;
3848 
3849 	update_cfs_rq_blocked_load(cfs_rq, 1);
3850 
3851 	if (se) {
3852 		update_entity_load_avg(se, 1);
3853 		/*
3854 		 * We pivot on our runnable average having decayed to zero for
3855 		 * list removal.  This generally implies that all our children
3856 		 * have also been removed (modulo rounding error or bandwidth
3857 		 * control); however, such cases are rare and we can fix these
3858 		 * at enqueue.
3859 		 *
3860 		 * TODO: fix up out-of-order children on enqueue.
3861 		 */
3862 		if (!se->avg.runnable_avg_sum && !cfs_rq->nr_running)
3863 			list_del_leaf_cfs_rq(cfs_rq);
3864 	} else {
3865 		struct rq *rq = rq_of(cfs_rq);
3866 		update_rq_runnable_avg(rq, rq->nr_running);
3867 	}
3868 }
3869 
3870 static void update_blocked_averages(int cpu)
3871 {
3872 	struct rq *rq = cpu_rq(cpu);
3873 	struct cfs_rq *cfs_rq;
3874 	unsigned long flags;
3875 
3876 	raw_spin_lock_irqsave(&rq->lock, flags);
3877 	update_rq_clock(rq);
3878 	/*
3879 	 * Iterates the task_group tree in a bottom up fashion, see
3880 	 * list_add_leaf_cfs_rq() for details.
3881 	 */
3882 	for_each_leaf_cfs_rq(rq, cfs_rq) {
3883 		/*
3884 		 * Note: We may want to consider periodically releasing
3885 		 * rq->lock about these updates so that creating many task
3886 		 * groups does not result in continually extending hold time.
3887 		 */
3888 		__update_blocked_averages_cpu(cfs_rq->tg, rq->cpu);
3889 	}
3890 
3891 	raw_spin_unlock_irqrestore(&rq->lock, flags);
3892 }
3893 
3894 /*
3895  * Compute the cpu's hierarchical load factor for each task group.
3896  * This needs to be done in a top-down fashion because the load of a child
3897  * group is a fraction of its parents load.
3898  */
3899 static int tg_load_down(struct task_group *tg, void *data)
3900 {
3901 	unsigned long load;
3902 	long cpu = (long)data;
3903 
3904 	if (!tg->parent) {
3905 		load = cpu_rq(cpu)->load.weight;
3906 	} else {
3907 		load = tg->parent->cfs_rq[cpu]->h_load;
3908 		load *= tg->se[cpu]->load.weight;
3909 		load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
3910 	}
3911 
3912 	tg->cfs_rq[cpu]->h_load = load;
3913 
3914 	return 0;
3915 }
3916 
3917 static void update_h_load(long cpu)
3918 {
3919 	struct rq *rq = cpu_rq(cpu);
3920 	unsigned long now = jiffies;
3921 
3922 	if (rq->h_load_throttle == now)
3923 		return;
3924 
3925 	rq->h_load_throttle = now;
3926 
3927 	rcu_read_lock();
3928 	walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
3929 	rcu_read_unlock();
3930 }
3931 
3932 static unsigned long task_h_load(struct task_struct *p)
3933 {
3934 	struct cfs_rq *cfs_rq = task_cfs_rq(p);
3935 	unsigned long load;
3936 
3937 	load = p->se.load.weight;
3938 	load = div_u64(load * cfs_rq->h_load, cfs_rq->load.weight + 1);
3939 
3940 	return load;
3941 }
3942 #else
3943 static inline void update_blocked_averages(int cpu)
3944 {
3945 }
3946 
3947 static inline void update_h_load(long cpu)
3948 {
3949 }
3950 
3951 static unsigned long task_h_load(struct task_struct *p)
3952 {
3953 	return p->se.load.weight;
3954 }
3955 #endif
3956 
3957 /********** Helpers for find_busiest_group ************************/
3958 /*
3959  * sd_lb_stats - Structure to store the statistics of a sched_domain
3960  * 		during load balancing.
3961  */
3962 struct sd_lb_stats {
3963 	struct sched_group *busiest; /* Busiest group in this sd */
3964 	struct sched_group *this;  /* Local group in this sd */
3965 	unsigned long total_load;  /* Total load of all groups in sd */
3966 	unsigned long total_pwr;   /*	Total power of all groups in sd */
3967 	unsigned long avg_load;	   /* Average load across all groups in sd */
3968 
3969 	/** Statistics of this group */
3970 	unsigned long this_load;
3971 	unsigned long this_load_per_task;
3972 	unsigned long this_nr_running;
3973 	unsigned long this_has_capacity;
3974 	unsigned int  this_idle_cpus;
3975 
3976 	/* Statistics of the busiest group */
3977 	unsigned int  busiest_idle_cpus;
3978 	unsigned long max_load;
3979 	unsigned long busiest_load_per_task;
3980 	unsigned long busiest_nr_running;
3981 	unsigned long busiest_group_capacity;
3982 	unsigned long busiest_has_capacity;
3983 	unsigned int  busiest_group_weight;
3984 
3985 	int group_imb; /* Is there imbalance in this sd */
3986 };
3987 
3988 /*
3989  * sg_lb_stats - stats of a sched_group required for load_balancing
3990  */
3991 struct sg_lb_stats {
3992 	unsigned long avg_load; /*Avg load across the CPUs of the group */
3993 	unsigned long group_load; /* Total load over the CPUs of the group */
3994 	unsigned long sum_nr_running; /* Nr tasks running in the group */
3995 	unsigned long sum_weighted_load; /* Weighted load of group's tasks */
3996 	unsigned long group_capacity;
3997 	unsigned long idle_cpus;
3998 	unsigned long group_weight;
3999 	int group_imb; /* Is there an imbalance in the group ? */
4000 	int group_has_capacity; /* Is there extra capacity in the group? */
4001 };
4002 
4003 /**
4004  * get_sd_load_idx - Obtain the load index for a given sched domain.
4005  * @sd: The sched_domain whose load_idx is to be obtained.
4006  * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
4007  */
4008 static inline int get_sd_load_idx(struct sched_domain *sd,
4009 					enum cpu_idle_type idle)
4010 {
4011 	int load_idx;
4012 
4013 	switch (idle) {
4014 	case CPU_NOT_IDLE:
4015 		load_idx = sd->busy_idx;
4016 		break;
4017 
4018 	case CPU_NEWLY_IDLE:
4019 		load_idx = sd->newidle_idx;
4020 		break;
4021 	default:
4022 		load_idx = sd->idle_idx;
4023 		break;
4024 	}
4025 
4026 	return load_idx;
4027 }
4028 
4029 unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
4030 {
4031 	return SCHED_POWER_SCALE;
4032 }
4033 
4034 unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
4035 {
4036 	return default_scale_freq_power(sd, cpu);
4037 }
4038 
4039 unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
4040 {
4041 	unsigned long weight = sd->span_weight;
4042 	unsigned long smt_gain = sd->smt_gain;
4043 
4044 	smt_gain /= weight;
4045 
4046 	return smt_gain;
4047 }
4048 
4049 unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
4050 {
4051 	return default_scale_smt_power(sd, cpu);
4052 }
4053 
4054 unsigned long scale_rt_power(int cpu)
4055 {
4056 	struct rq *rq = cpu_rq(cpu);
4057 	u64 total, available, age_stamp, avg;
4058 
4059 	/*
4060 	 * Since we're reading these variables without serialization make sure
4061 	 * we read them once before doing sanity checks on them.
4062 	 */
4063 	age_stamp = ACCESS_ONCE(rq->age_stamp);
4064 	avg = ACCESS_ONCE(rq->rt_avg);
4065 
4066 	total = sched_avg_period() + (rq->clock - age_stamp);
4067 
4068 	if (unlikely(total < avg)) {
4069 		/* Ensures that power won't end up being negative */
4070 		available = 0;
4071 	} else {
4072 		available = total - avg;
4073 	}
4074 
4075 	if (unlikely((s64)total < SCHED_POWER_SCALE))
4076 		total = SCHED_POWER_SCALE;
4077 
4078 	total >>= SCHED_POWER_SHIFT;
4079 
4080 	return div_u64(available, total);
4081 }
4082 
4083 static void update_cpu_power(struct sched_domain *sd, int cpu)
4084 {
4085 	unsigned long weight = sd->span_weight;
4086 	unsigned long power = SCHED_POWER_SCALE;
4087 	struct sched_group *sdg = sd->groups;
4088 
4089 	if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
4090 		if (sched_feat(ARCH_POWER))
4091 			power *= arch_scale_smt_power(sd, cpu);
4092 		else
4093 			power *= default_scale_smt_power(sd, cpu);
4094 
4095 		power >>= SCHED_POWER_SHIFT;
4096 	}
4097 
4098 	sdg->sgp->power_orig = power;
4099 
4100 	if (sched_feat(ARCH_POWER))
4101 		power *= arch_scale_freq_power(sd, cpu);
4102 	else
4103 		power *= default_scale_freq_power(sd, cpu);
4104 
4105 	power >>= SCHED_POWER_SHIFT;
4106 
4107 	power *= scale_rt_power(cpu);
4108 	power >>= SCHED_POWER_SHIFT;
4109 
4110 	if (!power)
4111 		power = 1;
4112 
4113 	cpu_rq(cpu)->cpu_power = power;
4114 	sdg->sgp->power = power;
4115 }
4116 
4117 void update_group_power(struct sched_domain *sd, int cpu)
4118 {
4119 	struct sched_domain *child = sd->child;
4120 	struct sched_group *group, *sdg = sd->groups;
4121 	unsigned long power;
4122 	unsigned long interval;
4123 
4124 	interval = msecs_to_jiffies(sd->balance_interval);
4125 	interval = clamp(interval, 1UL, max_load_balance_interval);
4126 	sdg->sgp->next_update = jiffies + interval;
4127 
4128 	if (!child) {
4129 		update_cpu_power(sd, cpu);
4130 		return;
4131 	}
4132 
4133 	power = 0;
4134 
4135 	if (child->flags & SD_OVERLAP) {
4136 		/*
4137 		 * SD_OVERLAP domains cannot assume that child groups
4138 		 * span the current group.
4139 		 */
4140 
4141 		for_each_cpu(cpu, sched_group_cpus(sdg))
4142 			power += power_of(cpu);
4143 	} else  {
4144 		/*
4145 		 * !SD_OVERLAP domains can assume that child groups
4146 		 * span the current group.
4147 		 */
4148 
4149 		group = child->groups;
4150 		do {
4151 			power += group->sgp->power;
4152 			group = group->next;
4153 		} while (group != child->groups);
4154 	}
4155 
4156 	sdg->sgp->power_orig = sdg->sgp->power = power;
4157 }
4158 
4159 /*
4160  * Try and fix up capacity for tiny siblings, this is needed when
4161  * things like SD_ASYM_PACKING need f_b_g to select another sibling
4162  * which on its own isn't powerful enough.
4163  *
4164  * See update_sd_pick_busiest() and check_asym_packing().
4165  */
4166 static inline int
4167 fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
4168 {
4169 	/*
4170 	 * Only siblings can have significantly less than SCHED_POWER_SCALE
4171 	 */
4172 	if (!(sd->flags & SD_SHARE_CPUPOWER))
4173 		return 0;
4174 
4175 	/*
4176 	 * If ~90% of the cpu_power is still there, we're good.
4177 	 */
4178 	if (group->sgp->power * 32 > group->sgp->power_orig * 29)
4179 		return 1;
4180 
4181 	return 0;
4182 }
4183 
4184 /**
4185  * update_sg_lb_stats - Update sched_group's statistics for load balancing.
4186  * @env: The load balancing environment.
4187  * @group: sched_group whose statistics are to be updated.
4188  * @load_idx: Load index of sched_domain of this_cpu for load calc.
4189  * @local_group: Does group contain this_cpu.
4190  * @balance: Should we balance.
4191  * @sgs: variable to hold the statistics for this group.
4192  */
4193 static inline void update_sg_lb_stats(struct lb_env *env,
4194 			struct sched_group *group, int load_idx,
4195 			int local_group, int *balance, struct sg_lb_stats *sgs)
4196 {
4197 	unsigned long nr_running, max_nr_running, min_nr_running;
4198 	unsigned long load, max_cpu_load, min_cpu_load;
4199 	unsigned int balance_cpu = -1, first_idle_cpu = 0;
4200 	unsigned long avg_load_per_task = 0;
4201 	int i;
4202 
4203 	if (local_group)
4204 		balance_cpu = group_balance_cpu(group);
4205 
4206 	/* Tally up the load of all CPUs in the group */
4207 	max_cpu_load = 0;
4208 	min_cpu_load = ~0UL;
4209 	max_nr_running = 0;
4210 	min_nr_running = ~0UL;
4211 
4212 	for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
4213 		struct rq *rq = cpu_rq(i);
4214 
4215 		nr_running = rq->nr_running;
4216 
4217 		/* Bias balancing toward cpus of our domain */
4218 		if (local_group) {
4219 			if (idle_cpu(i) && !first_idle_cpu &&
4220 					cpumask_test_cpu(i, sched_group_mask(group))) {
4221 				first_idle_cpu = 1;
4222 				balance_cpu = i;
4223 			}
4224 
4225 			load = target_load(i, load_idx);
4226 		} else {
4227 			load = source_load(i, load_idx);
4228 			if (load > max_cpu_load)
4229 				max_cpu_load = load;
4230 			if (min_cpu_load > load)
4231 				min_cpu_load = load;
4232 
4233 			if (nr_running > max_nr_running)
4234 				max_nr_running = nr_running;
4235 			if (min_nr_running > nr_running)
4236 				min_nr_running = nr_running;
4237 		}
4238 
4239 		sgs->group_load += load;
4240 		sgs->sum_nr_running += nr_running;
4241 		sgs->sum_weighted_load += weighted_cpuload(i);
4242 		if (idle_cpu(i))
4243 			sgs->idle_cpus++;
4244 	}
4245 
4246 	/*
4247 	 * First idle cpu or the first cpu(busiest) in this sched group
4248 	 * is eligible for doing load balancing at this and above
4249 	 * domains. In the newly idle case, we will allow all the cpu's
4250 	 * to do the newly idle load balance.
4251 	 */
4252 	if (local_group) {
4253 		if (env->idle != CPU_NEWLY_IDLE) {
4254 			if (balance_cpu != env->dst_cpu) {
4255 				*balance = 0;
4256 				return;
4257 			}
4258 			update_group_power(env->sd, env->dst_cpu);
4259 		} else if (time_after_eq(jiffies, group->sgp->next_update))
4260 			update_group_power(env->sd, env->dst_cpu);
4261 	}
4262 
4263 	/* Adjust by relative CPU power of the group */
4264 	sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->sgp->power;
4265 
4266 	/*
4267 	 * Consider the group unbalanced when the imbalance is larger
4268 	 * than the average weight of a task.
4269 	 *
4270 	 * APZ: with cgroup the avg task weight can vary wildly and
4271 	 *      might not be a suitable number - should we keep a
4272 	 *      normalized nr_running number somewhere that negates
4273 	 *      the hierarchy?
4274 	 */
4275 	if (sgs->sum_nr_running)
4276 		avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
4277 
4278 	if ((max_cpu_load - min_cpu_load) >= avg_load_per_task &&
4279 	    (max_nr_running - min_nr_running) > 1)
4280 		sgs->group_imb = 1;
4281 
4282 	sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power,
4283 						SCHED_POWER_SCALE);
4284 	if (!sgs->group_capacity)
4285 		sgs->group_capacity = fix_small_capacity(env->sd, group);
4286 	sgs->group_weight = group->group_weight;
4287 
4288 	if (sgs->group_capacity > sgs->sum_nr_running)
4289 		sgs->group_has_capacity = 1;
4290 }
4291 
4292 /**
4293  * update_sd_pick_busiest - return 1 on busiest group
4294  * @env: The load balancing environment.
4295  * @sds: sched_domain statistics
4296  * @sg: sched_group candidate to be checked for being the busiest
4297  * @sgs: sched_group statistics
4298  *
4299  * Determine if @sg is a busier group than the previously selected
4300  * busiest group.
4301  */
4302 static bool update_sd_pick_busiest(struct lb_env *env,
4303 				   struct sd_lb_stats *sds,
4304 				   struct sched_group *sg,
4305 				   struct sg_lb_stats *sgs)
4306 {
4307 	if (sgs->avg_load <= sds->max_load)
4308 		return false;
4309 
4310 	if (sgs->sum_nr_running > sgs->group_capacity)
4311 		return true;
4312 
4313 	if (sgs->group_imb)
4314 		return true;
4315 
4316 	/*
4317 	 * ASYM_PACKING needs to move all the work to the lowest
4318 	 * numbered CPUs in the group, therefore mark all groups
4319 	 * higher than ourself as busy.
4320 	 */
4321 	if ((env->sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running &&
4322 	    env->dst_cpu < group_first_cpu(sg)) {
4323 		if (!sds->busiest)
4324 			return true;
4325 
4326 		if (group_first_cpu(sds->busiest) > group_first_cpu(sg))
4327 			return true;
4328 	}
4329 
4330 	return false;
4331 }
4332 
4333 /**
4334  * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
4335  * @env: The load balancing environment.
4336  * @balance: Should we balance.
4337  * @sds: variable to hold the statistics for this sched_domain.
4338  */
4339 static inline void update_sd_lb_stats(struct lb_env *env,
4340 					int *balance, struct sd_lb_stats *sds)
4341 {
4342 	struct sched_domain *child = env->sd->child;
4343 	struct sched_group *sg = env->sd->groups;
4344 	struct sg_lb_stats sgs;
4345 	int load_idx, prefer_sibling = 0;
4346 
4347 	if (child && child->flags & SD_PREFER_SIBLING)
4348 		prefer_sibling = 1;
4349 
4350 	load_idx = get_sd_load_idx(env->sd, env->idle);
4351 
4352 	do {
4353 		int local_group;
4354 
4355 		local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
4356 		memset(&sgs, 0, sizeof(sgs));
4357 		update_sg_lb_stats(env, sg, load_idx, local_group, balance, &sgs);
4358 
4359 		if (local_group && !(*balance))
4360 			return;
4361 
4362 		sds->total_load += sgs.group_load;
4363 		sds->total_pwr += sg->sgp->power;
4364 
4365 		/*
4366 		 * In case the child domain prefers tasks go to siblings
4367 		 * first, lower the sg capacity to one so that we'll try
4368 		 * and move all the excess tasks away. We lower the capacity
4369 		 * of a group only if the local group has the capacity to fit
4370 		 * these excess tasks, i.e. nr_running < group_capacity. The
4371 		 * extra check prevents the case where you always pull from the
4372 		 * heaviest group when it is already under-utilized (possible
4373 		 * with a large weight task outweighs the tasks on the system).
4374 		 */
4375 		if (prefer_sibling && !local_group && sds->this_has_capacity)
4376 			sgs.group_capacity = min(sgs.group_capacity, 1UL);
4377 
4378 		if (local_group) {
4379 			sds->this_load = sgs.avg_load;
4380 			sds->this = sg;
4381 			sds->this_nr_running = sgs.sum_nr_running;
4382 			sds->this_load_per_task = sgs.sum_weighted_load;
4383 			sds->this_has_capacity = sgs.group_has_capacity;
4384 			sds->this_idle_cpus = sgs.idle_cpus;
4385 		} else if (update_sd_pick_busiest(env, sds, sg, &sgs)) {
4386 			sds->max_load = sgs.avg_load;
4387 			sds->busiest = sg;
4388 			sds->busiest_nr_running = sgs.sum_nr_running;
4389 			sds->busiest_idle_cpus = sgs.idle_cpus;
4390 			sds->busiest_group_capacity = sgs.group_capacity;
4391 			sds->busiest_load_per_task = sgs.sum_weighted_load;
4392 			sds->busiest_has_capacity = sgs.group_has_capacity;
4393 			sds->busiest_group_weight = sgs.group_weight;
4394 			sds->group_imb = sgs.group_imb;
4395 		}
4396 
4397 		sg = sg->next;
4398 	} while (sg != env->sd->groups);
4399 }
4400 
4401 /**
4402  * check_asym_packing - Check to see if the group is packed into the
4403  *			sched doman.
4404  *
4405  * This is primarily intended to used at the sibling level.  Some
4406  * cores like POWER7 prefer to use lower numbered SMT threads.  In the
4407  * case of POWER7, it can move to lower SMT modes only when higher
4408  * threads are idle.  When in lower SMT modes, the threads will
4409  * perform better since they share less core resources.  Hence when we
4410  * have idle threads, we want them to be the higher ones.
4411  *
4412  * This packing function is run on idle threads.  It checks to see if
4413  * the busiest CPU in this domain (core in the P7 case) has a higher
4414  * CPU number than the packing function is being run on.  Here we are
4415  * assuming lower CPU number will be equivalent to lower a SMT thread
4416  * number.
4417  *
4418  * Returns 1 when packing is required and a task should be moved to
4419  * this CPU.  The amount of the imbalance is returned in *imbalance.
4420  *
4421  * @env: The load balancing environment.
4422  * @sds: Statistics of the sched_domain which is to be packed
4423  */
4424 static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
4425 {
4426 	int busiest_cpu;
4427 
4428 	if (!(env->sd->flags & SD_ASYM_PACKING))
4429 		return 0;
4430 
4431 	if (!sds->busiest)
4432 		return 0;
4433 
4434 	busiest_cpu = group_first_cpu(sds->busiest);
4435 	if (env->dst_cpu > busiest_cpu)
4436 		return 0;
4437 
4438 	env->imbalance = DIV_ROUND_CLOSEST(
4439 		sds->max_load * sds->busiest->sgp->power, SCHED_POWER_SCALE);
4440 
4441 	return 1;
4442 }
4443 
4444 /**
4445  * fix_small_imbalance - Calculate the minor imbalance that exists
4446  *			amongst the groups of a sched_domain, during
4447  *			load balancing.
4448  * @env: The load balancing environment.
4449  * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
4450  */
4451 static inline
4452 void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
4453 {
4454 	unsigned long tmp, pwr_now = 0, pwr_move = 0;
4455 	unsigned int imbn = 2;
4456 	unsigned long scaled_busy_load_per_task;
4457 
4458 	if (sds->this_nr_running) {
4459 		sds->this_load_per_task /= sds->this_nr_running;
4460 		if (sds->busiest_load_per_task >
4461 				sds->this_load_per_task)
4462 			imbn = 1;
4463 	} else {
4464 		sds->this_load_per_task =
4465 			cpu_avg_load_per_task(env->dst_cpu);
4466 	}
4467 
4468 	scaled_busy_load_per_task = sds->busiest_load_per_task
4469 					 * SCHED_POWER_SCALE;
4470 	scaled_busy_load_per_task /= sds->busiest->sgp->power;
4471 
4472 	if (sds->max_load - sds->this_load + scaled_busy_load_per_task >=
4473 			(scaled_busy_load_per_task * imbn)) {
4474 		env->imbalance = sds->busiest_load_per_task;
4475 		return;
4476 	}
4477 
4478 	/*
4479 	 * OK, we don't have enough imbalance to justify moving tasks,
4480 	 * however we may be able to increase total CPU power used by
4481 	 * moving them.
4482 	 */
4483 
4484 	pwr_now += sds->busiest->sgp->power *
4485 			min(sds->busiest_load_per_task, sds->max_load);
4486 	pwr_now += sds->this->sgp->power *
4487 			min(sds->this_load_per_task, sds->this_load);
4488 	pwr_now /= SCHED_POWER_SCALE;
4489 
4490 	/* Amount of load we'd subtract */
4491 	tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
4492 		sds->busiest->sgp->power;
4493 	if (sds->max_load > tmp)
4494 		pwr_move += sds->busiest->sgp->power *
4495 			min(sds->busiest_load_per_task, sds->max_load - tmp);
4496 
4497 	/* Amount of load we'd add */
4498 	if (sds->max_load * sds->busiest->sgp->power <
4499 		sds->busiest_load_per_task * SCHED_POWER_SCALE)
4500 		tmp = (sds->max_load * sds->busiest->sgp->power) /
4501 			sds->this->sgp->power;
4502 	else
4503 		tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
4504 			sds->this->sgp->power;
4505 	pwr_move += sds->this->sgp->power *
4506 			min(sds->this_load_per_task, sds->this_load + tmp);
4507 	pwr_move /= SCHED_POWER_SCALE;
4508 
4509 	/* Move if we gain throughput */
4510 	if (pwr_move > pwr_now)
4511 		env->imbalance = sds->busiest_load_per_task;
4512 }
4513 
4514 /**
4515  * calculate_imbalance - Calculate the amount of imbalance present within the
4516  *			 groups of a given sched_domain during load balance.
4517  * @env: load balance environment
4518  * @sds: statistics of the sched_domain whose imbalance is to be calculated.
4519  */
4520 static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
4521 {
4522 	unsigned long max_pull, load_above_capacity = ~0UL;
4523 
4524 	sds->busiest_load_per_task /= sds->busiest_nr_running;
4525 	if (sds->group_imb) {
4526 		sds->busiest_load_per_task =
4527 			min(sds->busiest_load_per_task, sds->avg_load);
4528 	}
4529 
4530 	/*
4531 	 * In the presence of smp nice balancing, certain scenarios can have
4532 	 * max load less than avg load(as we skip the groups at or below
4533 	 * its cpu_power, while calculating max_load..)
4534 	 */
4535 	if (sds->max_load < sds->avg_load) {
4536 		env->imbalance = 0;
4537 		return fix_small_imbalance(env, sds);
4538 	}
4539 
4540 	if (!sds->group_imb) {
4541 		/*
4542 		 * Don't want to pull so many tasks that a group would go idle.
4543 		 */
4544 		load_above_capacity = (sds->busiest_nr_running -
4545 						sds->busiest_group_capacity);
4546 
4547 		load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE);
4548 
4549 		load_above_capacity /= sds->busiest->sgp->power;
4550 	}
4551 
4552 	/*
4553 	 * We're trying to get all the cpus to the average_load, so we don't
4554 	 * want to push ourselves above the average load, nor do we wish to
4555 	 * reduce the max loaded cpu below the average load. At the same time,
4556 	 * we also don't want to reduce the group load below the group capacity
4557 	 * (so that we can implement power-savings policies etc). Thus we look
4558 	 * for the minimum possible imbalance.
4559 	 * Be careful of negative numbers as they'll appear as very large values
4560 	 * with unsigned longs.
4561 	 */
4562 	max_pull = min(sds->max_load - sds->avg_load, load_above_capacity);
4563 
4564 	/* How much load to actually move to equalise the imbalance */
4565 	env->imbalance = min(max_pull * sds->busiest->sgp->power,
4566 		(sds->avg_load - sds->this_load) * sds->this->sgp->power)
4567 			/ SCHED_POWER_SCALE;
4568 
4569 	/*
4570 	 * if *imbalance is less than the average load per runnable task
4571 	 * there is no guarantee that any tasks will be moved so we'll have
4572 	 * a think about bumping its value to force at least one task to be
4573 	 * moved
4574 	 */
4575 	if (env->imbalance < sds->busiest_load_per_task)
4576 		return fix_small_imbalance(env, sds);
4577 
4578 }
4579 
4580 /******* find_busiest_group() helpers end here *********************/
4581 
4582 /**
4583  * find_busiest_group - Returns the busiest group within the sched_domain
4584  * if there is an imbalance. If there isn't an imbalance, and
4585  * the user has opted for power-savings, it returns a group whose
4586  * CPUs can be put to idle by rebalancing those tasks elsewhere, if
4587  * such a group exists.
4588  *
4589  * Also calculates the amount of weighted load which should be moved
4590  * to restore balance.
4591  *
4592  * @env: The load balancing environment.
4593  * @balance: Pointer to a variable indicating if this_cpu
4594  *	is the appropriate cpu to perform load balancing at this_level.
4595  *
4596  * Returns:	- the busiest group if imbalance exists.
4597  *		- If no imbalance and user has opted for power-savings balance,
4598  *		   return the least loaded group whose CPUs can be
4599  *		   put to idle by rebalancing its tasks onto our group.
4600  */
4601 static struct sched_group *
4602 find_busiest_group(struct lb_env *env, int *balance)
4603 {
4604 	struct sd_lb_stats sds;
4605 
4606 	memset(&sds, 0, sizeof(sds));
4607 
4608 	/*
4609 	 * Compute the various statistics relavent for load balancing at
4610 	 * this level.
4611 	 */
4612 	update_sd_lb_stats(env, balance, &sds);
4613 
4614 	/*
4615 	 * this_cpu is not the appropriate cpu to perform load balancing at
4616 	 * this level.
4617 	 */
4618 	if (!(*balance))
4619 		goto ret;
4620 
4621 	if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) &&
4622 	    check_asym_packing(env, &sds))
4623 		return sds.busiest;
4624 
4625 	/* There is no busy sibling group to pull tasks from */
4626 	if (!sds.busiest || sds.busiest_nr_running == 0)
4627 		goto out_balanced;
4628 
4629 	sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr;
4630 
4631 	/*
4632 	 * If the busiest group is imbalanced the below checks don't
4633 	 * work because they assumes all things are equal, which typically
4634 	 * isn't true due to cpus_allowed constraints and the like.
4635 	 */
4636 	if (sds.group_imb)
4637 		goto force_balance;
4638 
4639 	/* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
4640 	if (env->idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
4641 			!sds.busiest_has_capacity)
4642 		goto force_balance;
4643 
4644 	/*
4645 	 * If the local group is more busy than the selected busiest group
4646 	 * don't try and pull any tasks.
4647 	 */
4648 	if (sds.this_load >= sds.max_load)
4649 		goto out_balanced;
4650 
4651 	/*
4652 	 * Don't pull any tasks if this group is already above the domain
4653 	 * average load.
4654 	 */
4655 	if (sds.this_load >= sds.avg_load)
4656 		goto out_balanced;
4657 
4658 	if (env->idle == CPU_IDLE) {
4659 		/*
4660 		 * This cpu is idle. If the busiest group load doesn't
4661 		 * have more tasks than the number of available cpu's and
4662 		 * there is no imbalance between this and busiest group
4663 		 * wrt to idle cpu's, it is balanced.
4664 		 */
4665 		if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) &&
4666 		    sds.busiest_nr_running <= sds.busiest_group_weight)
4667 			goto out_balanced;
4668 	} else {
4669 		/*
4670 		 * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
4671 		 * imbalance_pct to be conservative.
4672 		 */
4673 		if (100 * sds.max_load <= env->sd->imbalance_pct * sds.this_load)
4674 			goto out_balanced;
4675 	}
4676 
4677 force_balance:
4678 	/* Looks like there is an imbalance. Compute it */
4679 	calculate_imbalance(env, &sds);
4680 	return sds.busiest;
4681 
4682 out_balanced:
4683 ret:
4684 	env->imbalance = 0;
4685 	return NULL;
4686 }
4687 
4688 /*
4689  * find_busiest_queue - find the busiest runqueue among the cpus in group.
4690  */
4691 static struct rq *find_busiest_queue(struct lb_env *env,
4692 				     struct sched_group *group)
4693 {
4694 	struct rq *busiest = NULL, *rq;
4695 	unsigned long max_load = 0;
4696 	int i;
4697 
4698 	for_each_cpu(i, sched_group_cpus(group)) {
4699 		unsigned long power = power_of(i);
4700 		unsigned long capacity = DIV_ROUND_CLOSEST(power,
4701 							   SCHED_POWER_SCALE);
4702 		unsigned long wl;
4703 
4704 		if (!capacity)
4705 			capacity = fix_small_capacity(env->sd, group);
4706 
4707 		if (!cpumask_test_cpu(i, env->cpus))
4708 			continue;
4709 
4710 		rq = cpu_rq(i);
4711 		wl = weighted_cpuload(i);
4712 
4713 		/*
4714 		 * When comparing with imbalance, use weighted_cpuload()
4715 		 * which is not scaled with the cpu power.
4716 		 */
4717 		if (capacity && rq->nr_running == 1 && wl > env->imbalance)
4718 			continue;
4719 
4720 		/*
4721 		 * For the load comparisons with the other cpu's, consider
4722 		 * the weighted_cpuload() scaled with the cpu power, so that
4723 		 * the load can be moved away from the cpu that is potentially
4724 		 * running at a lower capacity.
4725 		 */
4726 		wl = (wl * SCHED_POWER_SCALE) / power;
4727 
4728 		if (wl > max_load) {
4729 			max_load = wl;
4730 			busiest = rq;
4731 		}
4732 	}
4733 
4734 	return busiest;
4735 }
4736 
4737 /*
4738  * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
4739  * so long as it is large enough.
4740  */
4741 #define MAX_PINNED_INTERVAL	512
4742 
4743 /* Working cpumask for load_balance and load_balance_newidle. */
4744 DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
4745 
4746 static int need_active_balance(struct lb_env *env)
4747 {
4748 	struct sched_domain *sd = env->sd;
4749 
4750 	if (env->idle == CPU_NEWLY_IDLE) {
4751 
4752 		/*
4753 		 * ASYM_PACKING needs to force migrate tasks from busy but
4754 		 * higher numbered CPUs in order to pack all tasks in the
4755 		 * lowest numbered CPUs.
4756 		 */
4757 		if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu)
4758 			return 1;
4759 	}
4760 
4761 	return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
4762 }
4763 
4764 static int active_load_balance_cpu_stop(void *data);
4765 
4766 /*
4767  * Check this_cpu to ensure it is balanced within domain. Attempt to move
4768  * tasks if there is an imbalance.
4769  */
4770 static int load_balance(int this_cpu, struct rq *this_rq,
4771 			struct sched_domain *sd, enum cpu_idle_type idle,
4772 			int *balance)
4773 {
4774 	int ld_moved, cur_ld_moved, active_balance = 0;
4775 	int lb_iterations, max_lb_iterations;
4776 	struct sched_group *group;
4777 	struct rq *busiest;
4778 	unsigned long flags;
4779 	struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
4780 
4781 	struct lb_env env = {
4782 		.sd		= sd,
4783 		.dst_cpu	= this_cpu,
4784 		.dst_rq		= this_rq,
4785 		.dst_grpmask    = sched_group_cpus(sd->groups),
4786 		.idle		= idle,
4787 		.loop_break	= sched_nr_migrate_break,
4788 		.cpus		= cpus,
4789 	};
4790 
4791 	cpumask_copy(cpus, cpu_active_mask);
4792 	max_lb_iterations = cpumask_weight(env.dst_grpmask);
4793 
4794 	schedstat_inc(sd, lb_count[idle]);
4795 
4796 redo:
4797 	group = find_busiest_group(&env, balance);
4798 
4799 	if (*balance == 0)
4800 		goto out_balanced;
4801 
4802 	if (!group) {
4803 		schedstat_inc(sd, lb_nobusyg[idle]);
4804 		goto out_balanced;
4805 	}
4806 
4807 	busiest = find_busiest_queue(&env, group);
4808 	if (!busiest) {
4809 		schedstat_inc(sd, lb_nobusyq[idle]);
4810 		goto out_balanced;
4811 	}
4812 
4813 	BUG_ON(busiest == env.dst_rq);
4814 
4815 	schedstat_add(sd, lb_imbalance[idle], env.imbalance);
4816 
4817 	ld_moved = 0;
4818 	lb_iterations = 1;
4819 	if (busiest->nr_running > 1) {
4820 		/*
4821 		 * Attempt to move tasks. If find_busiest_group has found
4822 		 * an imbalance but busiest->nr_running <= 1, the group is
4823 		 * still unbalanced. ld_moved simply stays zero, so it is
4824 		 * correctly treated as an imbalance.
4825 		 */
4826 		env.flags |= LBF_ALL_PINNED;
4827 		env.src_cpu   = busiest->cpu;
4828 		env.src_rq    = busiest;
4829 		env.loop_max  = min(sysctl_sched_nr_migrate, busiest->nr_running);
4830 
4831 		update_h_load(env.src_cpu);
4832 more_balance:
4833 		local_irq_save(flags);
4834 		double_rq_lock(env.dst_rq, busiest);
4835 
4836 		/*
4837 		 * cur_ld_moved - load moved in current iteration
4838 		 * ld_moved     - cumulative load moved across iterations
4839 		 */
4840 		cur_ld_moved = move_tasks(&env);
4841 		ld_moved += cur_ld_moved;
4842 		double_rq_unlock(env.dst_rq, busiest);
4843 		local_irq_restore(flags);
4844 
4845 		if (env.flags & LBF_NEED_BREAK) {
4846 			env.flags &= ~LBF_NEED_BREAK;
4847 			goto more_balance;
4848 		}
4849 
4850 		/*
4851 		 * some other cpu did the load balance for us.
4852 		 */
4853 		if (cur_ld_moved && env.dst_cpu != smp_processor_id())
4854 			resched_cpu(env.dst_cpu);
4855 
4856 		/*
4857 		 * Revisit (affine) tasks on src_cpu that couldn't be moved to
4858 		 * us and move them to an alternate dst_cpu in our sched_group
4859 		 * where they can run. The upper limit on how many times we
4860 		 * iterate on same src_cpu is dependent on number of cpus in our
4861 		 * sched_group.
4862 		 *
4863 		 * This changes load balance semantics a bit on who can move
4864 		 * load to a given_cpu. In addition to the given_cpu itself
4865 		 * (or a ilb_cpu acting on its behalf where given_cpu is
4866 		 * nohz-idle), we now have balance_cpu in a position to move
4867 		 * load to given_cpu. In rare situations, this may cause
4868 		 * conflicts (balance_cpu and given_cpu/ilb_cpu deciding
4869 		 * _independently_ and at _same_ time to move some load to
4870 		 * given_cpu) causing exceess load to be moved to given_cpu.
4871 		 * This however should not happen so much in practice and
4872 		 * moreover subsequent load balance cycles should correct the
4873 		 * excess load moved.
4874 		 */
4875 		if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0 &&
4876 				lb_iterations++ < max_lb_iterations) {
4877 
4878 			env.dst_rq	 = cpu_rq(env.new_dst_cpu);
4879 			env.dst_cpu	 = env.new_dst_cpu;
4880 			env.flags	&= ~LBF_SOME_PINNED;
4881 			env.loop	 = 0;
4882 			env.loop_break	 = sched_nr_migrate_break;
4883 			/*
4884 			 * Go back to "more_balance" rather than "redo" since we
4885 			 * need to continue with same src_cpu.
4886 			 */
4887 			goto more_balance;
4888 		}
4889 
4890 		/* All tasks on this runqueue were pinned by CPU affinity */
4891 		if (unlikely(env.flags & LBF_ALL_PINNED)) {
4892 			cpumask_clear_cpu(cpu_of(busiest), cpus);
4893 			if (!cpumask_empty(cpus)) {
4894 				env.loop = 0;
4895 				env.loop_break = sched_nr_migrate_break;
4896 				goto redo;
4897 			}
4898 			goto out_balanced;
4899 		}
4900 	}
4901 
4902 	if (!ld_moved) {
4903 		schedstat_inc(sd, lb_failed[idle]);
4904 		/*
4905 		 * Increment the failure counter only on periodic balance.
4906 		 * We do not want newidle balance, which can be very
4907 		 * frequent, pollute the failure counter causing
4908 		 * excessive cache_hot migrations and active balances.
4909 		 */
4910 		if (idle != CPU_NEWLY_IDLE)
4911 			sd->nr_balance_failed++;
4912 
4913 		if (need_active_balance(&env)) {
4914 			raw_spin_lock_irqsave(&busiest->lock, flags);
4915 
4916 			/* don't kick the active_load_balance_cpu_stop,
4917 			 * if the curr task on busiest cpu can't be
4918 			 * moved to this_cpu
4919 			 */
4920 			if (!cpumask_test_cpu(this_cpu,
4921 					tsk_cpus_allowed(busiest->curr))) {
4922 				raw_spin_unlock_irqrestore(&busiest->lock,
4923 							    flags);
4924 				env.flags |= LBF_ALL_PINNED;
4925 				goto out_one_pinned;
4926 			}
4927 
4928 			/*
4929 			 * ->active_balance synchronizes accesses to
4930 			 * ->active_balance_work.  Once set, it's cleared
4931 			 * only after active load balance is finished.
4932 			 */
4933 			if (!busiest->active_balance) {
4934 				busiest->active_balance = 1;
4935 				busiest->push_cpu = this_cpu;
4936 				active_balance = 1;
4937 			}
4938 			raw_spin_unlock_irqrestore(&busiest->lock, flags);
4939 
4940 			if (active_balance) {
4941 				stop_one_cpu_nowait(cpu_of(busiest),
4942 					active_load_balance_cpu_stop, busiest,
4943 					&busiest->active_balance_work);
4944 			}
4945 
4946 			/*
4947 			 * We've kicked active balancing, reset the failure
4948 			 * counter.
4949 			 */
4950 			sd->nr_balance_failed = sd->cache_nice_tries+1;
4951 		}
4952 	} else
4953 		sd->nr_balance_failed = 0;
4954 
4955 	if (likely(!active_balance)) {
4956 		/* We were unbalanced, so reset the balancing interval */
4957 		sd->balance_interval = sd->min_interval;
4958 	} else {
4959 		/*
4960 		 * If we've begun active balancing, start to back off. This
4961 		 * case may not be covered by the all_pinned logic if there
4962 		 * is only 1 task on the busy runqueue (because we don't call
4963 		 * move_tasks).
4964 		 */
4965 		if (sd->balance_interval < sd->max_interval)
4966 			sd->balance_interval *= 2;
4967 	}
4968 
4969 	goto out;
4970 
4971 out_balanced:
4972 	schedstat_inc(sd, lb_balanced[idle]);
4973 
4974 	sd->nr_balance_failed = 0;
4975 
4976 out_one_pinned:
4977 	/* tune up the balancing interval */
4978 	if (((env.flags & LBF_ALL_PINNED) &&
4979 			sd->balance_interval < MAX_PINNED_INTERVAL) ||
4980 			(sd->balance_interval < sd->max_interval))
4981 		sd->balance_interval *= 2;
4982 
4983 	ld_moved = 0;
4984 out:
4985 	return ld_moved;
4986 }
4987 
4988 /*
4989  * idle_balance is called by schedule() if this_cpu is about to become
4990  * idle. Attempts to pull tasks from other CPUs.
4991  */
4992 void idle_balance(int this_cpu, struct rq *this_rq)
4993 {
4994 	struct sched_domain *sd;
4995 	int pulled_task = 0;
4996 	unsigned long next_balance = jiffies + HZ;
4997 
4998 	this_rq->idle_stamp = this_rq->clock;
4999 
5000 	if (this_rq->avg_idle < sysctl_sched_migration_cost)
5001 		return;
5002 
5003 	update_rq_runnable_avg(this_rq, 1);
5004 
5005 	/*
5006 	 * Drop the rq->lock, but keep IRQ/preempt disabled.
5007 	 */
5008 	raw_spin_unlock(&this_rq->lock);
5009 
5010 	update_blocked_averages(this_cpu);
5011 	rcu_read_lock();
5012 	for_each_domain(this_cpu, sd) {
5013 		unsigned long interval;
5014 		int balance = 1;
5015 
5016 		if (!(sd->flags & SD_LOAD_BALANCE))
5017 			continue;
5018 
5019 		if (sd->flags & SD_BALANCE_NEWIDLE) {
5020 			/* If we've pulled tasks over stop searching: */
5021 			pulled_task = load_balance(this_cpu, this_rq,
5022 						   sd, CPU_NEWLY_IDLE, &balance);
5023 		}
5024 
5025 		interval = msecs_to_jiffies(sd->balance_interval);
5026 		if (time_after(next_balance, sd->last_balance + interval))
5027 			next_balance = sd->last_balance + interval;
5028 		if (pulled_task) {
5029 			this_rq->idle_stamp = 0;
5030 			break;
5031 		}
5032 	}
5033 	rcu_read_unlock();
5034 
5035 	raw_spin_lock(&this_rq->lock);
5036 
5037 	if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
5038 		/*
5039 		 * We are going idle. next_balance may be set based on
5040 		 * a busy processor. So reset next_balance.
5041 		 */
5042 		this_rq->next_balance = next_balance;
5043 	}
5044 }
5045 
5046 /*
5047  * active_load_balance_cpu_stop is run by cpu stopper. It pushes
5048  * running tasks off the busiest CPU onto idle CPUs. It requires at
5049  * least 1 task to be running on each physical CPU where possible, and
5050  * avoids physical / logical imbalances.
5051  */
5052 static int active_load_balance_cpu_stop(void *data)
5053 {
5054 	struct rq *busiest_rq = data;
5055 	int busiest_cpu = cpu_of(busiest_rq);
5056 	int target_cpu = busiest_rq->push_cpu;
5057 	struct rq *target_rq = cpu_rq(target_cpu);
5058 	struct sched_domain *sd;
5059 
5060 	raw_spin_lock_irq(&busiest_rq->lock);
5061 
5062 	/* make sure the requested cpu hasn't gone down in the meantime */
5063 	if (unlikely(busiest_cpu != smp_processor_id() ||
5064 		     !busiest_rq->active_balance))
5065 		goto out_unlock;
5066 
5067 	/* Is there any task to move? */
5068 	if (busiest_rq->nr_running <= 1)
5069 		goto out_unlock;
5070 
5071 	/*
5072 	 * This condition is "impossible", if it occurs
5073 	 * we need to fix it. Originally reported by
5074 	 * Bjorn Helgaas on a 128-cpu setup.
5075 	 */
5076 	BUG_ON(busiest_rq == target_rq);
5077 
5078 	/* move a task from busiest_rq to target_rq */
5079 	double_lock_balance(busiest_rq, target_rq);
5080 
5081 	/* Search for an sd spanning us and the target CPU. */
5082 	rcu_read_lock();
5083 	for_each_domain(target_cpu, sd) {
5084 		if ((sd->flags & SD_LOAD_BALANCE) &&
5085 		    cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
5086 				break;
5087 	}
5088 
5089 	if (likely(sd)) {
5090 		struct lb_env env = {
5091 			.sd		= sd,
5092 			.dst_cpu	= target_cpu,
5093 			.dst_rq		= target_rq,
5094 			.src_cpu	= busiest_rq->cpu,
5095 			.src_rq		= busiest_rq,
5096 			.idle		= CPU_IDLE,
5097 		};
5098 
5099 		schedstat_inc(sd, alb_count);
5100 
5101 		if (move_one_task(&env))
5102 			schedstat_inc(sd, alb_pushed);
5103 		else
5104 			schedstat_inc(sd, alb_failed);
5105 	}
5106 	rcu_read_unlock();
5107 	double_unlock_balance(busiest_rq, target_rq);
5108 out_unlock:
5109 	busiest_rq->active_balance = 0;
5110 	raw_spin_unlock_irq(&busiest_rq->lock);
5111 	return 0;
5112 }
5113 
5114 #ifdef CONFIG_NO_HZ
5115 /*
5116  * idle load balancing details
5117  * - When one of the busy CPUs notice that there may be an idle rebalancing
5118  *   needed, they will kick the idle load balancer, which then does idle
5119  *   load balancing for all the idle CPUs.
5120  */
5121 static struct {
5122 	cpumask_var_t idle_cpus_mask;
5123 	atomic_t nr_cpus;
5124 	unsigned long next_balance;     /* in jiffy units */
5125 } nohz ____cacheline_aligned;
5126 
5127 static inline int find_new_ilb(int call_cpu)
5128 {
5129 	int ilb = cpumask_first(nohz.idle_cpus_mask);
5130 
5131 	if (ilb < nr_cpu_ids && idle_cpu(ilb))
5132 		return ilb;
5133 
5134 	return nr_cpu_ids;
5135 }
5136 
5137 /*
5138  * Kick a CPU to do the nohz balancing, if it is time for it. We pick the
5139  * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
5140  * CPU (if there is one).
5141  */
5142 static void nohz_balancer_kick(int cpu)
5143 {
5144 	int ilb_cpu;
5145 
5146 	nohz.next_balance++;
5147 
5148 	ilb_cpu = find_new_ilb(cpu);
5149 
5150 	if (ilb_cpu >= nr_cpu_ids)
5151 		return;
5152 
5153 	if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)))
5154 		return;
5155 	/*
5156 	 * Use smp_send_reschedule() instead of resched_cpu().
5157 	 * This way we generate a sched IPI on the target cpu which
5158 	 * is idle. And the softirq performing nohz idle load balance
5159 	 * will be run before returning from the IPI.
5160 	 */
5161 	smp_send_reschedule(ilb_cpu);
5162 	return;
5163 }
5164 
5165 static inline void nohz_balance_exit_idle(int cpu)
5166 {
5167 	if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
5168 		cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
5169 		atomic_dec(&nohz.nr_cpus);
5170 		clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
5171 	}
5172 }
5173 
5174 static inline void set_cpu_sd_state_busy(void)
5175 {
5176 	struct sched_domain *sd;
5177 	int cpu = smp_processor_id();
5178 
5179 	if (!test_bit(NOHZ_IDLE, nohz_flags(cpu)))
5180 		return;
5181 	clear_bit(NOHZ_IDLE, nohz_flags(cpu));
5182 
5183 	rcu_read_lock();
5184 	for_each_domain(cpu, sd)
5185 		atomic_inc(&sd->groups->sgp->nr_busy_cpus);
5186 	rcu_read_unlock();
5187 }
5188 
5189 void set_cpu_sd_state_idle(void)
5190 {
5191 	struct sched_domain *sd;
5192 	int cpu = smp_processor_id();
5193 
5194 	if (test_bit(NOHZ_IDLE, nohz_flags(cpu)))
5195 		return;
5196 	set_bit(NOHZ_IDLE, nohz_flags(cpu));
5197 
5198 	rcu_read_lock();
5199 	for_each_domain(cpu, sd)
5200 		atomic_dec(&sd->groups->sgp->nr_busy_cpus);
5201 	rcu_read_unlock();
5202 }
5203 
5204 /*
5205  * This routine will record that the cpu is going idle with tick stopped.
5206  * This info will be used in performing idle load balancing in the future.
5207  */
5208 void nohz_balance_enter_idle(int cpu)
5209 {
5210 	/*
5211 	 * If this cpu is going down, then nothing needs to be done.
5212 	 */
5213 	if (!cpu_active(cpu))
5214 		return;
5215 
5216 	if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
5217 		return;
5218 
5219 	cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
5220 	atomic_inc(&nohz.nr_cpus);
5221 	set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
5222 }
5223 
5224 static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb,
5225 					unsigned long action, void *hcpu)
5226 {
5227 	switch (action & ~CPU_TASKS_FROZEN) {
5228 	case CPU_DYING:
5229 		nohz_balance_exit_idle(smp_processor_id());
5230 		return NOTIFY_OK;
5231 	default:
5232 		return NOTIFY_DONE;
5233 	}
5234 }
5235 #endif
5236 
5237 static DEFINE_SPINLOCK(balancing);
5238 
5239 /*
5240  * Scale the max load_balance interval with the number of CPUs in the system.
5241  * This trades load-balance latency on larger machines for less cross talk.
5242  */
5243 void update_max_interval(void)
5244 {
5245 	max_load_balance_interval = HZ*num_online_cpus()/10;
5246 }
5247 
5248 /*
5249  * It checks each scheduling domain to see if it is due to be balanced,
5250  * and initiates a balancing operation if so.
5251  *
5252  * Balancing parameters are set up in arch_init_sched_domains.
5253  */
5254 static void rebalance_domains(int cpu, enum cpu_idle_type idle)
5255 {
5256 	int balance = 1;
5257 	struct rq *rq = cpu_rq(cpu);
5258 	unsigned long interval;
5259 	struct sched_domain *sd;
5260 	/* Earliest time when we have to do rebalance again */
5261 	unsigned long next_balance = jiffies + 60*HZ;
5262 	int update_next_balance = 0;
5263 	int need_serialize;
5264 
5265 	update_blocked_averages(cpu);
5266 
5267 	rcu_read_lock();
5268 	for_each_domain(cpu, sd) {
5269 		if (!(sd->flags & SD_LOAD_BALANCE))
5270 			continue;
5271 
5272 		interval = sd->balance_interval;
5273 		if (idle != CPU_IDLE)
5274 			interval *= sd->busy_factor;
5275 
5276 		/* scale ms to jiffies */
5277 		interval = msecs_to_jiffies(interval);
5278 		interval = clamp(interval, 1UL, max_load_balance_interval);
5279 
5280 		need_serialize = sd->flags & SD_SERIALIZE;
5281 
5282 		if (need_serialize) {
5283 			if (!spin_trylock(&balancing))
5284 				goto out;
5285 		}
5286 
5287 		if (time_after_eq(jiffies, sd->last_balance + interval)) {
5288 			if (load_balance(cpu, rq, sd, idle, &balance)) {
5289 				/*
5290 				 * We've pulled tasks over so either we're no
5291 				 * longer idle.
5292 				 */
5293 				idle = CPU_NOT_IDLE;
5294 			}
5295 			sd->last_balance = jiffies;
5296 		}
5297 		if (need_serialize)
5298 			spin_unlock(&balancing);
5299 out:
5300 		if (time_after(next_balance, sd->last_balance + interval)) {
5301 			next_balance = sd->last_balance + interval;
5302 			update_next_balance = 1;
5303 		}
5304 
5305 		/*
5306 		 * Stop the load balance at this level. There is another
5307 		 * CPU in our sched group which is doing load balancing more
5308 		 * actively.
5309 		 */
5310 		if (!balance)
5311 			break;
5312 	}
5313 	rcu_read_unlock();
5314 
5315 	/*
5316 	 * next_balance will be updated only when there is a need.
5317 	 * When the cpu is attached to null domain for ex, it will not be
5318 	 * updated.
5319 	 */
5320 	if (likely(update_next_balance))
5321 		rq->next_balance = next_balance;
5322 }
5323 
5324 #ifdef CONFIG_NO_HZ
5325 /*
5326  * In CONFIG_NO_HZ case, the idle balance kickee will do the
5327  * rebalancing for all the cpus for whom scheduler ticks are stopped.
5328  */
5329 static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
5330 {
5331 	struct rq *this_rq = cpu_rq(this_cpu);
5332 	struct rq *rq;
5333 	int balance_cpu;
5334 
5335 	if (idle != CPU_IDLE ||
5336 	    !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))
5337 		goto end;
5338 
5339 	for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
5340 		if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
5341 			continue;
5342 
5343 		/*
5344 		 * If this cpu gets work to do, stop the load balancing
5345 		 * work being done for other cpus. Next load
5346 		 * balancing owner will pick it up.
5347 		 */
5348 		if (need_resched())
5349 			break;
5350 
5351 		rq = cpu_rq(balance_cpu);
5352 
5353 		raw_spin_lock_irq(&rq->lock);
5354 		update_rq_clock(rq);
5355 		update_idle_cpu_load(rq);
5356 		raw_spin_unlock_irq(&rq->lock);
5357 
5358 		rebalance_domains(balance_cpu, CPU_IDLE);
5359 
5360 		if (time_after(this_rq->next_balance, rq->next_balance))
5361 			this_rq->next_balance = rq->next_balance;
5362 	}
5363 	nohz.next_balance = this_rq->next_balance;
5364 end:
5365 	clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
5366 }
5367 
5368 /*
5369  * Current heuristic for kicking the idle load balancer in the presence
5370  * of an idle cpu is the system.
5371  *   - This rq has more than one task.
5372  *   - At any scheduler domain level, this cpu's scheduler group has multiple
5373  *     busy cpu's exceeding the group's power.
5374  *   - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
5375  *     domain span are idle.
5376  */
5377 static inline int nohz_kick_needed(struct rq *rq, int cpu)
5378 {
5379 	unsigned long now = jiffies;
5380 	struct sched_domain *sd;
5381 
5382 	if (unlikely(idle_cpu(cpu)))
5383 		return 0;
5384 
5385        /*
5386 	* We may be recently in ticked or tickless idle mode. At the first
5387 	* busy tick after returning from idle, we will update the busy stats.
5388 	*/
5389 	set_cpu_sd_state_busy();
5390 	nohz_balance_exit_idle(cpu);
5391 
5392 	/*
5393 	 * None are in tickless mode and hence no need for NOHZ idle load
5394 	 * balancing.
5395 	 */
5396 	if (likely(!atomic_read(&nohz.nr_cpus)))
5397 		return 0;
5398 
5399 	if (time_before(now, nohz.next_balance))
5400 		return 0;
5401 
5402 	if (rq->nr_running >= 2)
5403 		goto need_kick;
5404 
5405 	rcu_read_lock();
5406 	for_each_domain(cpu, sd) {
5407 		struct sched_group *sg = sd->groups;
5408 		struct sched_group_power *sgp = sg->sgp;
5409 		int nr_busy = atomic_read(&sgp->nr_busy_cpus);
5410 
5411 		if (sd->flags & SD_SHARE_PKG_RESOURCES && nr_busy > 1)
5412 			goto need_kick_unlock;
5413 
5414 		if (sd->flags & SD_ASYM_PACKING && nr_busy != sg->group_weight
5415 		    && (cpumask_first_and(nohz.idle_cpus_mask,
5416 					  sched_domain_span(sd)) < cpu))
5417 			goto need_kick_unlock;
5418 
5419 		if (!(sd->flags & (SD_SHARE_PKG_RESOURCES | SD_ASYM_PACKING)))
5420 			break;
5421 	}
5422 	rcu_read_unlock();
5423 	return 0;
5424 
5425 need_kick_unlock:
5426 	rcu_read_unlock();
5427 need_kick:
5428 	return 1;
5429 }
5430 #else
5431 static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
5432 #endif
5433 
5434 /*
5435  * run_rebalance_domains is triggered when needed from the scheduler tick.
5436  * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
5437  */
5438 static void run_rebalance_domains(struct softirq_action *h)
5439 {
5440 	int this_cpu = smp_processor_id();
5441 	struct rq *this_rq = cpu_rq(this_cpu);
5442 	enum cpu_idle_type idle = this_rq->idle_balance ?
5443 						CPU_IDLE : CPU_NOT_IDLE;
5444 
5445 	rebalance_domains(this_cpu, idle);
5446 
5447 	/*
5448 	 * If this cpu has a pending nohz_balance_kick, then do the
5449 	 * balancing on behalf of the other idle cpus whose ticks are
5450 	 * stopped.
5451 	 */
5452 	nohz_idle_balance(this_cpu, idle);
5453 }
5454 
5455 static inline int on_null_domain(int cpu)
5456 {
5457 	return !rcu_dereference_sched(cpu_rq(cpu)->sd);
5458 }
5459 
5460 /*
5461  * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
5462  */
5463 void trigger_load_balance(struct rq *rq, int cpu)
5464 {
5465 	/* Don't need to rebalance while attached to NULL domain */
5466 	if (time_after_eq(jiffies, rq->next_balance) &&
5467 	    likely(!on_null_domain(cpu)))
5468 		raise_softirq(SCHED_SOFTIRQ);
5469 #ifdef CONFIG_NO_HZ
5470 	if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu)))
5471 		nohz_balancer_kick(cpu);
5472 #endif
5473 }
5474 
5475 static void rq_online_fair(struct rq *rq)
5476 {
5477 	update_sysctl();
5478 }
5479 
5480 static void rq_offline_fair(struct rq *rq)
5481 {
5482 	update_sysctl();
5483 
5484 	/* Ensure any throttled groups are reachable by pick_next_task */
5485 	unthrottle_offline_cfs_rqs(rq);
5486 }
5487 
5488 #endif /* CONFIG_SMP */
5489 
5490 /*
5491  * scheduler tick hitting a task of our scheduling class:
5492  */
5493 static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
5494 {
5495 	struct cfs_rq *cfs_rq;
5496 	struct sched_entity *se = &curr->se;
5497 
5498 	for_each_sched_entity(se) {
5499 		cfs_rq = cfs_rq_of(se);
5500 		entity_tick(cfs_rq, se, queued);
5501 	}
5502 
5503 	update_rq_runnable_avg(rq, 1);
5504 }
5505 
5506 /*
5507  * called on fork with the child task as argument from the parent's context
5508  *  - child not yet on the tasklist
5509  *  - preemption disabled
5510  */
5511 static void task_fork_fair(struct task_struct *p)
5512 {
5513 	struct cfs_rq *cfs_rq;
5514 	struct sched_entity *se = &p->se, *curr;
5515 	int this_cpu = smp_processor_id();
5516 	struct rq *rq = this_rq();
5517 	unsigned long flags;
5518 
5519 	raw_spin_lock_irqsave(&rq->lock, flags);
5520 
5521 	update_rq_clock(rq);
5522 
5523 	cfs_rq = task_cfs_rq(current);
5524 	curr = cfs_rq->curr;
5525 
5526 	if (unlikely(task_cpu(p) != this_cpu)) {
5527 		rcu_read_lock();
5528 		__set_task_cpu(p, this_cpu);
5529 		rcu_read_unlock();
5530 	}
5531 
5532 	update_curr(cfs_rq);
5533 
5534 	if (curr)
5535 		se->vruntime = curr->vruntime;
5536 	place_entity(cfs_rq, se, 1);
5537 
5538 	if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
5539 		/*
5540 		 * Upon rescheduling, sched_class::put_prev_task() will place
5541 		 * 'current' within the tree based on its new key value.
5542 		 */
5543 		swap(curr->vruntime, se->vruntime);
5544 		resched_task(rq->curr);
5545 	}
5546 
5547 	se->vruntime -= cfs_rq->min_vruntime;
5548 
5549 	raw_spin_unlock_irqrestore(&rq->lock, flags);
5550 }
5551 
5552 /*
5553  * Priority of the task has changed. Check to see if we preempt
5554  * the current task.
5555  */
5556 static void
5557 prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
5558 {
5559 	if (!p->se.on_rq)
5560 		return;
5561 
5562 	/*
5563 	 * Reschedule if we are currently running on this runqueue and
5564 	 * our priority decreased, or if we are not currently running on
5565 	 * this runqueue and our priority is higher than the current's
5566 	 */
5567 	if (rq->curr == p) {
5568 		if (p->prio > oldprio)
5569 			resched_task(rq->curr);
5570 	} else
5571 		check_preempt_curr(rq, p, 0);
5572 }
5573 
5574 static void switched_from_fair(struct rq *rq, struct task_struct *p)
5575 {
5576 	struct sched_entity *se = &p->se;
5577 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
5578 
5579 	/*
5580 	 * Ensure the task's vruntime is normalized, so that when its
5581 	 * switched back to the fair class the enqueue_entity(.flags=0) will
5582 	 * do the right thing.
5583 	 *
5584 	 * If it was on_rq, then the dequeue_entity(.flags=0) will already
5585 	 * have normalized the vruntime, if it was !on_rq, then only when
5586 	 * the task is sleeping will it still have non-normalized vruntime.
5587 	 */
5588 	if (!se->on_rq && p->state != TASK_RUNNING) {
5589 		/*
5590 		 * Fix up our vruntime so that the current sleep doesn't
5591 		 * cause 'unlimited' sleep bonus.
5592 		 */
5593 		place_entity(cfs_rq, se, 0);
5594 		se->vruntime -= cfs_rq->min_vruntime;
5595 	}
5596 
5597 #if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
5598 	/*
5599 	* Remove our load from contribution when we leave sched_fair
5600 	* and ensure we don't carry in an old decay_count if we
5601 	* switch back.
5602 	*/
5603 	if (p->se.avg.decay_count) {
5604 		struct cfs_rq *cfs_rq = cfs_rq_of(&p->se);
5605 		__synchronize_entity_decay(&p->se);
5606 		subtract_blocked_load_contrib(cfs_rq,
5607 				p->se.avg.load_avg_contrib);
5608 	}
5609 #endif
5610 }
5611 
5612 /*
5613  * We switched to the sched_fair class.
5614  */
5615 static void switched_to_fair(struct rq *rq, struct task_struct *p)
5616 {
5617 	if (!p->se.on_rq)
5618 		return;
5619 
5620 	/*
5621 	 * We were most likely switched from sched_rt, so
5622 	 * kick off the schedule if running, otherwise just see
5623 	 * if we can still preempt the current task.
5624 	 */
5625 	if (rq->curr == p)
5626 		resched_task(rq->curr);
5627 	else
5628 		check_preempt_curr(rq, p, 0);
5629 }
5630 
5631 /* Account for a task changing its policy or group.
5632  *
5633  * This routine is mostly called to set cfs_rq->curr field when a task
5634  * migrates between groups/classes.
5635  */
5636 static void set_curr_task_fair(struct rq *rq)
5637 {
5638 	struct sched_entity *se = &rq->curr->se;
5639 
5640 	for_each_sched_entity(se) {
5641 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
5642 
5643 		set_next_entity(cfs_rq, se);
5644 		/* ensure bandwidth has been allocated on our new cfs_rq */
5645 		account_cfs_rq_runtime(cfs_rq, 0);
5646 	}
5647 }
5648 
5649 void init_cfs_rq(struct cfs_rq *cfs_rq)
5650 {
5651 	cfs_rq->tasks_timeline = RB_ROOT;
5652 	cfs_rq->min_vruntime = (u64)(-(1LL << 20));
5653 #ifndef CONFIG_64BIT
5654 	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
5655 #endif
5656 #if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
5657 	atomic64_set(&cfs_rq->decay_counter, 1);
5658 	atomic64_set(&cfs_rq->removed_load, 0);
5659 #endif
5660 }
5661 
5662 #ifdef CONFIG_FAIR_GROUP_SCHED
5663 static void task_move_group_fair(struct task_struct *p, int on_rq)
5664 {
5665 	struct cfs_rq *cfs_rq;
5666 	/*
5667 	 * If the task was not on the rq at the time of this cgroup movement
5668 	 * it must have been asleep, sleeping tasks keep their ->vruntime
5669 	 * absolute on their old rq until wakeup (needed for the fair sleeper
5670 	 * bonus in place_entity()).
5671 	 *
5672 	 * If it was on the rq, we've just 'preempted' it, which does convert
5673 	 * ->vruntime to a relative base.
5674 	 *
5675 	 * Make sure both cases convert their relative position when migrating
5676 	 * to another cgroup's rq. This does somewhat interfere with the
5677 	 * fair sleeper stuff for the first placement, but who cares.
5678 	 */
5679 	/*
5680 	 * When !on_rq, vruntime of the task has usually NOT been normalized.
5681 	 * But there are some cases where it has already been normalized:
5682 	 *
5683 	 * - Moving a forked child which is waiting for being woken up by
5684 	 *   wake_up_new_task().
5685 	 * - Moving a task which has been woken up by try_to_wake_up() and
5686 	 *   waiting for actually being woken up by sched_ttwu_pending().
5687 	 *
5688 	 * To prevent boost or penalty in the new cfs_rq caused by delta
5689 	 * min_vruntime between the two cfs_rqs, we skip vruntime adjustment.
5690 	 */
5691 	if (!on_rq && (!p->se.sum_exec_runtime || p->state == TASK_WAKING))
5692 		on_rq = 1;
5693 
5694 	if (!on_rq)
5695 		p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
5696 	set_task_rq(p, task_cpu(p));
5697 	if (!on_rq) {
5698 		cfs_rq = cfs_rq_of(&p->se);
5699 		p->se.vruntime += cfs_rq->min_vruntime;
5700 #ifdef CONFIG_SMP
5701 		/*
5702 		 * migrate_task_rq_fair() will have removed our previous
5703 		 * contribution, but we must synchronize for ongoing future
5704 		 * decay.
5705 		 */
5706 		p->se.avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
5707 		cfs_rq->blocked_load_avg += p->se.avg.load_avg_contrib;
5708 #endif
5709 	}
5710 }
5711 
5712 void free_fair_sched_group(struct task_group *tg)
5713 {
5714 	int i;
5715 
5716 	destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
5717 
5718 	for_each_possible_cpu(i) {
5719 		if (tg->cfs_rq)
5720 			kfree(tg->cfs_rq[i]);
5721 		if (tg->se)
5722 			kfree(tg->se[i]);
5723 	}
5724 
5725 	kfree(tg->cfs_rq);
5726 	kfree(tg->se);
5727 }
5728 
5729 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
5730 {
5731 	struct cfs_rq *cfs_rq;
5732 	struct sched_entity *se;
5733 	int i;
5734 
5735 	tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
5736 	if (!tg->cfs_rq)
5737 		goto err;
5738 	tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
5739 	if (!tg->se)
5740 		goto err;
5741 
5742 	tg->shares = NICE_0_LOAD;
5743 
5744 	init_cfs_bandwidth(tg_cfs_bandwidth(tg));
5745 
5746 	for_each_possible_cpu(i) {
5747 		cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
5748 				      GFP_KERNEL, cpu_to_node(i));
5749 		if (!cfs_rq)
5750 			goto err;
5751 
5752 		se = kzalloc_node(sizeof(struct sched_entity),
5753 				  GFP_KERNEL, cpu_to_node(i));
5754 		if (!se)
5755 			goto err_free_rq;
5756 
5757 		init_cfs_rq(cfs_rq);
5758 		init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
5759 	}
5760 
5761 	return 1;
5762 
5763 err_free_rq:
5764 	kfree(cfs_rq);
5765 err:
5766 	return 0;
5767 }
5768 
5769 void unregister_fair_sched_group(struct task_group *tg, int cpu)
5770 {
5771 	struct rq *rq = cpu_rq(cpu);
5772 	unsigned long flags;
5773 
5774 	/*
5775 	* Only empty task groups can be destroyed; so we can speculatively
5776 	* check on_list without danger of it being re-added.
5777 	*/
5778 	if (!tg->cfs_rq[cpu]->on_list)
5779 		return;
5780 
5781 	raw_spin_lock_irqsave(&rq->lock, flags);
5782 	list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
5783 	raw_spin_unlock_irqrestore(&rq->lock, flags);
5784 }
5785 
5786 void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
5787 			struct sched_entity *se, int cpu,
5788 			struct sched_entity *parent)
5789 {
5790 	struct rq *rq = cpu_rq(cpu);
5791 
5792 	cfs_rq->tg = tg;
5793 	cfs_rq->rq = rq;
5794 	init_cfs_rq_runtime(cfs_rq);
5795 
5796 	tg->cfs_rq[cpu] = cfs_rq;
5797 	tg->se[cpu] = se;
5798 
5799 	/* se could be NULL for root_task_group */
5800 	if (!se)
5801 		return;
5802 
5803 	if (!parent)
5804 		se->cfs_rq = &rq->cfs;
5805 	else
5806 		se->cfs_rq = parent->my_q;
5807 
5808 	se->my_q = cfs_rq;
5809 	update_load_set(&se->load, 0);
5810 	se->parent = parent;
5811 }
5812 
5813 static DEFINE_MUTEX(shares_mutex);
5814 
5815 int sched_group_set_shares(struct task_group *tg, unsigned long shares)
5816 {
5817 	int i;
5818 	unsigned long flags;
5819 
5820 	/*
5821 	 * We can't change the weight of the root cgroup.
5822 	 */
5823 	if (!tg->se[0])
5824 		return -EINVAL;
5825 
5826 	shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
5827 
5828 	mutex_lock(&shares_mutex);
5829 	if (tg->shares == shares)
5830 		goto done;
5831 
5832 	tg->shares = shares;
5833 	for_each_possible_cpu(i) {
5834 		struct rq *rq = cpu_rq(i);
5835 		struct sched_entity *se;
5836 
5837 		se = tg->se[i];
5838 		/* Propagate contribution to hierarchy */
5839 		raw_spin_lock_irqsave(&rq->lock, flags);
5840 		for_each_sched_entity(se) {
5841 			update_cfs_shares(group_cfs_rq(se));
5842 			/* update contribution to parent */
5843 			update_entity_load_avg(se, 1);
5844 		}
5845 		raw_spin_unlock_irqrestore(&rq->lock, flags);
5846 	}
5847 
5848 done:
5849 	mutex_unlock(&shares_mutex);
5850 	return 0;
5851 }
5852 #else /* CONFIG_FAIR_GROUP_SCHED */
5853 
5854 void free_fair_sched_group(struct task_group *tg) { }
5855 
5856 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
5857 {
5858 	return 1;
5859 }
5860 
5861 void unregister_fair_sched_group(struct task_group *tg, int cpu) { }
5862 
5863 #endif /* CONFIG_FAIR_GROUP_SCHED */
5864 
5865 
5866 static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
5867 {
5868 	struct sched_entity *se = &task->se;
5869 	unsigned int rr_interval = 0;
5870 
5871 	/*
5872 	 * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
5873 	 * idle runqueue:
5874 	 */
5875 	if (rq->cfs.load.weight)
5876 		rr_interval = NS_TO_JIFFIES(sched_slice(&rq->cfs, se));
5877 
5878 	return rr_interval;
5879 }
5880 
5881 /*
5882  * All the scheduling class methods:
5883  */
5884 const struct sched_class fair_sched_class = {
5885 	.next			= &idle_sched_class,
5886 	.enqueue_task		= enqueue_task_fair,
5887 	.dequeue_task		= dequeue_task_fair,
5888 	.yield_task		= yield_task_fair,
5889 	.yield_to_task		= yield_to_task_fair,
5890 
5891 	.check_preempt_curr	= check_preempt_wakeup,
5892 
5893 	.pick_next_task		= pick_next_task_fair,
5894 	.put_prev_task		= put_prev_task_fair,
5895 
5896 #ifdef CONFIG_SMP
5897 	.select_task_rq		= select_task_rq_fair,
5898 #ifdef CONFIG_FAIR_GROUP_SCHED
5899 	.migrate_task_rq	= migrate_task_rq_fair,
5900 #endif
5901 	.rq_online		= rq_online_fair,
5902 	.rq_offline		= rq_offline_fair,
5903 
5904 	.task_waking		= task_waking_fair,
5905 #endif
5906 
5907 	.set_curr_task          = set_curr_task_fair,
5908 	.task_tick		= task_tick_fair,
5909 	.task_fork		= task_fork_fair,
5910 
5911 	.prio_changed		= prio_changed_fair,
5912 	.switched_from		= switched_from_fair,
5913 	.switched_to		= switched_to_fair,
5914 
5915 	.get_rr_interval	= get_rr_interval_fair,
5916 
5917 #ifdef CONFIG_FAIR_GROUP_SCHED
5918 	.task_move_group	= task_move_group_fair,
5919 #endif
5920 };
5921 
5922 #ifdef CONFIG_SCHED_DEBUG
5923 void print_cfs_stats(struct seq_file *m, int cpu)
5924 {
5925 	struct cfs_rq *cfs_rq;
5926 
5927 	rcu_read_lock();
5928 	for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
5929 		print_cfs_rq(m, cpu, cfs_rq);
5930 	rcu_read_unlock();
5931 }
5932 #endif
5933 
5934 __init void init_sched_fair_class(void)
5935 {
5936 #ifdef CONFIG_SMP
5937 	open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
5938 
5939 #ifdef CONFIG_NO_HZ
5940 	nohz.next_balance = jiffies;
5941 	zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
5942 	cpu_notifier(sched_ilb_notifier, 0);
5943 #endif
5944 #endif /* SMP */
5945 
5946 }
5947