xref: /linux/kernel/sched/core.c (revision ec2212088c42ff7d1362629ec26dda4f3e8bdad3)
1 /*
2  *  kernel/sched/core.c
3  *
4  *  Kernel scheduler and related syscalls
5  *
6  *  Copyright (C) 1991-2002  Linus Torvalds
7  *
8  *  1996-12-23  Modified by Dave Grothe to fix bugs in semaphores and
9  *		make semaphores SMP safe
10  *  1998-11-19	Implemented schedule_timeout() and related stuff
11  *		by Andrea Arcangeli
12  *  2002-01-04	New ultra-scalable O(1) scheduler by Ingo Molnar:
13  *		hybrid priority-list and round-robin design with
14  *		an array-switch method of distributing timeslices
15  *		and per-CPU runqueues.  Cleanups and useful suggestions
16  *		by Davide Libenzi, preemptible kernel bits by Robert Love.
17  *  2003-09-03	Interactivity tuning by Con Kolivas.
18  *  2004-04-02	Scheduler domains code by Nick Piggin
19  *  2007-04-15  Work begun on replacing all interactivity tuning with a
20  *              fair scheduling design by Con Kolivas.
21  *  2007-05-05  Load balancing (smp-nice) and other improvements
22  *              by Peter Williams
23  *  2007-05-06  Interactivity improvements to CFS by Mike Galbraith
24  *  2007-07-01  Group scheduling enhancements by Srivatsa Vaddagiri
25  *  2007-11-29  RT balancing improvements by Steven Rostedt, Gregory Haskins,
26  *              Thomas Gleixner, Mike Kravetz
27  */
28 
29 #include <linux/mm.h>
30 #include <linux/module.h>
31 #include <linux/nmi.h>
32 #include <linux/init.h>
33 #include <linux/uaccess.h>
34 #include <linux/highmem.h>
35 #include <asm/mmu_context.h>
36 #include <linux/interrupt.h>
37 #include <linux/capability.h>
38 #include <linux/completion.h>
39 #include <linux/kernel_stat.h>
40 #include <linux/debug_locks.h>
41 #include <linux/perf_event.h>
42 #include <linux/security.h>
43 #include <linux/notifier.h>
44 #include <linux/profile.h>
45 #include <linux/freezer.h>
46 #include <linux/vmalloc.h>
47 #include <linux/blkdev.h>
48 #include <linux/delay.h>
49 #include <linux/pid_namespace.h>
50 #include <linux/smp.h>
51 #include <linux/threads.h>
52 #include <linux/timer.h>
53 #include <linux/rcupdate.h>
54 #include <linux/cpu.h>
55 #include <linux/cpuset.h>
56 #include <linux/percpu.h>
57 #include <linux/proc_fs.h>
58 #include <linux/seq_file.h>
59 #include <linux/sysctl.h>
60 #include <linux/syscalls.h>
61 #include <linux/times.h>
62 #include <linux/tsacct_kern.h>
63 #include <linux/kprobes.h>
64 #include <linux/delayacct.h>
65 #include <linux/unistd.h>
66 #include <linux/pagemap.h>
67 #include <linux/hrtimer.h>
68 #include <linux/tick.h>
69 #include <linux/debugfs.h>
70 #include <linux/ctype.h>
71 #include <linux/ftrace.h>
72 #include <linux/slab.h>
73 #include <linux/init_task.h>
74 #include <linux/binfmts.h>
75 
76 #include <asm/tlb.h>
77 #include <asm/irq_regs.h>
78 #include <asm/mutex.h>
79 #ifdef CONFIG_PARAVIRT
80 #include <asm/paravirt.h>
81 #endif
82 
83 #include "sched.h"
84 #include "../workqueue_sched.h"
85 
86 #define CREATE_TRACE_POINTS
87 #include <trace/events/sched.h>
88 
89 void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
90 {
91 	unsigned long delta;
92 	ktime_t soft, hard, now;
93 
94 	for (;;) {
95 		if (hrtimer_active(period_timer))
96 			break;
97 
98 		now = hrtimer_cb_get_time(period_timer);
99 		hrtimer_forward(period_timer, now, period);
100 
101 		soft = hrtimer_get_softexpires(period_timer);
102 		hard = hrtimer_get_expires(period_timer);
103 		delta = ktime_to_ns(ktime_sub(hard, soft));
104 		__hrtimer_start_range_ns(period_timer, soft, delta,
105 					 HRTIMER_MODE_ABS_PINNED, 0);
106 	}
107 }
108 
109 DEFINE_MUTEX(sched_domains_mutex);
110 DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
111 
112 static void update_rq_clock_task(struct rq *rq, s64 delta);
113 
114 void update_rq_clock(struct rq *rq)
115 {
116 	s64 delta;
117 
118 	if (rq->skip_clock_update > 0)
119 		return;
120 
121 	delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
122 	rq->clock += delta;
123 	update_rq_clock_task(rq, delta);
124 }
125 
126 /*
127  * Debugging: various feature bits
128  */
129 
130 #define SCHED_FEAT(name, enabled)	\
131 	(1UL << __SCHED_FEAT_##name) * enabled |
132 
133 const_debug unsigned int sysctl_sched_features =
134 #include "features.h"
135 	0;
136 
137 #undef SCHED_FEAT
138 
139 #ifdef CONFIG_SCHED_DEBUG
140 #define SCHED_FEAT(name, enabled)	\
141 	#name ,
142 
143 static __read_mostly char *sched_feat_names[] = {
144 #include "features.h"
145 	NULL
146 };
147 
148 #undef SCHED_FEAT
149 
150 static int sched_feat_show(struct seq_file *m, void *v)
151 {
152 	int i;
153 
154 	for (i = 0; i < __SCHED_FEAT_NR; i++) {
155 		if (!(sysctl_sched_features & (1UL << i)))
156 			seq_puts(m, "NO_");
157 		seq_printf(m, "%s ", sched_feat_names[i]);
158 	}
159 	seq_puts(m, "\n");
160 
161 	return 0;
162 }
163 
164 #ifdef HAVE_JUMP_LABEL
165 
166 #define jump_label_key__true  STATIC_KEY_INIT_TRUE
167 #define jump_label_key__false STATIC_KEY_INIT_FALSE
168 
169 #define SCHED_FEAT(name, enabled)	\
170 	jump_label_key__##enabled ,
171 
172 struct static_key sched_feat_keys[__SCHED_FEAT_NR] = {
173 #include "features.h"
174 };
175 
176 #undef SCHED_FEAT
177 
178 static void sched_feat_disable(int i)
179 {
180 	if (static_key_enabled(&sched_feat_keys[i]))
181 		static_key_slow_dec(&sched_feat_keys[i]);
182 }
183 
184 static void sched_feat_enable(int i)
185 {
186 	if (!static_key_enabled(&sched_feat_keys[i]))
187 		static_key_slow_inc(&sched_feat_keys[i]);
188 }
189 #else
190 static void sched_feat_disable(int i) { };
191 static void sched_feat_enable(int i) { };
192 #endif /* HAVE_JUMP_LABEL */
193 
194 static ssize_t
195 sched_feat_write(struct file *filp, const char __user *ubuf,
196 		size_t cnt, loff_t *ppos)
197 {
198 	char buf[64];
199 	char *cmp;
200 	int neg = 0;
201 	int i;
202 
203 	if (cnt > 63)
204 		cnt = 63;
205 
206 	if (copy_from_user(&buf, ubuf, cnt))
207 		return -EFAULT;
208 
209 	buf[cnt] = 0;
210 	cmp = strstrip(buf);
211 
212 	if (strncmp(cmp, "NO_", 3) == 0) {
213 		neg = 1;
214 		cmp += 3;
215 	}
216 
217 	for (i = 0; i < __SCHED_FEAT_NR; i++) {
218 		if (strcmp(cmp, sched_feat_names[i]) == 0) {
219 			if (neg) {
220 				sysctl_sched_features &= ~(1UL << i);
221 				sched_feat_disable(i);
222 			} else {
223 				sysctl_sched_features |= (1UL << i);
224 				sched_feat_enable(i);
225 			}
226 			break;
227 		}
228 	}
229 
230 	if (i == __SCHED_FEAT_NR)
231 		return -EINVAL;
232 
233 	*ppos += cnt;
234 
235 	return cnt;
236 }
237 
238 static int sched_feat_open(struct inode *inode, struct file *filp)
239 {
240 	return single_open(filp, sched_feat_show, NULL);
241 }
242 
243 static const struct file_operations sched_feat_fops = {
244 	.open		= sched_feat_open,
245 	.write		= sched_feat_write,
246 	.read		= seq_read,
247 	.llseek		= seq_lseek,
248 	.release	= single_release,
249 };
250 
251 static __init int sched_init_debug(void)
252 {
253 	debugfs_create_file("sched_features", 0644, NULL, NULL,
254 			&sched_feat_fops);
255 
256 	return 0;
257 }
258 late_initcall(sched_init_debug);
259 #endif /* CONFIG_SCHED_DEBUG */
260 
261 /*
262  * Number of tasks to iterate in a single balance run.
263  * Limited because this is done with IRQs disabled.
264  */
265 const_debug unsigned int sysctl_sched_nr_migrate = 32;
266 
267 /*
268  * period over which we average the RT time consumption, measured
269  * in ms.
270  *
271  * default: 1s
272  */
273 const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;
274 
275 /*
276  * period over which we measure -rt task cpu usage in us.
277  * default: 1s
278  */
279 unsigned int sysctl_sched_rt_period = 1000000;
280 
281 __read_mostly int scheduler_running;
282 
283 /*
284  * part of the period that we allow rt tasks to run in us.
285  * default: 0.95s
286  */
287 int sysctl_sched_rt_runtime = 950000;
288 
289 
290 
291 /*
292  * __task_rq_lock - lock the rq @p resides on.
293  */
294 static inline struct rq *__task_rq_lock(struct task_struct *p)
295 	__acquires(rq->lock)
296 {
297 	struct rq *rq;
298 
299 	lockdep_assert_held(&p->pi_lock);
300 
301 	for (;;) {
302 		rq = task_rq(p);
303 		raw_spin_lock(&rq->lock);
304 		if (likely(rq == task_rq(p)))
305 			return rq;
306 		raw_spin_unlock(&rq->lock);
307 	}
308 }
309 
310 /*
311  * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
312  */
313 static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
314 	__acquires(p->pi_lock)
315 	__acquires(rq->lock)
316 {
317 	struct rq *rq;
318 
319 	for (;;) {
320 		raw_spin_lock_irqsave(&p->pi_lock, *flags);
321 		rq = task_rq(p);
322 		raw_spin_lock(&rq->lock);
323 		if (likely(rq == task_rq(p)))
324 			return rq;
325 		raw_spin_unlock(&rq->lock);
326 		raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
327 	}
328 }
329 
330 static void __task_rq_unlock(struct rq *rq)
331 	__releases(rq->lock)
332 {
333 	raw_spin_unlock(&rq->lock);
334 }
335 
336 static inline void
337 task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
338 	__releases(rq->lock)
339 	__releases(p->pi_lock)
340 {
341 	raw_spin_unlock(&rq->lock);
342 	raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
343 }
344 
345 /*
346  * this_rq_lock - lock this runqueue and disable interrupts.
347  */
348 static struct rq *this_rq_lock(void)
349 	__acquires(rq->lock)
350 {
351 	struct rq *rq;
352 
353 	local_irq_disable();
354 	rq = this_rq();
355 	raw_spin_lock(&rq->lock);
356 
357 	return rq;
358 }
359 
360 #ifdef CONFIG_SCHED_HRTICK
361 /*
362  * Use HR-timers to deliver accurate preemption points.
363  *
364  * Its all a bit involved since we cannot program an hrt while holding the
365  * rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a
366  * reschedule event.
367  *
368  * When we get rescheduled we reprogram the hrtick_timer outside of the
369  * rq->lock.
370  */
371 
372 static void hrtick_clear(struct rq *rq)
373 {
374 	if (hrtimer_active(&rq->hrtick_timer))
375 		hrtimer_cancel(&rq->hrtick_timer);
376 }
377 
378 /*
379  * High-resolution timer tick.
380  * Runs from hardirq context with interrupts disabled.
381  */
382 static enum hrtimer_restart hrtick(struct hrtimer *timer)
383 {
384 	struct rq *rq = container_of(timer, struct rq, hrtick_timer);
385 
386 	WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
387 
388 	raw_spin_lock(&rq->lock);
389 	update_rq_clock(rq);
390 	rq->curr->sched_class->task_tick(rq, rq->curr, 1);
391 	raw_spin_unlock(&rq->lock);
392 
393 	return HRTIMER_NORESTART;
394 }
395 
396 #ifdef CONFIG_SMP
397 /*
398  * called from hardirq (IPI) context
399  */
400 static void __hrtick_start(void *arg)
401 {
402 	struct rq *rq = arg;
403 
404 	raw_spin_lock(&rq->lock);
405 	hrtimer_restart(&rq->hrtick_timer);
406 	rq->hrtick_csd_pending = 0;
407 	raw_spin_unlock(&rq->lock);
408 }
409 
410 /*
411  * Called to set the hrtick timer state.
412  *
413  * called with rq->lock held and irqs disabled
414  */
415 void hrtick_start(struct rq *rq, u64 delay)
416 {
417 	struct hrtimer *timer = &rq->hrtick_timer;
418 	ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
419 
420 	hrtimer_set_expires(timer, time);
421 
422 	if (rq == this_rq()) {
423 		hrtimer_restart(timer);
424 	} else if (!rq->hrtick_csd_pending) {
425 		__smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0);
426 		rq->hrtick_csd_pending = 1;
427 	}
428 }
429 
430 static int
431 hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)
432 {
433 	int cpu = (int)(long)hcpu;
434 
435 	switch (action) {
436 	case CPU_UP_CANCELED:
437 	case CPU_UP_CANCELED_FROZEN:
438 	case CPU_DOWN_PREPARE:
439 	case CPU_DOWN_PREPARE_FROZEN:
440 	case CPU_DEAD:
441 	case CPU_DEAD_FROZEN:
442 		hrtick_clear(cpu_rq(cpu));
443 		return NOTIFY_OK;
444 	}
445 
446 	return NOTIFY_DONE;
447 }
448 
449 static __init void init_hrtick(void)
450 {
451 	hotcpu_notifier(hotplug_hrtick, 0);
452 }
453 #else
454 /*
455  * Called to set the hrtick timer state.
456  *
457  * called with rq->lock held and irqs disabled
458  */
459 void hrtick_start(struct rq *rq, u64 delay)
460 {
461 	__hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
462 			HRTIMER_MODE_REL_PINNED, 0);
463 }
464 
465 static inline void init_hrtick(void)
466 {
467 }
468 #endif /* CONFIG_SMP */
469 
470 static void init_rq_hrtick(struct rq *rq)
471 {
472 #ifdef CONFIG_SMP
473 	rq->hrtick_csd_pending = 0;
474 
475 	rq->hrtick_csd.flags = 0;
476 	rq->hrtick_csd.func = __hrtick_start;
477 	rq->hrtick_csd.info = rq;
478 #endif
479 
480 	hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
481 	rq->hrtick_timer.function = hrtick;
482 }
483 #else	/* CONFIG_SCHED_HRTICK */
484 static inline void hrtick_clear(struct rq *rq)
485 {
486 }
487 
488 static inline void init_rq_hrtick(struct rq *rq)
489 {
490 }
491 
492 static inline void init_hrtick(void)
493 {
494 }
495 #endif	/* CONFIG_SCHED_HRTICK */
496 
497 /*
498  * resched_task - mark a task 'to be rescheduled now'.
499  *
500  * On UP this means the setting of the need_resched flag, on SMP it
501  * might also involve a cross-CPU call to trigger the scheduler on
502  * the target CPU.
503  */
504 #ifdef CONFIG_SMP
505 
506 #ifndef tsk_is_polling
507 #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
508 #endif
509 
510 void resched_task(struct task_struct *p)
511 {
512 	int cpu;
513 
514 	assert_raw_spin_locked(&task_rq(p)->lock);
515 
516 	if (test_tsk_need_resched(p))
517 		return;
518 
519 	set_tsk_need_resched(p);
520 
521 	cpu = task_cpu(p);
522 	if (cpu == smp_processor_id())
523 		return;
524 
525 	/* NEED_RESCHED must be visible before we test polling */
526 	smp_mb();
527 	if (!tsk_is_polling(p))
528 		smp_send_reschedule(cpu);
529 }
530 
531 void resched_cpu(int cpu)
532 {
533 	struct rq *rq = cpu_rq(cpu);
534 	unsigned long flags;
535 
536 	if (!raw_spin_trylock_irqsave(&rq->lock, flags))
537 		return;
538 	resched_task(cpu_curr(cpu));
539 	raw_spin_unlock_irqrestore(&rq->lock, flags);
540 }
541 
542 #ifdef CONFIG_NO_HZ
543 /*
544  * In the semi idle case, use the nearest busy cpu for migrating timers
545  * from an idle cpu.  This is good for power-savings.
546  *
547  * We don't do similar optimization for completely idle system, as
548  * selecting an idle cpu will add more delays to the timers than intended
549  * (as that cpu's timer base may not be uptodate wrt jiffies etc).
550  */
551 int get_nohz_timer_target(void)
552 {
553 	int cpu = smp_processor_id();
554 	int i;
555 	struct sched_domain *sd;
556 
557 	rcu_read_lock();
558 	for_each_domain(cpu, sd) {
559 		for_each_cpu(i, sched_domain_span(sd)) {
560 			if (!idle_cpu(i)) {
561 				cpu = i;
562 				goto unlock;
563 			}
564 		}
565 	}
566 unlock:
567 	rcu_read_unlock();
568 	return cpu;
569 }
570 /*
571  * When add_timer_on() enqueues a timer into the timer wheel of an
572  * idle CPU then this timer might expire before the next timer event
573  * which is scheduled to wake up that CPU. In case of a completely
574  * idle system the next event might even be infinite time into the
575  * future. wake_up_idle_cpu() ensures that the CPU is woken up and
576  * leaves the inner idle loop so the newly added timer is taken into
577  * account when the CPU goes back to idle and evaluates the timer
578  * wheel for the next timer event.
579  */
580 void wake_up_idle_cpu(int cpu)
581 {
582 	struct rq *rq = cpu_rq(cpu);
583 
584 	if (cpu == smp_processor_id())
585 		return;
586 
587 	/*
588 	 * This is safe, as this function is called with the timer
589 	 * wheel base lock of (cpu) held. When the CPU is on the way
590 	 * to idle and has not yet set rq->curr to idle then it will
591 	 * be serialized on the timer wheel base lock and take the new
592 	 * timer into account automatically.
593 	 */
594 	if (rq->curr != rq->idle)
595 		return;
596 
597 	/*
598 	 * We can set TIF_RESCHED on the idle task of the other CPU
599 	 * lockless. The worst case is that the other CPU runs the
600 	 * idle task through an additional NOOP schedule()
601 	 */
602 	set_tsk_need_resched(rq->idle);
603 
604 	/* NEED_RESCHED must be visible before we test polling */
605 	smp_mb();
606 	if (!tsk_is_polling(rq->idle))
607 		smp_send_reschedule(cpu);
608 }
609 
610 static inline bool got_nohz_idle_kick(void)
611 {
612 	int cpu = smp_processor_id();
613 	return idle_cpu(cpu) && test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu));
614 }
615 
616 #else /* CONFIG_NO_HZ */
617 
618 static inline bool got_nohz_idle_kick(void)
619 {
620 	return false;
621 }
622 
623 #endif /* CONFIG_NO_HZ */
624 
625 void sched_avg_update(struct rq *rq)
626 {
627 	s64 period = sched_avg_period();
628 
629 	while ((s64)(rq->clock - rq->age_stamp) > period) {
630 		/*
631 		 * Inline assembly required to prevent the compiler
632 		 * optimising this loop into a divmod call.
633 		 * See __iter_div_u64_rem() for another example of this.
634 		 */
635 		asm("" : "+rm" (rq->age_stamp));
636 		rq->age_stamp += period;
637 		rq->rt_avg /= 2;
638 	}
639 }
640 
641 #else /* !CONFIG_SMP */
642 void resched_task(struct task_struct *p)
643 {
644 	assert_raw_spin_locked(&task_rq(p)->lock);
645 	set_tsk_need_resched(p);
646 }
647 #endif /* CONFIG_SMP */
648 
649 #if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
650 			(defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH)))
651 /*
652  * Iterate task_group tree rooted at *from, calling @down when first entering a
653  * node and @up when leaving it for the final time.
654  *
655  * Caller must hold rcu_lock or sufficient equivalent.
656  */
657 int walk_tg_tree_from(struct task_group *from,
658 			     tg_visitor down, tg_visitor up, void *data)
659 {
660 	struct task_group *parent, *child;
661 	int ret;
662 
663 	parent = from;
664 
665 down:
666 	ret = (*down)(parent, data);
667 	if (ret)
668 		goto out;
669 	list_for_each_entry_rcu(child, &parent->children, siblings) {
670 		parent = child;
671 		goto down;
672 
673 up:
674 		continue;
675 	}
676 	ret = (*up)(parent, data);
677 	if (ret || parent == from)
678 		goto out;
679 
680 	child = parent;
681 	parent = parent->parent;
682 	if (parent)
683 		goto up;
684 out:
685 	return ret;
686 }
687 
688 int tg_nop(struct task_group *tg, void *data)
689 {
690 	return 0;
691 }
692 #endif
693 
694 void update_cpu_load(struct rq *this_rq);
695 
696 static void set_load_weight(struct task_struct *p)
697 {
698 	int prio = p->static_prio - MAX_RT_PRIO;
699 	struct load_weight *load = &p->se.load;
700 
701 	/*
702 	 * SCHED_IDLE tasks get minimal weight:
703 	 */
704 	if (p->policy == SCHED_IDLE) {
705 		load->weight = scale_load(WEIGHT_IDLEPRIO);
706 		load->inv_weight = WMULT_IDLEPRIO;
707 		return;
708 	}
709 
710 	load->weight = scale_load(prio_to_weight[prio]);
711 	load->inv_weight = prio_to_wmult[prio];
712 }
713 
714 static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
715 {
716 	update_rq_clock(rq);
717 	sched_info_queued(p);
718 	p->sched_class->enqueue_task(rq, p, flags);
719 }
720 
721 static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
722 {
723 	update_rq_clock(rq);
724 	sched_info_dequeued(p);
725 	p->sched_class->dequeue_task(rq, p, flags);
726 }
727 
728 void activate_task(struct rq *rq, struct task_struct *p, int flags)
729 {
730 	if (task_contributes_to_load(p))
731 		rq->nr_uninterruptible--;
732 
733 	enqueue_task(rq, p, flags);
734 }
735 
736 void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
737 {
738 	if (task_contributes_to_load(p))
739 		rq->nr_uninterruptible++;
740 
741 	dequeue_task(rq, p, flags);
742 }
743 
744 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
745 
746 /*
747  * There are no locks covering percpu hardirq/softirq time.
748  * They are only modified in account_system_vtime, on corresponding CPU
749  * with interrupts disabled. So, writes are safe.
750  * They are read and saved off onto struct rq in update_rq_clock().
751  * This may result in other CPU reading this CPU's irq time and can
752  * race with irq/account_system_vtime on this CPU. We would either get old
753  * or new value with a side effect of accounting a slice of irq time to wrong
754  * task when irq is in progress while we read rq->clock. That is a worthy
755  * compromise in place of having locks on each irq in account_system_time.
756  */
757 static DEFINE_PER_CPU(u64, cpu_hardirq_time);
758 static DEFINE_PER_CPU(u64, cpu_softirq_time);
759 
760 static DEFINE_PER_CPU(u64, irq_start_time);
761 static int sched_clock_irqtime;
762 
763 void enable_sched_clock_irqtime(void)
764 {
765 	sched_clock_irqtime = 1;
766 }
767 
768 void disable_sched_clock_irqtime(void)
769 {
770 	sched_clock_irqtime = 0;
771 }
772 
773 #ifndef CONFIG_64BIT
774 static DEFINE_PER_CPU(seqcount_t, irq_time_seq);
775 
776 static inline void irq_time_write_begin(void)
777 {
778 	__this_cpu_inc(irq_time_seq.sequence);
779 	smp_wmb();
780 }
781 
782 static inline void irq_time_write_end(void)
783 {
784 	smp_wmb();
785 	__this_cpu_inc(irq_time_seq.sequence);
786 }
787 
788 static inline u64 irq_time_read(int cpu)
789 {
790 	u64 irq_time;
791 	unsigned seq;
792 
793 	do {
794 		seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu));
795 		irq_time = per_cpu(cpu_softirq_time, cpu) +
796 			   per_cpu(cpu_hardirq_time, cpu);
797 	} while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));
798 
799 	return irq_time;
800 }
801 #else /* CONFIG_64BIT */
802 static inline void irq_time_write_begin(void)
803 {
804 }
805 
806 static inline void irq_time_write_end(void)
807 {
808 }
809 
810 static inline u64 irq_time_read(int cpu)
811 {
812 	return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
813 }
814 #endif /* CONFIG_64BIT */
815 
816 /*
817  * Called before incrementing preempt_count on {soft,}irq_enter
818  * and before decrementing preempt_count on {soft,}irq_exit.
819  */
820 void account_system_vtime(struct task_struct *curr)
821 {
822 	unsigned long flags;
823 	s64 delta;
824 	int cpu;
825 
826 	if (!sched_clock_irqtime)
827 		return;
828 
829 	local_irq_save(flags);
830 
831 	cpu = smp_processor_id();
832 	delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
833 	__this_cpu_add(irq_start_time, delta);
834 
835 	irq_time_write_begin();
836 	/*
837 	 * We do not account for softirq time from ksoftirqd here.
838 	 * We want to continue accounting softirq time to ksoftirqd thread
839 	 * in that case, so as not to confuse scheduler with a special task
840 	 * that do not consume any time, but still wants to run.
841 	 */
842 	if (hardirq_count())
843 		__this_cpu_add(cpu_hardirq_time, delta);
844 	else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
845 		__this_cpu_add(cpu_softirq_time, delta);
846 
847 	irq_time_write_end();
848 	local_irq_restore(flags);
849 }
850 EXPORT_SYMBOL_GPL(account_system_vtime);
851 
852 #endif /* CONFIG_IRQ_TIME_ACCOUNTING */
853 
854 #ifdef CONFIG_PARAVIRT
855 static inline u64 steal_ticks(u64 steal)
856 {
857 	if (unlikely(steal > NSEC_PER_SEC))
858 		return div_u64(steal, TICK_NSEC);
859 
860 	return __iter_div_u64_rem(steal, TICK_NSEC, &steal);
861 }
862 #endif
863 
864 static void update_rq_clock_task(struct rq *rq, s64 delta)
865 {
866 /*
867  * In theory, the compile should just see 0 here, and optimize out the call
868  * to sched_rt_avg_update. But I don't trust it...
869  */
870 #if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
871 	s64 steal = 0, irq_delta = 0;
872 #endif
873 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
874 	irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
875 
876 	/*
877 	 * Since irq_time is only updated on {soft,}irq_exit, we might run into
878 	 * this case when a previous update_rq_clock() happened inside a
879 	 * {soft,}irq region.
880 	 *
881 	 * When this happens, we stop ->clock_task and only update the
882 	 * prev_irq_time stamp to account for the part that fit, so that a next
883 	 * update will consume the rest. This ensures ->clock_task is
884 	 * monotonic.
885 	 *
886 	 * It does however cause some slight miss-attribution of {soft,}irq
887 	 * time, a more accurate solution would be to update the irq_time using
888 	 * the current rq->clock timestamp, except that would require using
889 	 * atomic ops.
890 	 */
891 	if (irq_delta > delta)
892 		irq_delta = delta;
893 
894 	rq->prev_irq_time += irq_delta;
895 	delta -= irq_delta;
896 #endif
897 #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
898 	if (static_key_false((&paravirt_steal_rq_enabled))) {
899 		u64 st;
900 
901 		steal = paravirt_steal_clock(cpu_of(rq));
902 		steal -= rq->prev_steal_time_rq;
903 
904 		if (unlikely(steal > delta))
905 			steal = delta;
906 
907 		st = steal_ticks(steal);
908 		steal = st * TICK_NSEC;
909 
910 		rq->prev_steal_time_rq += steal;
911 
912 		delta -= steal;
913 	}
914 #endif
915 
916 	rq->clock_task += delta;
917 
918 #if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
919 	if ((irq_delta + steal) && sched_feat(NONTASK_POWER))
920 		sched_rt_avg_update(rq, irq_delta + steal);
921 #endif
922 }
923 
924 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
925 static int irqtime_account_hi_update(void)
926 {
927 	u64 *cpustat = kcpustat_this_cpu->cpustat;
928 	unsigned long flags;
929 	u64 latest_ns;
930 	int ret = 0;
931 
932 	local_irq_save(flags);
933 	latest_ns = this_cpu_read(cpu_hardirq_time);
934 	if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_IRQ])
935 		ret = 1;
936 	local_irq_restore(flags);
937 	return ret;
938 }
939 
940 static int irqtime_account_si_update(void)
941 {
942 	u64 *cpustat = kcpustat_this_cpu->cpustat;
943 	unsigned long flags;
944 	u64 latest_ns;
945 	int ret = 0;
946 
947 	local_irq_save(flags);
948 	latest_ns = this_cpu_read(cpu_softirq_time);
949 	if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_SOFTIRQ])
950 		ret = 1;
951 	local_irq_restore(flags);
952 	return ret;
953 }
954 
955 #else /* CONFIG_IRQ_TIME_ACCOUNTING */
956 
957 #define sched_clock_irqtime	(0)
958 
959 #endif
960 
961 void sched_set_stop_task(int cpu, struct task_struct *stop)
962 {
963 	struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
964 	struct task_struct *old_stop = cpu_rq(cpu)->stop;
965 
966 	if (stop) {
967 		/*
968 		 * Make it appear like a SCHED_FIFO task, its something
969 		 * userspace knows about and won't get confused about.
970 		 *
971 		 * Also, it will make PI more or less work without too
972 		 * much confusion -- but then, stop work should not
973 		 * rely on PI working anyway.
974 		 */
975 		sched_setscheduler_nocheck(stop, SCHED_FIFO, &param);
976 
977 		stop->sched_class = &stop_sched_class;
978 	}
979 
980 	cpu_rq(cpu)->stop = stop;
981 
982 	if (old_stop) {
983 		/*
984 		 * Reset it back to a normal scheduling class so that
985 		 * it can die in pieces.
986 		 */
987 		old_stop->sched_class = &rt_sched_class;
988 	}
989 }
990 
991 /*
992  * __normal_prio - return the priority that is based on the static prio
993  */
994 static inline int __normal_prio(struct task_struct *p)
995 {
996 	return p->static_prio;
997 }
998 
999 /*
1000  * Calculate the expected normal priority: i.e. priority
1001  * without taking RT-inheritance into account. Might be
1002  * boosted by interactivity modifiers. Changes upon fork,
1003  * setprio syscalls, and whenever the interactivity
1004  * estimator recalculates.
1005  */
1006 static inline int normal_prio(struct task_struct *p)
1007 {
1008 	int prio;
1009 
1010 	if (task_has_rt_policy(p))
1011 		prio = MAX_RT_PRIO-1 - p->rt_priority;
1012 	else
1013 		prio = __normal_prio(p);
1014 	return prio;
1015 }
1016 
1017 /*
1018  * Calculate the current priority, i.e. the priority
1019  * taken into account by the scheduler. This value might
1020  * be boosted by RT tasks, or might be boosted by
1021  * interactivity modifiers. Will be RT if the task got
1022  * RT-boosted. If not then it returns p->normal_prio.
1023  */
1024 static int effective_prio(struct task_struct *p)
1025 {
1026 	p->normal_prio = normal_prio(p);
1027 	/*
1028 	 * If we are RT tasks or we were boosted to RT priority,
1029 	 * keep the priority unchanged. Otherwise, update priority
1030 	 * to the normal priority:
1031 	 */
1032 	if (!rt_prio(p->prio))
1033 		return p->normal_prio;
1034 	return p->prio;
1035 }
1036 
1037 /**
1038  * task_curr - is this task currently executing on a CPU?
1039  * @p: the task in question.
1040  */
1041 inline int task_curr(const struct task_struct *p)
1042 {
1043 	return cpu_curr(task_cpu(p)) == p;
1044 }
1045 
1046 static inline void check_class_changed(struct rq *rq, struct task_struct *p,
1047 				       const struct sched_class *prev_class,
1048 				       int oldprio)
1049 {
1050 	if (prev_class != p->sched_class) {
1051 		if (prev_class->switched_from)
1052 			prev_class->switched_from(rq, p);
1053 		p->sched_class->switched_to(rq, p);
1054 	} else if (oldprio != p->prio)
1055 		p->sched_class->prio_changed(rq, p, oldprio);
1056 }
1057 
1058 void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
1059 {
1060 	const struct sched_class *class;
1061 
1062 	if (p->sched_class == rq->curr->sched_class) {
1063 		rq->curr->sched_class->check_preempt_curr(rq, p, flags);
1064 	} else {
1065 		for_each_class(class) {
1066 			if (class == rq->curr->sched_class)
1067 				break;
1068 			if (class == p->sched_class) {
1069 				resched_task(rq->curr);
1070 				break;
1071 			}
1072 		}
1073 	}
1074 
1075 	/*
1076 	 * A queue event has occurred, and we're going to schedule.  In
1077 	 * this case, we can save a useless back to back clock update.
1078 	 */
1079 	if (rq->curr->on_rq && test_tsk_need_resched(rq->curr))
1080 		rq->skip_clock_update = 1;
1081 }
1082 
1083 #ifdef CONFIG_SMP
1084 void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1085 {
1086 #ifdef CONFIG_SCHED_DEBUG
1087 	/*
1088 	 * We should never call set_task_cpu() on a blocked task,
1089 	 * ttwu() will sort out the placement.
1090 	 */
1091 	WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
1092 			!(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE));
1093 
1094 #ifdef CONFIG_LOCKDEP
1095 	/*
1096 	 * The caller should hold either p->pi_lock or rq->lock, when changing
1097 	 * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks.
1098 	 *
1099 	 * sched_move_task() holds both and thus holding either pins the cgroup,
1100 	 * see set_task_rq().
1101 	 *
1102 	 * Furthermore, all task_rq users should acquire both locks, see
1103 	 * task_rq_lock().
1104 	 */
1105 	WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
1106 				      lockdep_is_held(&task_rq(p)->lock)));
1107 #endif
1108 #endif
1109 
1110 	trace_sched_migrate_task(p, new_cpu);
1111 
1112 	if (task_cpu(p) != new_cpu) {
1113 		p->se.nr_migrations++;
1114 		perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0);
1115 	}
1116 
1117 	__set_task_cpu(p, new_cpu);
1118 }
1119 
1120 struct migration_arg {
1121 	struct task_struct *task;
1122 	int dest_cpu;
1123 };
1124 
1125 static int migration_cpu_stop(void *data);
1126 
1127 /*
1128  * wait_task_inactive - wait for a thread to unschedule.
1129  *
1130  * If @match_state is nonzero, it's the @p->state value just checked and
1131  * not expected to change.  If it changes, i.e. @p might have woken up,
1132  * then return zero.  When we succeed in waiting for @p to be off its CPU,
1133  * we return a positive number (its total switch count).  If a second call
1134  * a short while later returns the same number, the caller can be sure that
1135  * @p has remained unscheduled the whole time.
1136  *
1137  * The caller must ensure that the task *will* unschedule sometime soon,
1138  * else this function might spin for a *long* time. This function can't
1139  * be called with interrupts off, or it may introduce deadlock with
1140  * smp_call_function() if an IPI is sent by the same process we are
1141  * waiting to become inactive.
1142  */
1143 unsigned long wait_task_inactive(struct task_struct *p, long match_state)
1144 {
1145 	unsigned long flags;
1146 	int running, on_rq;
1147 	unsigned long ncsw;
1148 	struct rq *rq;
1149 
1150 	for (;;) {
1151 		/*
1152 		 * We do the initial early heuristics without holding
1153 		 * any task-queue locks at all. We'll only try to get
1154 		 * the runqueue lock when things look like they will
1155 		 * work out!
1156 		 */
1157 		rq = task_rq(p);
1158 
1159 		/*
1160 		 * If the task is actively running on another CPU
1161 		 * still, just relax and busy-wait without holding
1162 		 * any locks.
1163 		 *
1164 		 * NOTE! Since we don't hold any locks, it's not
1165 		 * even sure that "rq" stays as the right runqueue!
1166 		 * But we don't care, since "task_running()" will
1167 		 * return false if the runqueue has changed and p
1168 		 * is actually now running somewhere else!
1169 		 */
1170 		while (task_running(rq, p)) {
1171 			if (match_state && unlikely(p->state != match_state))
1172 				return 0;
1173 			cpu_relax();
1174 		}
1175 
1176 		/*
1177 		 * Ok, time to look more closely! We need the rq
1178 		 * lock now, to be *sure*. If we're wrong, we'll
1179 		 * just go back and repeat.
1180 		 */
1181 		rq = task_rq_lock(p, &flags);
1182 		trace_sched_wait_task(p);
1183 		running = task_running(rq, p);
1184 		on_rq = p->on_rq;
1185 		ncsw = 0;
1186 		if (!match_state || p->state == match_state)
1187 			ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
1188 		task_rq_unlock(rq, p, &flags);
1189 
1190 		/*
1191 		 * If it changed from the expected state, bail out now.
1192 		 */
1193 		if (unlikely(!ncsw))
1194 			break;
1195 
1196 		/*
1197 		 * Was it really running after all now that we
1198 		 * checked with the proper locks actually held?
1199 		 *
1200 		 * Oops. Go back and try again..
1201 		 */
1202 		if (unlikely(running)) {
1203 			cpu_relax();
1204 			continue;
1205 		}
1206 
1207 		/*
1208 		 * It's not enough that it's not actively running,
1209 		 * it must be off the runqueue _entirely_, and not
1210 		 * preempted!
1211 		 *
1212 		 * So if it was still runnable (but just not actively
1213 		 * running right now), it's preempted, and we should
1214 		 * yield - it could be a while.
1215 		 */
1216 		if (unlikely(on_rq)) {
1217 			ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ);
1218 
1219 			set_current_state(TASK_UNINTERRUPTIBLE);
1220 			schedule_hrtimeout(&to, HRTIMER_MODE_REL);
1221 			continue;
1222 		}
1223 
1224 		/*
1225 		 * Ahh, all good. It wasn't running, and it wasn't
1226 		 * runnable, which means that it will never become
1227 		 * running in the future either. We're all done!
1228 		 */
1229 		break;
1230 	}
1231 
1232 	return ncsw;
1233 }
1234 
1235 /***
1236  * kick_process - kick a running thread to enter/exit the kernel
1237  * @p: the to-be-kicked thread
1238  *
1239  * Cause a process which is running on another CPU to enter
1240  * kernel-mode, without any delay. (to get signals handled.)
1241  *
1242  * NOTE: this function doesn't have to take the runqueue lock,
1243  * because all it wants to ensure is that the remote task enters
1244  * the kernel. If the IPI races and the task has been migrated
1245  * to another CPU then no harm is done and the purpose has been
1246  * achieved as well.
1247  */
1248 void kick_process(struct task_struct *p)
1249 {
1250 	int cpu;
1251 
1252 	preempt_disable();
1253 	cpu = task_cpu(p);
1254 	if ((cpu != smp_processor_id()) && task_curr(p))
1255 		smp_send_reschedule(cpu);
1256 	preempt_enable();
1257 }
1258 EXPORT_SYMBOL_GPL(kick_process);
1259 #endif /* CONFIG_SMP */
1260 
1261 #ifdef CONFIG_SMP
1262 /*
1263  * ->cpus_allowed is protected by both rq->lock and p->pi_lock
1264  */
1265 static int select_fallback_rq(int cpu, struct task_struct *p)
1266 {
1267 	int dest_cpu;
1268 	const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu));
1269 
1270 	/* Look for allowed, online CPU in same node. */
1271 	for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask)
1272 		if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
1273 			return dest_cpu;
1274 
1275 	/* Any allowed, online CPU? */
1276 	dest_cpu = cpumask_any_and(tsk_cpus_allowed(p), cpu_active_mask);
1277 	if (dest_cpu < nr_cpu_ids)
1278 		return dest_cpu;
1279 
1280 	/* No more Mr. Nice Guy. */
1281 	dest_cpu = cpuset_cpus_allowed_fallback(p);
1282 	/*
1283 	 * Don't tell them about moving exiting tasks or
1284 	 * kernel threads (both mm NULL), since they never
1285 	 * leave kernel.
1286 	 */
1287 	if (p->mm && printk_ratelimit()) {
1288 		printk_sched("process %d (%s) no longer affine to cpu%d\n",
1289 				task_pid_nr(p), p->comm, cpu);
1290 	}
1291 
1292 	return dest_cpu;
1293 }
1294 
1295 /*
1296  * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
1297  */
1298 static inline
1299 int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
1300 {
1301 	int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags);
1302 
1303 	/*
1304 	 * In order not to call set_task_cpu() on a blocking task we need
1305 	 * to rely on ttwu() to place the task on a valid ->cpus_allowed
1306 	 * cpu.
1307 	 *
1308 	 * Since this is common to all placement strategies, this lives here.
1309 	 *
1310 	 * [ this allows ->select_task() to simply return task_cpu(p) and
1311 	 *   not worry about this generic constraint ]
1312 	 */
1313 	if (unlikely(!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) ||
1314 		     !cpu_online(cpu)))
1315 		cpu = select_fallback_rq(task_cpu(p), p);
1316 
1317 	return cpu;
1318 }
1319 
1320 static void update_avg(u64 *avg, u64 sample)
1321 {
1322 	s64 diff = sample - *avg;
1323 	*avg += diff >> 3;
1324 }
1325 #endif
1326 
1327 static void
1328 ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
1329 {
1330 #ifdef CONFIG_SCHEDSTATS
1331 	struct rq *rq = this_rq();
1332 
1333 #ifdef CONFIG_SMP
1334 	int this_cpu = smp_processor_id();
1335 
1336 	if (cpu == this_cpu) {
1337 		schedstat_inc(rq, ttwu_local);
1338 		schedstat_inc(p, se.statistics.nr_wakeups_local);
1339 	} else {
1340 		struct sched_domain *sd;
1341 
1342 		schedstat_inc(p, se.statistics.nr_wakeups_remote);
1343 		rcu_read_lock();
1344 		for_each_domain(this_cpu, sd) {
1345 			if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
1346 				schedstat_inc(sd, ttwu_wake_remote);
1347 				break;
1348 			}
1349 		}
1350 		rcu_read_unlock();
1351 	}
1352 
1353 	if (wake_flags & WF_MIGRATED)
1354 		schedstat_inc(p, se.statistics.nr_wakeups_migrate);
1355 
1356 #endif /* CONFIG_SMP */
1357 
1358 	schedstat_inc(rq, ttwu_count);
1359 	schedstat_inc(p, se.statistics.nr_wakeups);
1360 
1361 	if (wake_flags & WF_SYNC)
1362 		schedstat_inc(p, se.statistics.nr_wakeups_sync);
1363 
1364 #endif /* CONFIG_SCHEDSTATS */
1365 }
1366 
1367 static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
1368 {
1369 	activate_task(rq, p, en_flags);
1370 	p->on_rq = 1;
1371 
1372 	/* if a worker is waking up, notify workqueue */
1373 	if (p->flags & PF_WQ_WORKER)
1374 		wq_worker_waking_up(p, cpu_of(rq));
1375 }
1376 
1377 /*
1378  * Mark the task runnable and perform wakeup-preemption.
1379  */
1380 static void
1381 ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
1382 {
1383 	trace_sched_wakeup(p, true);
1384 	check_preempt_curr(rq, p, wake_flags);
1385 
1386 	p->state = TASK_RUNNING;
1387 #ifdef CONFIG_SMP
1388 	if (p->sched_class->task_woken)
1389 		p->sched_class->task_woken(rq, p);
1390 
1391 	if (rq->idle_stamp) {
1392 		u64 delta = rq->clock - rq->idle_stamp;
1393 		u64 max = 2*sysctl_sched_migration_cost;
1394 
1395 		if (delta > max)
1396 			rq->avg_idle = max;
1397 		else
1398 			update_avg(&rq->avg_idle, delta);
1399 		rq->idle_stamp = 0;
1400 	}
1401 #endif
1402 }
1403 
1404 static void
1405 ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
1406 {
1407 #ifdef CONFIG_SMP
1408 	if (p->sched_contributes_to_load)
1409 		rq->nr_uninterruptible--;
1410 #endif
1411 
1412 	ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING);
1413 	ttwu_do_wakeup(rq, p, wake_flags);
1414 }
1415 
1416 /*
1417  * Called in case the task @p isn't fully descheduled from its runqueue,
1418  * in this case we must do a remote wakeup. Its a 'light' wakeup though,
1419  * since all we need to do is flip p->state to TASK_RUNNING, since
1420  * the task is still ->on_rq.
1421  */
1422 static int ttwu_remote(struct task_struct *p, int wake_flags)
1423 {
1424 	struct rq *rq;
1425 	int ret = 0;
1426 
1427 	rq = __task_rq_lock(p);
1428 	if (p->on_rq) {
1429 		ttwu_do_wakeup(rq, p, wake_flags);
1430 		ret = 1;
1431 	}
1432 	__task_rq_unlock(rq);
1433 
1434 	return ret;
1435 }
1436 
1437 #ifdef CONFIG_SMP
1438 static void sched_ttwu_pending(void)
1439 {
1440 	struct rq *rq = this_rq();
1441 	struct llist_node *llist = llist_del_all(&rq->wake_list);
1442 	struct task_struct *p;
1443 
1444 	raw_spin_lock(&rq->lock);
1445 
1446 	while (llist) {
1447 		p = llist_entry(llist, struct task_struct, wake_entry);
1448 		llist = llist_next(llist);
1449 		ttwu_do_activate(rq, p, 0);
1450 	}
1451 
1452 	raw_spin_unlock(&rq->lock);
1453 }
1454 
1455 void scheduler_ipi(void)
1456 {
1457 	if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())
1458 		return;
1459 
1460 	/*
1461 	 * Not all reschedule IPI handlers call irq_enter/irq_exit, since
1462 	 * traditionally all their work was done from the interrupt return
1463 	 * path. Now that we actually do some work, we need to make sure
1464 	 * we do call them.
1465 	 *
1466 	 * Some archs already do call them, luckily irq_enter/exit nest
1467 	 * properly.
1468 	 *
1469 	 * Arguably we should visit all archs and update all handlers,
1470 	 * however a fair share of IPIs are still resched only so this would
1471 	 * somewhat pessimize the simple resched case.
1472 	 */
1473 	irq_enter();
1474 	sched_ttwu_pending();
1475 
1476 	/*
1477 	 * Check if someone kicked us for doing the nohz idle load balance.
1478 	 */
1479 	if (unlikely(got_nohz_idle_kick() && !need_resched())) {
1480 		this_rq()->idle_balance = 1;
1481 		raise_softirq_irqoff(SCHED_SOFTIRQ);
1482 	}
1483 	irq_exit();
1484 }
1485 
1486 static void ttwu_queue_remote(struct task_struct *p, int cpu)
1487 {
1488 	if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list))
1489 		smp_send_reschedule(cpu);
1490 }
1491 
1492 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
1493 static int ttwu_activate_remote(struct task_struct *p, int wake_flags)
1494 {
1495 	struct rq *rq;
1496 	int ret = 0;
1497 
1498 	rq = __task_rq_lock(p);
1499 	if (p->on_cpu) {
1500 		ttwu_activate(rq, p, ENQUEUE_WAKEUP);
1501 		ttwu_do_wakeup(rq, p, wake_flags);
1502 		ret = 1;
1503 	}
1504 	__task_rq_unlock(rq);
1505 
1506 	return ret;
1507 
1508 }
1509 #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
1510 
1511 bool cpus_share_cache(int this_cpu, int that_cpu)
1512 {
1513 	return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
1514 }
1515 #endif /* CONFIG_SMP */
1516 
1517 static void ttwu_queue(struct task_struct *p, int cpu)
1518 {
1519 	struct rq *rq = cpu_rq(cpu);
1520 
1521 #if defined(CONFIG_SMP)
1522 	if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
1523 		sched_clock_cpu(cpu); /* sync clocks x-cpu */
1524 		ttwu_queue_remote(p, cpu);
1525 		return;
1526 	}
1527 #endif
1528 
1529 	raw_spin_lock(&rq->lock);
1530 	ttwu_do_activate(rq, p, 0);
1531 	raw_spin_unlock(&rq->lock);
1532 }
1533 
1534 /**
1535  * try_to_wake_up - wake up a thread
1536  * @p: the thread to be awakened
1537  * @state: the mask of task states that can be woken
1538  * @wake_flags: wake modifier flags (WF_*)
1539  *
1540  * Put it on the run-queue if it's not already there. The "current"
1541  * thread is always on the run-queue (except when the actual
1542  * re-schedule is in progress), and as such you're allowed to do
1543  * the simpler "current->state = TASK_RUNNING" to mark yourself
1544  * runnable without the overhead of this.
1545  *
1546  * Returns %true if @p was woken up, %false if it was already running
1547  * or @state didn't match @p's state.
1548  */
1549 static int
1550 try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
1551 {
1552 	unsigned long flags;
1553 	int cpu, success = 0;
1554 
1555 	smp_wmb();
1556 	raw_spin_lock_irqsave(&p->pi_lock, flags);
1557 	if (!(p->state & state))
1558 		goto out;
1559 
1560 	success = 1; /* we're going to change ->state */
1561 	cpu = task_cpu(p);
1562 
1563 	if (p->on_rq && ttwu_remote(p, wake_flags))
1564 		goto stat;
1565 
1566 #ifdef CONFIG_SMP
1567 	/*
1568 	 * If the owning (remote) cpu is still in the middle of schedule() with
1569 	 * this task as prev, wait until its done referencing the task.
1570 	 */
1571 	while (p->on_cpu) {
1572 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
1573 		/*
1574 		 * In case the architecture enables interrupts in
1575 		 * context_switch(), we cannot busy wait, since that
1576 		 * would lead to deadlocks when an interrupt hits and
1577 		 * tries to wake up @prev. So bail and do a complete
1578 		 * remote wakeup.
1579 		 */
1580 		if (ttwu_activate_remote(p, wake_flags))
1581 			goto stat;
1582 #else
1583 		cpu_relax();
1584 #endif
1585 	}
1586 	/*
1587 	 * Pairs with the smp_wmb() in finish_lock_switch().
1588 	 */
1589 	smp_rmb();
1590 
1591 	p->sched_contributes_to_load = !!task_contributes_to_load(p);
1592 	p->state = TASK_WAKING;
1593 
1594 	if (p->sched_class->task_waking)
1595 		p->sched_class->task_waking(p);
1596 
1597 	cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
1598 	if (task_cpu(p) != cpu) {
1599 		wake_flags |= WF_MIGRATED;
1600 		set_task_cpu(p, cpu);
1601 	}
1602 #endif /* CONFIG_SMP */
1603 
1604 	ttwu_queue(p, cpu);
1605 stat:
1606 	ttwu_stat(p, cpu, wake_flags);
1607 out:
1608 	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
1609 
1610 	return success;
1611 }
1612 
1613 /**
1614  * try_to_wake_up_local - try to wake up a local task with rq lock held
1615  * @p: the thread to be awakened
1616  *
1617  * Put @p on the run-queue if it's not already there. The caller must
1618  * ensure that this_rq() is locked, @p is bound to this_rq() and not
1619  * the current task.
1620  */
1621 static void try_to_wake_up_local(struct task_struct *p)
1622 {
1623 	struct rq *rq = task_rq(p);
1624 
1625 	BUG_ON(rq != this_rq());
1626 	BUG_ON(p == current);
1627 	lockdep_assert_held(&rq->lock);
1628 
1629 	if (!raw_spin_trylock(&p->pi_lock)) {
1630 		raw_spin_unlock(&rq->lock);
1631 		raw_spin_lock(&p->pi_lock);
1632 		raw_spin_lock(&rq->lock);
1633 	}
1634 
1635 	if (!(p->state & TASK_NORMAL))
1636 		goto out;
1637 
1638 	if (!p->on_rq)
1639 		ttwu_activate(rq, p, ENQUEUE_WAKEUP);
1640 
1641 	ttwu_do_wakeup(rq, p, 0);
1642 	ttwu_stat(p, smp_processor_id(), 0);
1643 out:
1644 	raw_spin_unlock(&p->pi_lock);
1645 }
1646 
1647 /**
1648  * wake_up_process - Wake up a specific process
1649  * @p: The process to be woken up.
1650  *
1651  * Attempt to wake up the nominated process and move it to the set of runnable
1652  * processes.  Returns 1 if the process was woken up, 0 if it was already
1653  * running.
1654  *
1655  * It may be assumed that this function implies a write memory barrier before
1656  * changing the task state if and only if any tasks are woken up.
1657  */
1658 int wake_up_process(struct task_struct *p)
1659 {
1660 	return try_to_wake_up(p, TASK_ALL, 0);
1661 }
1662 EXPORT_SYMBOL(wake_up_process);
1663 
1664 int wake_up_state(struct task_struct *p, unsigned int state)
1665 {
1666 	return try_to_wake_up(p, state, 0);
1667 }
1668 
1669 /*
1670  * Perform scheduler related setup for a newly forked process p.
1671  * p is forked by current.
1672  *
1673  * __sched_fork() is basic setup used by init_idle() too:
1674  */
1675 static void __sched_fork(struct task_struct *p)
1676 {
1677 	p->on_rq			= 0;
1678 
1679 	p->se.on_rq			= 0;
1680 	p->se.exec_start		= 0;
1681 	p->se.sum_exec_runtime		= 0;
1682 	p->se.prev_sum_exec_runtime	= 0;
1683 	p->se.nr_migrations		= 0;
1684 	p->se.vruntime			= 0;
1685 	INIT_LIST_HEAD(&p->se.group_node);
1686 
1687 #ifdef CONFIG_SCHEDSTATS
1688 	memset(&p->se.statistics, 0, sizeof(p->se.statistics));
1689 #endif
1690 
1691 	INIT_LIST_HEAD(&p->rt.run_list);
1692 
1693 #ifdef CONFIG_PREEMPT_NOTIFIERS
1694 	INIT_HLIST_HEAD(&p->preempt_notifiers);
1695 #endif
1696 }
1697 
1698 /*
1699  * fork()/clone()-time setup:
1700  */
1701 void sched_fork(struct task_struct *p)
1702 {
1703 	unsigned long flags;
1704 	int cpu = get_cpu();
1705 
1706 	__sched_fork(p);
1707 	/*
1708 	 * We mark the process as running here. This guarantees that
1709 	 * nobody will actually run it, and a signal or other external
1710 	 * event cannot wake it up and insert it on the runqueue either.
1711 	 */
1712 	p->state = TASK_RUNNING;
1713 
1714 	/*
1715 	 * Make sure we do not leak PI boosting priority to the child.
1716 	 */
1717 	p->prio = current->normal_prio;
1718 
1719 	/*
1720 	 * Revert to default priority/policy on fork if requested.
1721 	 */
1722 	if (unlikely(p->sched_reset_on_fork)) {
1723 		if (task_has_rt_policy(p)) {
1724 			p->policy = SCHED_NORMAL;
1725 			p->static_prio = NICE_TO_PRIO(0);
1726 			p->rt_priority = 0;
1727 		} else if (PRIO_TO_NICE(p->static_prio) < 0)
1728 			p->static_prio = NICE_TO_PRIO(0);
1729 
1730 		p->prio = p->normal_prio = __normal_prio(p);
1731 		set_load_weight(p);
1732 
1733 		/*
1734 		 * We don't need the reset flag anymore after the fork. It has
1735 		 * fulfilled its duty:
1736 		 */
1737 		p->sched_reset_on_fork = 0;
1738 	}
1739 
1740 	if (!rt_prio(p->prio))
1741 		p->sched_class = &fair_sched_class;
1742 
1743 	if (p->sched_class->task_fork)
1744 		p->sched_class->task_fork(p);
1745 
1746 	/*
1747 	 * The child is not yet in the pid-hash so no cgroup attach races,
1748 	 * and the cgroup is pinned to this child due to cgroup_fork()
1749 	 * is ran before sched_fork().
1750 	 *
1751 	 * Silence PROVE_RCU.
1752 	 */
1753 	raw_spin_lock_irqsave(&p->pi_lock, flags);
1754 	set_task_cpu(p, cpu);
1755 	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
1756 
1757 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
1758 	if (likely(sched_info_on()))
1759 		memset(&p->sched_info, 0, sizeof(p->sched_info));
1760 #endif
1761 #if defined(CONFIG_SMP)
1762 	p->on_cpu = 0;
1763 #endif
1764 #ifdef CONFIG_PREEMPT_COUNT
1765 	/* Want to start with kernel preemption disabled. */
1766 	task_thread_info(p)->preempt_count = 1;
1767 #endif
1768 #ifdef CONFIG_SMP
1769 	plist_node_init(&p->pushable_tasks, MAX_PRIO);
1770 #endif
1771 
1772 	put_cpu();
1773 }
1774 
1775 /*
1776  * wake_up_new_task - wake up a newly created task for the first time.
1777  *
1778  * This function will do some initial scheduler statistics housekeeping
1779  * that must be done for every newly created context, then puts the task
1780  * on the runqueue and wakes it.
1781  */
1782 void wake_up_new_task(struct task_struct *p)
1783 {
1784 	unsigned long flags;
1785 	struct rq *rq;
1786 
1787 	raw_spin_lock_irqsave(&p->pi_lock, flags);
1788 #ifdef CONFIG_SMP
1789 	/*
1790 	 * Fork balancing, do it here and not earlier because:
1791 	 *  - cpus_allowed can change in the fork path
1792 	 *  - any previously selected cpu might disappear through hotplug
1793 	 */
1794 	set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0));
1795 #endif
1796 
1797 	rq = __task_rq_lock(p);
1798 	activate_task(rq, p, 0);
1799 	p->on_rq = 1;
1800 	trace_sched_wakeup_new(p, true);
1801 	check_preempt_curr(rq, p, WF_FORK);
1802 #ifdef CONFIG_SMP
1803 	if (p->sched_class->task_woken)
1804 		p->sched_class->task_woken(rq, p);
1805 #endif
1806 	task_rq_unlock(rq, p, &flags);
1807 }
1808 
1809 #ifdef CONFIG_PREEMPT_NOTIFIERS
1810 
1811 /**
1812  * preempt_notifier_register - tell me when current is being preempted & rescheduled
1813  * @notifier: notifier struct to register
1814  */
1815 void preempt_notifier_register(struct preempt_notifier *notifier)
1816 {
1817 	hlist_add_head(&notifier->link, &current->preempt_notifiers);
1818 }
1819 EXPORT_SYMBOL_GPL(preempt_notifier_register);
1820 
1821 /**
1822  * preempt_notifier_unregister - no longer interested in preemption notifications
1823  * @notifier: notifier struct to unregister
1824  *
1825  * This is safe to call from within a preemption notifier.
1826  */
1827 void preempt_notifier_unregister(struct preempt_notifier *notifier)
1828 {
1829 	hlist_del(&notifier->link);
1830 }
1831 EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
1832 
1833 static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
1834 {
1835 	struct preempt_notifier *notifier;
1836 	struct hlist_node *node;
1837 
1838 	hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
1839 		notifier->ops->sched_in(notifier, raw_smp_processor_id());
1840 }
1841 
1842 static void
1843 fire_sched_out_preempt_notifiers(struct task_struct *curr,
1844 				 struct task_struct *next)
1845 {
1846 	struct preempt_notifier *notifier;
1847 	struct hlist_node *node;
1848 
1849 	hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
1850 		notifier->ops->sched_out(notifier, next);
1851 }
1852 
1853 #else /* !CONFIG_PREEMPT_NOTIFIERS */
1854 
1855 static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
1856 {
1857 }
1858 
1859 static void
1860 fire_sched_out_preempt_notifiers(struct task_struct *curr,
1861 				 struct task_struct *next)
1862 {
1863 }
1864 
1865 #endif /* CONFIG_PREEMPT_NOTIFIERS */
1866 
1867 /**
1868  * prepare_task_switch - prepare to switch tasks
1869  * @rq: the runqueue preparing to switch
1870  * @prev: the current task that is being switched out
1871  * @next: the task we are going to switch to.
1872  *
1873  * This is called with the rq lock held and interrupts off. It must
1874  * be paired with a subsequent finish_task_switch after the context
1875  * switch.
1876  *
1877  * prepare_task_switch sets up locking and calls architecture specific
1878  * hooks.
1879  */
1880 static inline void
1881 prepare_task_switch(struct rq *rq, struct task_struct *prev,
1882 		    struct task_struct *next)
1883 {
1884 	sched_info_switch(prev, next);
1885 	perf_event_task_sched_out(prev, next);
1886 	fire_sched_out_preempt_notifiers(prev, next);
1887 	prepare_lock_switch(rq, next);
1888 	prepare_arch_switch(next);
1889 	trace_sched_switch(prev, next);
1890 }
1891 
1892 /**
1893  * finish_task_switch - clean up after a task-switch
1894  * @rq: runqueue associated with task-switch
1895  * @prev: the thread we just switched away from.
1896  *
1897  * finish_task_switch must be called after the context switch, paired
1898  * with a prepare_task_switch call before the context switch.
1899  * finish_task_switch will reconcile locking set up by prepare_task_switch,
1900  * and do any other architecture-specific cleanup actions.
1901  *
1902  * Note that we may have delayed dropping an mm in context_switch(). If
1903  * so, we finish that here outside of the runqueue lock. (Doing it
1904  * with the lock held can cause deadlocks; see schedule() for
1905  * details.)
1906  */
1907 static void finish_task_switch(struct rq *rq, struct task_struct *prev)
1908 	__releases(rq->lock)
1909 {
1910 	struct mm_struct *mm = rq->prev_mm;
1911 	long prev_state;
1912 
1913 	rq->prev_mm = NULL;
1914 
1915 	/*
1916 	 * A task struct has one reference for the use as "current".
1917 	 * If a task dies, then it sets TASK_DEAD in tsk->state and calls
1918 	 * schedule one last time. The schedule call will never return, and
1919 	 * the scheduled task must drop that reference.
1920 	 * The test for TASK_DEAD must occur while the runqueue locks are
1921 	 * still held, otherwise prev could be scheduled on another cpu, die
1922 	 * there before we look at prev->state, and then the reference would
1923 	 * be dropped twice.
1924 	 *		Manfred Spraul <manfred@colorfullife.com>
1925 	 */
1926 	prev_state = prev->state;
1927 	finish_arch_switch(prev);
1928 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
1929 	local_irq_disable();
1930 #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
1931 	perf_event_task_sched_in(prev, current);
1932 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
1933 	local_irq_enable();
1934 #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
1935 	finish_lock_switch(rq, prev);
1936 
1937 	fire_sched_in_preempt_notifiers(current);
1938 	if (mm)
1939 		mmdrop(mm);
1940 	if (unlikely(prev_state == TASK_DEAD)) {
1941 		/*
1942 		 * Remove function-return probe instances associated with this
1943 		 * task and put them back on the free list.
1944 		 */
1945 		kprobe_flush_task(prev);
1946 		put_task_struct(prev);
1947 	}
1948 }
1949 
1950 #ifdef CONFIG_SMP
1951 
1952 /* assumes rq->lock is held */
1953 static inline void pre_schedule(struct rq *rq, struct task_struct *prev)
1954 {
1955 	if (prev->sched_class->pre_schedule)
1956 		prev->sched_class->pre_schedule(rq, prev);
1957 }
1958 
1959 /* rq->lock is NOT held, but preemption is disabled */
1960 static inline void post_schedule(struct rq *rq)
1961 {
1962 	if (rq->post_schedule) {
1963 		unsigned long flags;
1964 
1965 		raw_spin_lock_irqsave(&rq->lock, flags);
1966 		if (rq->curr->sched_class->post_schedule)
1967 			rq->curr->sched_class->post_schedule(rq);
1968 		raw_spin_unlock_irqrestore(&rq->lock, flags);
1969 
1970 		rq->post_schedule = 0;
1971 	}
1972 }
1973 
1974 #else
1975 
1976 static inline void pre_schedule(struct rq *rq, struct task_struct *p)
1977 {
1978 }
1979 
1980 static inline void post_schedule(struct rq *rq)
1981 {
1982 }
1983 
1984 #endif
1985 
1986 /**
1987  * schedule_tail - first thing a freshly forked thread must call.
1988  * @prev: the thread we just switched away from.
1989  */
1990 asmlinkage void schedule_tail(struct task_struct *prev)
1991 	__releases(rq->lock)
1992 {
1993 	struct rq *rq = this_rq();
1994 
1995 	finish_task_switch(rq, prev);
1996 
1997 	/*
1998 	 * FIXME: do we need to worry about rq being invalidated by the
1999 	 * task_switch?
2000 	 */
2001 	post_schedule(rq);
2002 
2003 #ifdef __ARCH_WANT_UNLOCKED_CTXSW
2004 	/* In this case, finish_task_switch does not reenable preemption */
2005 	preempt_enable();
2006 #endif
2007 	if (current->set_child_tid)
2008 		put_user(task_pid_vnr(current), current->set_child_tid);
2009 }
2010 
2011 /*
2012  * context_switch - switch to the new MM and the new
2013  * thread's register state.
2014  */
2015 static inline void
2016 context_switch(struct rq *rq, struct task_struct *prev,
2017 	       struct task_struct *next)
2018 {
2019 	struct mm_struct *mm, *oldmm;
2020 
2021 	prepare_task_switch(rq, prev, next);
2022 
2023 	mm = next->mm;
2024 	oldmm = prev->active_mm;
2025 	/*
2026 	 * For paravirt, this is coupled with an exit in switch_to to
2027 	 * combine the page table reload and the switch backend into
2028 	 * one hypercall.
2029 	 */
2030 	arch_start_context_switch(prev);
2031 
2032 	if (!mm) {
2033 		next->active_mm = oldmm;
2034 		atomic_inc(&oldmm->mm_count);
2035 		enter_lazy_tlb(oldmm, next);
2036 	} else
2037 		switch_mm(oldmm, mm, next);
2038 
2039 	if (!prev->mm) {
2040 		prev->active_mm = NULL;
2041 		rq->prev_mm = oldmm;
2042 	}
2043 	/*
2044 	 * Since the runqueue lock will be released by the next
2045 	 * task (which is an invalid locking op but in the case
2046 	 * of the scheduler it's an obvious special-case), so we
2047 	 * do an early lockdep release here:
2048 	 */
2049 #ifndef __ARCH_WANT_UNLOCKED_CTXSW
2050 	spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
2051 #endif
2052 
2053 	/* Here we just switch the register state and the stack. */
2054 	switch_to(prev, next, prev);
2055 
2056 	barrier();
2057 	/*
2058 	 * this_rq must be evaluated again because prev may have moved
2059 	 * CPUs since it called schedule(), thus the 'rq' on its stack
2060 	 * frame will be invalid.
2061 	 */
2062 	finish_task_switch(this_rq(), prev);
2063 }
2064 
2065 /*
2066  * nr_running, nr_uninterruptible and nr_context_switches:
2067  *
2068  * externally visible scheduler statistics: current number of runnable
2069  * threads, current number of uninterruptible-sleeping threads, total
2070  * number of context switches performed since bootup.
2071  */
2072 unsigned long nr_running(void)
2073 {
2074 	unsigned long i, sum = 0;
2075 
2076 	for_each_online_cpu(i)
2077 		sum += cpu_rq(i)->nr_running;
2078 
2079 	return sum;
2080 }
2081 
2082 unsigned long nr_uninterruptible(void)
2083 {
2084 	unsigned long i, sum = 0;
2085 
2086 	for_each_possible_cpu(i)
2087 		sum += cpu_rq(i)->nr_uninterruptible;
2088 
2089 	/*
2090 	 * Since we read the counters lockless, it might be slightly
2091 	 * inaccurate. Do not allow it to go below zero though:
2092 	 */
2093 	if (unlikely((long)sum < 0))
2094 		sum = 0;
2095 
2096 	return sum;
2097 }
2098 
2099 unsigned long long nr_context_switches(void)
2100 {
2101 	int i;
2102 	unsigned long long sum = 0;
2103 
2104 	for_each_possible_cpu(i)
2105 		sum += cpu_rq(i)->nr_switches;
2106 
2107 	return sum;
2108 }
2109 
2110 unsigned long nr_iowait(void)
2111 {
2112 	unsigned long i, sum = 0;
2113 
2114 	for_each_possible_cpu(i)
2115 		sum += atomic_read(&cpu_rq(i)->nr_iowait);
2116 
2117 	return sum;
2118 }
2119 
2120 unsigned long nr_iowait_cpu(int cpu)
2121 {
2122 	struct rq *this = cpu_rq(cpu);
2123 	return atomic_read(&this->nr_iowait);
2124 }
2125 
2126 unsigned long this_cpu_load(void)
2127 {
2128 	struct rq *this = this_rq();
2129 	return this->cpu_load[0];
2130 }
2131 
2132 
2133 /* Variables and functions for calc_load */
2134 static atomic_long_t calc_load_tasks;
2135 static unsigned long calc_load_update;
2136 unsigned long avenrun[3];
2137 EXPORT_SYMBOL(avenrun);
2138 
2139 static long calc_load_fold_active(struct rq *this_rq)
2140 {
2141 	long nr_active, delta = 0;
2142 
2143 	nr_active = this_rq->nr_running;
2144 	nr_active += (long) this_rq->nr_uninterruptible;
2145 
2146 	if (nr_active != this_rq->calc_load_active) {
2147 		delta = nr_active - this_rq->calc_load_active;
2148 		this_rq->calc_load_active = nr_active;
2149 	}
2150 
2151 	return delta;
2152 }
2153 
2154 static unsigned long
2155 calc_load(unsigned long load, unsigned long exp, unsigned long active)
2156 {
2157 	load *= exp;
2158 	load += active * (FIXED_1 - exp);
2159 	load += 1UL << (FSHIFT - 1);
2160 	return load >> FSHIFT;
2161 }
2162 
2163 #ifdef CONFIG_NO_HZ
2164 /*
2165  * For NO_HZ we delay the active fold to the next LOAD_FREQ update.
2166  *
2167  * When making the ILB scale, we should try to pull this in as well.
2168  */
2169 static atomic_long_t calc_load_tasks_idle;
2170 
2171 void calc_load_account_idle(struct rq *this_rq)
2172 {
2173 	long delta;
2174 
2175 	delta = calc_load_fold_active(this_rq);
2176 	if (delta)
2177 		atomic_long_add(delta, &calc_load_tasks_idle);
2178 }
2179 
2180 static long calc_load_fold_idle(void)
2181 {
2182 	long delta = 0;
2183 
2184 	/*
2185 	 * Its got a race, we don't care...
2186 	 */
2187 	if (atomic_long_read(&calc_load_tasks_idle))
2188 		delta = atomic_long_xchg(&calc_load_tasks_idle, 0);
2189 
2190 	return delta;
2191 }
2192 
2193 /**
2194  * fixed_power_int - compute: x^n, in O(log n) time
2195  *
2196  * @x:         base of the power
2197  * @frac_bits: fractional bits of @x
2198  * @n:         power to raise @x to.
2199  *
2200  * By exploiting the relation between the definition of the natural power
2201  * function: x^n := x*x*...*x (x multiplied by itself for n times), and
2202  * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
2203  * (where: n_i \elem {0, 1}, the binary vector representing n),
2204  * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
2205  * of course trivially computable in O(log_2 n), the length of our binary
2206  * vector.
2207  */
2208 static unsigned long
2209 fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
2210 {
2211 	unsigned long result = 1UL << frac_bits;
2212 
2213 	if (n) for (;;) {
2214 		if (n & 1) {
2215 			result *= x;
2216 			result += 1UL << (frac_bits - 1);
2217 			result >>= frac_bits;
2218 		}
2219 		n >>= 1;
2220 		if (!n)
2221 			break;
2222 		x *= x;
2223 		x += 1UL << (frac_bits - 1);
2224 		x >>= frac_bits;
2225 	}
2226 
2227 	return result;
2228 }
2229 
2230 /*
2231  * a1 = a0 * e + a * (1 - e)
2232  *
2233  * a2 = a1 * e + a * (1 - e)
2234  *    = (a0 * e + a * (1 - e)) * e + a * (1 - e)
2235  *    = a0 * e^2 + a * (1 - e) * (1 + e)
2236  *
2237  * a3 = a2 * e + a * (1 - e)
2238  *    = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e)
2239  *    = a0 * e^3 + a * (1 - e) * (1 + e + e^2)
2240  *
2241  *  ...
2242  *
2243  * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1]
2244  *    = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e)
2245  *    = a0 * e^n + a * (1 - e^n)
2246  *
2247  * [1] application of the geometric series:
2248  *
2249  *              n         1 - x^(n+1)
2250  *     S_n := \Sum x^i = -------------
2251  *             i=0          1 - x
2252  */
2253 static unsigned long
2254 calc_load_n(unsigned long load, unsigned long exp,
2255 	    unsigned long active, unsigned int n)
2256 {
2257 
2258 	return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
2259 }
2260 
2261 /*
2262  * NO_HZ can leave us missing all per-cpu ticks calling
2263  * calc_load_account_active(), but since an idle CPU folds its delta into
2264  * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold
2265  * in the pending idle delta if our idle period crossed a load cycle boundary.
2266  *
2267  * Once we've updated the global active value, we need to apply the exponential
2268  * weights adjusted to the number of cycles missed.
2269  */
2270 static void calc_global_nohz(void)
2271 {
2272 	long delta, active, n;
2273 
2274 	/*
2275 	 * If we crossed a calc_load_update boundary, make sure to fold
2276 	 * any pending idle changes, the respective CPUs might have
2277 	 * missed the tick driven calc_load_account_active() update
2278 	 * due to NO_HZ.
2279 	 */
2280 	delta = calc_load_fold_idle();
2281 	if (delta)
2282 		atomic_long_add(delta, &calc_load_tasks);
2283 
2284 	/*
2285 	 * It could be the one fold was all it took, we done!
2286 	 */
2287 	if (time_before(jiffies, calc_load_update + 10))
2288 		return;
2289 
2290 	/*
2291 	 * Catch-up, fold however many we are behind still
2292 	 */
2293 	delta = jiffies - calc_load_update - 10;
2294 	n = 1 + (delta / LOAD_FREQ);
2295 
2296 	active = atomic_long_read(&calc_load_tasks);
2297 	active = active > 0 ? active * FIXED_1 : 0;
2298 
2299 	avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
2300 	avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
2301 	avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
2302 
2303 	calc_load_update += n * LOAD_FREQ;
2304 }
2305 #else
2306 void calc_load_account_idle(struct rq *this_rq)
2307 {
2308 }
2309 
2310 static inline long calc_load_fold_idle(void)
2311 {
2312 	return 0;
2313 }
2314 
2315 static void calc_global_nohz(void)
2316 {
2317 }
2318 #endif
2319 
2320 /**
2321  * get_avenrun - get the load average array
2322  * @loads:	pointer to dest load array
2323  * @offset:	offset to add
2324  * @shift:	shift count to shift the result left
2325  *
2326  * These values are estimates at best, so no need for locking.
2327  */
2328 void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
2329 {
2330 	loads[0] = (avenrun[0] + offset) << shift;
2331 	loads[1] = (avenrun[1] + offset) << shift;
2332 	loads[2] = (avenrun[2] + offset) << shift;
2333 }
2334 
2335 /*
2336  * calc_load - update the avenrun load estimates 10 ticks after the
2337  * CPUs have updated calc_load_tasks.
2338  */
2339 void calc_global_load(unsigned long ticks)
2340 {
2341 	long active;
2342 
2343 	if (time_before(jiffies, calc_load_update + 10))
2344 		return;
2345 
2346 	active = atomic_long_read(&calc_load_tasks);
2347 	active = active > 0 ? active * FIXED_1 : 0;
2348 
2349 	avenrun[0] = calc_load(avenrun[0], EXP_1, active);
2350 	avenrun[1] = calc_load(avenrun[1], EXP_5, active);
2351 	avenrun[2] = calc_load(avenrun[2], EXP_15, active);
2352 
2353 	calc_load_update += LOAD_FREQ;
2354 
2355 	/*
2356 	 * Account one period with whatever state we found before
2357 	 * folding in the nohz state and ageing the entire idle period.
2358 	 *
2359 	 * This avoids loosing a sample when we go idle between
2360 	 * calc_load_account_active() (10 ticks ago) and now and thus
2361 	 * under-accounting.
2362 	 */
2363 	calc_global_nohz();
2364 }
2365 
2366 /*
2367  * Called from update_cpu_load() to periodically update this CPU's
2368  * active count.
2369  */
2370 static void calc_load_account_active(struct rq *this_rq)
2371 {
2372 	long delta;
2373 
2374 	if (time_before(jiffies, this_rq->calc_load_update))
2375 		return;
2376 
2377 	delta  = calc_load_fold_active(this_rq);
2378 	delta += calc_load_fold_idle();
2379 	if (delta)
2380 		atomic_long_add(delta, &calc_load_tasks);
2381 
2382 	this_rq->calc_load_update += LOAD_FREQ;
2383 }
2384 
2385 /*
2386  * The exact cpuload at various idx values, calculated at every tick would be
2387  * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
2388  *
2389  * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
2390  * on nth tick when cpu may be busy, then we have:
2391  * load = ((2^idx - 1) / 2^idx)^(n-1) * load
2392  * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
2393  *
2394  * decay_load_missed() below does efficient calculation of
2395  * load = ((2^idx - 1) / 2^idx)^(n-1) * load
2396  * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
2397  *
2398  * The calculation is approximated on a 128 point scale.
2399  * degrade_zero_ticks is the number of ticks after which load at any
2400  * particular idx is approximated to be zero.
2401  * degrade_factor is a precomputed table, a row for each load idx.
2402  * Each column corresponds to degradation factor for a power of two ticks,
2403  * based on 128 point scale.
2404  * Example:
2405  * row 2, col 3 (=12) says that the degradation at load idx 2 after
2406  * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
2407  *
2408  * With this power of 2 load factors, we can degrade the load n times
2409  * by looking at 1 bits in n and doing as many mult/shift instead of
2410  * n mult/shifts needed by the exact degradation.
2411  */
2412 #define DEGRADE_SHIFT		7
2413 static const unsigned char
2414 		degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
2415 static const unsigned char
2416 		degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
2417 					{0, 0, 0, 0, 0, 0, 0, 0},
2418 					{64, 32, 8, 0, 0, 0, 0, 0},
2419 					{96, 72, 40, 12, 1, 0, 0},
2420 					{112, 98, 75, 43, 15, 1, 0},
2421 					{120, 112, 98, 76, 45, 16, 2} };
2422 
2423 /*
2424  * Update cpu_load for any missed ticks, due to tickless idle. The backlog
2425  * would be when CPU is idle and so we just decay the old load without
2426  * adding any new load.
2427  */
2428 static unsigned long
2429 decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
2430 {
2431 	int j = 0;
2432 
2433 	if (!missed_updates)
2434 		return load;
2435 
2436 	if (missed_updates >= degrade_zero_ticks[idx])
2437 		return 0;
2438 
2439 	if (idx == 1)
2440 		return load >> missed_updates;
2441 
2442 	while (missed_updates) {
2443 		if (missed_updates % 2)
2444 			load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
2445 
2446 		missed_updates >>= 1;
2447 		j++;
2448 	}
2449 	return load;
2450 }
2451 
2452 /*
2453  * Update rq->cpu_load[] statistics. This function is usually called every
2454  * scheduler tick (TICK_NSEC). With tickless idle this will not be called
2455  * every tick. We fix it up based on jiffies.
2456  */
2457 void update_cpu_load(struct rq *this_rq)
2458 {
2459 	unsigned long this_load = this_rq->load.weight;
2460 	unsigned long curr_jiffies = jiffies;
2461 	unsigned long pending_updates;
2462 	int i, scale;
2463 
2464 	this_rq->nr_load_updates++;
2465 
2466 	/* Avoid repeated calls on same jiffy, when moving in and out of idle */
2467 	if (curr_jiffies == this_rq->last_load_update_tick)
2468 		return;
2469 
2470 	pending_updates = curr_jiffies - this_rq->last_load_update_tick;
2471 	this_rq->last_load_update_tick = curr_jiffies;
2472 
2473 	/* Update our load: */
2474 	this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
2475 	for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
2476 		unsigned long old_load, new_load;
2477 
2478 		/* scale is effectively 1 << i now, and >> i divides by scale */
2479 
2480 		old_load = this_rq->cpu_load[i];
2481 		old_load = decay_load_missed(old_load, pending_updates - 1, i);
2482 		new_load = this_load;
2483 		/*
2484 		 * Round up the averaging division if load is increasing. This
2485 		 * prevents us from getting stuck on 9 if the load is 10, for
2486 		 * example.
2487 		 */
2488 		if (new_load > old_load)
2489 			new_load += scale - 1;
2490 
2491 		this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
2492 	}
2493 
2494 	sched_avg_update(this_rq);
2495 }
2496 
2497 static void update_cpu_load_active(struct rq *this_rq)
2498 {
2499 	update_cpu_load(this_rq);
2500 
2501 	calc_load_account_active(this_rq);
2502 }
2503 
2504 #ifdef CONFIG_SMP
2505 
2506 /*
2507  * sched_exec - execve() is a valuable balancing opportunity, because at
2508  * this point the task has the smallest effective memory and cache footprint.
2509  */
2510 void sched_exec(void)
2511 {
2512 	struct task_struct *p = current;
2513 	unsigned long flags;
2514 	int dest_cpu;
2515 
2516 	raw_spin_lock_irqsave(&p->pi_lock, flags);
2517 	dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0);
2518 	if (dest_cpu == smp_processor_id())
2519 		goto unlock;
2520 
2521 	if (likely(cpu_active(dest_cpu))) {
2522 		struct migration_arg arg = { p, dest_cpu };
2523 
2524 		raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2525 		stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
2526 		return;
2527 	}
2528 unlock:
2529 	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2530 }
2531 
2532 #endif
2533 
2534 DEFINE_PER_CPU(struct kernel_stat, kstat);
2535 DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
2536 
2537 EXPORT_PER_CPU_SYMBOL(kstat);
2538 EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
2539 
2540 /*
2541  * Return any ns on the sched_clock that have not yet been accounted in
2542  * @p in case that task is currently running.
2543  *
2544  * Called with task_rq_lock() held on @rq.
2545  */
2546 static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
2547 {
2548 	u64 ns = 0;
2549 
2550 	if (task_current(rq, p)) {
2551 		update_rq_clock(rq);
2552 		ns = rq->clock_task - p->se.exec_start;
2553 		if ((s64)ns < 0)
2554 			ns = 0;
2555 	}
2556 
2557 	return ns;
2558 }
2559 
2560 unsigned long long task_delta_exec(struct task_struct *p)
2561 {
2562 	unsigned long flags;
2563 	struct rq *rq;
2564 	u64 ns = 0;
2565 
2566 	rq = task_rq_lock(p, &flags);
2567 	ns = do_task_delta_exec(p, rq);
2568 	task_rq_unlock(rq, p, &flags);
2569 
2570 	return ns;
2571 }
2572 
2573 /*
2574  * Return accounted runtime for the task.
2575  * In case the task is currently running, return the runtime plus current's
2576  * pending runtime that have not been accounted yet.
2577  */
2578 unsigned long long task_sched_runtime(struct task_struct *p)
2579 {
2580 	unsigned long flags;
2581 	struct rq *rq;
2582 	u64 ns = 0;
2583 
2584 	rq = task_rq_lock(p, &flags);
2585 	ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
2586 	task_rq_unlock(rq, p, &flags);
2587 
2588 	return ns;
2589 }
2590 
2591 #ifdef CONFIG_CGROUP_CPUACCT
2592 struct cgroup_subsys cpuacct_subsys;
2593 struct cpuacct root_cpuacct;
2594 #endif
2595 
2596 static inline void task_group_account_field(struct task_struct *p, int index,
2597 					    u64 tmp)
2598 {
2599 #ifdef CONFIG_CGROUP_CPUACCT
2600 	struct kernel_cpustat *kcpustat;
2601 	struct cpuacct *ca;
2602 #endif
2603 	/*
2604 	 * Since all updates are sure to touch the root cgroup, we
2605 	 * get ourselves ahead and touch it first. If the root cgroup
2606 	 * is the only cgroup, then nothing else should be necessary.
2607 	 *
2608 	 */
2609 	__get_cpu_var(kernel_cpustat).cpustat[index] += tmp;
2610 
2611 #ifdef CONFIG_CGROUP_CPUACCT
2612 	if (unlikely(!cpuacct_subsys.active))
2613 		return;
2614 
2615 	rcu_read_lock();
2616 	ca = task_ca(p);
2617 	while (ca && (ca != &root_cpuacct)) {
2618 		kcpustat = this_cpu_ptr(ca->cpustat);
2619 		kcpustat->cpustat[index] += tmp;
2620 		ca = parent_ca(ca);
2621 	}
2622 	rcu_read_unlock();
2623 #endif
2624 }
2625 
2626 
2627 /*
2628  * Account user cpu time to a process.
2629  * @p: the process that the cpu time gets accounted to
2630  * @cputime: the cpu time spent in user space since the last update
2631  * @cputime_scaled: cputime scaled by cpu frequency
2632  */
2633 void account_user_time(struct task_struct *p, cputime_t cputime,
2634 		       cputime_t cputime_scaled)
2635 {
2636 	int index;
2637 
2638 	/* Add user time to process. */
2639 	p->utime += cputime;
2640 	p->utimescaled += cputime_scaled;
2641 	account_group_user_time(p, cputime);
2642 
2643 	index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
2644 
2645 	/* Add user time to cpustat. */
2646 	task_group_account_field(p, index, (__force u64) cputime);
2647 
2648 	/* Account for user time used */
2649 	acct_update_integrals(p);
2650 }
2651 
2652 /*
2653  * Account guest cpu time to a process.
2654  * @p: the process that the cpu time gets accounted to
2655  * @cputime: the cpu time spent in virtual machine since the last update
2656  * @cputime_scaled: cputime scaled by cpu frequency
2657  */
2658 static void account_guest_time(struct task_struct *p, cputime_t cputime,
2659 			       cputime_t cputime_scaled)
2660 {
2661 	u64 *cpustat = kcpustat_this_cpu->cpustat;
2662 
2663 	/* Add guest time to process. */
2664 	p->utime += cputime;
2665 	p->utimescaled += cputime_scaled;
2666 	account_group_user_time(p, cputime);
2667 	p->gtime += cputime;
2668 
2669 	/* Add guest time to cpustat. */
2670 	if (TASK_NICE(p) > 0) {
2671 		cpustat[CPUTIME_NICE] += (__force u64) cputime;
2672 		cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime;
2673 	} else {
2674 		cpustat[CPUTIME_USER] += (__force u64) cputime;
2675 		cpustat[CPUTIME_GUEST] += (__force u64) cputime;
2676 	}
2677 }
2678 
2679 /*
2680  * Account system cpu time to a process and desired cpustat field
2681  * @p: the process that the cpu time gets accounted to
2682  * @cputime: the cpu time spent in kernel space since the last update
2683  * @cputime_scaled: cputime scaled by cpu frequency
2684  * @target_cputime64: pointer to cpustat field that has to be updated
2685  */
2686 static inline
2687 void __account_system_time(struct task_struct *p, cputime_t cputime,
2688 			cputime_t cputime_scaled, int index)
2689 {
2690 	/* Add system time to process. */
2691 	p->stime += cputime;
2692 	p->stimescaled += cputime_scaled;
2693 	account_group_system_time(p, cputime);
2694 
2695 	/* Add system time to cpustat. */
2696 	task_group_account_field(p, index, (__force u64) cputime);
2697 
2698 	/* Account for system time used */
2699 	acct_update_integrals(p);
2700 }
2701 
2702 /*
2703  * Account system cpu time to a process.
2704  * @p: the process that the cpu time gets accounted to
2705  * @hardirq_offset: the offset to subtract from hardirq_count()
2706  * @cputime: the cpu time spent in kernel space since the last update
2707  * @cputime_scaled: cputime scaled by cpu frequency
2708  */
2709 void account_system_time(struct task_struct *p, int hardirq_offset,
2710 			 cputime_t cputime, cputime_t cputime_scaled)
2711 {
2712 	int index;
2713 
2714 	if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
2715 		account_guest_time(p, cputime, cputime_scaled);
2716 		return;
2717 	}
2718 
2719 	if (hardirq_count() - hardirq_offset)
2720 		index = CPUTIME_IRQ;
2721 	else if (in_serving_softirq())
2722 		index = CPUTIME_SOFTIRQ;
2723 	else
2724 		index = CPUTIME_SYSTEM;
2725 
2726 	__account_system_time(p, cputime, cputime_scaled, index);
2727 }
2728 
2729 /*
2730  * Account for involuntary wait time.
2731  * @cputime: the cpu time spent in involuntary wait
2732  */
2733 void account_steal_time(cputime_t cputime)
2734 {
2735 	u64 *cpustat = kcpustat_this_cpu->cpustat;
2736 
2737 	cpustat[CPUTIME_STEAL] += (__force u64) cputime;
2738 }
2739 
2740 /*
2741  * Account for idle time.
2742  * @cputime: the cpu time spent in idle wait
2743  */
2744 void account_idle_time(cputime_t cputime)
2745 {
2746 	u64 *cpustat = kcpustat_this_cpu->cpustat;
2747 	struct rq *rq = this_rq();
2748 
2749 	if (atomic_read(&rq->nr_iowait) > 0)
2750 		cpustat[CPUTIME_IOWAIT] += (__force u64) cputime;
2751 	else
2752 		cpustat[CPUTIME_IDLE] += (__force u64) cputime;
2753 }
2754 
2755 static __always_inline bool steal_account_process_tick(void)
2756 {
2757 #ifdef CONFIG_PARAVIRT
2758 	if (static_key_false(&paravirt_steal_enabled)) {
2759 		u64 steal, st = 0;
2760 
2761 		steal = paravirt_steal_clock(smp_processor_id());
2762 		steal -= this_rq()->prev_steal_time;
2763 
2764 		st = steal_ticks(steal);
2765 		this_rq()->prev_steal_time += st * TICK_NSEC;
2766 
2767 		account_steal_time(st);
2768 		return st;
2769 	}
2770 #endif
2771 	return false;
2772 }
2773 
2774 #ifndef CONFIG_VIRT_CPU_ACCOUNTING
2775 
2776 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
2777 /*
2778  * Account a tick to a process and cpustat
2779  * @p: the process that the cpu time gets accounted to
2780  * @user_tick: is the tick from userspace
2781  * @rq: the pointer to rq
2782  *
2783  * Tick demultiplexing follows the order
2784  * - pending hardirq update
2785  * - pending softirq update
2786  * - user_time
2787  * - idle_time
2788  * - system time
2789  *   - check for guest_time
2790  *   - else account as system_time
2791  *
2792  * Check for hardirq is done both for system and user time as there is
2793  * no timer going off while we are on hardirq and hence we may never get an
2794  * opportunity to update it solely in system time.
2795  * p->stime and friends are only updated on system time and not on irq
2796  * softirq as those do not count in task exec_runtime any more.
2797  */
2798 static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
2799 						struct rq *rq)
2800 {
2801 	cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
2802 	u64 *cpustat = kcpustat_this_cpu->cpustat;
2803 
2804 	if (steal_account_process_tick())
2805 		return;
2806 
2807 	if (irqtime_account_hi_update()) {
2808 		cpustat[CPUTIME_IRQ] += (__force u64) cputime_one_jiffy;
2809 	} else if (irqtime_account_si_update()) {
2810 		cpustat[CPUTIME_SOFTIRQ] += (__force u64) cputime_one_jiffy;
2811 	} else if (this_cpu_ksoftirqd() == p) {
2812 		/*
2813 		 * ksoftirqd time do not get accounted in cpu_softirq_time.
2814 		 * So, we have to handle it separately here.
2815 		 * Also, p->stime needs to be updated for ksoftirqd.
2816 		 */
2817 		__account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
2818 					CPUTIME_SOFTIRQ);
2819 	} else if (user_tick) {
2820 		account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
2821 	} else if (p == rq->idle) {
2822 		account_idle_time(cputime_one_jiffy);
2823 	} else if (p->flags & PF_VCPU) { /* System time or guest time */
2824 		account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled);
2825 	} else {
2826 		__account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
2827 					CPUTIME_SYSTEM);
2828 	}
2829 }
2830 
2831 static void irqtime_account_idle_ticks(int ticks)
2832 {
2833 	int i;
2834 	struct rq *rq = this_rq();
2835 
2836 	for (i = 0; i < ticks; i++)
2837 		irqtime_account_process_tick(current, 0, rq);
2838 }
2839 #else /* CONFIG_IRQ_TIME_ACCOUNTING */
2840 static void irqtime_account_idle_ticks(int ticks) {}
2841 static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
2842 						struct rq *rq) {}
2843 #endif /* CONFIG_IRQ_TIME_ACCOUNTING */
2844 
2845 /*
2846  * Account a single tick of cpu time.
2847  * @p: the process that the cpu time gets accounted to
2848  * @user_tick: indicates if the tick is a user or a system tick
2849  */
2850 void account_process_tick(struct task_struct *p, int user_tick)
2851 {
2852 	cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
2853 	struct rq *rq = this_rq();
2854 
2855 	if (sched_clock_irqtime) {
2856 		irqtime_account_process_tick(p, user_tick, rq);
2857 		return;
2858 	}
2859 
2860 	if (steal_account_process_tick())
2861 		return;
2862 
2863 	if (user_tick)
2864 		account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
2865 	else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
2866 		account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy,
2867 				    one_jiffy_scaled);
2868 	else
2869 		account_idle_time(cputime_one_jiffy);
2870 }
2871 
2872 /*
2873  * Account multiple ticks of steal time.
2874  * @p: the process from which the cpu time has been stolen
2875  * @ticks: number of stolen ticks
2876  */
2877 void account_steal_ticks(unsigned long ticks)
2878 {
2879 	account_steal_time(jiffies_to_cputime(ticks));
2880 }
2881 
2882 /*
2883  * Account multiple ticks of idle time.
2884  * @ticks: number of stolen ticks
2885  */
2886 void account_idle_ticks(unsigned long ticks)
2887 {
2888 
2889 	if (sched_clock_irqtime) {
2890 		irqtime_account_idle_ticks(ticks);
2891 		return;
2892 	}
2893 
2894 	account_idle_time(jiffies_to_cputime(ticks));
2895 }
2896 
2897 #endif
2898 
2899 /*
2900  * Use precise platform statistics if available:
2901  */
2902 #ifdef CONFIG_VIRT_CPU_ACCOUNTING
2903 void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
2904 {
2905 	*ut = p->utime;
2906 	*st = p->stime;
2907 }
2908 
2909 void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
2910 {
2911 	struct task_cputime cputime;
2912 
2913 	thread_group_cputime(p, &cputime);
2914 
2915 	*ut = cputime.utime;
2916 	*st = cputime.stime;
2917 }
2918 #else
2919 
2920 #ifndef nsecs_to_cputime
2921 # define nsecs_to_cputime(__nsecs)	nsecs_to_jiffies(__nsecs)
2922 #endif
2923 
2924 void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
2925 {
2926 	cputime_t rtime, utime = p->utime, total = utime + p->stime;
2927 
2928 	/*
2929 	 * Use CFS's precise accounting:
2930 	 */
2931 	rtime = nsecs_to_cputime(p->se.sum_exec_runtime);
2932 
2933 	if (total) {
2934 		u64 temp = (__force u64) rtime;
2935 
2936 		temp *= (__force u64) utime;
2937 		do_div(temp, (__force u32) total);
2938 		utime = (__force cputime_t) temp;
2939 	} else
2940 		utime = rtime;
2941 
2942 	/*
2943 	 * Compare with previous values, to keep monotonicity:
2944 	 */
2945 	p->prev_utime = max(p->prev_utime, utime);
2946 	p->prev_stime = max(p->prev_stime, rtime - p->prev_utime);
2947 
2948 	*ut = p->prev_utime;
2949 	*st = p->prev_stime;
2950 }
2951 
2952 /*
2953  * Must be called with siglock held.
2954  */
2955 void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
2956 {
2957 	struct signal_struct *sig = p->signal;
2958 	struct task_cputime cputime;
2959 	cputime_t rtime, utime, total;
2960 
2961 	thread_group_cputime(p, &cputime);
2962 
2963 	total = cputime.utime + cputime.stime;
2964 	rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
2965 
2966 	if (total) {
2967 		u64 temp = (__force u64) rtime;
2968 
2969 		temp *= (__force u64) cputime.utime;
2970 		do_div(temp, (__force u32) total);
2971 		utime = (__force cputime_t) temp;
2972 	} else
2973 		utime = rtime;
2974 
2975 	sig->prev_utime = max(sig->prev_utime, utime);
2976 	sig->prev_stime = max(sig->prev_stime, rtime - sig->prev_utime);
2977 
2978 	*ut = sig->prev_utime;
2979 	*st = sig->prev_stime;
2980 }
2981 #endif
2982 
2983 /*
2984  * This function gets called by the timer code, with HZ frequency.
2985  * We call it with interrupts disabled.
2986  */
2987 void scheduler_tick(void)
2988 {
2989 	int cpu = smp_processor_id();
2990 	struct rq *rq = cpu_rq(cpu);
2991 	struct task_struct *curr = rq->curr;
2992 
2993 	sched_clock_tick();
2994 
2995 	raw_spin_lock(&rq->lock);
2996 	update_rq_clock(rq);
2997 	update_cpu_load_active(rq);
2998 	curr->sched_class->task_tick(rq, curr, 0);
2999 	raw_spin_unlock(&rq->lock);
3000 
3001 	perf_event_task_tick();
3002 
3003 #ifdef CONFIG_SMP
3004 	rq->idle_balance = idle_cpu(cpu);
3005 	trigger_load_balance(rq, cpu);
3006 #endif
3007 }
3008 
3009 notrace unsigned long get_parent_ip(unsigned long addr)
3010 {
3011 	if (in_lock_functions(addr)) {
3012 		addr = CALLER_ADDR2;
3013 		if (in_lock_functions(addr))
3014 			addr = CALLER_ADDR3;
3015 	}
3016 	return addr;
3017 }
3018 
3019 #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
3020 				defined(CONFIG_PREEMPT_TRACER))
3021 
3022 void __kprobes add_preempt_count(int val)
3023 {
3024 #ifdef CONFIG_DEBUG_PREEMPT
3025 	/*
3026 	 * Underflow?
3027 	 */
3028 	if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
3029 		return;
3030 #endif
3031 	preempt_count() += val;
3032 #ifdef CONFIG_DEBUG_PREEMPT
3033 	/*
3034 	 * Spinlock count overflowing soon?
3035 	 */
3036 	DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
3037 				PREEMPT_MASK - 10);
3038 #endif
3039 	if (preempt_count() == val)
3040 		trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
3041 }
3042 EXPORT_SYMBOL(add_preempt_count);
3043 
3044 void __kprobes sub_preempt_count(int val)
3045 {
3046 #ifdef CONFIG_DEBUG_PREEMPT
3047 	/*
3048 	 * Underflow?
3049 	 */
3050 	if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
3051 		return;
3052 	/*
3053 	 * Is the spinlock portion underflowing?
3054 	 */
3055 	if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
3056 			!(preempt_count() & PREEMPT_MASK)))
3057 		return;
3058 #endif
3059 
3060 	if (preempt_count() == val)
3061 		trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
3062 	preempt_count() -= val;
3063 }
3064 EXPORT_SYMBOL(sub_preempt_count);
3065 
3066 #endif
3067 
3068 /*
3069  * Print scheduling while atomic bug:
3070  */
3071 static noinline void __schedule_bug(struct task_struct *prev)
3072 {
3073 	struct pt_regs *regs = get_irq_regs();
3074 
3075 	if (oops_in_progress)
3076 		return;
3077 
3078 	printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
3079 		prev->comm, prev->pid, preempt_count());
3080 
3081 	debug_show_held_locks(prev);
3082 	print_modules();
3083 	if (irqs_disabled())
3084 		print_irqtrace_events(prev);
3085 
3086 	if (regs)
3087 		show_regs(regs);
3088 	else
3089 		dump_stack();
3090 }
3091 
3092 /*
3093  * Various schedule()-time debugging checks and statistics:
3094  */
3095 static inline void schedule_debug(struct task_struct *prev)
3096 {
3097 	/*
3098 	 * Test if we are atomic. Since do_exit() needs to call into
3099 	 * schedule() atomically, we ignore that path for now.
3100 	 * Otherwise, whine if we are scheduling when we should not be.
3101 	 */
3102 	if (unlikely(in_atomic_preempt_off() && !prev->exit_state))
3103 		__schedule_bug(prev);
3104 	rcu_sleep_check();
3105 
3106 	profile_hit(SCHED_PROFILING, __builtin_return_address(0));
3107 
3108 	schedstat_inc(this_rq(), sched_count);
3109 }
3110 
3111 static void put_prev_task(struct rq *rq, struct task_struct *prev)
3112 {
3113 	if (prev->on_rq || rq->skip_clock_update < 0)
3114 		update_rq_clock(rq);
3115 	prev->sched_class->put_prev_task(rq, prev);
3116 }
3117 
3118 /*
3119  * Pick up the highest-prio task:
3120  */
3121 static inline struct task_struct *
3122 pick_next_task(struct rq *rq)
3123 {
3124 	const struct sched_class *class;
3125 	struct task_struct *p;
3126 
3127 	/*
3128 	 * Optimization: we know that if all tasks are in
3129 	 * the fair class we can call that function directly:
3130 	 */
3131 	if (likely(rq->nr_running == rq->cfs.h_nr_running)) {
3132 		p = fair_sched_class.pick_next_task(rq);
3133 		if (likely(p))
3134 			return p;
3135 	}
3136 
3137 	for_each_class(class) {
3138 		p = class->pick_next_task(rq);
3139 		if (p)
3140 			return p;
3141 	}
3142 
3143 	BUG(); /* the idle class will always have a runnable task */
3144 }
3145 
3146 /*
3147  * __schedule() is the main scheduler function.
3148  */
3149 static void __sched __schedule(void)
3150 {
3151 	struct task_struct *prev, *next;
3152 	unsigned long *switch_count;
3153 	struct rq *rq;
3154 	int cpu;
3155 
3156 need_resched:
3157 	preempt_disable();
3158 	cpu = smp_processor_id();
3159 	rq = cpu_rq(cpu);
3160 	rcu_note_context_switch(cpu);
3161 	prev = rq->curr;
3162 
3163 	schedule_debug(prev);
3164 
3165 	if (sched_feat(HRTICK))
3166 		hrtick_clear(rq);
3167 
3168 	raw_spin_lock_irq(&rq->lock);
3169 
3170 	switch_count = &prev->nivcsw;
3171 	if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
3172 		if (unlikely(signal_pending_state(prev->state, prev))) {
3173 			prev->state = TASK_RUNNING;
3174 		} else {
3175 			deactivate_task(rq, prev, DEQUEUE_SLEEP);
3176 			prev->on_rq = 0;
3177 
3178 			/*
3179 			 * If a worker went to sleep, notify and ask workqueue
3180 			 * whether it wants to wake up a task to maintain
3181 			 * concurrency.
3182 			 */
3183 			if (prev->flags & PF_WQ_WORKER) {
3184 				struct task_struct *to_wakeup;
3185 
3186 				to_wakeup = wq_worker_sleeping(prev, cpu);
3187 				if (to_wakeup)
3188 					try_to_wake_up_local(to_wakeup);
3189 			}
3190 		}
3191 		switch_count = &prev->nvcsw;
3192 	}
3193 
3194 	pre_schedule(rq, prev);
3195 
3196 	if (unlikely(!rq->nr_running))
3197 		idle_balance(cpu, rq);
3198 
3199 	put_prev_task(rq, prev);
3200 	next = pick_next_task(rq);
3201 	clear_tsk_need_resched(prev);
3202 	rq->skip_clock_update = 0;
3203 
3204 	if (likely(prev != next)) {
3205 		rq->nr_switches++;
3206 		rq->curr = next;
3207 		++*switch_count;
3208 
3209 		context_switch(rq, prev, next); /* unlocks the rq */
3210 		/*
3211 		 * The context switch have flipped the stack from under us
3212 		 * and restored the local variables which were saved when
3213 		 * this task called schedule() in the past. prev == current
3214 		 * is still correct, but it can be moved to another cpu/rq.
3215 		 */
3216 		cpu = smp_processor_id();
3217 		rq = cpu_rq(cpu);
3218 	} else
3219 		raw_spin_unlock_irq(&rq->lock);
3220 
3221 	post_schedule(rq);
3222 
3223 	sched_preempt_enable_no_resched();
3224 	if (need_resched())
3225 		goto need_resched;
3226 }
3227 
3228 static inline void sched_submit_work(struct task_struct *tsk)
3229 {
3230 	if (!tsk->state || tsk_is_pi_blocked(tsk))
3231 		return;
3232 	/*
3233 	 * If we are going to sleep and we have plugged IO queued,
3234 	 * make sure to submit it to avoid deadlocks.
3235 	 */
3236 	if (blk_needs_flush_plug(tsk))
3237 		blk_schedule_flush_plug(tsk);
3238 }
3239 
3240 asmlinkage void __sched schedule(void)
3241 {
3242 	struct task_struct *tsk = current;
3243 
3244 	sched_submit_work(tsk);
3245 	__schedule();
3246 }
3247 EXPORT_SYMBOL(schedule);
3248 
3249 /**
3250  * schedule_preempt_disabled - called with preemption disabled
3251  *
3252  * Returns with preemption disabled. Note: preempt_count must be 1
3253  */
3254 void __sched schedule_preempt_disabled(void)
3255 {
3256 	sched_preempt_enable_no_resched();
3257 	schedule();
3258 	preempt_disable();
3259 }
3260 
3261 #ifdef CONFIG_MUTEX_SPIN_ON_OWNER
3262 
3263 static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
3264 {
3265 	if (lock->owner != owner)
3266 		return false;
3267 
3268 	/*
3269 	 * Ensure we emit the owner->on_cpu, dereference _after_ checking
3270 	 * lock->owner still matches owner, if that fails, owner might
3271 	 * point to free()d memory, if it still matches, the rcu_read_lock()
3272 	 * ensures the memory stays valid.
3273 	 */
3274 	barrier();
3275 
3276 	return owner->on_cpu;
3277 }
3278 
3279 /*
3280  * Look out! "owner" is an entirely speculative pointer
3281  * access and not reliable.
3282  */
3283 int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
3284 {
3285 	if (!sched_feat(OWNER_SPIN))
3286 		return 0;
3287 
3288 	rcu_read_lock();
3289 	while (owner_running(lock, owner)) {
3290 		if (need_resched())
3291 			break;
3292 
3293 		arch_mutex_cpu_relax();
3294 	}
3295 	rcu_read_unlock();
3296 
3297 	/*
3298 	 * We break out the loop above on need_resched() and when the
3299 	 * owner changed, which is a sign for heavy contention. Return
3300 	 * success only when lock->owner is NULL.
3301 	 */
3302 	return lock->owner == NULL;
3303 }
3304 #endif
3305 
3306 #ifdef CONFIG_PREEMPT
3307 /*
3308  * this is the entry point to schedule() from in-kernel preemption
3309  * off of preempt_enable. Kernel preemptions off return from interrupt
3310  * occur there and call schedule directly.
3311  */
3312 asmlinkage void __sched notrace preempt_schedule(void)
3313 {
3314 	struct thread_info *ti = current_thread_info();
3315 
3316 	/*
3317 	 * If there is a non-zero preempt_count or interrupts are disabled,
3318 	 * we do not want to preempt the current task. Just return..
3319 	 */
3320 	if (likely(ti->preempt_count || irqs_disabled()))
3321 		return;
3322 
3323 	do {
3324 		add_preempt_count_notrace(PREEMPT_ACTIVE);
3325 		__schedule();
3326 		sub_preempt_count_notrace(PREEMPT_ACTIVE);
3327 
3328 		/*
3329 		 * Check again in case we missed a preemption opportunity
3330 		 * between schedule and now.
3331 		 */
3332 		barrier();
3333 	} while (need_resched());
3334 }
3335 EXPORT_SYMBOL(preempt_schedule);
3336 
3337 /*
3338  * this is the entry point to schedule() from kernel preemption
3339  * off of irq context.
3340  * Note, that this is called and return with irqs disabled. This will
3341  * protect us against recursive calling from irq.
3342  */
3343 asmlinkage void __sched preempt_schedule_irq(void)
3344 {
3345 	struct thread_info *ti = current_thread_info();
3346 
3347 	/* Catch callers which need to be fixed */
3348 	BUG_ON(ti->preempt_count || !irqs_disabled());
3349 
3350 	do {
3351 		add_preempt_count(PREEMPT_ACTIVE);
3352 		local_irq_enable();
3353 		__schedule();
3354 		local_irq_disable();
3355 		sub_preempt_count(PREEMPT_ACTIVE);
3356 
3357 		/*
3358 		 * Check again in case we missed a preemption opportunity
3359 		 * between schedule and now.
3360 		 */
3361 		barrier();
3362 	} while (need_resched());
3363 }
3364 
3365 #endif /* CONFIG_PREEMPT */
3366 
3367 int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
3368 			  void *key)
3369 {
3370 	return try_to_wake_up(curr->private, mode, wake_flags);
3371 }
3372 EXPORT_SYMBOL(default_wake_function);
3373 
3374 /*
3375  * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
3376  * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
3377  * number) then we wake all the non-exclusive tasks and one exclusive task.
3378  *
3379  * There are circumstances in which we can try to wake a task which has already
3380  * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
3381  * zero in this (rare) case, and we handle it by continuing to scan the queue.
3382  */
3383 static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
3384 			int nr_exclusive, int wake_flags, void *key)
3385 {
3386 	wait_queue_t *curr, *next;
3387 
3388 	list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
3389 		unsigned flags = curr->flags;
3390 
3391 		if (curr->func(curr, mode, wake_flags, key) &&
3392 				(flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
3393 			break;
3394 	}
3395 }
3396 
3397 /**
3398  * __wake_up - wake up threads blocked on a waitqueue.
3399  * @q: the waitqueue
3400  * @mode: which threads
3401  * @nr_exclusive: how many wake-one or wake-many threads to wake up
3402  * @key: is directly passed to the wakeup function
3403  *
3404  * It may be assumed that this function implies a write memory barrier before
3405  * changing the task state if and only if any tasks are woken up.
3406  */
3407 void __wake_up(wait_queue_head_t *q, unsigned int mode,
3408 			int nr_exclusive, void *key)
3409 {
3410 	unsigned long flags;
3411 
3412 	spin_lock_irqsave(&q->lock, flags);
3413 	__wake_up_common(q, mode, nr_exclusive, 0, key);
3414 	spin_unlock_irqrestore(&q->lock, flags);
3415 }
3416 EXPORT_SYMBOL(__wake_up);
3417 
3418 /*
3419  * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
3420  */
3421 void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr)
3422 {
3423 	__wake_up_common(q, mode, nr, 0, NULL);
3424 }
3425 EXPORT_SYMBOL_GPL(__wake_up_locked);
3426 
3427 void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
3428 {
3429 	__wake_up_common(q, mode, 1, 0, key);
3430 }
3431 EXPORT_SYMBOL_GPL(__wake_up_locked_key);
3432 
3433 /**
3434  * __wake_up_sync_key - wake up threads blocked on a waitqueue.
3435  * @q: the waitqueue
3436  * @mode: which threads
3437  * @nr_exclusive: how many wake-one or wake-many threads to wake up
3438  * @key: opaque value to be passed to wakeup targets
3439  *
3440  * The sync wakeup differs that the waker knows that it will schedule
3441  * away soon, so while the target thread will be woken up, it will not
3442  * be migrated to another CPU - ie. the two threads are 'synchronized'
3443  * with each other. This can prevent needless bouncing between CPUs.
3444  *
3445  * On UP it can prevent extra preemption.
3446  *
3447  * It may be assumed that this function implies a write memory barrier before
3448  * changing the task state if and only if any tasks are woken up.
3449  */
3450 void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
3451 			int nr_exclusive, void *key)
3452 {
3453 	unsigned long flags;
3454 	int wake_flags = WF_SYNC;
3455 
3456 	if (unlikely(!q))
3457 		return;
3458 
3459 	if (unlikely(!nr_exclusive))
3460 		wake_flags = 0;
3461 
3462 	spin_lock_irqsave(&q->lock, flags);
3463 	__wake_up_common(q, mode, nr_exclusive, wake_flags, key);
3464 	spin_unlock_irqrestore(&q->lock, flags);
3465 }
3466 EXPORT_SYMBOL_GPL(__wake_up_sync_key);
3467 
3468 /*
3469  * __wake_up_sync - see __wake_up_sync_key()
3470  */
3471 void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
3472 {
3473 	__wake_up_sync_key(q, mode, nr_exclusive, NULL);
3474 }
3475 EXPORT_SYMBOL_GPL(__wake_up_sync);	/* For internal use only */
3476 
3477 /**
3478  * complete: - signals a single thread waiting on this completion
3479  * @x:  holds the state of this particular completion
3480  *
3481  * This will wake up a single thread waiting on this completion. Threads will be
3482  * awakened in the same order in which they were queued.
3483  *
3484  * See also complete_all(), wait_for_completion() and related routines.
3485  *
3486  * It may be assumed that this function implies a write memory barrier before
3487  * changing the task state if and only if any tasks are woken up.
3488  */
3489 void complete(struct completion *x)
3490 {
3491 	unsigned long flags;
3492 
3493 	spin_lock_irqsave(&x->wait.lock, flags);
3494 	x->done++;
3495 	__wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL);
3496 	spin_unlock_irqrestore(&x->wait.lock, flags);
3497 }
3498 EXPORT_SYMBOL(complete);
3499 
3500 /**
3501  * complete_all: - signals all threads waiting on this completion
3502  * @x:  holds the state of this particular completion
3503  *
3504  * This will wake up all threads waiting on this particular completion event.
3505  *
3506  * It may be assumed that this function implies a write memory barrier before
3507  * changing the task state if and only if any tasks are woken up.
3508  */
3509 void complete_all(struct completion *x)
3510 {
3511 	unsigned long flags;
3512 
3513 	spin_lock_irqsave(&x->wait.lock, flags);
3514 	x->done += UINT_MAX/2;
3515 	__wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL);
3516 	spin_unlock_irqrestore(&x->wait.lock, flags);
3517 }
3518 EXPORT_SYMBOL(complete_all);
3519 
3520 static inline long __sched
3521 do_wait_for_common(struct completion *x, long timeout, int state)
3522 {
3523 	if (!x->done) {
3524 		DECLARE_WAITQUEUE(wait, current);
3525 
3526 		__add_wait_queue_tail_exclusive(&x->wait, &wait);
3527 		do {
3528 			if (signal_pending_state(state, current)) {
3529 				timeout = -ERESTARTSYS;
3530 				break;
3531 			}
3532 			__set_current_state(state);
3533 			spin_unlock_irq(&x->wait.lock);
3534 			timeout = schedule_timeout(timeout);
3535 			spin_lock_irq(&x->wait.lock);
3536 		} while (!x->done && timeout);
3537 		__remove_wait_queue(&x->wait, &wait);
3538 		if (!x->done)
3539 			return timeout;
3540 	}
3541 	x->done--;
3542 	return timeout ?: 1;
3543 }
3544 
3545 static long __sched
3546 wait_for_common(struct completion *x, long timeout, int state)
3547 {
3548 	might_sleep();
3549 
3550 	spin_lock_irq(&x->wait.lock);
3551 	timeout = do_wait_for_common(x, timeout, state);
3552 	spin_unlock_irq(&x->wait.lock);
3553 	return timeout;
3554 }
3555 
3556 /**
3557  * wait_for_completion: - waits for completion of a task
3558  * @x:  holds the state of this particular completion
3559  *
3560  * This waits to be signaled for completion of a specific task. It is NOT
3561  * interruptible and there is no timeout.
3562  *
3563  * See also similar routines (i.e. wait_for_completion_timeout()) with timeout
3564  * and interrupt capability. Also see complete().
3565  */
3566 void __sched wait_for_completion(struct completion *x)
3567 {
3568 	wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
3569 }
3570 EXPORT_SYMBOL(wait_for_completion);
3571 
3572 /**
3573  * wait_for_completion_timeout: - waits for completion of a task (w/timeout)
3574  * @x:  holds the state of this particular completion
3575  * @timeout:  timeout value in jiffies
3576  *
3577  * This waits for either a completion of a specific task to be signaled or for a
3578  * specified timeout to expire. The timeout is in jiffies. It is not
3579  * interruptible.
3580  *
3581  * The return value is 0 if timed out, and positive (at least 1, or number of
3582  * jiffies left till timeout) if completed.
3583  */
3584 unsigned long __sched
3585 wait_for_completion_timeout(struct completion *x, unsigned long timeout)
3586 {
3587 	return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);
3588 }
3589 EXPORT_SYMBOL(wait_for_completion_timeout);
3590 
3591 /**
3592  * wait_for_completion_interruptible: - waits for completion of a task (w/intr)
3593  * @x:  holds the state of this particular completion
3594  *
3595  * This waits for completion of a specific task to be signaled. It is
3596  * interruptible.
3597  *
3598  * The return value is -ERESTARTSYS if interrupted, 0 if completed.
3599  */
3600 int __sched wait_for_completion_interruptible(struct completion *x)
3601 {
3602 	long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
3603 	if (t == -ERESTARTSYS)
3604 		return t;
3605 	return 0;
3606 }
3607 EXPORT_SYMBOL(wait_for_completion_interruptible);
3608 
3609 /**
3610  * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr))
3611  * @x:  holds the state of this particular completion
3612  * @timeout:  timeout value in jiffies
3613  *
3614  * This waits for either a completion of a specific task to be signaled or for a
3615  * specified timeout to expire. It is interruptible. The timeout is in jiffies.
3616  *
3617  * The return value is -ERESTARTSYS if interrupted, 0 if timed out,
3618  * positive (at least 1, or number of jiffies left till timeout) if completed.
3619  */
3620 long __sched
3621 wait_for_completion_interruptible_timeout(struct completion *x,
3622 					  unsigned long timeout)
3623 {
3624 	return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);
3625 }
3626 EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
3627 
3628 /**
3629  * wait_for_completion_killable: - waits for completion of a task (killable)
3630  * @x:  holds the state of this particular completion
3631  *
3632  * This waits to be signaled for completion of a specific task. It can be
3633  * interrupted by a kill signal.
3634  *
3635  * The return value is -ERESTARTSYS if interrupted, 0 if completed.
3636  */
3637 int __sched wait_for_completion_killable(struct completion *x)
3638 {
3639 	long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
3640 	if (t == -ERESTARTSYS)
3641 		return t;
3642 	return 0;
3643 }
3644 EXPORT_SYMBOL(wait_for_completion_killable);
3645 
3646 /**
3647  * wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable))
3648  * @x:  holds the state of this particular completion
3649  * @timeout:  timeout value in jiffies
3650  *
3651  * This waits for either a completion of a specific task to be
3652  * signaled or for a specified timeout to expire. It can be
3653  * interrupted by a kill signal. The timeout is in jiffies.
3654  *
3655  * The return value is -ERESTARTSYS if interrupted, 0 if timed out,
3656  * positive (at least 1, or number of jiffies left till timeout) if completed.
3657  */
3658 long __sched
3659 wait_for_completion_killable_timeout(struct completion *x,
3660 				     unsigned long timeout)
3661 {
3662 	return wait_for_common(x, timeout, TASK_KILLABLE);
3663 }
3664 EXPORT_SYMBOL(wait_for_completion_killable_timeout);
3665 
3666 /**
3667  *	try_wait_for_completion - try to decrement a completion without blocking
3668  *	@x:	completion structure
3669  *
3670  *	Returns: 0 if a decrement cannot be done without blocking
3671  *		 1 if a decrement succeeded.
3672  *
3673  *	If a completion is being used as a counting completion,
3674  *	attempt to decrement the counter without blocking. This
3675  *	enables us to avoid waiting if the resource the completion
3676  *	is protecting is not available.
3677  */
3678 bool try_wait_for_completion(struct completion *x)
3679 {
3680 	unsigned long flags;
3681 	int ret = 1;
3682 
3683 	spin_lock_irqsave(&x->wait.lock, flags);
3684 	if (!x->done)
3685 		ret = 0;
3686 	else
3687 		x->done--;
3688 	spin_unlock_irqrestore(&x->wait.lock, flags);
3689 	return ret;
3690 }
3691 EXPORT_SYMBOL(try_wait_for_completion);
3692 
3693 /**
3694  *	completion_done - Test to see if a completion has any waiters
3695  *	@x:	completion structure
3696  *
3697  *	Returns: 0 if there are waiters (wait_for_completion() in progress)
3698  *		 1 if there are no waiters.
3699  *
3700  */
3701 bool completion_done(struct completion *x)
3702 {
3703 	unsigned long flags;
3704 	int ret = 1;
3705 
3706 	spin_lock_irqsave(&x->wait.lock, flags);
3707 	if (!x->done)
3708 		ret = 0;
3709 	spin_unlock_irqrestore(&x->wait.lock, flags);
3710 	return ret;
3711 }
3712 EXPORT_SYMBOL(completion_done);
3713 
3714 static long __sched
3715 sleep_on_common(wait_queue_head_t *q, int state, long timeout)
3716 {
3717 	unsigned long flags;
3718 	wait_queue_t wait;
3719 
3720 	init_waitqueue_entry(&wait, current);
3721 
3722 	__set_current_state(state);
3723 
3724 	spin_lock_irqsave(&q->lock, flags);
3725 	__add_wait_queue(q, &wait);
3726 	spin_unlock(&q->lock);
3727 	timeout = schedule_timeout(timeout);
3728 	spin_lock_irq(&q->lock);
3729 	__remove_wait_queue(q, &wait);
3730 	spin_unlock_irqrestore(&q->lock, flags);
3731 
3732 	return timeout;
3733 }
3734 
3735 void __sched interruptible_sleep_on(wait_queue_head_t *q)
3736 {
3737 	sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
3738 }
3739 EXPORT_SYMBOL(interruptible_sleep_on);
3740 
3741 long __sched
3742 interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
3743 {
3744 	return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout);
3745 }
3746 EXPORT_SYMBOL(interruptible_sleep_on_timeout);
3747 
3748 void __sched sleep_on(wait_queue_head_t *q)
3749 {
3750 	sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
3751 }
3752 EXPORT_SYMBOL(sleep_on);
3753 
3754 long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
3755 {
3756 	return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout);
3757 }
3758 EXPORT_SYMBOL(sleep_on_timeout);
3759 
3760 #ifdef CONFIG_RT_MUTEXES
3761 
3762 /*
3763  * rt_mutex_setprio - set the current priority of a task
3764  * @p: task
3765  * @prio: prio value (kernel-internal form)
3766  *
3767  * This function changes the 'effective' priority of a task. It does
3768  * not touch ->normal_prio like __setscheduler().
3769  *
3770  * Used by the rt_mutex code to implement priority inheritance logic.
3771  */
3772 void rt_mutex_setprio(struct task_struct *p, int prio)
3773 {
3774 	int oldprio, on_rq, running;
3775 	struct rq *rq;
3776 	const struct sched_class *prev_class;
3777 
3778 	BUG_ON(prio < 0 || prio > MAX_PRIO);
3779 
3780 	rq = __task_rq_lock(p);
3781 
3782 	/*
3783 	 * Idle task boosting is a nono in general. There is one
3784 	 * exception, when PREEMPT_RT and NOHZ is active:
3785 	 *
3786 	 * The idle task calls get_next_timer_interrupt() and holds
3787 	 * the timer wheel base->lock on the CPU and another CPU wants
3788 	 * to access the timer (probably to cancel it). We can safely
3789 	 * ignore the boosting request, as the idle CPU runs this code
3790 	 * with interrupts disabled and will complete the lock
3791 	 * protected section without being interrupted. So there is no
3792 	 * real need to boost.
3793 	 */
3794 	if (unlikely(p == rq->idle)) {
3795 		WARN_ON(p != rq->curr);
3796 		WARN_ON(p->pi_blocked_on);
3797 		goto out_unlock;
3798 	}
3799 
3800 	trace_sched_pi_setprio(p, prio);
3801 	oldprio = p->prio;
3802 	prev_class = p->sched_class;
3803 	on_rq = p->on_rq;
3804 	running = task_current(rq, p);
3805 	if (on_rq)
3806 		dequeue_task(rq, p, 0);
3807 	if (running)
3808 		p->sched_class->put_prev_task(rq, p);
3809 
3810 	if (rt_prio(prio))
3811 		p->sched_class = &rt_sched_class;
3812 	else
3813 		p->sched_class = &fair_sched_class;
3814 
3815 	p->prio = prio;
3816 
3817 	if (running)
3818 		p->sched_class->set_curr_task(rq);
3819 	if (on_rq)
3820 		enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
3821 
3822 	check_class_changed(rq, p, prev_class, oldprio);
3823 out_unlock:
3824 	__task_rq_unlock(rq);
3825 }
3826 #endif
3827 void set_user_nice(struct task_struct *p, long nice)
3828 {
3829 	int old_prio, delta, on_rq;
3830 	unsigned long flags;
3831 	struct rq *rq;
3832 
3833 	if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
3834 		return;
3835 	/*
3836 	 * We have to be careful, if called from sys_setpriority(),
3837 	 * the task might be in the middle of scheduling on another CPU.
3838 	 */
3839 	rq = task_rq_lock(p, &flags);
3840 	/*
3841 	 * The RT priorities are set via sched_setscheduler(), but we still
3842 	 * allow the 'normal' nice value to be set - but as expected
3843 	 * it wont have any effect on scheduling until the task is
3844 	 * SCHED_FIFO/SCHED_RR:
3845 	 */
3846 	if (task_has_rt_policy(p)) {
3847 		p->static_prio = NICE_TO_PRIO(nice);
3848 		goto out_unlock;
3849 	}
3850 	on_rq = p->on_rq;
3851 	if (on_rq)
3852 		dequeue_task(rq, p, 0);
3853 
3854 	p->static_prio = NICE_TO_PRIO(nice);
3855 	set_load_weight(p);
3856 	old_prio = p->prio;
3857 	p->prio = effective_prio(p);
3858 	delta = p->prio - old_prio;
3859 
3860 	if (on_rq) {
3861 		enqueue_task(rq, p, 0);
3862 		/*
3863 		 * If the task increased its priority or is running and
3864 		 * lowered its priority, then reschedule its CPU:
3865 		 */
3866 		if (delta < 0 || (delta > 0 && task_running(rq, p)))
3867 			resched_task(rq->curr);
3868 	}
3869 out_unlock:
3870 	task_rq_unlock(rq, p, &flags);
3871 }
3872 EXPORT_SYMBOL(set_user_nice);
3873 
3874 /*
3875  * can_nice - check if a task can reduce its nice value
3876  * @p: task
3877  * @nice: nice value
3878  */
3879 int can_nice(const struct task_struct *p, const int nice)
3880 {
3881 	/* convert nice value [19,-20] to rlimit style value [1,40] */
3882 	int nice_rlim = 20 - nice;
3883 
3884 	return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
3885 		capable(CAP_SYS_NICE));
3886 }
3887 
3888 #ifdef __ARCH_WANT_SYS_NICE
3889 
3890 /*
3891  * sys_nice - change the priority of the current process.
3892  * @increment: priority increment
3893  *
3894  * sys_setpriority is a more generic, but much slower function that
3895  * does similar things.
3896  */
3897 SYSCALL_DEFINE1(nice, int, increment)
3898 {
3899 	long nice, retval;
3900 
3901 	/*
3902 	 * Setpriority might change our priority at the same moment.
3903 	 * We don't have to worry. Conceptually one call occurs first
3904 	 * and we have a single winner.
3905 	 */
3906 	if (increment < -40)
3907 		increment = -40;
3908 	if (increment > 40)
3909 		increment = 40;
3910 
3911 	nice = TASK_NICE(current) + increment;
3912 	if (nice < -20)
3913 		nice = -20;
3914 	if (nice > 19)
3915 		nice = 19;
3916 
3917 	if (increment < 0 && !can_nice(current, nice))
3918 		return -EPERM;
3919 
3920 	retval = security_task_setnice(current, nice);
3921 	if (retval)
3922 		return retval;
3923 
3924 	set_user_nice(current, nice);
3925 	return 0;
3926 }
3927 
3928 #endif
3929 
3930 /**
3931  * task_prio - return the priority value of a given task.
3932  * @p: the task in question.
3933  *
3934  * This is the priority value as seen by users in /proc.
3935  * RT tasks are offset by -200. Normal tasks are centered
3936  * around 0, value goes from -16 to +15.
3937  */
3938 int task_prio(const struct task_struct *p)
3939 {
3940 	return p->prio - MAX_RT_PRIO;
3941 }
3942 
3943 /**
3944  * task_nice - return the nice value of a given task.
3945  * @p: the task in question.
3946  */
3947 int task_nice(const struct task_struct *p)
3948 {
3949 	return TASK_NICE(p);
3950 }
3951 EXPORT_SYMBOL(task_nice);
3952 
3953 /**
3954  * idle_cpu - is a given cpu idle currently?
3955  * @cpu: the processor in question.
3956  */
3957 int idle_cpu(int cpu)
3958 {
3959 	struct rq *rq = cpu_rq(cpu);
3960 
3961 	if (rq->curr != rq->idle)
3962 		return 0;
3963 
3964 	if (rq->nr_running)
3965 		return 0;
3966 
3967 #ifdef CONFIG_SMP
3968 	if (!llist_empty(&rq->wake_list))
3969 		return 0;
3970 #endif
3971 
3972 	return 1;
3973 }
3974 
3975 /**
3976  * idle_task - return the idle task for a given cpu.
3977  * @cpu: the processor in question.
3978  */
3979 struct task_struct *idle_task(int cpu)
3980 {
3981 	return cpu_rq(cpu)->idle;
3982 }
3983 
3984 /**
3985  * find_process_by_pid - find a process with a matching PID value.
3986  * @pid: the pid in question.
3987  */
3988 static struct task_struct *find_process_by_pid(pid_t pid)
3989 {
3990 	return pid ? find_task_by_vpid(pid) : current;
3991 }
3992 
3993 /* Actually do priority change: must hold rq lock. */
3994 static void
3995 __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
3996 {
3997 	p->policy = policy;
3998 	p->rt_priority = prio;
3999 	p->normal_prio = normal_prio(p);
4000 	/* we are holding p->pi_lock already */
4001 	p->prio = rt_mutex_getprio(p);
4002 	if (rt_prio(p->prio))
4003 		p->sched_class = &rt_sched_class;
4004 	else
4005 		p->sched_class = &fair_sched_class;
4006 	set_load_weight(p);
4007 }
4008 
4009 /*
4010  * check the target process has a UID that matches the current process's
4011  */
4012 static bool check_same_owner(struct task_struct *p)
4013 {
4014 	const struct cred *cred = current_cred(), *pcred;
4015 	bool match;
4016 
4017 	rcu_read_lock();
4018 	pcred = __task_cred(p);
4019 	if (cred->user->user_ns == pcred->user->user_ns)
4020 		match = (cred->euid == pcred->euid ||
4021 			 cred->euid == pcred->uid);
4022 	else
4023 		match = false;
4024 	rcu_read_unlock();
4025 	return match;
4026 }
4027 
4028 static int __sched_setscheduler(struct task_struct *p, int policy,
4029 				const struct sched_param *param, bool user)
4030 {
4031 	int retval, oldprio, oldpolicy = -1, on_rq, running;
4032 	unsigned long flags;
4033 	const struct sched_class *prev_class;
4034 	struct rq *rq;
4035 	int reset_on_fork;
4036 
4037 	/* may grab non-irq protected spin_locks */
4038 	BUG_ON(in_interrupt());
4039 recheck:
4040 	/* double check policy once rq lock held */
4041 	if (policy < 0) {
4042 		reset_on_fork = p->sched_reset_on_fork;
4043 		policy = oldpolicy = p->policy;
4044 	} else {
4045 		reset_on_fork = !!(policy & SCHED_RESET_ON_FORK);
4046 		policy &= ~SCHED_RESET_ON_FORK;
4047 
4048 		if (policy != SCHED_FIFO && policy != SCHED_RR &&
4049 				policy != SCHED_NORMAL && policy != SCHED_BATCH &&
4050 				policy != SCHED_IDLE)
4051 			return -EINVAL;
4052 	}
4053 
4054 	/*
4055 	 * Valid priorities for SCHED_FIFO and SCHED_RR are
4056 	 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
4057 	 * SCHED_BATCH and SCHED_IDLE is 0.
4058 	 */
4059 	if (param->sched_priority < 0 ||
4060 	    (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
4061 	    (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
4062 		return -EINVAL;
4063 	if (rt_policy(policy) != (param->sched_priority != 0))
4064 		return -EINVAL;
4065 
4066 	/*
4067 	 * Allow unprivileged RT tasks to decrease priority:
4068 	 */
4069 	if (user && !capable(CAP_SYS_NICE)) {
4070 		if (rt_policy(policy)) {
4071 			unsigned long rlim_rtprio =
4072 					task_rlimit(p, RLIMIT_RTPRIO);
4073 
4074 			/* can't set/change the rt policy */
4075 			if (policy != p->policy && !rlim_rtprio)
4076 				return -EPERM;
4077 
4078 			/* can't increase priority */
4079 			if (param->sched_priority > p->rt_priority &&
4080 			    param->sched_priority > rlim_rtprio)
4081 				return -EPERM;
4082 		}
4083 
4084 		/*
4085 		 * Treat SCHED_IDLE as nice 20. Only allow a switch to
4086 		 * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
4087 		 */
4088 		if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) {
4089 			if (!can_nice(p, TASK_NICE(p)))
4090 				return -EPERM;
4091 		}
4092 
4093 		/* can't change other user's priorities */
4094 		if (!check_same_owner(p))
4095 			return -EPERM;
4096 
4097 		/* Normal users shall not reset the sched_reset_on_fork flag */
4098 		if (p->sched_reset_on_fork && !reset_on_fork)
4099 			return -EPERM;
4100 	}
4101 
4102 	if (user) {
4103 		retval = security_task_setscheduler(p);
4104 		if (retval)
4105 			return retval;
4106 	}
4107 
4108 	/*
4109 	 * make sure no PI-waiters arrive (or leave) while we are
4110 	 * changing the priority of the task:
4111 	 *
4112 	 * To be able to change p->policy safely, the appropriate
4113 	 * runqueue lock must be held.
4114 	 */
4115 	rq = task_rq_lock(p, &flags);
4116 
4117 	/*
4118 	 * Changing the policy of the stop threads its a very bad idea
4119 	 */
4120 	if (p == rq->stop) {
4121 		task_rq_unlock(rq, p, &flags);
4122 		return -EINVAL;
4123 	}
4124 
4125 	/*
4126 	 * If not changing anything there's no need to proceed further:
4127 	 */
4128 	if (unlikely(policy == p->policy && (!rt_policy(policy) ||
4129 			param->sched_priority == p->rt_priority))) {
4130 
4131 		__task_rq_unlock(rq);
4132 		raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4133 		return 0;
4134 	}
4135 
4136 #ifdef CONFIG_RT_GROUP_SCHED
4137 	if (user) {
4138 		/*
4139 		 * Do not allow realtime tasks into groups that have no runtime
4140 		 * assigned.
4141 		 */
4142 		if (rt_bandwidth_enabled() && rt_policy(policy) &&
4143 				task_group(p)->rt_bandwidth.rt_runtime == 0 &&
4144 				!task_group_is_autogroup(task_group(p))) {
4145 			task_rq_unlock(rq, p, &flags);
4146 			return -EPERM;
4147 		}
4148 	}
4149 #endif
4150 
4151 	/* recheck policy now with rq lock held */
4152 	if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
4153 		policy = oldpolicy = -1;
4154 		task_rq_unlock(rq, p, &flags);
4155 		goto recheck;
4156 	}
4157 	on_rq = p->on_rq;
4158 	running = task_current(rq, p);
4159 	if (on_rq)
4160 		dequeue_task(rq, p, 0);
4161 	if (running)
4162 		p->sched_class->put_prev_task(rq, p);
4163 
4164 	p->sched_reset_on_fork = reset_on_fork;
4165 
4166 	oldprio = p->prio;
4167 	prev_class = p->sched_class;
4168 	__setscheduler(rq, p, policy, param->sched_priority);
4169 
4170 	if (running)
4171 		p->sched_class->set_curr_task(rq);
4172 	if (on_rq)
4173 		enqueue_task(rq, p, 0);
4174 
4175 	check_class_changed(rq, p, prev_class, oldprio);
4176 	task_rq_unlock(rq, p, &flags);
4177 
4178 	rt_mutex_adjust_pi(p);
4179 
4180 	return 0;
4181 }
4182 
4183 /**
4184  * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
4185  * @p: the task in question.
4186  * @policy: new policy.
4187  * @param: structure containing the new RT priority.
4188  *
4189  * NOTE that the task may be already dead.
4190  */
4191 int sched_setscheduler(struct task_struct *p, int policy,
4192 		       const struct sched_param *param)
4193 {
4194 	return __sched_setscheduler(p, policy, param, true);
4195 }
4196 EXPORT_SYMBOL_GPL(sched_setscheduler);
4197 
4198 /**
4199  * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
4200  * @p: the task in question.
4201  * @policy: new policy.
4202  * @param: structure containing the new RT priority.
4203  *
4204  * Just like sched_setscheduler, only don't bother checking if the
4205  * current context has permission.  For example, this is needed in
4206  * stop_machine(): we create temporary high priority worker threads,
4207  * but our caller might not have that capability.
4208  */
4209 int sched_setscheduler_nocheck(struct task_struct *p, int policy,
4210 			       const struct sched_param *param)
4211 {
4212 	return __sched_setscheduler(p, policy, param, false);
4213 }
4214 
4215 static int
4216 do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
4217 {
4218 	struct sched_param lparam;
4219 	struct task_struct *p;
4220 	int retval;
4221 
4222 	if (!param || pid < 0)
4223 		return -EINVAL;
4224 	if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
4225 		return -EFAULT;
4226 
4227 	rcu_read_lock();
4228 	retval = -ESRCH;
4229 	p = find_process_by_pid(pid);
4230 	if (p != NULL)
4231 		retval = sched_setscheduler(p, policy, &lparam);
4232 	rcu_read_unlock();
4233 
4234 	return retval;
4235 }
4236 
4237 /**
4238  * sys_sched_setscheduler - set/change the scheduler policy and RT priority
4239  * @pid: the pid in question.
4240  * @policy: new policy.
4241  * @param: structure containing the new RT priority.
4242  */
4243 SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
4244 		struct sched_param __user *, param)
4245 {
4246 	/* negative values for policy are not valid */
4247 	if (policy < 0)
4248 		return -EINVAL;
4249 
4250 	return do_sched_setscheduler(pid, policy, param);
4251 }
4252 
4253 /**
4254  * sys_sched_setparam - set/change the RT priority of a thread
4255  * @pid: the pid in question.
4256  * @param: structure containing the new RT priority.
4257  */
4258 SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
4259 {
4260 	return do_sched_setscheduler(pid, -1, param);
4261 }
4262 
4263 /**
4264  * sys_sched_getscheduler - get the policy (scheduling class) of a thread
4265  * @pid: the pid in question.
4266  */
4267 SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
4268 {
4269 	struct task_struct *p;
4270 	int retval;
4271 
4272 	if (pid < 0)
4273 		return -EINVAL;
4274 
4275 	retval = -ESRCH;
4276 	rcu_read_lock();
4277 	p = find_process_by_pid(pid);
4278 	if (p) {
4279 		retval = security_task_getscheduler(p);
4280 		if (!retval)
4281 			retval = p->policy
4282 				| (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
4283 	}
4284 	rcu_read_unlock();
4285 	return retval;
4286 }
4287 
4288 /**
4289  * sys_sched_getparam - get the RT priority of a thread
4290  * @pid: the pid in question.
4291  * @param: structure containing the RT priority.
4292  */
4293 SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
4294 {
4295 	struct sched_param lp;
4296 	struct task_struct *p;
4297 	int retval;
4298 
4299 	if (!param || pid < 0)
4300 		return -EINVAL;
4301 
4302 	rcu_read_lock();
4303 	p = find_process_by_pid(pid);
4304 	retval = -ESRCH;
4305 	if (!p)
4306 		goto out_unlock;
4307 
4308 	retval = security_task_getscheduler(p);
4309 	if (retval)
4310 		goto out_unlock;
4311 
4312 	lp.sched_priority = p->rt_priority;
4313 	rcu_read_unlock();
4314 
4315 	/*
4316 	 * This one might sleep, we cannot do it with a spinlock held ...
4317 	 */
4318 	retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
4319 
4320 	return retval;
4321 
4322 out_unlock:
4323 	rcu_read_unlock();
4324 	return retval;
4325 }
4326 
4327 long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
4328 {
4329 	cpumask_var_t cpus_allowed, new_mask;
4330 	struct task_struct *p;
4331 	int retval;
4332 
4333 	get_online_cpus();
4334 	rcu_read_lock();
4335 
4336 	p = find_process_by_pid(pid);
4337 	if (!p) {
4338 		rcu_read_unlock();
4339 		put_online_cpus();
4340 		return -ESRCH;
4341 	}
4342 
4343 	/* Prevent p going away */
4344 	get_task_struct(p);
4345 	rcu_read_unlock();
4346 
4347 	if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
4348 		retval = -ENOMEM;
4349 		goto out_put_task;
4350 	}
4351 	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
4352 		retval = -ENOMEM;
4353 		goto out_free_cpus_allowed;
4354 	}
4355 	retval = -EPERM;
4356 	if (!check_same_owner(p) && !ns_capable(task_user_ns(p), CAP_SYS_NICE))
4357 		goto out_unlock;
4358 
4359 	retval = security_task_setscheduler(p);
4360 	if (retval)
4361 		goto out_unlock;
4362 
4363 	cpuset_cpus_allowed(p, cpus_allowed);
4364 	cpumask_and(new_mask, in_mask, cpus_allowed);
4365 again:
4366 	retval = set_cpus_allowed_ptr(p, new_mask);
4367 
4368 	if (!retval) {
4369 		cpuset_cpus_allowed(p, cpus_allowed);
4370 		if (!cpumask_subset(new_mask, cpus_allowed)) {
4371 			/*
4372 			 * We must have raced with a concurrent cpuset
4373 			 * update. Just reset the cpus_allowed to the
4374 			 * cpuset's cpus_allowed
4375 			 */
4376 			cpumask_copy(new_mask, cpus_allowed);
4377 			goto again;
4378 		}
4379 	}
4380 out_unlock:
4381 	free_cpumask_var(new_mask);
4382 out_free_cpus_allowed:
4383 	free_cpumask_var(cpus_allowed);
4384 out_put_task:
4385 	put_task_struct(p);
4386 	put_online_cpus();
4387 	return retval;
4388 }
4389 
4390 static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
4391 			     struct cpumask *new_mask)
4392 {
4393 	if (len < cpumask_size())
4394 		cpumask_clear(new_mask);
4395 	else if (len > cpumask_size())
4396 		len = cpumask_size();
4397 
4398 	return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
4399 }
4400 
4401 /**
4402  * sys_sched_setaffinity - set the cpu affinity of a process
4403  * @pid: pid of the process
4404  * @len: length in bytes of the bitmask pointed to by user_mask_ptr
4405  * @user_mask_ptr: user-space pointer to the new cpu mask
4406  */
4407 SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
4408 		unsigned long __user *, user_mask_ptr)
4409 {
4410 	cpumask_var_t new_mask;
4411 	int retval;
4412 
4413 	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
4414 		return -ENOMEM;
4415 
4416 	retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
4417 	if (retval == 0)
4418 		retval = sched_setaffinity(pid, new_mask);
4419 	free_cpumask_var(new_mask);
4420 	return retval;
4421 }
4422 
4423 long sched_getaffinity(pid_t pid, struct cpumask *mask)
4424 {
4425 	struct task_struct *p;
4426 	unsigned long flags;
4427 	int retval;
4428 
4429 	get_online_cpus();
4430 	rcu_read_lock();
4431 
4432 	retval = -ESRCH;
4433 	p = find_process_by_pid(pid);
4434 	if (!p)
4435 		goto out_unlock;
4436 
4437 	retval = security_task_getscheduler(p);
4438 	if (retval)
4439 		goto out_unlock;
4440 
4441 	raw_spin_lock_irqsave(&p->pi_lock, flags);
4442 	cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
4443 	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4444 
4445 out_unlock:
4446 	rcu_read_unlock();
4447 	put_online_cpus();
4448 
4449 	return retval;
4450 }
4451 
4452 /**
4453  * sys_sched_getaffinity - get the cpu affinity of a process
4454  * @pid: pid of the process
4455  * @len: length in bytes of the bitmask pointed to by user_mask_ptr
4456  * @user_mask_ptr: user-space pointer to hold the current cpu mask
4457  */
4458 SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
4459 		unsigned long __user *, user_mask_ptr)
4460 {
4461 	int ret;
4462 	cpumask_var_t mask;
4463 
4464 	if ((len * BITS_PER_BYTE) < nr_cpu_ids)
4465 		return -EINVAL;
4466 	if (len & (sizeof(unsigned long)-1))
4467 		return -EINVAL;
4468 
4469 	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
4470 		return -ENOMEM;
4471 
4472 	ret = sched_getaffinity(pid, mask);
4473 	if (ret == 0) {
4474 		size_t retlen = min_t(size_t, len, cpumask_size());
4475 
4476 		if (copy_to_user(user_mask_ptr, mask, retlen))
4477 			ret = -EFAULT;
4478 		else
4479 			ret = retlen;
4480 	}
4481 	free_cpumask_var(mask);
4482 
4483 	return ret;
4484 }
4485 
4486 /**
4487  * sys_sched_yield - yield the current processor to other threads.
4488  *
4489  * This function yields the current CPU to other tasks. If there are no
4490  * other threads running on this CPU then this function will return.
4491  */
4492 SYSCALL_DEFINE0(sched_yield)
4493 {
4494 	struct rq *rq = this_rq_lock();
4495 
4496 	schedstat_inc(rq, yld_count);
4497 	current->sched_class->yield_task(rq);
4498 
4499 	/*
4500 	 * Since we are going to call schedule() anyway, there's
4501 	 * no need to preempt or enable interrupts:
4502 	 */
4503 	__release(rq->lock);
4504 	spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
4505 	do_raw_spin_unlock(&rq->lock);
4506 	sched_preempt_enable_no_resched();
4507 
4508 	schedule();
4509 
4510 	return 0;
4511 }
4512 
4513 static inline int should_resched(void)
4514 {
4515 	return need_resched() && !(preempt_count() & PREEMPT_ACTIVE);
4516 }
4517 
4518 static void __cond_resched(void)
4519 {
4520 	add_preempt_count(PREEMPT_ACTIVE);
4521 	__schedule();
4522 	sub_preempt_count(PREEMPT_ACTIVE);
4523 }
4524 
4525 int __sched _cond_resched(void)
4526 {
4527 	if (should_resched()) {
4528 		__cond_resched();
4529 		return 1;
4530 	}
4531 	return 0;
4532 }
4533 EXPORT_SYMBOL(_cond_resched);
4534 
4535 /*
4536  * __cond_resched_lock() - if a reschedule is pending, drop the given lock,
4537  * call schedule, and on return reacquire the lock.
4538  *
4539  * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
4540  * operations here to prevent schedule() from being called twice (once via
4541  * spin_unlock(), once by hand).
4542  */
4543 int __cond_resched_lock(spinlock_t *lock)
4544 {
4545 	int resched = should_resched();
4546 	int ret = 0;
4547 
4548 	lockdep_assert_held(lock);
4549 
4550 	if (spin_needbreak(lock) || resched) {
4551 		spin_unlock(lock);
4552 		if (resched)
4553 			__cond_resched();
4554 		else
4555 			cpu_relax();
4556 		ret = 1;
4557 		spin_lock(lock);
4558 	}
4559 	return ret;
4560 }
4561 EXPORT_SYMBOL(__cond_resched_lock);
4562 
4563 int __sched __cond_resched_softirq(void)
4564 {
4565 	BUG_ON(!in_softirq());
4566 
4567 	if (should_resched()) {
4568 		local_bh_enable();
4569 		__cond_resched();
4570 		local_bh_disable();
4571 		return 1;
4572 	}
4573 	return 0;
4574 }
4575 EXPORT_SYMBOL(__cond_resched_softirq);
4576 
4577 /**
4578  * yield - yield the current processor to other threads.
4579  *
4580  * Do not ever use this function, there's a 99% chance you're doing it wrong.
4581  *
4582  * The scheduler is at all times free to pick the calling task as the most
4583  * eligible task to run, if removing the yield() call from your code breaks
4584  * it, its already broken.
4585  *
4586  * Typical broken usage is:
4587  *
4588  * while (!event)
4589  * 	yield();
4590  *
4591  * where one assumes that yield() will let 'the other' process run that will
4592  * make event true. If the current task is a SCHED_FIFO task that will never
4593  * happen. Never use yield() as a progress guarantee!!
4594  *
4595  * If you want to use yield() to wait for something, use wait_event().
4596  * If you want to use yield() to be 'nice' for others, use cond_resched().
4597  * If you still want to use yield(), do not!
4598  */
4599 void __sched yield(void)
4600 {
4601 	set_current_state(TASK_RUNNING);
4602 	sys_sched_yield();
4603 }
4604 EXPORT_SYMBOL(yield);
4605 
4606 /**
4607  * yield_to - yield the current processor to another thread in
4608  * your thread group, or accelerate that thread toward the
4609  * processor it's on.
4610  * @p: target task
4611  * @preempt: whether task preemption is allowed or not
4612  *
4613  * It's the caller's job to ensure that the target task struct
4614  * can't go away on us before we can do any checks.
4615  *
4616  * Returns true if we indeed boosted the target task.
4617  */
4618 bool __sched yield_to(struct task_struct *p, bool preempt)
4619 {
4620 	struct task_struct *curr = current;
4621 	struct rq *rq, *p_rq;
4622 	unsigned long flags;
4623 	bool yielded = 0;
4624 
4625 	local_irq_save(flags);
4626 	rq = this_rq();
4627 
4628 again:
4629 	p_rq = task_rq(p);
4630 	double_rq_lock(rq, p_rq);
4631 	while (task_rq(p) != p_rq) {
4632 		double_rq_unlock(rq, p_rq);
4633 		goto again;
4634 	}
4635 
4636 	if (!curr->sched_class->yield_to_task)
4637 		goto out;
4638 
4639 	if (curr->sched_class != p->sched_class)
4640 		goto out;
4641 
4642 	if (task_running(p_rq, p) || p->state)
4643 		goto out;
4644 
4645 	yielded = curr->sched_class->yield_to_task(rq, p, preempt);
4646 	if (yielded) {
4647 		schedstat_inc(rq, yld_count);
4648 		/*
4649 		 * Make p's CPU reschedule; pick_next_entity takes care of
4650 		 * fairness.
4651 		 */
4652 		if (preempt && rq != p_rq)
4653 			resched_task(p_rq->curr);
4654 	} else {
4655 		/*
4656 		 * We might have set it in task_yield_fair(), but are
4657 		 * not going to schedule(), so don't want to skip
4658 		 * the next update.
4659 		 */
4660 		rq->skip_clock_update = 0;
4661 	}
4662 
4663 out:
4664 	double_rq_unlock(rq, p_rq);
4665 	local_irq_restore(flags);
4666 
4667 	if (yielded)
4668 		schedule();
4669 
4670 	return yielded;
4671 }
4672 EXPORT_SYMBOL_GPL(yield_to);
4673 
4674 /*
4675  * This task is about to go to sleep on IO. Increment rq->nr_iowait so
4676  * that process accounting knows that this is a task in IO wait state.
4677  */
4678 void __sched io_schedule(void)
4679 {
4680 	struct rq *rq = raw_rq();
4681 
4682 	delayacct_blkio_start();
4683 	atomic_inc(&rq->nr_iowait);
4684 	blk_flush_plug(current);
4685 	current->in_iowait = 1;
4686 	schedule();
4687 	current->in_iowait = 0;
4688 	atomic_dec(&rq->nr_iowait);
4689 	delayacct_blkio_end();
4690 }
4691 EXPORT_SYMBOL(io_schedule);
4692 
4693 long __sched io_schedule_timeout(long timeout)
4694 {
4695 	struct rq *rq = raw_rq();
4696 	long ret;
4697 
4698 	delayacct_blkio_start();
4699 	atomic_inc(&rq->nr_iowait);
4700 	blk_flush_plug(current);
4701 	current->in_iowait = 1;
4702 	ret = schedule_timeout(timeout);
4703 	current->in_iowait = 0;
4704 	atomic_dec(&rq->nr_iowait);
4705 	delayacct_blkio_end();
4706 	return ret;
4707 }
4708 
4709 /**
4710  * sys_sched_get_priority_max - return maximum RT priority.
4711  * @policy: scheduling class.
4712  *
4713  * this syscall returns the maximum rt_priority that can be used
4714  * by a given scheduling class.
4715  */
4716 SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
4717 {
4718 	int ret = -EINVAL;
4719 
4720 	switch (policy) {
4721 	case SCHED_FIFO:
4722 	case SCHED_RR:
4723 		ret = MAX_USER_RT_PRIO-1;
4724 		break;
4725 	case SCHED_NORMAL:
4726 	case SCHED_BATCH:
4727 	case SCHED_IDLE:
4728 		ret = 0;
4729 		break;
4730 	}
4731 	return ret;
4732 }
4733 
4734 /**
4735  * sys_sched_get_priority_min - return minimum RT priority.
4736  * @policy: scheduling class.
4737  *
4738  * this syscall returns the minimum rt_priority that can be used
4739  * by a given scheduling class.
4740  */
4741 SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
4742 {
4743 	int ret = -EINVAL;
4744 
4745 	switch (policy) {
4746 	case SCHED_FIFO:
4747 	case SCHED_RR:
4748 		ret = 1;
4749 		break;
4750 	case SCHED_NORMAL:
4751 	case SCHED_BATCH:
4752 	case SCHED_IDLE:
4753 		ret = 0;
4754 	}
4755 	return ret;
4756 }
4757 
4758 /**
4759  * sys_sched_rr_get_interval - return the default timeslice of a process.
4760  * @pid: pid of the process.
4761  * @interval: userspace pointer to the timeslice value.
4762  *
4763  * this syscall writes the default timeslice value of a given process
4764  * into the user-space timespec buffer. A value of '0' means infinity.
4765  */
4766 SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
4767 		struct timespec __user *, interval)
4768 {
4769 	struct task_struct *p;
4770 	unsigned int time_slice;
4771 	unsigned long flags;
4772 	struct rq *rq;
4773 	int retval;
4774 	struct timespec t;
4775 
4776 	if (pid < 0)
4777 		return -EINVAL;
4778 
4779 	retval = -ESRCH;
4780 	rcu_read_lock();
4781 	p = find_process_by_pid(pid);
4782 	if (!p)
4783 		goto out_unlock;
4784 
4785 	retval = security_task_getscheduler(p);
4786 	if (retval)
4787 		goto out_unlock;
4788 
4789 	rq = task_rq_lock(p, &flags);
4790 	time_slice = p->sched_class->get_rr_interval(rq, p);
4791 	task_rq_unlock(rq, p, &flags);
4792 
4793 	rcu_read_unlock();
4794 	jiffies_to_timespec(time_slice, &t);
4795 	retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
4796 	return retval;
4797 
4798 out_unlock:
4799 	rcu_read_unlock();
4800 	return retval;
4801 }
4802 
4803 static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
4804 
4805 void sched_show_task(struct task_struct *p)
4806 {
4807 	unsigned long free = 0;
4808 	unsigned state;
4809 
4810 	state = p->state ? __ffs(p->state) + 1 : 0;
4811 	printk(KERN_INFO "%-15.15s %c", p->comm,
4812 		state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
4813 #if BITS_PER_LONG == 32
4814 	if (state == TASK_RUNNING)
4815 		printk(KERN_CONT " running  ");
4816 	else
4817 		printk(KERN_CONT " %08lx ", thread_saved_pc(p));
4818 #else
4819 	if (state == TASK_RUNNING)
4820 		printk(KERN_CONT "  running task    ");
4821 	else
4822 		printk(KERN_CONT " %016lx ", thread_saved_pc(p));
4823 #endif
4824 #ifdef CONFIG_DEBUG_STACK_USAGE
4825 	free = stack_not_used(p);
4826 #endif
4827 	printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
4828 		task_pid_nr(p), task_pid_nr(rcu_dereference(p->real_parent)),
4829 		(unsigned long)task_thread_info(p)->flags);
4830 
4831 	show_stack(p, NULL);
4832 }
4833 
4834 void show_state_filter(unsigned long state_filter)
4835 {
4836 	struct task_struct *g, *p;
4837 
4838 #if BITS_PER_LONG == 32
4839 	printk(KERN_INFO
4840 		"  task                PC stack   pid father\n");
4841 #else
4842 	printk(KERN_INFO
4843 		"  task                        PC stack   pid father\n");
4844 #endif
4845 	rcu_read_lock();
4846 	do_each_thread(g, p) {
4847 		/*
4848 		 * reset the NMI-timeout, listing all files on a slow
4849 		 * console might take a lot of time:
4850 		 */
4851 		touch_nmi_watchdog();
4852 		if (!state_filter || (p->state & state_filter))
4853 			sched_show_task(p);
4854 	} while_each_thread(g, p);
4855 
4856 	touch_all_softlockup_watchdogs();
4857 
4858 #ifdef CONFIG_SCHED_DEBUG
4859 	sysrq_sched_debug_show();
4860 #endif
4861 	rcu_read_unlock();
4862 	/*
4863 	 * Only show locks if all tasks are dumped:
4864 	 */
4865 	if (!state_filter)
4866 		debug_show_all_locks();
4867 }
4868 
4869 void __cpuinit init_idle_bootup_task(struct task_struct *idle)
4870 {
4871 	idle->sched_class = &idle_sched_class;
4872 }
4873 
4874 /**
4875  * init_idle - set up an idle thread for a given CPU
4876  * @idle: task in question
4877  * @cpu: cpu the idle task belongs to
4878  *
4879  * NOTE: this function does not set the idle thread's NEED_RESCHED
4880  * flag, to make booting more robust.
4881  */
4882 void __cpuinit init_idle(struct task_struct *idle, int cpu)
4883 {
4884 	struct rq *rq = cpu_rq(cpu);
4885 	unsigned long flags;
4886 
4887 	raw_spin_lock_irqsave(&rq->lock, flags);
4888 
4889 	__sched_fork(idle);
4890 	idle->state = TASK_RUNNING;
4891 	idle->se.exec_start = sched_clock();
4892 
4893 	do_set_cpus_allowed(idle, cpumask_of(cpu));
4894 	/*
4895 	 * We're having a chicken and egg problem, even though we are
4896 	 * holding rq->lock, the cpu isn't yet set to this cpu so the
4897 	 * lockdep check in task_group() will fail.
4898 	 *
4899 	 * Similar case to sched_fork(). / Alternatively we could
4900 	 * use task_rq_lock() here and obtain the other rq->lock.
4901 	 *
4902 	 * Silence PROVE_RCU
4903 	 */
4904 	rcu_read_lock();
4905 	__set_task_cpu(idle, cpu);
4906 	rcu_read_unlock();
4907 
4908 	rq->curr = rq->idle = idle;
4909 #if defined(CONFIG_SMP)
4910 	idle->on_cpu = 1;
4911 #endif
4912 	raw_spin_unlock_irqrestore(&rq->lock, flags);
4913 
4914 	/* Set the preempt count _outside_ the spinlocks! */
4915 	task_thread_info(idle)->preempt_count = 0;
4916 
4917 	/*
4918 	 * The idle tasks have their own, simple scheduling class:
4919 	 */
4920 	idle->sched_class = &idle_sched_class;
4921 	ftrace_graph_init_idle_task(idle, cpu);
4922 #if defined(CONFIG_SMP)
4923 	sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
4924 #endif
4925 }
4926 
4927 #ifdef CONFIG_SMP
4928 void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
4929 {
4930 	if (p->sched_class && p->sched_class->set_cpus_allowed)
4931 		p->sched_class->set_cpus_allowed(p, new_mask);
4932 
4933 	cpumask_copy(&p->cpus_allowed, new_mask);
4934 	p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
4935 }
4936 
4937 /*
4938  * This is how migration works:
4939  *
4940  * 1) we invoke migration_cpu_stop() on the target CPU using
4941  *    stop_one_cpu().
4942  * 2) stopper starts to run (implicitly forcing the migrated thread
4943  *    off the CPU)
4944  * 3) it checks whether the migrated task is still in the wrong runqueue.
4945  * 4) if it's in the wrong runqueue then the migration thread removes
4946  *    it and puts it into the right queue.
4947  * 5) stopper completes and stop_one_cpu() returns and the migration
4948  *    is done.
4949  */
4950 
4951 /*
4952  * Change a given task's CPU affinity. Migrate the thread to a
4953  * proper CPU and schedule it away if the CPU it's executing on
4954  * is removed from the allowed bitmask.
4955  *
4956  * NOTE: the caller must have a valid reference to the task, the
4957  * task must not exit() & deallocate itself prematurely. The
4958  * call is not atomic; no spinlocks may be held.
4959  */
4960 int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
4961 {
4962 	unsigned long flags;
4963 	struct rq *rq;
4964 	unsigned int dest_cpu;
4965 	int ret = 0;
4966 
4967 	rq = task_rq_lock(p, &flags);
4968 
4969 	if (cpumask_equal(&p->cpus_allowed, new_mask))
4970 		goto out;
4971 
4972 	if (!cpumask_intersects(new_mask, cpu_active_mask)) {
4973 		ret = -EINVAL;
4974 		goto out;
4975 	}
4976 
4977 	if (unlikely((p->flags & PF_THREAD_BOUND) && p != current)) {
4978 		ret = -EINVAL;
4979 		goto out;
4980 	}
4981 
4982 	do_set_cpus_allowed(p, new_mask);
4983 
4984 	/* Can the task run on the task's current CPU? If so, we're done */
4985 	if (cpumask_test_cpu(task_cpu(p), new_mask))
4986 		goto out;
4987 
4988 	dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
4989 	if (p->on_rq) {
4990 		struct migration_arg arg = { p, dest_cpu };
4991 		/* Need help from migration thread: drop lock and wait. */
4992 		task_rq_unlock(rq, p, &flags);
4993 		stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
4994 		tlb_migrate_finish(p->mm);
4995 		return 0;
4996 	}
4997 out:
4998 	task_rq_unlock(rq, p, &flags);
4999 
5000 	return ret;
5001 }
5002 EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
5003 
5004 /*
5005  * Move (not current) task off this cpu, onto dest cpu. We're doing
5006  * this because either it can't run here any more (set_cpus_allowed()
5007  * away from this CPU, or CPU going down), or because we're
5008  * attempting to rebalance this task on exec (sched_exec).
5009  *
5010  * So we race with normal scheduler movements, but that's OK, as long
5011  * as the task is no longer on this CPU.
5012  *
5013  * Returns non-zero if task was successfully migrated.
5014  */
5015 static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
5016 {
5017 	struct rq *rq_dest, *rq_src;
5018 	int ret = 0;
5019 
5020 	if (unlikely(!cpu_active(dest_cpu)))
5021 		return ret;
5022 
5023 	rq_src = cpu_rq(src_cpu);
5024 	rq_dest = cpu_rq(dest_cpu);
5025 
5026 	raw_spin_lock(&p->pi_lock);
5027 	double_rq_lock(rq_src, rq_dest);
5028 	/* Already moved. */
5029 	if (task_cpu(p) != src_cpu)
5030 		goto done;
5031 	/* Affinity changed (again). */
5032 	if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
5033 		goto fail;
5034 
5035 	/*
5036 	 * If we're not on a rq, the next wake-up will ensure we're
5037 	 * placed properly.
5038 	 */
5039 	if (p->on_rq) {
5040 		dequeue_task(rq_src, p, 0);
5041 		set_task_cpu(p, dest_cpu);
5042 		enqueue_task(rq_dest, p, 0);
5043 		check_preempt_curr(rq_dest, p, 0);
5044 	}
5045 done:
5046 	ret = 1;
5047 fail:
5048 	double_rq_unlock(rq_src, rq_dest);
5049 	raw_spin_unlock(&p->pi_lock);
5050 	return ret;
5051 }
5052 
5053 /*
5054  * migration_cpu_stop - this will be executed by a highprio stopper thread
5055  * and performs thread migration by bumping thread off CPU then
5056  * 'pushing' onto another runqueue.
5057  */
5058 static int migration_cpu_stop(void *data)
5059 {
5060 	struct migration_arg *arg = data;
5061 
5062 	/*
5063 	 * The original target cpu might have gone down and we might
5064 	 * be on another cpu but it doesn't matter.
5065 	 */
5066 	local_irq_disable();
5067 	__migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu);
5068 	local_irq_enable();
5069 	return 0;
5070 }
5071 
5072 #ifdef CONFIG_HOTPLUG_CPU
5073 
5074 /*
5075  * Ensures that the idle task is using init_mm right before its cpu goes
5076  * offline.
5077  */
5078 void idle_task_exit(void)
5079 {
5080 	struct mm_struct *mm = current->active_mm;
5081 
5082 	BUG_ON(cpu_online(smp_processor_id()));
5083 
5084 	if (mm != &init_mm)
5085 		switch_mm(mm, &init_mm, current);
5086 	mmdrop(mm);
5087 }
5088 
5089 /*
5090  * While a dead CPU has no uninterruptible tasks queued at this point,
5091  * it might still have a nonzero ->nr_uninterruptible counter, because
5092  * for performance reasons the counter is not stricly tracking tasks to
5093  * their home CPUs. So we just add the counter to another CPU's counter,
5094  * to keep the global sum constant after CPU-down:
5095  */
5096 static void migrate_nr_uninterruptible(struct rq *rq_src)
5097 {
5098 	struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask));
5099 
5100 	rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
5101 	rq_src->nr_uninterruptible = 0;
5102 }
5103 
5104 /*
5105  * remove the tasks which were accounted by rq from calc_load_tasks.
5106  */
5107 static void calc_global_load_remove(struct rq *rq)
5108 {
5109 	atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
5110 	rq->calc_load_active = 0;
5111 }
5112 
5113 /*
5114  * Migrate all tasks from the rq, sleeping tasks will be migrated by
5115  * try_to_wake_up()->select_task_rq().
5116  *
5117  * Called with rq->lock held even though we'er in stop_machine() and
5118  * there's no concurrency possible, we hold the required locks anyway
5119  * because of lock validation efforts.
5120  */
5121 static void migrate_tasks(unsigned int dead_cpu)
5122 {
5123 	struct rq *rq = cpu_rq(dead_cpu);
5124 	struct task_struct *next, *stop = rq->stop;
5125 	int dest_cpu;
5126 
5127 	/*
5128 	 * Fudge the rq selection such that the below task selection loop
5129 	 * doesn't get stuck on the currently eligible stop task.
5130 	 *
5131 	 * We're currently inside stop_machine() and the rq is either stuck
5132 	 * in the stop_machine_cpu_stop() loop, or we're executing this code,
5133 	 * either way we should never end up calling schedule() until we're
5134 	 * done here.
5135 	 */
5136 	rq->stop = NULL;
5137 
5138 	/* Ensure any throttled groups are reachable by pick_next_task */
5139 	unthrottle_offline_cfs_rqs(rq);
5140 
5141 	for ( ; ; ) {
5142 		/*
5143 		 * There's this thread running, bail when that's the only
5144 		 * remaining thread.
5145 		 */
5146 		if (rq->nr_running == 1)
5147 			break;
5148 
5149 		next = pick_next_task(rq);
5150 		BUG_ON(!next);
5151 		next->sched_class->put_prev_task(rq, next);
5152 
5153 		/* Find suitable destination for @next, with force if needed. */
5154 		dest_cpu = select_fallback_rq(dead_cpu, next);
5155 		raw_spin_unlock(&rq->lock);
5156 
5157 		__migrate_task(next, dead_cpu, dest_cpu);
5158 
5159 		raw_spin_lock(&rq->lock);
5160 	}
5161 
5162 	rq->stop = stop;
5163 }
5164 
5165 #endif /* CONFIG_HOTPLUG_CPU */
5166 
5167 #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
5168 
5169 static struct ctl_table sd_ctl_dir[] = {
5170 	{
5171 		.procname	= "sched_domain",
5172 		.mode		= 0555,
5173 	},
5174 	{}
5175 };
5176 
5177 static struct ctl_table sd_ctl_root[] = {
5178 	{
5179 		.procname	= "kernel",
5180 		.mode		= 0555,
5181 		.child		= sd_ctl_dir,
5182 	},
5183 	{}
5184 };
5185 
5186 static struct ctl_table *sd_alloc_ctl_entry(int n)
5187 {
5188 	struct ctl_table *entry =
5189 		kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
5190 
5191 	return entry;
5192 }
5193 
5194 static void sd_free_ctl_entry(struct ctl_table **tablep)
5195 {
5196 	struct ctl_table *entry;
5197 
5198 	/*
5199 	 * In the intermediate directories, both the child directory and
5200 	 * procname are dynamically allocated and could fail but the mode
5201 	 * will always be set. In the lowest directory the names are
5202 	 * static strings and all have proc handlers.
5203 	 */
5204 	for (entry = *tablep; entry->mode; entry++) {
5205 		if (entry->child)
5206 			sd_free_ctl_entry(&entry->child);
5207 		if (entry->proc_handler == NULL)
5208 			kfree(entry->procname);
5209 	}
5210 
5211 	kfree(*tablep);
5212 	*tablep = NULL;
5213 }
5214 
5215 static void
5216 set_table_entry(struct ctl_table *entry,
5217 		const char *procname, void *data, int maxlen,
5218 		umode_t mode, proc_handler *proc_handler)
5219 {
5220 	entry->procname = procname;
5221 	entry->data = data;
5222 	entry->maxlen = maxlen;
5223 	entry->mode = mode;
5224 	entry->proc_handler = proc_handler;
5225 }
5226 
5227 static struct ctl_table *
5228 sd_alloc_ctl_domain_table(struct sched_domain *sd)
5229 {
5230 	struct ctl_table *table = sd_alloc_ctl_entry(13);
5231 
5232 	if (table == NULL)
5233 		return NULL;
5234 
5235 	set_table_entry(&table[0], "min_interval", &sd->min_interval,
5236 		sizeof(long), 0644, proc_doulongvec_minmax);
5237 	set_table_entry(&table[1], "max_interval", &sd->max_interval,
5238 		sizeof(long), 0644, proc_doulongvec_minmax);
5239 	set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
5240 		sizeof(int), 0644, proc_dointvec_minmax);
5241 	set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
5242 		sizeof(int), 0644, proc_dointvec_minmax);
5243 	set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
5244 		sizeof(int), 0644, proc_dointvec_minmax);
5245 	set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
5246 		sizeof(int), 0644, proc_dointvec_minmax);
5247 	set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
5248 		sizeof(int), 0644, proc_dointvec_minmax);
5249 	set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
5250 		sizeof(int), 0644, proc_dointvec_minmax);
5251 	set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
5252 		sizeof(int), 0644, proc_dointvec_minmax);
5253 	set_table_entry(&table[9], "cache_nice_tries",
5254 		&sd->cache_nice_tries,
5255 		sizeof(int), 0644, proc_dointvec_minmax);
5256 	set_table_entry(&table[10], "flags", &sd->flags,
5257 		sizeof(int), 0644, proc_dointvec_minmax);
5258 	set_table_entry(&table[11], "name", sd->name,
5259 		CORENAME_MAX_SIZE, 0444, proc_dostring);
5260 	/* &table[12] is terminator */
5261 
5262 	return table;
5263 }
5264 
5265 static ctl_table *sd_alloc_ctl_cpu_table(int cpu)
5266 {
5267 	struct ctl_table *entry, *table;
5268 	struct sched_domain *sd;
5269 	int domain_num = 0, i;
5270 	char buf[32];
5271 
5272 	for_each_domain(cpu, sd)
5273 		domain_num++;
5274 	entry = table = sd_alloc_ctl_entry(domain_num + 1);
5275 	if (table == NULL)
5276 		return NULL;
5277 
5278 	i = 0;
5279 	for_each_domain(cpu, sd) {
5280 		snprintf(buf, 32, "domain%d", i);
5281 		entry->procname = kstrdup(buf, GFP_KERNEL);
5282 		entry->mode = 0555;
5283 		entry->child = sd_alloc_ctl_domain_table(sd);
5284 		entry++;
5285 		i++;
5286 	}
5287 	return table;
5288 }
5289 
5290 static struct ctl_table_header *sd_sysctl_header;
5291 static void register_sched_domain_sysctl(void)
5292 {
5293 	int i, cpu_num = num_possible_cpus();
5294 	struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
5295 	char buf[32];
5296 
5297 	WARN_ON(sd_ctl_dir[0].child);
5298 	sd_ctl_dir[0].child = entry;
5299 
5300 	if (entry == NULL)
5301 		return;
5302 
5303 	for_each_possible_cpu(i) {
5304 		snprintf(buf, 32, "cpu%d", i);
5305 		entry->procname = kstrdup(buf, GFP_KERNEL);
5306 		entry->mode = 0555;
5307 		entry->child = sd_alloc_ctl_cpu_table(i);
5308 		entry++;
5309 	}
5310 
5311 	WARN_ON(sd_sysctl_header);
5312 	sd_sysctl_header = register_sysctl_table(sd_ctl_root);
5313 }
5314 
5315 /* may be called multiple times per register */
5316 static void unregister_sched_domain_sysctl(void)
5317 {
5318 	if (sd_sysctl_header)
5319 		unregister_sysctl_table(sd_sysctl_header);
5320 	sd_sysctl_header = NULL;
5321 	if (sd_ctl_dir[0].child)
5322 		sd_free_ctl_entry(&sd_ctl_dir[0].child);
5323 }
5324 #else
5325 static void register_sched_domain_sysctl(void)
5326 {
5327 }
5328 static void unregister_sched_domain_sysctl(void)
5329 {
5330 }
5331 #endif
5332 
5333 static void set_rq_online(struct rq *rq)
5334 {
5335 	if (!rq->online) {
5336 		const struct sched_class *class;
5337 
5338 		cpumask_set_cpu(rq->cpu, rq->rd->online);
5339 		rq->online = 1;
5340 
5341 		for_each_class(class) {
5342 			if (class->rq_online)
5343 				class->rq_online(rq);
5344 		}
5345 	}
5346 }
5347 
5348 static void set_rq_offline(struct rq *rq)
5349 {
5350 	if (rq->online) {
5351 		const struct sched_class *class;
5352 
5353 		for_each_class(class) {
5354 			if (class->rq_offline)
5355 				class->rq_offline(rq);
5356 		}
5357 
5358 		cpumask_clear_cpu(rq->cpu, rq->rd->online);
5359 		rq->online = 0;
5360 	}
5361 }
5362 
5363 /*
5364  * migration_call - callback that gets triggered when a CPU is added.
5365  * Here we can start up the necessary migration thread for the new CPU.
5366  */
5367 static int __cpuinit
5368 migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5369 {
5370 	int cpu = (long)hcpu;
5371 	unsigned long flags;
5372 	struct rq *rq = cpu_rq(cpu);
5373 
5374 	switch (action & ~CPU_TASKS_FROZEN) {
5375 
5376 	case CPU_UP_PREPARE:
5377 		rq->calc_load_update = calc_load_update;
5378 		break;
5379 
5380 	case CPU_ONLINE:
5381 		/* Update our root-domain */
5382 		raw_spin_lock_irqsave(&rq->lock, flags);
5383 		if (rq->rd) {
5384 			BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
5385 
5386 			set_rq_online(rq);
5387 		}
5388 		raw_spin_unlock_irqrestore(&rq->lock, flags);
5389 		break;
5390 
5391 #ifdef CONFIG_HOTPLUG_CPU
5392 	case CPU_DYING:
5393 		sched_ttwu_pending();
5394 		/* Update our root-domain */
5395 		raw_spin_lock_irqsave(&rq->lock, flags);
5396 		if (rq->rd) {
5397 			BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
5398 			set_rq_offline(rq);
5399 		}
5400 		migrate_tasks(cpu);
5401 		BUG_ON(rq->nr_running != 1); /* the migration thread */
5402 		raw_spin_unlock_irqrestore(&rq->lock, flags);
5403 
5404 		migrate_nr_uninterruptible(rq);
5405 		calc_global_load_remove(rq);
5406 		break;
5407 #endif
5408 	}
5409 
5410 	update_max_interval();
5411 
5412 	return NOTIFY_OK;
5413 }
5414 
5415 /*
5416  * Register at high priority so that task migration (migrate_all_tasks)
5417  * happens before everything else.  This has to be lower priority than
5418  * the notifier in the perf_event subsystem, though.
5419  */
5420 static struct notifier_block __cpuinitdata migration_notifier = {
5421 	.notifier_call = migration_call,
5422 	.priority = CPU_PRI_MIGRATION,
5423 };
5424 
5425 static int __cpuinit sched_cpu_active(struct notifier_block *nfb,
5426 				      unsigned long action, void *hcpu)
5427 {
5428 	switch (action & ~CPU_TASKS_FROZEN) {
5429 	case CPU_STARTING:
5430 	case CPU_DOWN_FAILED:
5431 		set_cpu_active((long)hcpu, true);
5432 		return NOTIFY_OK;
5433 	default:
5434 		return NOTIFY_DONE;
5435 	}
5436 }
5437 
5438 static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb,
5439 					unsigned long action, void *hcpu)
5440 {
5441 	switch (action & ~CPU_TASKS_FROZEN) {
5442 	case CPU_DOWN_PREPARE:
5443 		set_cpu_active((long)hcpu, false);
5444 		return NOTIFY_OK;
5445 	default:
5446 		return NOTIFY_DONE;
5447 	}
5448 }
5449 
5450 static int __init migration_init(void)
5451 {
5452 	void *cpu = (void *)(long)smp_processor_id();
5453 	int err;
5454 
5455 	/* Initialize migration for the boot CPU */
5456 	err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
5457 	BUG_ON(err == NOTIFY_BAD);
5458 	migration_call(&migration_notifier, CPU_ONLINE, cpu);
5459 	register_cpu_notifier(&migration_notifier);
5460 
5461 	/* Register cpu active notifiers */
5462 	cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE);
5463 	cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE);
5464 
5465 	return 0;
5466 }
5467 early_initcall(migration_init);
5468 #endif
5469 
5470 #ifdef CONFIG_SMP
5471 
5472 static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */
5473 
5474 #ifdef CONFIG_SCHED_DEBUG
5475 
5476 static __read_mostly int sched_domain_debug_enabled;
5477 
5478 static int __init sched_domain_debug_setup(char *str)
5479 {
5480 	sched_domain_debug_enabled = 1;
5481 
5482 	return 0;
5483 }
5484 early_param("sched_debug", sched_domain_debug_setup);
5485 
5486 static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
5487 				  struct cpumask *groupmask)
5488 {
5489 	struct sched_group *group = sd->groups;
5490 	char str[256];
5491 
5492 	cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd));
5493 	cpumask_clear(groupmask);
5494 
5495 	printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
5496 
5497 	if (!(sd->flags & SD_LOAD_BALANCE)) {
5498 		printk("does not load-balance\n");
5499 		if (sd->parent)
5500 			printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
5501 					" has parent");
5502 		return -1;
5503 	}
5504 
5505 	printk(KERN_CONT "span %s level %s\n", str, sd->name);
5506 
5507 	if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
5508 		printk(KERN_ERR "ERROR: domain->span does not contain "
5509 				"CPU%d\n", cpu);
5510 	}
5511 	if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) {
5512 		printk(KERN_ERR "ERROR: domain->groups does not contain"
5513 				" CPU%d\n", cpu);
5514 	}
5515 
5516 	printk(KERN_DEBUG "%*s groups:", level + 1, "");
5517 	do {
5518 		if (!group) {
5519 			printk("\n");
5520 			printk(KERN_ERR "ERROR: group is NULL\n");
5521 			break;
5522 		}
5523 
5524 		if (!group->sgp->power) {
5525 			printk(KERN_CONT "\n");
5526 			printk(KERN_ERR "ERROR: domain->cpu_power not "
5527 					"set\n");
5528 			break;
5529 		}
5530 
5531 		if (!cpumask_weight(sched_group_cpus(group))) {
5532 			printk(KERN_CONT "\n");
5533 			printk(KERN_ERR "ERROR: empty group\n");
5534 			break;
5535 		}
5536 
5537 		if (cpumask_intersects(groupmask, sched_group_cpus(group))) {
5538 			printk(KERN_CONT "\n");
5539 			printk(KERN_ERR "ERROR: repeated CPUs\n");
5540 			break;
5541 		}
5542 
5543 		cpumask_or(groupmask, groupmask, sched_group_cpus(group));
5544 
5545 		cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
5546 
5547 		printk(KERN_CONT " %s", str);
5548 		if (group->sgp->power != SCHED_POWER_SCALE) {
5549 			printk(KERN_CONT " (cpu_power = %d)",
5550 				group->sgp->power);
5551 		}
5552 
5553 		group = group->next;
5554 	} while (group != sd->groups);
5555 	printk(KERN_CONT "\n");
5556 
5557 	if (!cpumask_equal(sched_domain_span(sd), groupmask))
5558 		printk(KERN_ERR "ERROR: groups don't span domain->span\n");
5559 
5560 	if (sd->parent &&
5561 	    !cpumask_subset(groupmask, sched_domain_span(sd->parent)))
5562 		printk(KERN_ERR "ERROR: parent span is not a superset "
5563 			"of domain->span\n");
5564 	return 0;
5565 }
5566 
5567 static void sched_domain_debug(struct sched_domain *sd, int cpu)
5568 {
5569 	int level = 0;
5570 
5571 	if (!sched_domain_debug_enabled)
5572 		return;
5573 
5574 	if (!sd) {
5575 		printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
5576 		return;
5577 	}
5578 
5579 	printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
5580 
5581 	for (;;) {
5582 		if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
5583 			break;
5584 		level++;
5585 		sd = sd->parent;
5586 		if (!sd)
5587 			break;
5588 	}
5589 }
5590 #else /* !CONFIG_SCHED_DEBUG */
5591 # define sched_domain_debug(sd, cpu) do { } while (0)
5592 #endif /* CONFIG_SCHED_DEBUG */
5593 
5594 static int sd_degenerate(struct sched_domain *sd)
5595 {
5596 	if (cpumask_weight(sched_domain_span(sd)) == 1)
5597 		return 1;
5598 
5599 	/* Following flags need at least 2 groups */
5600 	if (sd->flags & (SD_LOAD_BALANCE |
5601 			 SD_BALANCE_NEWIDLE |
5602 			 SD_BALANCE_FORK |
5603 			 SD_BALANCE_EXEC |
5604 			 SD_SHARE_CPUPOWER |
5605 			 SD_SHARE_PKG_RESOURCES)) {
5606 		if (sd->groups != sd->groups->next)
5607 			return 0;
5608 	}
5609 
5610 	/* Following flags don't use groups */
5611 	if (sd->flags & (SD_WAKE_AFFINE))
5612 		return 0;
5613 
5614 	return 1;
5615 }
5616 
5617 static int
5618 sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
5619 {
5620 	unsigned long cflags = sd->flags, pflags = parent->flags;
5621 
5622 	if (sd_degenerate(parent))
5623 		return 1;
5624 
5625 	if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
5626 		return 0;
5627 
5628 	/* Flags needing groups don't count if only 1 group in parent */
5629 	if (parent->groups == parent->groups->next) {
5630 		pflags &= ~(SD_LOAD_BALANCE |
5631 				SD_BALANCE_NEWIDLE |
5632 				SD_BALANCE_FORK |
5633 				SD_BALANCE_EXEC |
5634 				SD_SHARE_CPUPOWER |
5635 				SD_SHARE_PKG_RESOURCES);
5636 		if (nr_node_ids == 1)
5637 			pflags &= ~SD_SERIALIZE;
5638 	}
5639 	if (~cflags & pflags)
5640 		return 0;
5641 
5642 	return 1;
5643 }
5644 
5645 static void free_rootdomain(struct rcu_head *rcu)
5646 {
5647 	struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
5648 
5649 	cpupri_cleanup(&rd->cpupri);
5650 	free_cpumask_var(rd->rto_mask);
5651 	free_cpumask_var(rd->online);
5652 	free_cpumask_var(rd->span);
5653 	kfree(rd);
5654 }
5655 
5656 static void rq_attach_root(struct rq *rq, struct root_domain *rd)
5657 {
5658 	struct root_domain *old_rd = NULL;
5659 	unsigned long flags;
5660 
5661 	raw_spin_lock_irqsave(&rq->lock, flags);
5662 
5663 	if (rq->rd) {
5664 		old_rd = rq->rd;
5665 
5666 		if (cpumask_test_cpu(rq->cpu, old_rd->online))
5667 			set_rq_offline(rq);
5668 
5669 		cpumask_clear_cpu(rq->cpu, old_rd->span);
5670 
5671 		/*
5672 		 * If we dont want to free the old_rt yet then
5673 		 * set old_rd to NULL to skip the freeing later
5674 		 * in this function:
5675 		 */
5676 		if (!atomic_dec_and_test(&old_rd->refcount))
5677 			old_rd = NULL;
5678 	}
5679 
5680 	atomic_inc(&rd->refcount);
5681 	rq->rd = rd;
5682 
5683 	cpumask_set_cpu(rq->cpu, rd->span);
5684 	if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
5685 		set_rq_online(rq);
5686 
5687 	raw_spin_unlock_irqrestore(&rq->lock, flags);
5688 
5689 	if (old_rd)
5690 		call_rcu_sched(&old_rd->rcu, free_rootdomain);
5691 }
5692 
5693 static int init_rootdomain(struct root_domain *rd)
5694 {
5695 	memset(rd, 0, sizeof(*rd));
5696 
5697 	if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
5698 		goto out;
5699 	if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
5700 		goto free_span;
5701 	if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
5702 		goto free_online;
5703 
5704 	if (cpupri_init(&rd->cpupri) != 0)
5705 		goto free_rto_mask;
5706 	return 0;
5707 
5708 free_rto_mask:
5709 	free_cpumask_var(rd->rto_mask);
5710 free_online:
5711 	free_cpumask_var(rd->online);
5712 free_span:
5713 	free_cpumask_var(rd->span);
5714 out:
5715 	return -ENOMEM;
5716 }
5717 
5718 /*
5719  * By default the system creates a single root-domain with all cpus as
5720  * members (mimicking the global state we have today).
5721  */
5722 struct root_domain def_root_domain;
5723 
5724 static void init_defrootdomain(void)
5725 {
5726 	init_rootdomain(&def_root_domain);
5727 
5728 	atomic_set(&def_root_domain.refcount, 1);
5729 }
5730 
5731 static struct root_domain *alloc_rootdomain(void)
5732 {
5733 	struct root_domain *rd;
5734 
5735 	rd = kmalloc(sizeof(*rd), GFP_KERNEL);
5736 	if (!rd)
5737 		return NULL;
5738 
5739 	if (init_rootdomain(rd) != 0) {
5740 		kfree(rd);
5741 		return NULL;
5742 	}
5743 
5744 	return rd;
5745 }
5746 
5747 static void free_sched_groups(struct sched_group *sg, int free_sgp)
5748 {
5749 	struct sched_group *tmp, *first;
5750 
5751 	if (!sg)
5752 		return;
5753 
5754 	first = sg;
5755 	do {
5756 		tmp = sg->next;
5757 
5758 		if (free_sgp && atomic_dec_and_test(&sg->sgp->ref))
5759 			kfree(sg->sgp);
5760 
5761 		kfree(sg);
5762 		sg = tmp;
5763 	} while (sg != first);
5764 }
5765 
5766 static void free_sched_domain(struct rcu_head *rcu)
5767 {
5768 	struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
5769 
5770 	/*
5771 	 * If its an overlapping domain it has private groups, iterate and
5772 	 * nuke them all.
5773 	 */
5774 	if (sd->flags & SD_OVERLAP) {
5775 		free_sched_groups(sd->groups, 1);
5776 	} else if (atomic_dec_and_test(&sd->groups->ref)) {
5777 		kfree(sd->groups->sgp);
5778 		kfree(sd->groups);
5779 	}
5780 	kfree(sd);
5781 }
5782 
5783 static void destroy_sched_domain(struct sched_domain *sd, int cpu)
5784 {
5785 	call_rcu(&sd->rcu, free_sched_domain);
5786 }
5787 
5788 static void destroy_sched_domains(struct sched_domain *sd, int cpu)
5789 {
5790 	for (; sd; sd = sd->parent)
5791 		destroy_sched_domain(sd, cpu);
5792 }
5793 
5794 /*
5795  * Keep a special pointer to the highest sched_domain that has
5796  * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
5797  * allows us to avoid some pointer chasing select_idle_sibling().
5798  *
5799  * Also keep a unique ID per domain (we use the first cpu number in
5800  * the cpumask of the domain), this allows us to quickly tell if
5801  * two cpus are in the same cache domain, see cpus_share_cache().
5802  */
5803 DEFINE_PER_CPU(struct sched_domain *, sd_llc);
5804 DEFINE_PER_CPU(int, sd_llc_id);
5805 
5806 static void update_top_cache_domain(int cpu)
5807 {
5808 	struct sched_domain *sd;
5809 	int id = cpu;
5810 
5811 	sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
5812 	if (sd)
5813 		id = cpumask_first(sched_domain_span(sd));
5814 
5815 	rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
5816 	per_cpu(sd_llc_id, cpu) = id;
5817 }
5818 
5819 /*
5820  * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
5821  * hold the hotplug lock.
5822  */
5823 static void
5824 cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
5825 {
5826 	struct rq *rq = cpu_rq(cpu);
5827 	struct sched_domain *tmp;
5828 
5829 	/* Remove the sched domains which do not contribute to scheduling. */
5830 	for (tmp = sd; tmp; ) {
5831 		struct sched_domain *parent = tmp->parent;
5832 		if (!parent)
5833 			break;
5834 
5835 		if (sd_parent_degenerate(tmp, parent)) {
5836 			tmp->parent = parent->parent;
5837 			if (parent->parent)
5838 				parent->parent->child = tmp;
5839 			destroy_sched_domain(parent, cpu);
5840 		} else
5841 			tmp = tmp->parent;
5842 	}
5843 
5844 	if (sd && sd_degenerate(sd)) {
5845 		tmp = sd;
5846 		sd = sd->parent;
5847 		destroy_sched_domain(tmp, cpu);
5848 		if (sd)
5849 			sd->child = NULL;
5850 	}
5851 
5852 	sched_domain_debug(sd, cpu);
5853 
5854 	rq_attach_root(rq, rd);
5855 	tmp = rq->sd;
5856 	rcu_assign_pointer(rq->sd, sd);
5857 	destroy_sched_domains(tmp, cpu);
5858 
5859 	update_top_cache_domain(cpu);
5860 }
5861 
5862 /* cpus with isolated domains */
5863 static cpumask_var_t cpu_isolated_map;
5864 
5865 /* Setup the mask of cpus configured for isolated domains */
5866 static int __init isolated_cpu_setup(char *str)
5867 {
5868 	alloc_bootmem_cpumask_var(&cpu_isolated_map);
5869 	cpulist_parse(str, cpu_isolated_map);
5870 	return 1;
5871 }
5872 
5873 __setup("isolcpus=", isolated_cpu_setup);
5874 
5875 #ifdef CONFIG_NUMA
5876 
5877 /**
5878  * find_next_best_node - find the next node to include in a sched_domain
5879  * @node: node whose sched_domain we're building
5880  * @used_nodes: nodes already in the sched_domain
5881  *
5882  * Find the next node to include in a given scheduling domain. Simply
5883  * finds the closest node not already in the @used_nodes map.
5884  *
5885  * Should use nodemask_t.
5886  */
5887 static int find_next_best_node(int node, nodemask_t *used_nodes)
5888 {
5889 	int i, n, val, min_val, best_node = -1;
5890 
5891 	min_val = INT_MAX;
5892 
5893 	for (i = 0; i < nr_node_ids; i++) {
5894 		/* Start at @node */
5895 		n = (node + i) % nr_node_ids;
5896 
5897 		if (!nr_cpus_node(n))
5898 			continue;
5899 
5900 		/* Skip already used nodes */
5901 		if (node_isset(n, *used_nodes))
5902 			continue;
5903 
5904 		/* Simple min distance search */
5905 		val = node_distance(node, n);
5906 
5907 		if (val < min_val) {
5908 			min_val = val;
5909 			best_node = n;
5910 		}
5911 	}
5912 
5913 	if (best_node != -1)
5914 		node_set(best_node, *used_nodes);
5915 	return best_node;
5916 }
5917 
5918 /**
5919  * sched_domain_node_span - get a cpumask for a node's sched_domain
5920  * @node: node whose cpumask we're constructing
5921  * @span: resulting cpumask
5922  *
5923  * Given a node, construct a good cpumask for its sched_domain to span. It
5924  * should be one that prevents unnecessary balancing, but also spreads tasks
5925  * out optimally.
5926  */
5927 static void sched_domain_node_span(int node, struct cpumask *span)
5928 {
5929 	nodemask_t used_nodes;
5930 	int i;
5931 
5932 	cpumask_clear(span);
5933 	nodes_clear(used_nodes);
5934 
5935 	cpumask_or(span, span, cpumask_of_node(node));
5936 	node_set(node, used_nodes);
5937 
5938 	for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
5939 		int next_node = find_next_best_node(node, &used_nodes);
5940 		if (next_node < 0)
5941 			break;
5942 		cpumask_or(span, span, cpumask_of_node(next_node));
5943 	}
5944 }
5945 
5946 static const struct cpumask *cpu_node_mask(int cpu)
5947 {
5948 	lockdep_assert_held(&sched_domains_mutex);
5949 
5950 	sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask);
5951 
5952 	return sched_domains_tmpmask;
5953 }
5954 
5955 static const struct cpumask *cpu_allnodes_mask(int cpu)
5956 {
5957 	return cpu_possible_mask;
5958 }
5959 #endif /* CONFIG_NUMA */
5960 
5961 static const struct cpumask *cpu_cpu_mask(int cpu)
5962 {
5963 	return cpumask_of_node(cpu_to_node(cpu));
5964 }
5965 
5966 int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
5967 
5968 struct sd_data {
5969 	struct sched_domain **__percpu sd;
5970 	struct sched_group **__percpu sg;
5971 	struct sched_group_power **__percpu sgp;
5972 };
5973 
5974 struct s_data {
5975 	struct sched_domain ** __percpu sd;
5976 	struct root_domain	*rd;
5977 };
5978 
5979 enum s_alloc {
5980 	sa_rootdomain,
5981 	sa_sd,
5982 	sa_sd_storage,
5983 	sa_none,
5984 };
5985 
5986 struct sched_domain_topology_level;
5987 
5988 typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu);
5989 typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
5990 
5991 #define SDTL_OVERLAP	0x01
5992 
5993 struct sched_domain_topology_level {
5994 	sched_domain_init_f init;
5995 	sched_domain_mask_f mask;
5996 	int		    flags;
5997 	struct sd_data      data;
5998 };
5999 
6000 static int
6001 build_overlap_sched_groups(struct sched_domain *sd, int cpu)
6002 {
6003 	struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg;
6004 	const struct cpumask *span = sched_domain_span(sd);
6005 	struct cpumask *covered = sched_domains_tmpmask;
6006 	struct sd_data *sdd = sd->private;
6007 	struct sched_domain *child;
6008 	int i;
6009 
6010 	cpumask_clear(covered);
6011 
6012 	for_each_cpu(i, span) {
6013 		struct cpumask *sg_span;
6014 
6015 		if (cpumask_test_cpu(i, covered))
6016 			continue;
6017 
6018 		sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
6019 				GFP_KERNEL, cpu_to_node(cpu));
6020 
6021 		if (!sg)
6022 			goto fail;
6023 
6024 		sg_span = sched_group_cpus(sg);
6025 
6026 		child = *per_cpu_ptr(sdd->sd, i);
6027 		if (child->child) {
6028 			child = child->child;
6029 			cpumask_copy(sg_span, sched_domain_span(child));
6030 		} else
6031 			cpumask_set_cpu(i, sg_span);
6032 
6033 		cpumask_or(covered, covered, sg_span);
6034 
6035 		sg->sgp = *per_cpu_ptr(sdd->sgp, cpumask_first(sg_span));
6036 		atomic_inc(&sg->sgp->ref);
6037 
6038 		if (cpumask_test_cpu(cpu, sg_span))
6039 			groups = sg;
6040 
6041 		if (!first)
6042 			first = sg;
6043 		if (last)
6044 			last->next = sg;
6045 		last = sg;
6046 		last->next = first;
6047 	}
6048 	sd->groups = groups;
6049 
6050 	return 0;
6051 
6052 fail:
6053 	free_sched_groups(first, 0);
6054 
6055 	return -ENOMEM;
6056 }
6057 
6058 static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
6059 {
6060 	struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
6061 	struct sched_domain *child = sd->child;
6062 
6063 	if (child)
6064 		cpu = cpumask_first(sched_domain_span(child));
6065 
6066 	if (sg) {
6067 		*sg = *per_cpu_ptr(sdd->sg, cpu);
6068 		(*sg)->sgp = *per_cpu_ptr(sdd->sgp, cpu);
6069 		atomic_set(&(*sg)->sgp->ref, 1); /* for claim_allocations */
6070 	}
6071 
6072 	return cpu;
6073 }
6074 
6075 /*
6076  * build_sched_groups will build a circular linked list of the groups
6077  * covered by the given span, and will set each group's ->cpumask correctly,
6078  * and ->cpu_power to 0.
6079  *
6080  * Assumes the sched_domain tree is fully constructed
6081  */
6082 static int
6083 build_sched_groups(struct sched_domain *sd, int cpu)
6084 {
6085 	struct sched_group *first = NULL, *last = NULL;
6086 	struct sd_data *sdd = sd->private;
6087 	const struct cpumask *span = sched_domain_span(sd);
6088 	struct cpumask *covered;
6089 	int i;
6090 
6091 	get_group(cpu, sdd, &sd->groups);
6092 	atomic_inc(&sd->groups->ref);
6093 
6094 	if (cpu != cpumask_first(sched_domain_span(sd)))
6095 		return 0;
6096 
6097 	lockdep_assert_held(&sched_domains_mutex);
6098 	covered = sched_domains_tmpmask;
6099 
6100 	cpumask_clear(covered);
6101 
6102 	for_each_cpu(i, span) {
6103 		struct sched_group *sg;
6104 		int group = get_group(i, sdd, &sg);
6105 		int j;
6106 
6107 		if (cpumask_test_cpu(i, covered))
6108 			continue;
6109 
6110 		cpumask_clear(sched_group_cpus(sg));
6111 		sg->sgp->power = 0;
6112 
6113 		for_each_cpu(j, span) {
6114 			if (get_group(j, sdd, NULL) != group)
6115 				continue;
6116 
6117 			cpumask_set_cpu(j, covered);
6118 			cpumask_set_cpu(j, sched_group_cpus(sg));
6119 		}
6120 
6121 		if (!first)
6122 			first = sg;
6123 		if (last)
6124 			last->next = sg;
6125 		last = sg;
6126 	}
6127 	last->next = first;
6128 
6129 	return 0;
6130 }
6131 
6132 /*
6133  * Initialize sched groups cpu_power.
6134  *
6135  * cpu_power indicates the capacity of sched group, which is used while
6136  * distributing the load between different sched groups in a sched domain.
6137  * Typically cpu_power for all the groups in a sched domain will be same unless
6138  * there are asymmetries in the topology. If there are asymmetries, group
6139  * having more cpu_power will pickup more load compared to the group having
6140  * less cpu_power.
6141  */
6142 static void init_sched_groups_power(int cpu, struct sched_domain *sd)
6143 {
6144 	struct sched_group *sg = sd->groups;
6145 
6146 	WARN_ON(!sd || !sg);
6147 
6148 	do {
6149 		sg->group_weight = cpumask_weight(sched_group_cpus(sg));
6150 		sg = sg->next;
6151 	} while (sg != sd->groups);
6152 
6153 	if (cpu != group_first_cpu(sg))
6154 		return;
6155 
6156 	update_group_power(sd, cpu);
6157 	atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight);
6158 }
6159 
6160 int __weak arch_sd_sibling_asym_packing(void)
6161 {
6162        return 0*SD_ASYM_PACKING;
6163 }
6164 
6165 /*
6166  * Initializers for schedule domains
6167  * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
6168  */
6169 
6170 #ifdef CONFIG_SCHED_DEBUG
6171 # define SD_INIT_NAME(sd, type)		sd->name = #type
6172 #else
6173 # define SD_INIT_NAME(sd, type)		do { } while (0)
6174 #endif
6175 
6176 #define SD_INIT_FUNC(type)						\
6177 static noinline struct sched_domain *					\
6178 sd_init_##type(struct sched_domain_topology_level *tl, int cpu) 	\
6179 {									\
6180 	struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);	\
6181 	*sd = SD_##type##_INIT;						\
6182 	SD_INIT_NAME(sd, type);						\
6183 	sd->private = &tl->data;					\
6184 	return sd;							\
6185 }
6186 
6187 SD_INIT_FUNC(CPU)
6188 #ifdef CONFIG_NUMA
6189  SD_INIT_FUNC(ALLNODES)
6190  SD_INIT_FUNC(NODE)
6191 #endif
6192 #ifdef CONFIG_SCHED_SMT
6193  SD_INIT_FUNC(SIBLING)
6194 #endif
6195 #ifdef CONFIG_SCHED_MC
6196  SD_INIT_FUNC(MC)
6197 #endif
6198 #ifdef CONFIG_SCHED_BOOK
6199  SD_INIT_FUNC(BOOK)
6200 #endif
6201 
6202 static int default_relax_domain_level = -1;
6203 int sched_domain_level_max;
6204 
6205 static int __init setup_relax_domain_level(char *str)
6206 {
6207 	unsigned long val;
6208 
6209 	val = simple_strtoul(str, NULL, 0);
6210 	if (val < sched_domain_level_max)
6211 		default_relax_domain_level = val;
6212 
6213 	return 1;
6214 }
6215 __setup("relax_domain_level=", setup_relax_domain_level);
6216 
6217 static void set_domain_attribute(struct sched_domain *sd,
6218 				 struct sched_domain_attr *attr)
6219 {
6220 	int request;
6221 
6222 	if (!attr || attr->relax_domain_level < 0) {
6223 		if (default_relax_domain_level < 0)
6224 			return;
6225 		else
6226 			request = default_relax_domain_level;
6227 	} else
6228 		request = attr->relax_domain_level;
6229 	if (request < sd->level) {
6230 		/* turn off idle balance on this domain */
6231 		sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
6232 	} else {
6233 		/* turn on idle balance on this domain */
6234 		sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
6235 	}
6236 }
6237 
6238 static void __sdt_free(const struct cpumask *cpu_map);
6239 static int __sdt_alloc(const struct cpumask *cpu_map);
6240 
6241 static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
6242 				 const struct cpumask *cpu_map)
6243 {
6244 	switch (what) {
6245 	case sa_rootdomain:
6246 		if (!atomic_read(&d->rd->refcount))
6247 			free_rootdomain(&d->rd->rcu); /* fall through */
6248 	case sa_sd:
6249 		free_percpu(d->sd); /* fall through */
6250 	case sa_sd_storage:
6251 		__sdt_free(cpu_map); /* fall through */
6252 	case sa_none:
6253 		break;
6254 	}
6255 }
6256 
6257 static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
6258 						   const struct cpumask *cpu_map)
6259 {
6260 	memset(d, 0, sizeof(*d));
6261 
6262 	if (__sdt_alloc(cpu_map))
6263 		return sa_sd_storage;
6264 	d->sd = alloc_percpu(struct sched_domain *);
6265 	if (!d->sd)
6266 		return sa_sd_storage;
6267 	d->rd = alloc_rootdomain();
6268 	if (!d->rd)
6269 		return sa_sd;
6270 	return sa_rootdomain;
6271 }
6272 
6273 /*
6274  * NULL the sd_data elements we've used to build the sched_domain and
6275  * sched_group structure so that the subsequent __free_domain_allocs()
6276  * will not free the data we're using.
6277  */
6278 static void claim_allocations(int cpu, struct sched_domain *sd)
6279 {
6280 	struct sd_data *sdd = sd->private;
6281 
6282 	WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
6283 	*per_cpu_ptr(sdd->sd, cpu) = NULL;
6284 
6285 	if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
6286 		*per_cpu_ptr(sdd->sg, cpu) = NULL;
6287 
6288 	if (atomic_read(&(*per_cpu_ptr(sdd->sgp, cpu))->ref))
6289 		*per_cpu_ptr(sdd->sgp, cpu) = NULL;
6290 }
6291 
6292 #ifdef CONFIG_SCHED_SMT
6293 static const struct cpumask *cpu_smt_mask(int cpu)
6294 {
6295 	return topology_thread_cpumask(cpu);
6296 }
6297 #endif
6298 
6299 /*
6300  * Topology list, bottom-up.
6301  */
6302 static struct sched_domain_topology_level default_topology[] = {
6303 #ifdef CONFIG_SCHED_SMT
6304 	{ sd_init_SIBLING, cpu_smt_mask, },
6305 #endif
6306 #ifdef CONFIG_SCHED_MC
6307 	{ sd_init_MC, cpu_coregroup_mask, },
6308 #endif
6309 #ifdef CONFIG_SCHED_BOOK
6310 	{ sd_init_BOOK, cpu_book_mask, },
6311 #endif
6312 	{ sd_init_CPU, cpu_cpu_mask, },
6313 #ifdef CONFIG_NUMA
6314 	{ sd_init_NODE, cpu_node_mask, SDTL_OVERLAP, },
6315 	{ sd_init_ALLNODES, cpu_allnodes_mask, },
6316 #endif
6317 	{ NULL, },
6318 };
6319 
6320 static struct sched_domain_topology_level *sched_domain_topology = default_topology;
6321 
6322 static int __sdt_alloc(const struct cpumask *cpu_map)
6323 {
6324 	struct sched_domain_topology_level *tl;
6325 	int j;
6326 
6327 	for (tl = sched_domain_topology; tl->init; tl++) {
6328 		struct sd_data *sdd = &tl->data;
6329 
6330 		sdd->sd = alloc_percpu(struct sched_domain *);
6331 		if (!sdd->sd)
6332 			return -ENOMEM;
6333 
6334 		sdd->sg = alloc_percpu(struct sched_group *);
6335 		if (!sdd->sg)
6336 			return -ENOMEM;
6337 
6338 		sdd->sgp = alloc_percpu(struct sched_group_power *);
6339 		if (!sdd->sgp)
6340 			return -ENOMEM;
6341 
6342 		for_each_cpu(j, cpu_map) {
6343 			struct sched_domain *sd;
6344 			struct sched_group *sg;
6345 			struct sched_group_power *sgp;
6346 
6347 		       	sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
6348 					GFP_KERNEL, cpu_to_node(j));
6349 			if (!sd)
6350 				return -ENOMEM;
6351 
6352 			*per_cpu_ptr(sdd->sd, j) = sd;
6353 
6354 			sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
6355 					GFP_KERNEL, cpu_to_node(j));
6356 			if (!sg)
6357 				return -ENOMEM;
6358 
6359 			*per_cpu_ptr(sdd->sg, j) = sg;
6360 
6361 			sgp = kzalloc_node(sizeof(struct sched_group_power),
6362 					GFP_KERNEL, cpu_to_node(j));
6363 			if (!sgp)
6364 				return -ENOMEM;
6365 
6366 			*per_cpu_ptr(sdd->sgp, j) = sgp;
6367 		}
6368 	}
6369 
6370 	return 0;
6371 }
6372 
6373 static void __sdt_free(const struct cpumask *cpu_map)
6374 {
6375 	struct sched_domain_topology_level *tl;
6376 	int j;
6377 
6378 	for (tl = sched_domain_topology; tl->init; tl++) {
6379 		struct sd_data *sdd = &tl->data;
6380 
6381 		for_each_cpu(j, cpu_map) {
6382 			struct sched_domain *sd = *per_cpu_ptr(sdd->sd, j);
6383 			if (sd && (sd->flags & SD_OVERLAP))
6384 				free_sched_groups(sd->groups, 0);
6385 			kfree(*per_cpu_ptr(sdd->sd, j));
6386 			kfree(*per_cpu_ptr(sdd->sg, j));
6387 			kfree(*per_cpu_ptr(sdd->sgp, j));
6388 		}
6389 		free_percpu(sdd->sd);
6390 		free_percpu(sdd->sg);
6391 		free_percpu(sdd->sgp);
6392 	}
6393 }
6394 
6395 struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
6396 		struct s_data *d, const struct cpumask *cpu_map,
6397 		struct sched_domain_attr *attr, struct sched_domain *child,
6398 		int cpu)
6399 {
6400 	struct sched_domain *sd = tl->init(tl, cpu);
6401 	if (!sd)
6402 		return child;
6403 
6404 	set_domain_attribute(sd, attr);
6405 	cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
6406 	if (child) {
6407 		sd->level = child->level + 1;
6408 		sched_domain_level_max = max(sched_domain_level_max, sd->level);
6409 		child->parent = sd;
6410 	}
6411 	sd->child = child;
6412 
6413 	return sd;
6414 }
6415 
6416 /*
6417  * Build sched domains for a given set of cpus and attach the sched domains
6418  * to the individual cpus
6419  */
6420 static int build_sched_domains(const struct cpumask *cpu_map,
6421 			       struct sched_domain_attr *attr)
6422 {
6423 	enum s_alloc alloc_state = sa_none;
6424 	struct sched_domain *sd;
6425 	struct s_data d;
6426 	int i, ret = -ENOMEM;
6427 
6428 	alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
6429 	if (alloc_state != sa_rootdomain)
6430 		goto error;
6431 
6432 	/* Set up domains for cpus specified by the cpu_map. */
6433 	for_each_cpu(i, cpu_map) {
6434 		struct sched_domain_topology_level *tl;
6435 
6436 		sd = NULL;
6437 		for (tl = sched_domain_topology; tl->init; tl++) {
6438 			sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i);
6439 			if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP))
6440 				sd->flags |= SD_OVERLAP;
6441 			if (cpumask_equal(cpu_map, sched_domain_span(sd)))
6442 				break;
6443 		}
6444 
6445 		while (sd->child)
6446 			sd = sd->child;
6447 
6448 		*per_cpu_ptr(d.sd, i) = sd;
6449 	}
6450 
6451 	/* Build the groups for the domains */
6452 	for_each_cpu(i, cpu_map) {
6453 		for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
6454 			sd->span_weight = cpumask_weight(sched_domain_span(sd));
6455 			if (sd->flags & SD_OVERLAP) {
6456 				if (build_overlap_sched_groups(sd, i))
6457 					goto error;
6458 			} else {
6459 				if (build_sched_groups(sd, i))
6460 					goto error;
6461 			}
6462 		}
6463 	}
6464 
6465 	/* Calculate CPU power for physical packages and nodes */
6466 	for (i = nr_cpumask_bits-1; i >= 0; i--) {
6467 		if (!cpumask_test_cpu(i, cpu_map))
6468 			continue;
6469 
6470 		for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
6471 			claim_allocations(i, sd);
6472 			init_sched_groups_power(i, sd);
6473 		}
6474 	}
6475 
6476 	/* Attach the domains */
6477 	rcu_read_lock();
6478 	for_each_cpu(i, cpu_map) {
6479 		sd = *per_cpu_ptr(d.sd, i);
6480 		cpu_attach_domain(sd, d.rd, i);
6481 	}
6482 	rcu_read_unlock();
6483 
6484 	ret = 0;
6485 error:
6486 	__free_domain_allocs(&d, alloc_state, cpu_map);
6487 	return ret;
6488 }
6489 
6490 static cpumask_var_t *doms_cur;	/* current sched domains */
6491 static int ndoms_cur;		/* number of sched domains in 'doms_cur' */
6492 static struct sched_domain_attr *dattr_cur;
6493 				/* attribues of custom domains in 'doms_cur' */
6494 
6495 /*
6496  * Special case: If a kmalloc of a doms_cur partition (array of
6497  * cpumask) fails, then fallback to a single sched domain,
6498  * as determined by the single cpumask fallback_doms.
6499  */
6500 static cpumask_var_t fallback_doms;
6501 
6502 /*
6503  * arch_update_cpu_topology lets virtualized architectures update the
6504  * cpu core maps. It is supposed to return 1 if the topology changed
6505  * or 0 if it stayed the same.
6506  */
6507 int __attribute__((weak)) arch_update_cpu_topology(void)
6508 {
6509 	return 0;
6510 }
6511 
6512 cpumask_var_t *alloc_sched_domains(unsigned int ndoms)
6513 {
6514 	int i;
6515 	cpumask_var_t *doms;
6516 
6517 	doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL);
6518 	if (!doms)
6519 		return NULL;
6520 	for (i = 0; i < ndoms; i++) {
6521 		if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {
6522 			free_sched_domains(doms, i);
6523 			return NULL;
6524 		}
6525 	}
6526 	return doms;
6527 }
6528 
6529 void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
6530 {
6531 	unsigned int i;
6532 	for (i = 0; i < ndoms; i++)
6533 		free_cpumask_var(doms[i]);
6534 	kfree(doms);
6535 }
6536 
6537 /*
6538  * Set up scheduler domains and groups. Callers must hold the hotplug lock.
6539  * For now this just excludes isolated cpus, but could be used to
6540  * exclude other special cases in the future.
6541  */
6542 static int init_sched_domains(const struct cpumask *cpu_map)
6543 {
6544 	int err;
6545 
6546 	arch_update_cpu_topology();
6547 	ndoms_cur = 1;
6548 	doms_cur = alloc_sched_domains(ndoms_cur);
6549 	if (!doms_cur)
6550 		doms_cur = &fallback_doms;
6551 	cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
6552 	dattr_cur = NULL;
6553 	err = build_sched_domains(doms_cur[0], NULL);
6554 	register_sched_domain_sysctl();
6555 
6556 	return err;
6557 }
6558 
6559 /*
6560  * Detach sched domains from a group of cpus specified in cpu_map
6561  * These cpus will now be attached to the NULL domain
6562  */
6563 static void detach_destroy_domains(const struct cpumask *cpu_map)
6564 {
6565 	int i;
6566 
6567 	rcu_read_lock();
6568 	for_each_cpu(i, cpu_map)
6569 		cpu_attach_domain(NULL, &def_root_domain, i);
6570 	rcu_read_unlock();
6571 }
6572 
6573 /* handle null as "default" */
6574 static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
6575 			struct sched_domain_attr *new, int idx_new)
6576 {
6577 	struct sched_domain_attr tmp;
6578 
6579 	/* fast path */
6580 	if (!new && !cur)
6581 		return 1;
6582 
6583 	tmp = SD_ATTR_INIT;
6584 	return !memcmp(cur ? (cur + idx_cur) : &tmp,
6585 			new ? (new + idx_new) : &tmp,
6586 			sizeof(struct sched_domain_attr));
6587 }
6588 
6589 /*
6590  * Partition sched domains as specified by the 'ndoms_new'
6591  * cpumasks in the array doms_new[] of cpumasks. This compares
6592  * doms_new[] to the current sched domain partitioning, doms_cur[].
6593  * It destroys each deleted domain and builds each new domain.
6594  *
6595  * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'.
6596  * The masks don't intersect (don't overlap.) We should setup one
6597  * sched domain for each mask. CPUs not in any of the cpumasks will
6598  * not be load balanced. If the same cpumask appears both in the
6599  * current 'doms_cur' domains and in the new 'doms_new', we can leave
6600  * it as it is.
6601  *
6602  * The passed in 'doms_new' should be allocated using
6603  * alloc_sched_domains.  This routine takes ownership of it and will
6604  * free_sched_domains it when done with it. If the caller failed the
6605  * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1,
6606  * and partition_sched_domains() will fallback to the single partition
6607  * 'fallback_doms', it also forces the domains to be rebuilt.
6608  *
6609  * If doms_new == NULL it will be replaced with cpu_online_mask.
6610  * ndoms_new == 0 is a special case for destroying existing domains,
6611  * and it will not create the default domain.
6612  *
6613  * Call with hotplug lock held
6614  */
6615 void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
6616 			     struct sched_domain_attr *dattr_new)
6617 {
6618 	int i, j, n;
6619 	int new_topology;
6620 
6621 	mutex_lock(&sched_domains_mutex);
6622 
6623 	/* always unregister in case we don't destroy any domains */
6624 	unregister_sched_domain_sysctl();
6625 
6626 	/* Let architecture update cpu core mappings. */
6627 	new_topology = arch_update_cpu_topology();
6628 
6629 	n = doms_new ? ndoms_new : 0;
6630 
6631 	/* Destroy deleted domains */
6632 	for (i = 0; i < ndoms_cur; i++) {
6633 		for (j = 0; j < n && !new_topology; j++) {
6634 			if (cpumask_equal(doms_cur[i], doms_new[j])
6635 			    && dattrs_equal(dattr_cur, i, dattr_new, j))
6636 				goto match1;
6637 		}
6638 		/* no match - a current sched domain not in new doms_new[] */
6639 		detach_destroy_domains(doms_cur[i]);
6640 match1:
6641 		;
6642 	}
6643 
6644 	if (doms_new == NULL) {
6645 		ndoms_cur = 0;
6646 		doms_new = &fallback_doms;
6647 		cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
6648 		WARN_ON_ONCE(dattr_new);
6649 	}
6650 
6651 	/* Build new domains */
6652 	for (i = 0; i < ndoms_new; i++) {
6653 		for (j = 0; j < ndoms_cur && !new_topology; j++) {
6654 			if (cpumask_equal(doms_new[i], doms_cur[j])
6655 			    && dattrs_equal(dattr_new, i, dattr_cur, j))
6656 				goto match2;
6657 		}
6658 		/* no match - add a new doms_new */
6659 		build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
6660 match2:
6661 		;
6662 	}
6663 
6664 	/* Remember the new sched domains */
6665 	if (doms_cur != &fallback_doms)
6666 		free_sched_domains(doms_cur, ndoms_cur);
6667 	kfree(dattr_cur);	/* kfree(NULL) is safe */
6668 	doms_cur = doms_new;
6669 	dattr_cur = dattr_new;
6670 	ndoms_cur = ndoms_new;
6671 
6672 	register_sched_domain_sysctl();
6673 
6674 	mutex_unlock(&sched_domains_mutex);
6675 }
6676 
6677 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
6678 static void reinit_sched_domains(void)
6679 {
6680 	get_online_cpus();
6681 
6682 	/* Destroy domains first to force the rebuild */
6683 	partition_sched_domains(0, NULL, NULL);
6684 
6685 	rebuild_sched_domains();
6686 	put_online_cpus();
6687 }
6688 
6689 static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
6690 {
6691 	unsigned int level = 0;
6692 
6693 	if (sscanf(buf, "%u", &level) != 1)
6694 		return -EINVAL;
6695 
6696 	/*
6697 	 * level is always be positive so don't check for
6698 	 * level < POWERSAVINGS_BALANCE_NONE which is 0
6699 	 * What happens on 0 or 1 byte write,
6700 	 * need to check for count as well?
6701 	 */
6702 
6703 	if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS)
6704 		return -EINVAL;
6705 
6706 	if (smt)
6707 		sched_smt_power_savings = level;
6708 	else
6709 		sched_mc_power_savings = level;
6710 
6711 	reinit_sched_domains();
6712 
6713 	return count;
6714 }
6715 
6716 #ifdef CONFIG_SCHED_MC
6717 static ssize_t sched_mc_power_savings_show(struct device *dev,
6718 					   struct device_attribute *attr,
6719 					   char *buf)
6720 {
6721 	return sprintf(buf, "%u\n", sched_mc_power_savings);
6722 }
6723 static ssize_t sched_mc_power_savings_store(struct device *dev,
6724 					    struct device_attribute *attr,
6725 					    const char *buf, size_t count)
6726 {
6727 	return sched_power_savings_store(buf, count, 0);
6728 }
6729 static DEVICE_ATTR(sched_mc_power_savings, 0644,
6730 		   sched_mc_power_savings_show,
6731 		   sched_mc_power_savings_store);
6732 #endif
6733 
6734 #ifdef CONFIG_SCHED_SMT
6735 static ssize_t sched_smt_power_savings_show(struct device *dev,
6736 					    struct device_attribute *attr,
6737 					    char *buf)
6738 {
6739 	return sprintf(buf, "%u\n", sched_smt_power_savings);
6740 }
6741 static ssize_t sched_smt_power_savings_store(struct device *dev,
6742 					    struct device_attribute *attr,
6743 					     const char *buf, size_t count)
6744 {
6745 	return sched_power_savings_store(buf, count, 1);
6746 }
6747 static DEVICE_ATTR(sched_smt_power_savings, 0644,
6748 		   sched_smt_power_savings_show,
6749 		   sched_smt_power_savings_store);
6750 #endif
6751 
6752 int __init sched_create_sysfs_power_savings_entries(struct device *dev)
6753 {
6754 	int err = 0;
6755 
6756 #ifdef CONFIG_SCHED_SMT
6757 	if (smt_capable())
6758 		err = device_create_file(dev, &dev_attr_sched_smt_power_savings);
6759 #endif
6760 #ifdef CONFIG_SCHED_MC
6761 	if (!err && mc_capable())
6762 		err = device_create_file(dev, &dev_attr_sched_mc_power_savings);
6763 #endif
6764 	return err;
6765 }
6766 #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
6767 
6768 /*
6769  * Update cpusets according to cpu_active mask.  If cpusets are
6770  * disabled, cpuset_update_active_cpus() becomes a simple wrapper
6771  * around partition_sched_domains().
6772  */
6773 static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
6774 			     void *hcpu)
6775 {
6776 	switch (action & ~CPU_TASKS_FROZEN) {
6777 	case CPU_ONLINE:
6778 	case CPU_DOWN_FAILED:
6779 		cpuset_update_active_cpus();
6780 		return NOTIFY_OK;
6781 	default:
6782 		return NOTIFY_DONE;
6783 	}
6784 }
6785 
6786 static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
6787 			       void *hcpu)
6788 {
6789 	switch (action & ~CPU_TASKS_FROZEN) {
6790 	case CPU_DOWN_PREPARE:
6791 		cpuset_update_active_cpus();
6792 		return NOTIFY_OK;
6793 	default:
6794 		return NOTIFY_DONE;
6795 	}
6796 }
6797 
6798 void __init sched_init_smp(void)
6799 {
6800 	cpumask_var_t non_isolated_cpus;
6801 
6802 	alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
6803 	alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
6804 
6805 	get_online_cpus();
6806 	mutex_lock(&sched_domains_mutex);
6807 	init_sched_domains(cpu_active_mask);
6808 	cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
6809 	if (cpumask_empty(non_isolated_cpus))
6810 		cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
6811 	mutex_unlock(&sched_domains_mutex);
6812 	put_online_cpus();
6813 
6814 	hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
6815 	hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);
6816 
6817 	/* RT runtime code needs to handle some hotplug events */
6818 	hotcpu_notifier(update_runtime, 0);
6819 
6820 	init_hrtick();
6821 
6822 	/* Move init over to a non-isolated CPU */
6823 	if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)
6824 		BUG();
6825 	sched_init_granularity();
6826 	free_cpumask_var(non_isolated_cpus);
6827 
6828 	init_sched_rt_class();
6829 }
6830 #else
6831 void __init sched_init_smp(void)
6832 {
6833 	sched_init_granularity();
6834 }
6835 #endif /* CONFIG_SMP */
6836 
6837 const_debug unsigned int sysctl_timer_migration = 1;
6838 
6839 int in_sched_functions(unsigned long addr)
6840 {
6841 	return in_lock_functions(addr) ||
6842 		(addr >= (unsigned long)__sched_text_start
6843 		&& addr < (unsigned long)__sched_text_end);
6844 }
6845 
6846 #ifdef CONFIG_CGROUP_SCHED
6847 struct task_group root_task_group;
6848 #endif
6849 
6850 DECLARE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
6851 
6852 void __init sched_init(void)
6853 {
6854 	int i, j;
6855 	unsigned long alloc_size = 0, ptr;
6856 
6857 #ifdef CONFIG_FAIR_GROUP_SCHED
6858 	alloc_size += 2 * nr_cpu_ids * sizeof(void **);
6859 #endif
6860 #ifdef CONFIG_RT_GROUP_SCHED
6861 	alloc_size += 2 * nr_cpu_ids * sizeof(void **);
6862 #endif
6863 #ifdef CONFIG_CPUMASK_OFFSTACK
6864 	alloc_size += num_possible_cpus() * cpumask_size();
6865 #endif
6866 	if (alloc_size) {
6867 		ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
6868 
6869 #ifdef CONFIG_FAIR_GROUP_SCHED
6870 		root_task_group.se = (struct sched_entity **)ptr;
6871 		ptr += nr_cpu_ids * sizeof(void **);
6872 
6873 		root_task_group.cfs_rq = (struct cfs_rq **)ptr;
6874 		ptr += nr_cpu_ids * sizeof(void **);
6875 
6876 #endif /* CONFIG_FAIR_GROUP_SCHED */
6877 #ifdef CONFIG_RT_GROUP_SCHED
6878 		root_task_group.rt_se = (struct sched_rt_entity **)ptr;
6879 		ptr += nr_cpu_ids * sizeof(void **);
6880 
6881 		root_task_group.rt_rq = (struct rt_rq **)ptr;
6882 		ptr += nr_cpu_ids * sizeof(void **);
6883 
6884 #endif /* CONFIG_RT_GROUP_SCHED */
6885 #ifdef CONFIG_CPUMASK_OFFSTACK
6886 		for_each_possible_cpu(i) {
6887 			per_cpu(load_balance_tmpmask, i) = (void *)ptr;
6888 			ptr += cpumask_size();
6889 		}
6890 #endif /* CONFIG_CPUMASK_OFFSTACK */
6891 	}
6892 
6893 #ifdef CONFIG_SMP
6894 	init_defrootdomain();
6895 #endif
6896 
6897 	init_rt_bandwidth(&def_rt_bandwidth,
6898 			global_rt_period(), global_rt_runtime());
6899 
6900 #ifdef CONFIG_RT_GROUP_SCHED
6901 	init_rt_bandwidth(&root_task_group.rt_bandwidth,
6902 			global_rt_period(), global_rt_runtime());
6903 #endif /* CONFIG_RT_GROUP_SCHED */
6904 
6905 #ifdef CONFIG_CGROUP_SCHED
6906 	list_add(&root_task_group.list, &task_groups);
6907 	INIT_LIST_HEAD(&root_task_group.children);
6908 	INIT_LIST_HEAD(&root_task_group.siblings);
6909 	autogroup_init(&init_task);
6910 
6911 #endif /* CONFIG_CGROUP_SCHED */
6912 
6913 #ifdef CONFIG_CGROUP_CPUACCT
6914 	root_cpuacct.cpustat = &kernel_cpustat;
6915 	root_cpuacct.cpuusage = alloc_percpu(u64);
6916 	/* Too early, not expected to fail */
6917 	BUG_ON(!root_cpuacct.cpuusage);
6918 #endif
6919 	for_each_possible_cpu(i) {
6920 		struct rq *rq;
6921 
6922 		rq = cpu_rq(i);
6923 		raw_spin_lock_init(&rq->lock);
6924 		rq->nr_running = 0;
6925 		rq->calc_load_active = 0;
6926 		rq->calc_load_update = jiffies + LOAD_FREQ;
6927 		init_cfs_rq(&rq->cfs);
6928 		init_rt_rq(&rq->rt, rq);
6929 #ifdef CONFIG_FAIR_GROUP_SCHED
6930 		root_task_group.shares = ROOT_TASK_GROUP_LOAD;
6931 		INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
6932 		/*
6933 		 * How much cpu bandwidth does root_task_group get?
6934 		 *
6935 		 * In case of task-groups formed thr' the cgroup filesystem, it
6936 		 * gets 100% of the cpu resources in the system. This overall
6937 		 * system cpu resource is divided among the tasks of
6938 		 * root_task_group and its child task-groups in a fair manner,
6939 		 * based on each entity's (task or task-group's) weight
6940 		 * (se->load.weight).
6941 		 *
6942 		 * In other words, if root_task_group has 10 tasks of weight
6943 		 * 1024) and two child groups A0 and A1 (of weight 1024 each),
6944 		 * then A0's share of the cpu resource is:
6945 		 *
6946 		 *	A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
6947 		 *
6948 		 * We achieve this by letting root_task_group's tasks sit
6949 		 * directly in rq->cfs (i.e root_task_group->se[] = NULL).
6950 		 */
6951 		init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
6952 		init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
6953 #endif /* CONFIG_FAIR_GROUP_SCHED */
6954 
6955 		rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
6956 #ifdef CONFIG_RT_GROUP_SCHED
6957 		INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
6958 		init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
6959 #endif
6960 
6961 		for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
6962 			rq->cpu_load[j] = 0;
6963 
6964 		rq->last_load_update_tick = jiffies;
6965 
6966 #ifdef CONFIG_SMP
6967 		rq->sd = NULL;
6968 		rq->rd = NULL;
6969 		rq->cpu_power = SCHED_POWER_SCALE;
6970 		rq->post_schedule = 0;
6971 		rq->active_balance = 0;
6972 		rq->next_balance = jiffies;
6973 		rq->push_cpu = 0;
6974 		rq->cpu = i;
6975 		rq->online = 0;
6976 		rq->idle_stamp = 0;
6977 		rq->avg_idle = 2*sysctl_sched_migration_cost;
6978 
6979 		INIT_LIST_HEAD(&rq->cfs_tasks);
6980 
6981 		rq_attach_root(rq, &def_root_domain);
6982 #ifdef CONFIG_NO_HZ
6983 		rq->nohz_flags = 0;
6984 #endif
6985 #endif
6986 		init_rq_hrtick(rq);
6987 		atomic_set(&rq->nr_iowait, 0);
6988 	}
6989 
6990 	set_load_weight(&init_task);
6991 
6992 #ifdef CONFIG_PREEMPT_NOTIFIERS
6993 	INIT_HLIST_HEAD(&init_task.preempt_notifiers);
6994 #endif
6995 
6996 #ifdef CONFIG_RT_MUTEXES
6997 	plist_head_init(&init_task.pi_waiters);
6998 #endif
6999 
7000 	/*
7001 	 * The boot idle thread does lazy MMU switching as well:
7002 	 */
7003 	atomic_inc(&init_mm.mm_count);
7004 	enter_lazy_tlb(&init_mm, current);
7005 
7006 	/*
7007 	 * Make us the idle thread. Technically, schedule() should not be
7008 	 * called from this thread, however somewhere below it might be,
7009 	 * but because we are the idle thread, we just pick up running again
7010 	 * when this runqueue becomes "idle".
7011 	 */
7012 	init_idle(current, smp_processor_id());
7013 
7014 	calc_load_update = jiffies + LOAD_FREQ;
7015 
7016 	/*
7017 	 * During early bootup we pretend to be a normal task:
7018 	 */
7019 	current->sched_class = &fair_sched_class;
7020 
7021 #ifdef CONFIG_SMP
7022 	zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
7023 	/* May be allocated at isolcpus cmdline parse time */
7024 	if (cpu_isolated_map == NULL)
7025 		zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
7026 #endif
7027 	init_sched_fair_class();
7028 
7029 	scheduler_running = 1;
7030 }
7031 
7032 #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
7033 static inline int preempt_count_equals(int preempt_offset)
7034 {
7035 	int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
7036 
7037 	return (nested == preempt_offset);
7038 }
7039 
7040 void __might_sleep(const char *file, int line, int preempt_offset)
7041 {
7042 	static unsigned long prev_jiffy;	/* ratelimiting */
7043 
7044 	rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */
7045 	if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||
7046 	    system_state != SYSTEM_RUNNING || oops_in_progress)
7047 		return;
7048 	if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
7049 		return;
7050 	prev_jiffy = jiffies;
7051 
7052 	printk(KERN_ERR
7053 		"BUG: sleeping function called from invalid context at %s:%d\n",
7054 			file, line);
7055 	printk(KERN_ERR
7056 		"in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
7057 			in_atomic(), irqs_disabled(),
7058 			current->pid, current->comm);
7059 
7060 	debug_show_held_locks(current);
7061 	if (irqs_disabled())
7062 		print_irqtrace_events(current);
7063 	dump_stack();
7064 }
7065 EXPORT_SYMBOL(__might_sleep);
7066 #endif
7067 
7068 #ifdef CONFIG_MAGIC_SYSRQ
7069 static void normalize_task(struct rq *rq, struct task_struct *p)
7070 {
7071 	const struct sched_class *prev_class = p->sched_class;
7072 	int old_prio = p->prio;
7073 	int on_rq;
7074 
7075 	on_rq = p->on_rq;
7076 	if (on_rq)
7077 		dequeue_task(rq, p, 0);
7078 	__setscheduler(rq, p, SCHED_NORMAL, 0);
7079 	if (on_rq) {
7080 		enqueue_task(rq, p, 0);
7081 		resched_task(rq->curr);
7082 	}
7083 
7084 	check_class_changed(rq, p, prev_class, old_prio);
7085 }
7086 
7087 void normalize_rt_tasks(void)
7088 {
7089 	struct task_struct *g, *p;
7090 	unsigned long flags;
7091 	struct rq *rq;
7092 
7093 	read_lock_irqsave(&tasklist_lock, flags);
7094 	do_each_thread(g, p) {
7095 		/*
7096 		 * Only normalize user tasks:
7097 		 */
7098 		if (!p->mm)
7099 			continue;
7100 
7101 		p->se.exec_start		= 0;
7102 #ifdef CONFIG_SCHEDSTATS
7103 		p->se.statistics.wait_start	= 0;
7104 		p->se.statistics.sleep_start	= 0;
7105 		p->se.statistics.block_start	= 0;
7106 #endif
7107 
7108 		if (!rt_task(p)) {
7109 			/*
7110 			 * Renice negative nice level userspace
7111 			 * tasks back to 0:
7112 			 */
7113 			if (TASK_NICE(p) < 0 && p->mm)
7114 				set_user_nice(p, 0);
7115 			continue;
7116 		}
7117 
7118 		raw_spin_lock(&p->pi_lock);
7119 		rq = __task_rq_lock(p);
7120 
7121 		normalize_task(rq, p);
7122 
7123 		__task_rq_unlock(rq);
7124 		raw_spin_unlock(&p->pi_lock);
7125 	} while_each_thread(g, p);
7126 
7127 	read_unlock_irqrestore(&tasklist_lock, flags);
7128 }
7129 
7130 #endif /* CONFIG_MAGIC_SYSRQ */
7131 
7132 #if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB)
7133 /*
7134  * These functions are only useful for the IA64 MCA handling, or kdb.
7135  *
7136  * They can only be called when the whole system has been
7137  * stopped - every CPU needs to be quiescent, and no scheduling
7138  * activity can take place. Using them for anything else would
7139  * be a serious bug, and as a result, they aren't even visible
7140  * under any other configuration.
7141  */
7142 
7143 /**
7144  * curr_task - return the current task for a given cpu.
7145  * @cpu: the processor in question.
7146  *
7147  * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
7148  */
7149 struct task_struct *curr_task(int cpu)
7150 {
7151 	return cpu_curr(cpu);
7152 }
7153 
7154 #endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */
7155 
7156 #ifdef CONFIG_IA64
7157 /**
7158  * set_curr_task - set the current task for a given cpu.
7159  * @cpu: the processor in question.
7160  * @p: the task pointer to set.
7161  *
7162  * Description: This function must only be used when non-maskable interrupts
7163  * are serviced on a separate stack. It allows the architecture to switch the
7164  * notion of the current task on a cpu in a non-blocking manner. This function
7165  * must be called with all CPU's synchronized, and interrupts disabled, the
7166  * and caller must save the original value of the current task (see
7167  * curr_task() above) and restore that value before reenabling interrupts and
7168  * re-starting the system.
7169  *
7170  * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
7171  */
7172 void set_curr_task(int cpu, struct task_struct *p)
7173 {
7174 	cpu_curr(cpu) = p;
7175 }
7176 
7177 #endif
7178 
7179 #ifdef CONFIG_CGROUP_SCHED
7180 /* task_group_lock serializes the addition/removal of task groups */
7181 static DEFINE_SPINLOCK(task_group_lock);
7182 
7183 static void free_sched_group(struct task_group *tg)
7184 {
7185 	free_fair_sched_group(tg);
7186 	free_rt_sched_group(tg);
7187 	autogroup_free(tg);
7188 	kfree(tg);
7189 }
7190 
7191 /* allocate runqueue etc for a new task group */
7192 struct task_group *sched_create_group(struct task_group *parent)
7193 {
7194 	struct task_group *tg;
7195 	unsigned long flags;
7196 
7197 	tg = kzalloc(sizeof(*tg), GFP_KERNEL);
7198 	if (!tg)
7199 		return ERR_PTR(-ENOMEM);
7200 
7201 	if (!alloc_fair_sched_group(tg, parent))
7202 		goto err;
7203 
7204 	if (!alloc_rt_sched_group(tg, parent))
7205 		goto err;
7206 
7207 	spin_lock_irqsave(&task_group_lock, flags);
7208 	list_add_rcu(&tg->list, &task_groups);
7209 
7210 	WARN_ON(!parent); /* root should already exist */
7211 
7212 	tg->parent = parent;
7213 	INIT_LIST_HEAD(&tg->children);
7214 	list_add_rcu(&tg->siblings, &parent->children);
7215 	spin_unlock_irqrestore(&task_group_lock, flags);
7216 
7217 	return tg;
7218 
7219 err:
7220 	free_sched_group(tg);
7221 	return ERR_PTR(-ENOMEM);
7222 }
7223 
7224 /* rcu callback to free various structures associated with a task group */
7225 static void free_sched_group_rcu(struct rcu_head *rhp)
7226 {
7227 	/* now it should be safe to free those cfs_rqs */
7228 	free_sched_group(container_of(rhp, struct task_group, rcu));
7229 }
7230 
7231 /* Destroy runqueue etc associated with a task group */
7232 void sched_destroy_group(struct task_group *tg)
7233 {
7234 	unsigned long flags;
7235 	int i;
7236 
7237 	/* end participation in shares distribution */
7238 	for_each_possible_cpu(i)
7239 		unregister_fair_sched_group(tg, i);
7240 
7241 	spin_lock_irqsave(&task_group_lock, flags);
7242 	list_del_rcu(&tg->list);
7243 	list_del_rcu(&tg->siblings);
7244 	spin_unlock_irqrestore(&task_group_lock, flags);
7245 
7246 	/* wait for possible concurrent references to cfs_rqs complete */
7247 	call_rcu(&tg->rcu, free_sched_group_rcu);
7248 }
7249 
7250 /* change task's runqueue when it moves between groups.
7251  *	The caller of this function should have put the task in its new group
7252  *	by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to
7253  *	reflect its new group.
7254  */
7255 void sched_move_task(struct task_struct *tsk)
7256 {
7257 	int on_rq, running;
7258 	unsigned long flags;
7259 	struct rq *rq;
7260 
7261 	rq = task_rq_lock(tsk, &flags);
7262 
7263 	running = task_current(rq, tsk);
7264 	on_rq = tsk->on_rq;
7265 
7266 	if (on_rq)
7267 		dequeue_task(rq, tsk, 0);
7268 	if (unlikely(running))
7269 		tsk->sched_class->put_prev_task(rq, tsk);
7270 
7271 #ifdef CONFIG_FAIR_GROUP_SCHED
7272 	if (tsk->sched_class->task_move_group)
7273 		tsk->sched_class->task_move_group(tsk, on_rq);
7274 	else
7275 #endif
7276 		set_task_rq(tsk, task_cpu(tsk));
7277 
7278 	if (unlikely(running))
7279 		tsk->sched_class->set_curr_task(rq);
7280 	if (on_rq)
7281 		enqueue_task(rq, tsk, 0);
7282 
7283 	task_rq_unlock(rq, tsk, &flags);
7284 }
7285 #endif /* CONFIG_CGROUP_SCHED */
7286 
7287 #if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH)
7288 static unsigned long to_ratio(u64 period, u64 runtime)
7289 {
7290 	if (runtime == RUNTIME_INF)
7291 		return 1ULL << 20;
7292 
7293 	return div64_u64(runtime << 20, period);
7294 }
7295 #endif
7296 
7297 #ifdef CONFIG_RT_GROUP_SCHED
7298 /*
7299  * Ensure that the real time constraints are schedulable.
7300  */
7301 static DEFINE_MUTEX(rt_constraints_mutex);
7302 
7303 /* Must be called with tasklist_lock held */
7304 static inline int tg_has_rt_tasks(struct task_group *tg)
7305 {
7306 	struct task_struct *g, *p;
7307 
7308 	do_each_thread(g, p) {
7309 		if (rt_task(p) && task_rq(p)->rt.tg == tg)
7310 			return 1;
7311 	} while_each_thread(g, p);
7312 
7313 	return 0;
7314 }
7315 
7316 struct rt_schedulable_data {
7317 	struct task_group *tg;
7318 	u64 rt_period;
7319 	u64 rt_runtime;
7320 };
7321 
7322 static int tg_rt_schedulable(struct task_group *tg, void *data)
7323 {
7324 	struct rt_schedulable_data *d = data;
7325 	struct task_group *child;
7326 	unsigned long total, sum = 0;
7327 	u64 period, runtime;
7328 
7329 	period = ktime_to_ns(tg->rt_bandwidth.rt_period);
7330 	runtime = tg->rt_bandwidth.rt_runtime;
7331 
7332 	if (tg == d->tg) {
7333 		period = d->rt_period;
7334 		runtime = d->rt_runtime;
7335 	}
7336 
7337 	/*
7338 	 * Cannot have more runtime than the period.
7339 	 */
7340 	if (runtime > period && runtime != RUNTIME_INF)
7341 		return -EINVAL;
7342 
7343 	/*
7344 	 * Ensure we don't starve existing RT tasks.
7345 	 */
7346 	if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
7347 		return -EBUSY;
7348 
7349 	total = to_ratio(period, runtime);
7350 
7351 	/*
7352 	 * Nobody can have more than the global setting allows.
7353 	 */
7354 	if (total > to_ratio(global_rt_period(), global_rt_runtime()))
7355 		return -EINVAL;
7356 
7357 	/*
7358 	 * The sum of our children's runtime should not exceed our own.
7359 	 */
7360 	list_for_each_entry_rcu(child, &tg->children, siblings) {
7361 		period = ktime_to_ns(child->rt_bandwidth.rt_period);
7362 		runtime = child->rt_bandwidth.rt_runtime;
7363 
7364 		if (child == d->tg) {
7365 			period = d->rt_period;
7366 			runtime = d->rt_runtime;
7367 		}
7368 
7369 		sum += to_ratio(period, runtime);
7370 	}
7371 
7372 	if (sum > total)
7373 		return -EINVAL;
7374 
7375 	return 0;
7376 }
7377 
7378 static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
7379 {
7380 	int ret;
7381 
7382 	struct rt_schedulable_data data = {
7383 		.tg = tg,
7384 		.rt_period = period,
7385 		.rt_runtime = runtime,
7386 	};
7387 
7388 	rcu_read_lock();
7389 	ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data);
7390 	rcu_read_unlock();
7391 
7392 	return ret;
7393 }
7394 
7395 static int tg_set_rt_bandwidth(struct task_group *tg,
7396 		u64 rt_period, u64 rt_runtime)
7397 {
7398 	int i, err = 0;
7399 
7400 	mutex_lock(&rt_constraints_mutex);
7401 	read_lock(&tasklist_lock);
7402 	err = __rt_schedulable(tg, rt_period, rt_runtime);
7403 	if (err)
7404 		goto unlock;
7405 
7406 	raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
7407 	tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
7408 	tg->rt_bandwidth.rt_runtime = rt_runtime;
7409 
7410 	for_each_possible_cpu(i) {
7411 		struct rt_rq *rt_rq = tg->rt_rq[i];
7412 
7413 		raw_spin_lock(&rt_rq->rt_runtime_lock);
7414 		rt_rq->rt_runtime = rt_runtime;
7415 		raw_spin_unlock(&rt_rq->rt_runtime_lock);
7416 	}
7417 	raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
7418 unlock:
7419 	read_unlock(&tasklist_lock);
7420 	mutex_unlock(&rt_constraints_mutex);
7421 
7422 	return err;
7423 }
7424 
7425 int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
7426 {
7427 	u64 rt_runtime, rt_period;
7428 
7429 	rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
7430 	rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
7431 	if (rt_runtime_us < 0)
7432 		rt_runtime = RUNTIME_INF;
7433 
7434 	return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
7435 }
7436 
7437 long sched_group_rt_runtime(struct task_group *tg)
7438 {
7439 	u64 rt_runtime_us;
7440 
7441 	if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)
7442 		return -1;
7443 
7444 	rt_runtime_us = tg->rt_bandwidth.rt_runtime;
7445 	do_div(rt_runtime_us, NSEC_PER_USEC);
7446 	return rt_runtime_us;
7447 }
7448 
7449 int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
7450 {
7451 	u64 rt_runtime, rt_period;
7452 
7453 	rt_period = (u64)rt_period_us * NSEC_PER_USEC;
7454 	rt_runtime = tg->rt_bandwidth.rt_runtime;
7455 
7456 	if (rt_period == 0)
7457 		return -EINVAL;
7458 
7459 	return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
7460 }
7461 
7462 long sched_group_rt_period(struct task_group *tg)
7463 {
7464 	u64 rt_period_us;
7465 
7466 	rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);
7467 	do_div(rt_period_us, NSEC_PER_USEC);
7468 	return rt_period_us;
7469 }
7470 
7471 static int sched_rt_global_constraints(void)
7472 {
7473 	u64 runtime, period;
7474 	int ret = 0;
7475 
7476 	if (sysctl_sched_rt_period <= 0)
7477 		return -EINVAL;
7478 
7479 	runtime = global_rt_runtime();
7480 	period = global_rt_period();
7481 
7482 	/*
7483 	 * Sanity check on the sysctl variables.
7484 	 */
7485 	if (runtime > period && runtime != RUNTIME_INF)
7486 		return -EINVAL;
7487 
7488 	mutex_lock(&rt_constraints_mutex);
7489 	read_lock(&tasklist_lock);
7490 	ret = __rt_schedulable(NULL, 0, 0);
7491 	read_unlock(&tasklist_lock);
7492 	mutex_unlock(&rt_constraints_mutex);
7493 
7494 	return ret;
7495 }
7496 
7497 int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
7498 {
7499 	/* Don't accept realtime tasks when there is no way for them to run */
7500 	if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
7501 		return 0;
7502 
7503 	return 1;
7504 }
7505 
7506 #else /* !CONFIG_RT_GROUP_SCHED */
7507 static int sched_rt_global_constraints(void)
7508 {
7509 	unsigned long flags;
7510 	int i;
7511 
7512 	if (sysctl_sched_rt_period <= 0)
7513 		return -EINVAL;
7514 
7515 	/*
7516 	 * There's always some RT tasks in the root group
7517 	 * -- migration, kstopmachine etc..
7518 	 */
7519 	if (sysctl_sched_rt_runtime == 0)
7520 		return -EBUSY;
7521 
7522 	raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
7523 	for_each_possible_cpu(i) {
7524 		struct rt_rq *rt_rq = &cpu_rq(i)->rt;
7525 
7526 		raw_spin_lock(&rt_rq->rt_runtime_lock);
7527 		rt_rq->rt_runtime = global_rt_runtime();
7528 		raw_spin_unlock(&rt_rq->rt_runtime_lock);
7529 	}
7530 	raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
7531 
7532 	return 0;
7533 }
7534 #endif /* CONFIG_RT_GROUP_SCHED */
7535 
7536 int sched_rt_handler(struct ctl_table *table, int write,
7537 		void __user *buffer, size_t *lenp,
7538 		loff_t *ppos)
7539 {
7540 	int ret;
7541 	int old_period, old_runtime;
7542 	static DEFINE_MUTEX(mutex);
7543 
7544 	mutex_lock(&mutex);
7545 	old_period = sysctl_sched_rt_period;
7546 	old_runtime = sysctl_sched_rt_runtime;
7547 
7548 	ret = proc_dointvec(table, write, buffer, lenp, ppos);
7549 
7550 	if (!ret && write) {
7551 		ret = sched_rt_global_constraints();
7552 		if (ret) {
7553 			sysctl_sched_rt_period = old_period;
7554 			sysctl_sched_rt_runtime = old_runtime;
7555 		} else {
7556 			def_rt_bandwidth.rt_runtime = global_rt_runtime();
7557 			def_rt_bandwidth.rt_period =
7558 				ns_to_ktime(global_rt_period());
7559 		}
7560 	}
7561 	mutex_unlock(&mutex);
7562 
7563 	return ret;
7564 }
7565 
7566 #ifdef CONFIG_CGROUP_SCHED
7567 
7568 /* return corresponding task_group object of a cgroup */
7569 static inline struct task_group *cgroup_tg(struct cgroup *cgrp)
7570 {
7571 	return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id),
7572 			    struct task_group, css);
7573 }
7574 
7575 static struct cgroup_subsys_state *cpu_cgroup_create(struct cgroup *cgrp)
7576 {
7577 	struct task_group *tg, *parent;
7578 
7579 	if (!cgrp->parent) {
7580 		/* This is early initialization for the top cgroup */
7581 		return &root_task_group.css;
7582 	}
7583 
7584 	parent = cgroup_tg(cgrp->parent);
7585 	tg = sched_create_group(parent);
7586 	if (IS_ERR(tg))
7587 		return ERR_PTR(-ENOMEM);
7588 
7589 	return &tg->css;
7590 }
7591 
7592 static void cpu_cgroup_destroy(struct cgroup *cgrp)
7593 {
7594 	struct task_group *tg = cgroup_tg(cgrp);
7595 
7596 	sched_destroy_group(tg);
7597 }
7598 
7599 static int cpu_cgroup_can_attach(struct cgroup *cgrp,
7600 				 struct cgroup_taskset *tset)
7601 {
7602 	struct task_struct *task;
7603 
7604 	cgroup_taskset_for_each(task, cgrp, tset) {
7605 #ifdef CONFIG_RT_GROUP_SCHED
7606 		if (!sched_rt_can_attach(cgroup_tg(cgrp), task))
7607 			return -EINVAL;
7608 #else
7609 		/* We don't support RT-tasks being in separate groups */
7610 		if (task->sched_class != &fair_sched_class)
7611 			return -EINVAL;
7612 #endif
7613 	}
7614 	return 0;
7615 }
7616 
7617 static void cpu_cgroup_attach(struct cgroup *cgrp,
7618 			      struct cgroup_taskset *tset)
7619 {
7620 	struct task_struct *task;
7621 
7622 	cgroup_taskset_for_each(task, cgrp, tset)
7623 		sched_move_task(task);
7624 }
7625 
7626 static void
7627 cpu_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp,
7628 		struct task_struct *task)
7629 {
7630 	/*
7631 	 * cgroup_exit() is called in the copy_process() failure path.
7632 	 * Ignore this case since the task hasn't ran yet, this avoids
7633 	 * trying to poke a half freed task state from generic code.
7634 	 */
7635 	if (!(task->flags & PF_EXITING))
7636 		return;
7637 
7638 	sched_move_task(task);
7639 }
7640 
7641 #ifdef CONFIG_FAIR_GROUP_SCHED
7642 static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,
7643 				u64 shareval)
7644 {
7645 	return sched_group_set_shares(cgroup_tg(cgrp), scale_load(shareval));
7646 }
7647 
7648 static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
7649 {
7650 	struct task_group *tg = cgroup_tg(cgrp);
7651 
7652 	return (u64) scale_load_down(tg->shares);
7653 }
7654 
7655 #ifdef CONFIG_CFS_BANDWIDTH
7656 static DEFINE_MUTEX(cfs_constraints_mutex);
7657 
7658 const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */
7659 const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
7660 
7661 static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
7662 
7663 static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
7664 {
7665 	int i, ret = 0, runtime_enabled, runtime_was_enabled;
7666 	struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
7667 
7668 	if (tg == &root_task_group)
7669 		return -EINVAL;
7670 
7671 	/*
7672 	 * Ensure we have at some amount of bandwidth every period.  This is
7673 	 * to prevent reaching a state of large arrears when throttled via
7674 	 * entity_tick() resulting in prolonged exit starvation.
7675 	 */
7676 	if (quota < min_cfs_quota_period || period < min_cfs_quota_period)
7677 		return -EINVAL;
7678 
7679 	/*
7680 	 * Likewise, bound things on the otherside by preventing insane quota
7681 	 * periods.  This also allows us to normalize in computing quota
7682 	 * feasibility.
7683 	 */
7684 	if (period > max_cfs_quota_period)
7685 		return -EINVAL;
7686 
7687 	mutex_lock(&cfs_constraints_mutex);
7688 	ret = __cfs_schedulable(tg, period, quota);
7689 	if (ret)
7690 		goto out_unlock;
7691 
7692 	runtime_enabled = quota != RUNTIME_INF;
7693 	runtime_was_enabled = cfs_b->quota != RUNTIME_INF;
7694 	account_cfs_bandwidth_used(runtime_enabled, runtime_was_enabled);
7695 	raw_spin_lock_irq(&cfs_b->lock);
7696 	cfs_b->period = ns_to_ktime(period);
7697 	cfs_b->quota = quota;
7698 
7699 	__refill_cfs_bandwidth_runtime(cfs_b);
7700 	/* restart the period timer (if active) to handle new period expiry */
7701 	if (runtime_enabled && cfs_b->timer_active) {
7702 		/* force a reprogram */
7703 		cfs_b->timer_active = 0;
7704 		__start_cfs_bandwidth(cfs_b);
7705 	}
7706 	raw_spin_unlock_irq(&cfs_b->lock);
7707 
7708 	for_each_possible_cpu(i) {
7709 		struct cfs_rq *cfs_rq = tg->cfs_rq[i];
7710 		struct rq *rq = cfs_rq->rq;
7711 
7712 		raw_spin_lock_irq(&rq->lock);
7713 		cfs_rq->runtime_enabled = runtime_enabled;
7714 		cfs_rq->runtime_remaining = 0;
7715 
7716 		if (cfs_rq->throttled)
7717 			unthrottle_cfs_rq(cfs_rq);
7718 		raw_spin_unlock_irq(&rq->lock);
7719 	}
7720 out_unlock:
7721 	mutex_unlock(&cfs_constraints_mutex);
7722 
7723 	return ret;
7724 }
7725 
7726 int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
7727 {
7728 	u64 quota, period;
7729 
7730 	period = ktime_to_ns(tg->cfs_bandwidth.period);
7731 	if (cfs_quota_us < 0)
7732 		quota = RUNTIME_INF;
7733 	else
7734 		quota = (u64)cfs_quota_us * NSEC_PER_USEC;
7735 
7736 	return tg_set_cfs_bandwidth(tg, period, quota);
7737 }
7738 
7739 long tg_get_cfs_quota(struct task_group *tg)
7740 {
7741 	u64 quota_us;
7742 
7743 	if (tg->cfs_bandwidth.quota == RUNTIME_INF)
7744 		return -1;
7745 
7746 	quota_us = tg->cfs_bandwidth.quota;
7747 	do_div(quota_us, NSEC_PER_USEC);
7748 
7749 	return quota_us;
7750 }
7751 
7752 int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
7753 {
7754 	u64 quota, period;
7755 
7756 	period = (u64)cfs_period_us * NSEC_PER_USEC;
7757 	quota = tg->cfs_bandwidth.quota;
7758 
7759 	return tg_set_cfs_bandwidth(tg, period, quota);
7760 }
7761 
7762 long tg_get_cfs_period(struct task_group *tg)
7763 {
7764 	u64 cfs_period_us;
7765 
7766 	cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period);
7767 	do_div(cfs_period_us, NSEC_PER_USEC);
7768 
7769 	return cfs_period_us;
7770 }
7771 
7772 static s64 cpu_cfs_quota_read_s64(struct cgroup *cgrp, struct cftype *cft)
7773 {
7774 	return tg_get_cfs_quota(cgroup_tg(cgrp));
7775 }
7776 
7777 static int cpu_cfs_quota_write_s64(struct cgroup *cgrp, struct cftype *cftype,
7778 				s64 cfs_quota_us)
7779 {
7780 	return tg_set_cfs_quota(cgroup_tg(cgrp), cfs_quota_us);
7781 }
7782 
7783 static u64 cpu_cfs_period_read_u64(struct cgroup *cgrp, struct cftype *cft)
7784 {
7785 	return tg_get_cfs_period(cgroup_tg(cgrp));
7786 }
7787 
7788 static int cpu_cfs_period_write_u64(struct cgroup *cgrp, struct cftype *cftype,
7789 				u64 cfs_period_us)
7790 {
7791 	return tg_set_cfs_period(cgroup_tg(cgrp), cfs_period_us);
7792 }
7793 
7794 struct cfs_schedulable_data {
7795 	struct task_group *tg;
7796 	u64 period, quota;
7797 };
7798 
7799 /*
7800  * normalize group quota/period to be quota/max_period
7801  * note: units are usecs
7802  */
7803 static u64 normalize_cfs_quota(struct task_group *tg,
7804 			       struct cfs_schedulable_data *d)
7805 {
7806 	u64 quota, period;
7807 
7808 	if (tg == d->tg) {
7809 		period = d->period;
7810 		quota = d->quota;
7811 	} else {
7812 		period = tg_get_cfs_period(tg);
7813 		quota = tg_get_cfs_quota(tg);
7814 	}
7815 
7816 	/* note: these should typically be equivalent */
7817 	if (quota == RUNTIME_INF || quota == -1)
7818 		return RUNTIME_INF;
7819 
7820 	return to_ratio(period, quota);
7821 }
7822 
7823 static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
7824 {
7825 	struct cfs_schedulable_data *d = data;
7826 	struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
7827 	s64 quota = 0, parent_quota = -1;
7828 
7829 	if (!tg->parent) {
7830 		quota = RUNTIME_INF;
7831 	} else {
7832 		struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth;
7833 
7834 		quota = normalize_cfs_quota(tg, d);
7835 		parent_quota = parent_b->hierarchal_quota;
7836 
7837 		/*
7838 		 * ensure max(child_quota) <= parent_quota, inherit when no
7839 		 * limit is set
7840 		 */
7841 		if (quota == RUNTIME_INF)
7842 			quota = parent_quota;
7843 		else if (parent_quota != RUNTIME_INF && quota > parent_quota)
7844 			return -EINVAL;
7845 	}
7846 	cfs_b->hierarchal_quota = quota;
7847 
7848 	return 0;
7849 }
7850 
7851 static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
7852 {
7853 	int ret;
7854 	struct cfs_schedulable_data data = {
7855 		.tg = tg,
7856 		.period = period,
7857 		.quota = quota,
7858 	};
7859 
7860 	if (quota != RUNTIME_INF) {
7861 		do_div(data.period, NSEC_PER_USEC);
7862 		do_div(data.quota, NSEC_PER_USEC);
7863 	}
7864 
7865 	rcu_read_lock();
7866 	ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data);
7867 	rcu_read_unlock();
7868 
7869 	return ret;
7870 }
7871 
7872 static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft,
7873 		struct cgroup_map_cb *cb)
7874 {
7875 	struct task_group *tg = cgroup_tg(cgrp);
7876 	struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
7877 
7878 	cb->fill(cb, "nr_periods", cfs_b->nr_periods);
7879 	cb->fill(cb, "nr_throttled", cfs_b->nr_throttled);
7880 	cb->fill(cb, "throttled_time", cfs_b->throttled_time);
7881 
7882 	return 0;
7883 }
7884 #endif /* CONFIG_CFS_BANDWIDTH */
7885 #endif /* CONFIG_FAIR_GROUP_SCHED */
7886 
7887 #ifdef CONFIG_RT_GROUP_SCHED
7888 static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,
7889 				s64 val)
7890 {
7891 	return sched_group_set_rt_runtime(cgroup_tg(cgrp), val);
7892 }
7893 
7894 static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft)
7895 {
7896 	return sched_group_rt_runtime(cgroup_tg(cgrp));
7897 }
7898 
7899 static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype,
7900 		u64 rt_period_us)
7901 {
7902 	return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us);
7903 }
7904 
7905 static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft)
7906 {
7907 	return sched_group_rt_period(cgroup_tg(cgrp));
7908 }
7909 #endif /* CONFIG_RT_GROUP_SCHED */
7910 
7911 static struct cftype cpu_files[] = {
7912 #ifdef CONFIG_FAIR_GROUP_SCHED
7913 	{
7914 		.name = "shares",
7915 		.read_u64 = cpu_shares_read_u64,
7916 		.write_u64 = cpu_shares_write_u64,
7917 	},
7918 #endif
7919 #ifdef CONFIG_CFS_BANDWIDTH
7920 	{
7921 		.name = "cfs_quota_us",
7922 		.read_s64 = cpu_cfs_quota_read_s64,
7923 		.write_s64 = cpu_cfs_quota_write_s64,
7924 	},
7925 	{
7926 		.name = "cfs_period_us",
7927 		.read_u64 = cpu_cfs_period_read_u64,
7928 		.write_u64 = cpu_cfs_period_write_u64,
7929 	},
7930 	{
7931 		.name = "stat",
7932 		.read_map = cpu_stats_show,
7933 	},
7934 #endif
7935 #ifdef CONFIG_RT_GROUP_SCHED
7936 	{
7937 		.name = "rt_runtime_us",
7938 		.read_s64 = cpu_rt_runtime_read,
7939 		.write_s64 = cpu_rt_runtime_write,
7940 	},
7941 	{
7942 		.name = "rt_period_us",
7943 		.read_u64 = cpu_rt_period_read_uint,
7944 		.write_u64 = cpu_rt_period_write_uint,
7945 	},
7946 #endif
7947 };
7948 
7949 static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)
7950 {
7951 	return cgroup_add_files(cont, ss, cpu_files, ARRAY_SIZE(cpu_files));
7952 }
7953 
7954 struct cgroup_subsys cpu_cgroup_subsys = {
7955 	.name		= "cpu",
7956 	.create		= cpu_cgroup_create,
7957 	.destroy	= cpu_cgroup_destroy,
7958 	.can_attach	= cpu_cgroup_can_attach,
7959 	.attach		= cpu_cgroup_attach,
7960 	.exit		= cpu_cgroup_exit,
7961 	.populate	= cpu_cgroup_populate,
7962 	.subsys_id	= cpu_cgroup_subsys_id,
7963 	.early_init	= 1,
7964 };
7965 
7966 #endif	/* CONFIG_CGROUP_SCHED */
7967 
7968 #ifdef CONFIG_CGROUP_CPUACCT
7969 
7970 /*
7971  * CPU accounting code for task groups.
7972  *
7973  * Based on the work by Paul Menage (menage@google.com) and Balbir Singh
7974  * (balbir@in.ibm.com).
7975  */
7976 
7977 /* create a new cpu accounting group */
7978 static struct cgroup_subsys_state *cpuacct_create(struct cgroup *cgrp)
7979 {
7980 	struct cpuacct *ca;
7981 
7982 	if (!cgrp->parent)
7983 		return &root_cpuacct.css;
7984 
7985 	ca = kzalloc(sizeof(*ca), GFP_KERNEL);
7986 	if (!ca)
7987 		goto out;
7988 
7989 	ca->cpuusage = alloc_percpu(u64);
7990 	if (!ca->cpuusage)
7991 		goto out_free_ca;
7992 
7993 	ca->cpustat = alloc_percpu(struct kernel_cpustat);
7994 	if (!ca->cpustat)
7995 		goto out_free_cpuusage;
7996 
7997 	return &ca->css;
7998 
7999 out_free_cpuusage:
8000 	free_percpu(ca->cpuusage);
8001 out_free_ca:
8002 	kfree(ca);
8003 out:
8004 	return ERR_PTR(-ENOMEM);
8005 }
8006 
8007 /* destroy an existing cpu accounting group */
8008 static void cpuacct_destroy(struct cgroup *cgrp)
8009 {
8010 	struct cpuacct *ca = cgroup_ca(cgrp);
8011 
8012 	free_percpu(ca->cpustat);
8013 	free_percpu(ca->cpuusage);
8014 	kfree(ca);
8015 }
8016 
8017 static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
8018 {
8019 	u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
8020 	u64 data;
8021 
8022 #ifndef CONFIG_64BIT
8023 	/*
8024 	 * Take rq->lock to make 64-bit read safe on 32-bit platforms.
8025 	 */
8026 	raw_spin_lock_irq(&cpu_rq(cpu)->lock);
8027 	data = *cpuusage;
8028 	raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
8029 #else
8030 	data = *cpuusage;
8031 #endif
8032 
8033 	return data;
8034 }
8035 
8036 static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
8037 {
8038 	u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
8039 
8040 #ifndef CONFIG_64BIT
8041 	/*
8042 	 * Take rq->lock to make 64-bit write safe on 32-bit platforms.
8043 	 */
8044 	raw_spin_lock_irq(&cpu_rq(cpu)->lock);
8045 	*cpuusage = val;
8046 	raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
8047 #else
8048 	*cpuusage = val;
8049 #endif
8050 }
8051 
8052 /* return total cpu usage (in nanoseconds) of a group */
8053 static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
8054 {
8055 	struct cpuacct *ca = cgroup_ca(cgrp);
8056 	u64 totalcpuusage = 0;
8057 	int i;
8058 
8059 	for_each_present_cpu(i)
8060 		totalcpuusage += cpuacct_cpuusage_read(ca, i);
8061 
8062 	return totalcpuusage;
8063 }
8064 
8065 static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,
8066 								u64 reset)
8067 {
8068 	struct cpuacct *ca = cgroup_ca(cgrp);
8069 	int err = 0;
8070 	int i;
8071 
8072 	if (reset) {
8073 		err = -EINVAL;
8074 		goto out;
8075 	}
8076 
8077 	for_each_present_cpu(i)
8078 		cpuacct_cpuusage_write(ca, i, 0);
8079 
8080 out:
8081 	return err;
8082 }
8083 
8084 static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,
8085 				   struct seq_file *m)
8086 {
8087 	struct cpuacct *ca = cgroup_ca(cgroup);
8088 	u64 percpu;
8089 	int i;
8090 
8091 	for_each_present_cpu(i) {
8092 		percpu = cpuacct_cpuusage_read(ca, i);
8093 		seq_printf(m, "%llu ", (unsigned long long) percpu);
8094 	}
8095 	seq_printf(m, "\n");
8096 	return 0;
8097 }
8098 
8099 static const char *cpuacct_stat_desc[] = {
8100 	[CPUACCT_STAT_USER] = "user",
8101 	[CPUACCT_STAT_SYSTEM] = "system",
8102 };
8103 
8104 static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,
8105 			      struct cgroup_map_cb *cb)
8106 {
8107 	struct cpuacct *ca = cgroup_ca(cgrp);
8108 	int cpu;
8109 	s64 val = 0;
8110 
8111 	for_each_online_cpu(cpu) {
8112 		struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
8113 		val += kcpustat->cpustat[CPUTIME_USER];
8114 		val += kcpustat->cpustat[CPUTIME_NICE];
8115 	}
8116 	val = cputime64_to_clock_t(val);
8117 	cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val);
8118 
8119 	val = 0;
8120 	for_each_online_cpu(cpu) {
8121 		struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
8122 		val += kcpustat->cpustat[CPUTIME_SYSTEM];
8123 		val += kcpustat->cpustat[CPUTIME_IRQ];
8124 		val += kcpustat->cpustat[CPUTIME_SOFTIRQ];
8125 	}
8126 
8127 	val = cputime64_to_clock_t(val);
8128 	cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val);
8129 
8130 	return 0;
8131 }
8132 
8133 static struct cftype files[] = {
8134 	{
8135 		.name = "usage",
8136 		.read_u64 = cpuusage_read,
8137 		.write_u64 = cpuusage_write,
8138 	},
8139 	{
8140 		.name = "usage_percpu",
8141 		.read_seq_string = cpuacct_percpu_seq_read,
8142 	},
8143 	{
8144 		.name = "stat",
8145 		.read_map = cpuacct_stats_show,
8146 	},
8147 };
8148 
8149 static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
8150 {
8151 	return cgroup_add_files(cgrp, ss, files, ARRAY_SIZE(files));
8152 }
8153 
8154 /*
8155  * charge this task's execution time to its accounting group.
8156  *
8157  * called with rq->lock held.
8158  */
8159 void cpuacct_charge(struct task_struct *tsk, u64 cputime)
8160 {
8161 	struct cpuacct *ca;
8162 	int cpu;
8163 
8164 	if (unlikely(!cpuacct_subsys.active))
8165 		return;
8166 
8167 	cpu = task_cpu(tsk);
8168 
8169 	rcu_read_lock();
8170 
8171 	ca = task_ca(tsk);
8172 
8173 	for (; ca; ca = parent_ca(ca)) {
8174 		u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
8175 		*cpuusage += cputime;
8176 	}
8177 
8178 	rcu_read_unlock();
8179 }
8180 
8181 struct cgroup_subsys cpuacct_subsys = {
8182 	.name = "cpuacct",
8183 	.create = cpuacct_create,
8184 	.destroy = cpuacct_destroy,
8185 	.populate = cpuacct_populate,
8186 	.subsys_id = cpuacct_subsys_id,
8187 };
8188 #endif	/* CONFIG_CGROUP_CPUACCT */
8189