xref: /linux/kernel/sched/rt.c (revision e3b2949e3fa2fd8c19cd5fbb0424d38f70a70e9c)
1  // SPDX-License-Identifier: GPL-2.0
2  /*
3   * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR
4   * policies)
5   */
6  
7  int sched_rr_timeslice = RR_TIMESLICE;
8  /* More than 4 hours if BW_SHIFT equals 20. */
9  static const u64 max_rt_runtime = MAX_BW;
10  
11  /*
12   * period over which we measure -rt task CPU usage in us.
13   * default: 1s
14   */
15  int sysctl_sched_rt_period = 1000000;
16  
17  /*
18   * part of the period that we allow rt tasks to run in us.
19   * default: 0.95s
20   */
21  int sysctl_sched_rt_runtime = 950000;
22  
23  #ifdef CONFIG_SYSCTL
24  static int sysctl_sched_rr_timeslice = (MSEC_PER_SEC * RR_TIMESLICE) / HZ;
25  static int sched_rt_handler(const struct ctl_table *table, int write, void *buffer,
26  		size_t *lenp, loff_t *ppos);
27  static int sched_rr_handler(const struct ctl_table *table, int write, void *buffer,
28  		size_t *lenp, loff_t *ppos);
29  static struct ctl_table sched_rt_sysctls[] = {
30  	{
31  		.procname       = "sched_rt_period_us",
32  		.data           = &sysctl_sched_rt_period,
33  		.maxlen         = sizeof(int),
34  		.mode           = 0644,
35  		.proc_handler   = sched_rt_handler,
36  		.extra1         = SYSCTL_ONE,
37  		.extra2         = SYSCTL_INT_MAX,
38  	},
39  	{
40  		.procname       = "sched_rt_runtime_us",
41  		.data           = &sysctl_sched_rt_runtime,
42  		.maxlen         = sizeof(int),
43  		.mode           = 0644,
44  		.proc_handler   = sched_rt_handler,
45  		.extra1         = SYSCTL_NEG_ONE,
46  		.extra2         = (void *)&sysctl_sched_rt_period,
47  	},
48  	{
49  		.procname       = "sched_rr_timeslice_ms",
50  		.data           = &sysctl_sched_rr_timeslice,
51  		.maxlen         = sizeof(int),
52  		.mode           = 0644,
53  		.proc_handler   = sched_rr_handler,
54  	},
55  };
56  
57  static int __init sched_rt_sysctl_init(void)
58  {
59  	register_sysctl_init("kernel", sched_rt_sysctls);
60  	return 0;
61  }
62  late_initcall(sched_rt_sysctl_init);
63  #endif
64  
65  void init_rt_rq(struct rt_rq *rt_rq)
66  {
67  	struct rt_prio_array *array;
68  	int i;
69  
70  	array = &rt_rq->active;
71  	for (i = 0; i < MAX_RT_PRIO; i++) {
72  		INIT_LIST_HEAD(array->queue + i);
73  		__clear_bit(i, array->bitmap);
74  	}
75  	/* delimiter for bitsearch: */
76  	__set_bit(MAX_RT_PRIO, array->bitmap);
77  
78  #if defined CONFIG_SMP
79  	rt_rq->highest_prio.curr = MAX_RT_PRIO-1;
80  	rt_rq->highest_prio.next = MAX_RT_PRIO-1;
81  	rt_rq->overloaded = 0;
82  	plist_head_init(&rt_rq->pushable_tasks);
83  #endif /* CONFIG_SMP */
84  	/* We start is dequeued state, because no RT tasks are queued */
85  	rt_rq->rt_queued = 0;
86  
87  #ifdef CONFIG_RT_GROUP_SCHED
88  	rt_rq->rt_time = 0;
89  	rt_rq->rt_throttled = 0;
90  	rt_rq->rt_runtime = 0;
91  	raw_spin_lock_init(&rt_rq->rt_runtime_lock);
92  #endif
93  }
94  
95  #ifdef CONFIG_RT_GROUP_SCHED
96  
97  static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
98  
99  static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
100  {
101  	struct rt_bandwidth *rt_b =
102  		container_of(timer, struct rt_bandwidth, rt_period_timer);
103  	int idle = 0;
104  	int overrun;
105  
106  	raw_spin_lock(&rt_b->rt_runtime_lock);
107  	for (;;) {
108  		overrun = hrtimer_forward_now(timer, rt_b->rt_period);
109  		if (!overrun)
110  			break;
111  
112  		raw_spin_unlock(&rt_b->rt_runtime_lock);
113  		idle = do_sched_rt_period_timer(rt_b, overrun);
114  		raw_spin_lock(&rt_b->rt_runtime_lock);
115  	}
116  	if (idle)
117  		rt_b->rt_period_active = 0;
118  	raw_spin_unlock(&rt_b->rt_runtime_lock);
119  
120  	return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
121  }
122  
123  void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
124  {
125  	rt_b->rt_period = ns_to_ktime(period);
126  	rt_b->rt_runtime = runtime;
127  
128  	raw_spin_lock_init(&rt_b->rt_runtime_lock);
129  
130  	hrtimer_init(&rt_b->rt_period_timer, CLOCK_MONOTONIC,
131  		     HRTIMER_MODE_REL_HARD);
132  	rt_b->rt_period_timer.function = sched_rt_period_timer;
133  }
134  
135  static inline void do_start_rt_bandwidth(struct rt_bandwidth *rt_b)
136  {
137  	raw_spin_lock(&rt_b->rt_runtime_lock);
138  	if (!rt_b->rt_period_active) {
139  		rt_b->rt_period_active = 1;
140  		/*
141  		 * SCHED_DEADLINE updates the bandwidth, as a run away
142  		 * RT task with a DL task could hog a CPU. But DL does
143  		 * not reset the period. If a deadline task was running
144  		 * without an RT task running, it can cause RT tasks to
145  		 * throttle when they start up. Kick the timer right away
146  		 * to update the period.
147  		 */
148  		hrtimer_forward_now(&rt_b->rt_period_timer, ns_to_ktime(0));
149  		hrtimer_start_expires(&rt_b->rt_period_timer,
150  				      HRTIMER_MODE_ABS_PINNED_HARD);
151  	}
152  	raw_spin_unlock(&rt_b->rt_runtime_lock);
153  }
154  
155  static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
156  {
157  	if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
158  		return;
159  
160  	do_start_rt_bandwidth(rt_b);
161  }
162  
163  static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
164  {
165  	hrtimer_cancel(&rt_b->rt_period_timer);
166  }
167  
168  #define rt_entity_is_task(rt_se) (!(rt_se)->my_q)
169  
170  static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
171  {
172  #ifdef CONFIG_SCHED_DEBUG
173  	WARN_ON_ONCE(!rt_entity_is_task(rt_se));
174  #endif
175  	return container_of(rt_se, struct task_struct, rt);
176  }
177  
178  static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
179  {
180  	return rt_rq->rq;
181  }
182  
183  static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
184  {
185  	return rt_se->rt_rq;
186  }
187  
188  static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
189  {
190  	struct rt_rq *rt_rq = rt_se->rt_rq;
191  
192  	return rt_rq->rq;
193  }
194  
195  void unregister_rt_sched_group(struct task_group *tg)
196  {
197  	if (tg->rt_se)
198  		destroy_rt_bandwidth(&tg->rt_bandwidth);
199  }
200  
201  void free_rt_sched_group(struct task_group *tg)
202  {
203  	int i;
204  
205  	for_each_possible_cpu(i) {
206  		if (tg->rt_rq)
207  			kfree(tg->rt_rq[i]);
208  		if (tg->rt_se)
209  			kfree(tg->rt_se[i]);
210  	}
211  
212  	kfree(tg->rt_rq);
213  	kfree(tg->rt_se);
214  }
215  
216  void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
217  		struct sched_rt_entity *rt_se, int cpu,
218  		struct sched_rt_entity *parent)
219  {
220  	struct rq *rq = cpu_rq(cpu);
221  
222  	rt_rq->highest_prio.curr = MAX_RT_PRIO-1;
223  	rt_rq->rt_nr_boosted = 0;
224  	rt_rq->rq = rq;
225  	rt_rq->tg = tg;
226  
227  	tg->rt_rq[cpu] = rt_rq;
228  	tg->rt_se[cpu] = rt_se;
229  
230  	if (!rt_se)
231  		return;
232  
233  	if (!parent)
234  		rt_se->rt_rq = &rq->rt;
235  	else
236  		rt_se->rt_rq = parent->my_q;
237  
238  	rt_se->my_q = rt_rq;
239  	rt_se->parent = parent;
240  	INIT_LIST_HEAD(&rt_se->run_list);
241  }
242  
243  int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
244  {
245  	struct rt_rq *rt_rq;
246  	struct sched_rt_entity *rt_se;
247  	int i;
248  
249  	tg->rt_rq = kcalloc(nr_cpu_ids, sizeof(rt_rq), GFP_KERNEL);
250  	if (!tg->rt_rq)
251  		goto err;
252  	tg->rt_se = kcalloc(nr_cpu_ids, sizeof(rt_se), GFP_KERNEL);
253  	if (!tg->rt_se)
254  		goto err;
255  
256  	init_rt_bandwidth(&tg->rt_bandwidth, ktime_to_ns(global_rt_period()), 0);
257  
258  	for_each_possible_cpu(i) {
259  		rt_rq = kzalloc_node(sizeof(struct rt_rq),
260  				     GFP_KERNEL, cpu_to_node(i));
261  		if (!rt_rq)
262  			goto err;
263  
264  		rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
265  				     GFP_KERNEL, cpu_to_node(i));
266  		if (!rt_se)
267  			goto err_free_rq;
268  
269  		init_rt_rq(rt_rq);
270  		rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
271  		init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
272  	}
273  
274  	return 1;
275  
276  err_free_rq:
277  	kfree(rt_rq);
278  err:
279  	return 0;
280  }
281  
282  #else /* CONFIG_RT_GROUP_SCHED */
283  
284  #define rt_entity_is_task(rt_se) (1)
285  
286  static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
287  {
288  	return container_of(rt_se, struct task_struct, rt);
289  }
290  
291  static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
292  {
293  	return container_of(rt_rq, struct rq, rt);
294  }
295  
296  static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
297  {
298  	struct task_struct *p = rt_task_of(rt_se);
299  
300  	return task_rq(p);
301  }
302  
303  static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
304  {
305  	struct rq *rq = rq_of_rt_se(rt_se);
306  
307  	return &rq->rt;
308  }
309  
310  void unregister_rt_sched_group(struct task_group *tg) { }
311  
312  void free_rt_sched_group(struct task_group *tg) { }
313  
314  int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
315  {
316  	return 1;
317  }
318  #endif /* CONFIG_RT_GROUP_SCHED */
319  
320  #ifdef CONFIG_SMP
321  
322  static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
323  {
324  	/* Try to pull RT tasks here if we lower this rq's prio */
325  	return rq->online && rq->rt.highest_prio.curr > prev->prio;
326  }
327  
328  static inline int rt_overloaded(struct rq *rq)
329  {
330  	return atomic_read(&rq->rd->rto_count);
331  }
332  
333  static inline void rt_set_overload(struct rq *rq)
334  {
335  	if (!rq->online)
336  		return;
337  
338  	cpumask_set_cpu(rq->cpu, rq->rd->rto_mask);
339  	/*
340  	 * Make sure the mask is visible before we set
341  	 * the overload count. That is checked to determine
342  	 * if we should look at the mask. It would be a shame
343  	 * if we looked at the mask, but the mask was not
344  	 * updated yet.
345  	 *
346  	 * Matched by the barrier in pull_rt_task().
347  	 */
348  	smp_wmb();
349  	atomic_inc(&rq->rd->rto_count);
350  }
351  
352  static inline void rt_clear_overload(struct rq *rq)
353  {
354  	if (!rq->online)
355  		return;
356  
357  	/* the order here really doesn't matter */
358  	atomic_dec(&rq->rd->rto_count);
359  	cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask);
360  }
361  
362  static inline int has_pushable_tasks(struct rq *rq)
363  {
364  	return !plist_head_empty(&rq->rt.pushable_tasks);
365  }
366  
367  static DEFINE_PER_CPU(struct balance_callback, rt_push_head);
368  static DEFINE_PER_CPU(struct balance_callback, rt_pull_head);
369  
370  static void push_rt_tasks(struct rq *);
371  static void pull_rt_task(struct rq *);
372  
373  static inline void rt_queue_push_tasks(struct rq *rq)
374  {
375  	if (!has_pushable_tasks(rq))
376  		return;
377  
378  	queue_balance_callback(rq, &per_cpu(rt_push_head, rq->cpu), push_rt_tasks);
379  }
380  
381  static inline void rt_queue_pull_task(struct rq *rq)
382  {
383  	queue_balance_callback(rq, &per_cpu(rt_pull_head, rq->cpu), pull_rt_task);
384  }
385  
386  static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
387  {
388  	plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
389  	plist_node_init(&p->pushable_tasks, p->prio);
390  	plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks);
391  
392  	/* Update the highest prio pushable task */
393  	if (p->prio < rq->rt.highest_prio.next)
394  		rq->rt.highest_prio.next = p->prio;
395  
396  	if (!rq->rt.overloaded) {
397  		rt_set_overload(rq);
398  		rq->rt.overloaded = 1;
399  	}
400  }
401  
402  static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
403  {
404  	plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
405  
406  	/* Update the new highest prio pushable task */
407  	if (has_pushable_tasks(rq)) {
408  		p = plist_first_entry(&rq->rt.pushable_tasks,
409  				      struct task_struct, pushable_tasks);
410  		rq->rt.highest_prio.next = p->prio;
411  	} else {
412  		rq->rt.highest_prio.next = MAX_RT_PRIO-1;
413  
414  		if (rq->rt.overloaded) {
415  			rt_clear_overload(rq);
416  			rq->rt.overloaded = 0;
417  		}
418  	}
419  }
420  
421  #else
422  
423  static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
424  {
425  }
426  
427  static inline void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
428  {
429  }
430  
431  static inline void rt_queue_push_tasks(struct rq *rq)
432  {
433  }
434  #endif /* CONFIG_SMP */
435  
436  static void enqueue_top_rt_rq(struct rt_rq *rt_rq);
437  static void dequeue_top_rt_rq(struct rt_rq *rt_rq, unsigned int count);
438  
439  static inline int on_rt_rq(struct sched_rt_entity *rt_se)
440  {
441  	return rt_se->on_rq;
442  }
443  
444  #ifdef CONFIG_UCLAMP_TASK
445  /*
446   * Verify the fitness of task @p to run on @cpu taking into account the uclamp
447   * settings.
448   *
449   * This check is only important for heterogeneous systems where uclamp_min value
450   * is higher than the capacity of a @cpu. For non-heterogeneous system this
451   * function will always return true.
452   *
453   * The function will return true if the capacity of the @cpu is >= the
454   * uclamp_min and false otherwise.
455   *
456   * Note that uclamp_min will be clamped to uclamp_max if uclamp_min
457   * > uclamp_max.
458   */
459  static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu)
460  {
461  	unsigned int min_cap;
462  	unsigned int max_cap;
463  	unsigned int cpu_cap;
464  
465  	/* Only heterogeneous systems can benefit from this check */
466  	if (!sched_asym_cpucap_active())
467  		return true;
468  
469  	min_cap = uclamp_eff_value(p, UCLAMP_MIN);
470  	max_cap = uclamp_eff_value(p, UCLAMP_MAX);
471  
472  	cpu_cap = arch_scale_cpu_capacity(cpu);
473  
474  	return cpu_cap >= min(min_cap, max_cap);
475  }
476  #else
477  static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu)
478  {
479  	return true;
480  }
481  #endif
482  
483  #ifdef CONFIG_RT_GROUP_SCHED
484  
485  static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
486  {
487  	if (!rt_rq->tg)
488  		return RUNTIME_INF;
489  
490  	return rt_rq->rt_runtime;
491  }
492  
493  static inline u64 sched_rt_period(struct rt_rq *rt_rq)
494  {
495  	return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period);
496  }
497  
498  typedef struct task_group *rt_rq_iter_t;
499  
500  static inline struct task_group *next_task_group(struct task_group *tg)
501  {
502  	do {
503  		tg = list_entry_rcu(tg->list.next,
504  			typeof(struct task_group), list);
505  	} while (&tg->list != &task_groups && task_group_is_autogroup(tg));
506  
507  	if (&tg->list == &task_groups)
508  		tg = NULL;
509  
510  	return tg;
511  }
512  
513  #define for_each_rt_rq(rt_rq, iter, rq)					\
514  	for (iter = container_of(&task_groups, typeof(*iter), list);	\
515  		(iter = next_task_group(iter)) &&			\
516  		(rt_rq = iter->rt_rq[cpu_of(rq)]);)
517  
518  #define for_each_sched_rt_entity(rt_se) \
519  	for (; rt_se; rt_se = rt_se->parent)
520  
521  static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
522  {
523  	return rt_se->my_q;
524  }
525  
526  static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags);
527  static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags);
528  
529  static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
530  {
531  	struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
532  	struct rq *rq = rq_of_rt_rq(rt_rq);
533  	struct sched_rt_entity *rt_se;
534  
535  	int cpu = cpu_of(rq);
536  
537  	rt_se = rt_rq->tg->rt_se[cpu];
538  
539  	if (rt_rq->rt_nr_running) {
540  		if (!rt_se)
541  			enqueue_top_rt_rq(rt_rq);
542  		else if (!on_rt_rq(rt_se))
543  			enqueue_rt_entity(rt_se, 0);
544  
545  		if (rt_rq->highest_prio.curr < curr->prio)
546  			resched_curr(rq);
547  	}
548  }
549  
550  static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
551  {
552  	struct sched_rt_entity *rt_se;
553  	int cpu = cpu_of(rq_of_rt_rq(rt_rq));
554  
555  	rt_se = rt_rq->tg->rt_se[cpu];
556  
557  	if (!rt_se) {
558  		dequeue_top_rt_rq(rt_rq, rt_rq->rt_nr_running);
559  		/* Kick cpufreq (see the comment in kernel/sched/sched.h). */
560  		cpufreq_update_util(rq_of_rt_rq(rt_rq), 0);
561  	}
562  	else if (on_rt_rq(rt_se))
563  		dequeue_rt_entity(rt_se, 0);
564  }
565  
566  static inline int rt_rq_throttled(struct rt_rq *rt_rq)
567  {
568  	return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
569  }
570  
571  static int rt_se_boosted(struct sched_rt_entity *rt_se)
572  {
573  	struct rt_rq *rt_rq = group_rt_rq(rt_se);
574  	struct task_struct *p;
575  
576  	if (rt_rq)
577  		return !!rt_rq->rt_nr_boosted;
578  
579  	p = rt_task_of(rt_se);
580  	return p->prio != p->normal_prio;
581  }
582  
583  #ifdef CONFIG_SMP
584  static inline const struct cpumask *sched_rt_period_mask(void)
585  {
586  	return this_rq()->rd->span;
587  }
588  #else
589  static inline const struct cpumask *sched_rt_period_mask(void)
590  {
591  	return cpu_online_mask;
592  }
593  #endif
594  
595  static inline
596  struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
597  {
598  	return container_of(rt_b, struct task_group, rt_bandwidth)->rt_rq[cpu];
599  }
600  
601  static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
602  {
603  	return &rt_rq->tg->rt_bandwidth;
604  }
605  
606  bool sched_rt_bandwidth_account(struct rt_rq *rt_rq)
607  {
608  	struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
609  
610  	return (hrtimer_active(&rt_b->rt_period_timer) ||
611  		rt_rq->rt_time < rt_b->rt_runtime);
612  }
613  
614  #ifdef CONFIG_SMP
615  /*
616   * We ran out of runtime, see if we can borrow some from our neighbours.
617   */
618  static void do_balance_runtime(struct rt_rq *rt_rq)
619  {
620  	struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
621  	struct root_domain *rd = rq_of_rt_rq(rt_rq)->rd;
622  	int i, weight;
623  	u64 rt_period;
624  
625  	weight = cpumask_weight(rd->span);
626  
627  	raw_spin_lock(&rt_b->rt_runtime_lock);
628  	rt_period = ktime_to_ns(rt_b->rt_period);
629  	for_each_cpu(i, rd->span) {
630  		struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
631  		s64 diff;
632  
633  		if (iter == rt_rq)
634  			continue;
635  
636  		raw_spin_lock(&iter->rt_runtime_lock);
637  		/*
638  		 * Either all rqs have inf runtime and there's nothing to steal
639  		 * or __disable_runtime() below sets a specific rq to inf to
640  		 * indicate its been disabled and disallow stealing.
641  		 */
642  		if (iter->rt_runtime == RUNTIME_INF)
643  			goto next;
644  
645  		/*
646  		 * From runqueues with spare time, take 1/n part of their
647  		 * spare time, but no more than our period.
648  		 */
649  		diff = iter->rt_runtime - iter->rt_time;
650  		if (diff > 0) {
651  			diff = div_u64((u64)diff, weight);
652  			if (rt_rq->rt_runtime + diff > rt_period)
653  				diff = rt_period - rt_rq->rt_runtime;
654  			iter->rt_runtime -= diff;
655  			rt_rq->rt_runtime += diff;
656  			if (rt_rq->rt_runtime == rt_period) {
657  				raw_spin_unlock(&iter->rt_runtime_lock);
658  				break;
659  			}
660  		}
661  next:
662  		raw_spin_unlock(&iter->rt_runtime_lock);
663  	}
664  	raw_spin_unlock(&rt_b->rt_runtime_lock);
665  }
666  
667  /*
668   * Ensure this RQ takes back all the runtime it lend to its neighbours.
669   */
670  static void __disable_runtime(struct rq *rq)
671  {
672  	struct root_domain *rd = rq->rd;
673  	rt_rq_iter_t iter;
674  	struct rt_rq *rt_rq;
675  
676  	if (unlikely(!scheduler_running))
677  		return;
678  
679  	for_each_rt_rq(rt_rq, iter, rq) {
680  		struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
681  		s64 want;
682  		int i;
683  
684  		raw_spin_lock(&rt_b->rt_runtime_lock);
685  		raw_spin_lock(&rt_rq->rt_runtime_lock);
686  		/*
687  		 * Either we're all inf and nobody needs to borrow, or we're
688  		 * already disabled and thus have nothing to do, or we have
689  		 * exactly the right amount of runtime to take out.
690  		 */
691  		if (rt_rq->rt_runtime == RUNTIME_INF ||
692  				rt_rq->rt_runtime == rt_b->rt_runtime)
693  			goto balanced;
694  		raw_spin_unlock(&rt_rq->rt_runtime_lock);
695  
696  		/*
697  		 * Calculate the difference between what we started out with
698  		 * and what we current have, that's the amount of runtime
699  		 * we lend and now have to reclaim.
700  		 */
701  		want = rt_b->rt_runtime - rt_rq->rt_runtime;
702  
703  		/*
704  		 * Greedy reclaim, take back as much as we can.
705  		 */
706  		for_each_cpu(i, rd->span) {
707  			struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
708  			s64 diff;
709  
710  			/*
711  			 * Can't reclaim from ourselves or disabled runqueues.
712  			 */
713  			if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF)
714  				continue;
715  
716  			raw_spin_lock(&iter->rt_runtime_lock);
717  			if (want > 0) {
718  				diff = min_t(s64, iter->rt_runtime, want);
719  				iter->rt_runtime -= diff;
720  				want -= diff;
721  			} else {
722  				iter->rt_runtime -= want;
723  				want -= want;
724  			}
725  			raw_spin_unlock(&iter->rt_runtime_lock);
726  
727  			if (!want)
728  				break;
729  		}
730  
731  		raw_spin_lock(&rt_rq->rt_runtime_lock);
732  		/*
733  		 * We cannot be left wanting - that would mean some runtime
734  		 * leaked out of the system.
735  		 */
736  		WARN_ON_ONCE(want);
737  balanced:
738  		/*
739  		 * Disable all the borrow logic by pretending we have inf
740  		 * runtime - in which case borrowing doesn't make sense.
741  		 */
742  		rt_rq->rt_runtime = RUNTIME_INF;
743  		rt_rq->rt_throttled = 0;
744  		raw_spin_unlock(&rt_rq->rt_runtime_lock);
745  		raw_spin_unlock(&rt_b->rt_runtime_lock);
746  
747  		/* Make rt_rq available for pick_next_task() */
748  		sched_rt_rq_enqueue(rt_rq);
749  	}
750  }
751  
752  static void __enable_runtime(struct rq *rq)
753  {
754  	rt_rq_iter_t iter;
755  	struct rt_rq *rt_rq;
756  
757  	if (unlikely(!scheduler_running))
758  		return;
759  
760  	/*
761  	 * Reset each runqueue's bandwidth settings
762  	 */
763  	for_each_rt_rq(rt_rq, iter, rq) {
764  		struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
765  
766  		raw_spin_lock(&rt_b->rt_runtime_lock);
767  		raw_spin_lock(&rt_rq->rt_runtime_lock);
768  		rt_rq->rt_runtime = rt_b->rt_runtime;
769  		rt_rq->rt_time = 0;
770  		rt_rq->rt_throttled = 0;
771  		raw_spin_unlock(&rt_rq->rt_runtime_lock);
772  		raw_spin_unlock(&rt_b->rt_runtime_lock);
773  	}
774  }
775  
776  static void balance_runtime(struct rt_rq *rt_rq)
777  {
778  	if (!sched_feat(RT_RUNTIME_SHARE))
779  		return;
780  
781  	if (rt_rq->rt_time > rt_rq->rt_runtime) {
782  		raw_spin_unlock(&rt_rq->rt_runtime_lock);
783  		do_balance_runtime(rt_rq);
784  		raw_spin_lock(&rt_rq->rt_runtime_lock);
785  	}
786  }
787  #else /* !CONFIG_SMP */
788  static inline void balance_runtime(struct rt_rq *rt_rq) {}
789  #endif /* CONFIG_SMP */
790  
791  static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
792  {
793  	int i, idle = 1, throttled = 0;
794  	const struct cpumask *span;
795  
796  	span = sched_rt_period_mask();
797  
798  	/*
799  	 * FIXME: isolated CPUs should really leave the root task group,
800  	 * whether they are isolcpus or were isolated via cpusets, lest
801  	 * the timer run on a CPU which does not service all runqueues,
802  	 * potentially leaving other CPUs indefinitely throttled.  If
803  	 * isolation is really required, the user will turn the throttle
804  	 * off to kill the perturbations it causes anyway.  Meanwhile,
805  	 * this maintains functionality for boot and/or troubleshooting.
806  	 */
807  	if (rt_b == &root_task_group.rt_bandwidth)
808  		span = cpu_online_mask;
809  
810  	for_each_cpu(i, span) {
811  		int enqueue = 0;
812  		struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
813  		struct rq *rq = rq_of_rt_rq(rt_rq);
814  		struct rq_flags rf;
815  		int skip;
816  
817  		/*
818  		 * When span == cpu_online_mask, taking each rq->lock
819  		 * can be time-consuming. Try to avoid it when possible.
820  		 */
821  		raw_spin_lock(&rt_rq->rt_runtime_lock);
822  		if (!sched_feat(RT_RUNTIME_SHARE) && rt_rq->rt_runtime != RUNTIME_INF)
823  			rt_rq->rt_runtime = rt_b->rt_runtime;
824  		skip = !rt_rq->rt_time && !rt_rq->rt_nr_running;
825  		raw_spin_unlock(&rt_rq->rt_runtime_lock);
826  		if (skip)
827  			continue;
828  
829  		rq_lock(rq, &rf);
830  		update_rq_clock(rq);
831  
832  		if (rt_rq->rt_time) {
833  			u64 runtime;
834  
835  			raw_spin_lock(&rt_rq->rt_runtime_lock);
836  			if (rt_rq->rt_throttled)
837  				balance_runtime(rt_rq);
838  			runtime = rt_rq->rt_runtime;
839  			rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime);
840  			if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
841  				rt_rq->rt_throttled = 0;
842  				enqueue = 1;
843  
844  				/*
845  				 * When we're idle and a woken (rt) task is
846  				 * throttled wakeup_preempt() will set
847  				 * skip_update and the time between the wakeup
848  				 * and this unthrottle will get accounted as
849  				 * 'runtime'.
850  				 */
851  				if (rt_rq->rt_nr_running && rq->curr == rq->idle)
852  					rq_clock_cancel_skipupdate(rq);
853  			}
854  			if (rt_rq->rt_time || rt_rq->rt_nr_running)
855  				idle = 0;
856  			raw_spin_unlock(&rt_rq->rt_runtime_lock);
857  		} else if (rt_rq->rt_nr_running) {
858  			idle = 0;
859  			if (!rt_rq_throttled(rt_rq))
860  				enqueue = 1;
861  		}
862  		if (rt_rq->rt_throttled)
863  			throttled = 1;
864  
865  		if (enqueue)
866  			sched_rt_rq_enqueue(rt_rq);
867  		rq_unlock(rq, &rf);
868  	}
869  
870  	if (!throttled && (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF))
871  		return 1;
872  
873  	return idle;
874  }
875  
876  static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
877  {
878  	u64 runtime = sched_rt_runtime(rt_rq);
879  
880  	if (rt_rq->rt_throttled)
881  		return rt_rq_throttled(rt_rq);
882  
883  	if (runtime >= sched_rt_period(rt_rq))
884  		return 0;
885  
886  	balance_runtime(rt_rq);
887  	runtime = sched_rt_runtime(rt_rq);
888  	if (runtime == RUNTIME_INF)
889  		return 0;
890  
891  	if (rt_rq->rt_time > runtime) {
892  		struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
893  
894  		/*
895  		 * Don't actually throttle groups that have no runtime assigned
896  		 * but accrue some time due to boosting.
897  		 */
898  		if (likely(rt_b->rt_runtime)) {
899  			rt_rq->rt_throttled = 1;
900  			printk_deferred_once("sched: RT throttling activated\n");
901  		} else {
902  			/*
903  			 * In case we did anyway, make it go away,
904  			 * replenishment is a joke, since it will replenish us
905  			 * with exactly 0 ns.
906  			 */
907  			rt_rq->rt_time = 0;
908  		}
909  
910  		if (rt_rq_throttled(rt_rq)) {
911  			sched_rt_rq_dequeue(rt_rq);
912  			return 1;
913  		}
914  	}
915  
916  	return 0;
917  }
918  
919  #else /* !CONFIG_RT_GROUP_SCHED */
920  
921  typedef struct rt_rq *rt_rq_iter_t;
922  
923  #define for_each_rt_rq(rt_rq, iter, rq) \
924  	for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
925  
926  #define for_each_sched_rt_entity(rt_se) \
927  	for (; rt_se; rt_se = NULL)
928  
929  static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
930  {
931  	return NULL;
932  }
933  
934  static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
935  {
936  	struct rq *rq = rq_of_rt_rq(rt_rq);
937  
938  	if (!rt_rq->rt_nr_running)
939  		return;
940  
941  	enqueue_top_rt_rq(rt_rq);
942  	resched_curr(rq);
943  }
944  
945  static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
946  {
947  	dequeue_top_rt_rq(rt_rq, rt_rq->rt_nr_running);
948  }
949  
950  static inline int rt_rq_throttled(struct rt_rq *rt_rq)
951  {
952  	return false;
953  }
954  
955  static inline const struct cpumask *sched_rt_period_mask(void)
956  {
957  	return cpu_online_mask;
958  }
959  
960  static inline
961  struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
962  {
963  	return &cpu_rq(cpu)->rt;
964  }
965  
966  #ifdef CONFIG_SMP
967  static void __enable_runtime(struct rq *rq) { }
968  static void __disable_runtime(struct rq *rq) { }
969  #endif
970  
971  #endif /* CONFIG_RT_GROUP_SCHED */
972  
973  static inline int rt_se_prio(struct sched_rt_entity *rt_se)
974  {
975  #ifdef CONFIG_RT_GROUP_SCHED
976  	struct rt_rq *rt_rq = group_rt_rq(rt_se);
977  
978  	if (rt_rq)
979  		return rt_rq->highest_prio.curr;
980  #endif
981  
982  	return rt_task_of(rt_se)->prio;
983  }
984  
985  /*
986   * Update the current task's runtime statistics. Skip current tasks that
987   * are not in our scheduling class.
988   */
989  static void update_curr_rt(struct rq *rq)
990  {
991  	struct task_struct *curr = rq->curr;
992  	s64 delta_exec;
993  
994  	if (curr->sched_class != &rt_sched_class)
995  		return;
996  
997  	delta_exec = update_curr_common(rq);
998  	if (unlikely(delta_exec <= 0))
999  		return;
1000  
1001  #ifdef CONFIG_RT_GROUP_SCHED
1002  	struct sched_rt_entity *rt_se = &curr->rt;
1003  
1004  	if (!rt_bandwidth_enabled())
1005  		return;
1006  
1007  	for_each_sched_rt_entity(rt_se) {
1008  		struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
1009  		int exceeded;
1010  
1011  		if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
1012  			raw_spin_lock(&rt_rq->rt_runtime_lock);
1013  			rt_rq->rt_time += delta_exec;
1014  			exceeded = sched_rt_runtime_exceeded(rt_rq);
1015  			if (exceeded)
1016  				resched_curr(rq);
1017  			raw_spin_unlock(&rt_rq->rt_runtime_lock);
1018  			if (exceeded)
1019  				do_start_rt_bandwidth(sched_rt_bandwidth(rt_rq));
1020  		}
1021  	}
1022  #endif
1023  }
1024  
1025  static void
1026  dequeue_top_rt_rq(struct rt_rq *rt_rq, unsigned int count)
1027  {
1028  	struct rq *rq = rq_of_rt_rq(rt_rq);
1029  
1030  	BUG_ON(&rq->rt != rt_rq);
1031  
1032  	if (!rt_rq->rt_queued)
1033  		return;
1034  
1035  	BUG_ON(!rq->nr_running);
1036  
1037  	sub_nr_running(rq, count);
1038  	rt_rq->rt_queued = 0;
1039  
1040  }
1041  
1042  static void
1043  enqueue_top_rt_rq(struct rt_rq *rt_rq)
1044  {
1045  	struct rq *rq = rq_of_rt_rq(rt_rq);
1046  
1047  	BUG_ON(&rq->rt != rt_rq);
1048  
1049  	if (rt_rq->rt_queued)
1050  		return;
1051  
1052  	if (rt_rq_throttled(rt_rq))
1053  		return;
1054  
1055  	if (rt_rq->rt_nr_running) {
1056  		add_nr_running(rq, rt_rq->rt_nr_running);
1057  		rt_rq->rt_queued = 1;
1058  	}
1059  
1060  	/* Kick cpufreq (see the comment in kernel/sched/sched.h). */
1061  	cpufreq_update_util(rq, 0);
1062  }
1063  
1064  #if defined CONFIG_SMP
1065  
1066  static void
1067  inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
1068  {
1069  	struct rq *rq = rq_of_rt_rq(rt_rq);
1070  
1071  #ifdef CONFIG_RT_GROUP_SCHED
1072  	/*
1073  	 * Change rq's cpupri only if rt_rq is the top queue.
1074  	 */
1075  	if (&rq->rt != rt_rq)
1076  		return;
1077  #endif
1078  	if (rq->online && prio < prev_prio)
1079  		cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
1080  }
1081  
1082  static void
1083  dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
1084  {
1085  	struct rq *rq = rq_of_rt_rq(rt_rq);
1086  
1087  #ifdef CONFIG_RT_GROUP_SCHED
1088  	/*
1089  	 * Change rq's cpupri only if rt_rq is the top queue.
1090  	 */
1091  	if (&rq->rt != rt_rq)
1092  		return;
1093  #endif
1094  	if (rq->online && rt_rq->highest_prio.curr != prev_prio)
1095  		cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr);
1096  }
1097  
1098  #else /* CONFIG_SMP */
1099  
1100  static inline
1101  void inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
1102  static inline
1103  void dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
1104  
1105  #endif /* CONFIG_SMP */
1106  
1107  #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
1108  static void
1109  inc_rt_prio(struct rt_rq *rt_rq, int prio)
1110  {
1111  	int prev_prio = rt_rq->highest_prio.curr;
1112  
1113  	if (prio < prev_prio)
1114  		rt_rq->highest_prio.curr = prio;
1115  
1116  	inc_rt_prio_smp(rt_rq, prio, prev_prio);
1117  }
1118  
1119  static void
1120  dec_rt_prio(struct rt_rq *rt_rq, int prio)
1121  {
1122  	int prev_prio = rt_rq->highest_prio.curr;
1123  
1124  	if (rt_rq->rt_nr_running) {
1125  
1126  		WARN_ON(prio < prev_prio);
1127  
1128  		/*
1129  		 * This may have been our highest task, and therefore
1130  		 * we may have some re-computation to do
1131  		 */
1132  		if (prio == prev_prio) {
1133  			struct rt_prio_array *array = &rt_rq->active;
1134  
1135  			rt_rq->highest_prio.curr =
1136  				sched_find_first_bit(array->bitmap);
1137  		}
1138  
1139  	} else {
1140  		rt_rq->highest_prio.curr = MAX_RT_PRIO-1;
1141  	}
1142  
1143  	dec_rt_prio_smp(rt_rq, prio, prev_prio);
1144  }
1145  
1146  #else
1147  
1148  static inline void inc_rt_prio(struct rt_rq *rt_rq, int prio) {}
1149  static inline void dec_rt_prio(struct rt_rq *rt_rq, int prio) {}
1150  
1151  #endif /* CONFIG_SMP || CONFIG_RT_GROUP_SCHED */
1152  
1153  #ifdef CONFIG_RT_GROUP_SCHED
1154  
1155  static void
1156  inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1157  {
1158  	if (rt_se_boosted(rt_se))
1159  		rt_rq->rt_nr_boosted++;
1160  
1161  	if (rt_rq->tg)
1162  		start_rt_bandwidth(&rt_rq->tg->rt_bandwidth);
1163  }
1164  
1165  static void
1166  dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1167  {
1168  	if (rt_se_boosted(rt_se))
1169  		rt_rq->rt_nr_boosted--;
1170  
1171  	WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted);
1172  }
1173  
1174  #else /* CONFIG_RT_GROUP_SCHED */
1175  
1176  static void
1177  inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1178  {
1179  }
1180  
1181  static inline
1182  void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {}
1183  
1184  #endif /* CONFIG_RT_GROUP_SCHED */
1185  
1186  static inline
1187  unsigned int rt_se_nr_running(struct sched_rt_entity *rt_se)
1188  {
1189  	struct rt_rq *group_rq = group_rt_rq(rt_se);
1190  
1191  	if (group_rq)
1192  		return group_rq->rt_nr_running;
1193  	else
1194  		return 1;
1195  }
1196  
1197  static inline
1198  unsigned int rt_se_rr_nr_running(struct sched_rt_entity *rt_se)
1199  {
1200  	struct rt_rq *group_rq = group_rt_rq(rt_se);
1201  	struct task_struct *tsk;
1202  
1203  	if (group_rq)
1204  		return group_rq->rr_nr_running;
1205  
1206  	tsk = rt_task_of(rt_se);
1207  
1208  	return (tsk->policy == SCHED_RR) ? 1 : 0;
1209  }
1210  
1211  static inline
1212  void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1213  {
1214  	int prio = rt_se_prio(rt_se);
1215  
1216  	WARN_ON(!rt_prio(prio));
1217  	rt_rq->rt_nr_running += rt_se_nr_running(rt_se);
1218  	rt_rq->rr_nr_running += rt_se_rr_nr_running(rt_se);
1219  
1220  	inc_rt_prio(rt_rq, prio);
1221  	inc_rt_group(rt_se, rt_rq);
1222  }
1223  
1224  static inline
1225  void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1226  {
1227  	WARN_ON(!rt_prio(rt_se_prio(rt_se)));
1228  	WARN_ON(!rt_rq->rt_nr_running);
1229  	rt_rq->rt_nr_running -= rt_se_nr_running(rt_se);
1230  	rt_rq->rr_nr_running -= rt_se_rr_nr_running(rt_se);
1231  
1232  	dec_rt_prio(rt_rq, rt_se_prio(rt_se));
1233  	dec_rt_group(rt_se, rt_rq);
1234  }
1235  
1236  /*
1237   * Change rt_se->run_list location unless SAVE && !MOVE
1238   *
1239   * assumes ENQUEUE/DEQUEUE flags match
1240   */
1241  static inline bool move_entity(unsigned int flags)
1242  {
1243  	if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE)
1244  		return false;
1245  
1246  	return true;
1247  }
1248  
1249  static void __delist_rt_entity(struct sched_rt_entity *rt_se, struct rt_prio_array *array)
1250  {
1251  	list_del_init(&rt_se->run_list);
1252  
1253  	if (list_empty(array->queue + rt_se_prio(rt_se)))
1254  		__clear_bit(rt_se_prio(rt_se), array->bitmap);
1255  
1256  	rt_se->on_list = 0;
1257  }
1258  
1259  static inline struct sched_statistics *
1260  __schedstats_from_rt_se(struct sched_rt_entity *rt_se)
1261  {
1262  #ifdef CONFIG_RT_GROUP_SCHED
1263  	/* schedstats is not supported for rt group. */
1264  	if (!rt_entity_is_task(rt_se))
1265  		return NULL;
1266  #endif
1267  
1268  	return &rt_task_of(rt_se)->stats;
1269  }
1270  
1271  static inline void
1272  update_stats_wait_start_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se)
1273  {
1274  	struct sched_statistics *stats;
1275  	struct task_struct *p = NULL;
1276  
1277  	if (!schedstat_enabled())
1278  		return;
1279  
1280  	if (rt_entity_is_task(rt_se))
1281  		p = rt_task_of(rt_se);
1282  
1283  	stats = __schedstats_from_rt_se(rt_se);
1284  	if (!stats)
1285  		return;
1286  
1287  	__update_stats_wait_start(rq_of_rt_rq(rt_rq), p, stats);
1288  }
1289  
1290  static inline void
1291  update_stats_enqueue_sleeper_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se)
1292  {
1293  	struct sched_statistics *stats;
1294  	struct task_struct *p = NULL;
1295  
1296  	if (!schedstat_enabled())
1297  		return;
1298  
1299  	if (rt_entity_is_task(rt_se))
1300  		p = rt_task_of(rt_se);
1301  
1302  	stats = __schedstats_from_rt_se(rt_se);
1303  	if (!stats)
1304  		return;
1305  
1306  	__update_stats_enqueue_sleeper(rq_of_rt_rq(rt_rq), p, stats);
1307  }
1308  
1309  static inline void
1310  update_stats_enqueue_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se,
1311  			int flags)
1312  {
1313  	if (!schedstat_enabled())
1314  		return;
1315  
1316  	if (flags & ENQUEUE_WAKEUP)
1317  		update_stats_enqueue_sleeper_rt(rt_rq, rt_se);
1318  }
1319  
1320  static inline void
1321  update_stats_wait_end_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se)
1322  {
1323  	struct sched_statistics *stats;
1324  	struct task_struct *p = NULL;
1325  
1326  	if (!schedstat_enabled())
1327  		return;
1328  
1329  	if (rt_entity_is_task(rt_se))
1330  		p = rt_task_of(rt_se);
1331  
1332  	stats = __schedstats_from_rt_se(rt_se);
1333  	if (!stats)
1334  		return;
1335  
1336  	__update_stats_wait_end(rq_of_rt_rq(rt_rq), p, stats);
1337  }
1338  
1339  static inline void
1340  update_stats_dequeue_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se,
1341  			int flags)
1342  {
1343  	struct task_struct *p = NULL;
1344  
1345  	if (!schedstat_enabled())
1346  		return;
1347  
1348  	if (rt_entity_is_task(rt_se))
1349  		p = rt_task_of(rt_se);
1350  
1351  	if ((flags & DEQUEUE_SLEEP) && p) {
1352  		unsigned int state;
1353  
1354  		state = READ_ONCE(p->__state);
1355  		if (state & TASK_INTERRUPTIBLE)
1356  			__schedstat_set(p->stats.sleep_start,
1357  					rq_clock(rq_of_rt_rq(rt_rq)));
1358  
1359  		if (state & TASK_UNINTERRUPTIBLE)
1360  			__schedstat_set(p->stats.block_start,
1361  					rq_clock(rq_of_rt_rq(rt_rq)));
1362  	}
1363  }
1364  
1365  static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
1366  {
1367  	struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
1368  	struct rt_prio_array *array = &rt_rq->active;
1369  	struct rt_rq *group_rq = group_rt_rq(rt_se);
1370  	struct list_head *queue = array->queue + rt_se_prio(rt_se);
1371  
1372  	/*
1373  	 * Don't enqueue the group if its throttled, or when empty.
1374  	 * The latter is a consequence of the former when a child group
1375  	 * get throttled and the current group doesn't have any other
1376  	 * active members.
1377  	 */
1378  	if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) {
1379  		if (rt_se->on_list)
1380  			__delist_rt_entity(rt_se, array);
1381  		return;
1382  	}
1383  
1384  	if (move_entity(flags)) {
1385  		WARN_ON_ONCE(rt_se->on_list);
1386  		if (flags & ENQUEUE_HEAD)
1387  			list_add(&rt_se->run_list, queue);
1388  		else
1389  			list_add_tail(&rt_se->run_list, queue);
1390  
1391  		__set_bit(rt_se_prio(rt_se), array->bitmap);
1392  		rt_se->on_list = 1;
1393  	}
1394  	rt_se->on_rq = 1;
1395  
1396  	inc_rt_tasks(rt_se, rt_rq);
1397  }
1398  
1399  static void __dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
1400  {
1401  	struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
1402  	struct rt_prio_array *array = &rt_rq->active;
1403  
1404  	if (move_entity(flags)) {
1405  		WARN_ON_ONCE(!rt_se->on_list);
1406  		__delist_rt_entity(rt_se, array);
1407  	}
1408  	rt_se->on_rq = 0;
1409  
1410  	dec_rt_tasks(rt_se, rt_rq);
1411  }
1412  
1413  /*
1414   * Because the prio of an upper entry depends on the lower
1415   * entries, we must remove entries top - down.
1416   */
1417  static void dequeue_rt_stack(struct sched_rt_entity *rt_se, unsigned int flags)
1418  {
1419  	struct sched_rt_entity *back = NULL;
1420  	unsigned int rt_nr_running;
1421  
1422  	for_each_sched_rt_entity(rt_se) {
1423  		rt_se->back = back;
1424  		back = rt_se;
1425  	}
1426  
1427  	rt_nr_running = rt_rq_of_se(back)->rt_nr_running;
1428  
1429  	for (rt_se = back; rt_se; rt_se = rt_se->back) {
1430  		if (on_rt_rq(rt_se))
1431  			__dequeue_rt_entity(rt_se, flags);
1432  	}
1433  
1434  	dequeue_top_rt_rq(rt_rq_of_se(back), rt_nr_running);
1435  }
1436  
1437  static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
1438  {
1439  	struct rq *rq = rq_of_rt_se(rt_se);
1440  
1441  	update_stats_enqueue_rt(rt_rq_of_se(rt_se), rt_se, flags);
1442  
1443  	dequeue_rt_stack(rt_se, flags);
1444  	for_each_sched_rt_entity(rt_se)
1445  		__enqueue_rt_entity(rt_se, flags);
1446  	enqueue_top_rt_rq(&rq->rt);
1447  }
1448  
1449  static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
1450  {
1451  	struct rq *rq = rq_of_rt_se(rt_se);
1452  
1453  	update_stats_dequeue_rt(rt_rq_of_se(rt_se), rt_se, flags);
1454  
1455  	dequeue_rt_stack(rt_se, flags);
1456  
1457  	for_each_sched_rt_entity(rt_se) {
1458  		struct rt_rq *rt_rq = group_rt_rq(rt_se);
1459  
1460  		if (rt_rq && rt_rq->rt_nr_running)
1461  			__enqueue_rt_entity(rt_se, flags);
1462  	}
1463  	enqueue_top_rt_rq(&rq->rt);
1464  }
1465  
1466  /*
1467   * Adding/removing a task to/from a priority array:
1468   */
1469  static void
1470  enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
1471  {
1472  	struct sched_rt_entity *rt_se = &p->rt;
1473  
1474  	if (flags & ENQUEUE_WAKEUP)
1475  		rt_se->timeout = 0;
1476  
1477  	check_schedstat_required();
1478  	update_stats_wait_start_rt(rt_rq_of_se(rt_se), rt_se);
1479  
1480  	enqueue_rt_entity(rt_se, flags);
1481  
1482  	if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
1483  		enqueue_pushable_task(rq, p);
1484  }
1485  
1486  static bool dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
1487  {
1488  	struct sched_rt_entity *rt_se = &p->rt;
1489  
1490  	update_curr_rt(rq);
1491  	dequeue_rt_entity(rt_se, flags);
1492  
1493  	dequeue_pushable_task(rq, p);
1494  
1495  	return true;
1496  }
1497  
1498  /*
1499   * Put task to the head or the end of the run list without the overhead of
1500   * dequeue followed by enqueue.
1501   */
1502  static void
1503  requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int head)
1504  {
1505  	if (on_rt_rq(rt_se)) {
1506  		struct rt_prio_array *array = &rt_rq->active;
1507  		struct list_head *queue = array->queue + rt_se_prio(rt_se);
1508  
1509  		if (head)
1510  			list_move(&rt_se->run_list, queue);
1511  		else
1512  			list_move_tail(&rt_se->run_list, queue);
1513  	}
1514  }
1515  
1516  static void requeue_task_rt(struct rq *rq, struct task_struct *p, int head)
1517  {
1518  	struct sched_rt_entity *rt_se = &p->rt;
1519  	struct rt_rq *rt_rq;
1520  
1521  	for_each_sched_rt_entity(rt_se) {
1522  		rt_rq = rt_rq_of_se(rt_se);
1523  		requeue_rt_entity(rt_rq, rt_se, head);
1524  	}
1525  }
1526  
1527  static void yield_task_rt(struct rq *rq)
1528  {
1529  	requeue_task_rt(rq, rq->curr, 0);
1530  }
1531  
1532  #ifdef CONFIG_SMP
1533  static int find_lowest_rq(struct task_struct *task);
1534  
1535  static int
1536  select_task_rq_rt(struct task_struct *p, int cpu, int flags)
1537  {
1538  	struct task_struct *curr;
1539  	struct rq *rq;
1540  	bool test;
1541  
1542  	/* For anything but wake ups, just return the task_cpu */
1543  	if (!(flags & (WF_TTWU | WF_FORK)))
1544  		goto out;
1545  
1546  	rq = cpu_rq(cpu);
1547  
1548  	rcu_read_lock();
1549  	curr = READ_ONCE(rq->curr); /* unlocked access */
1550  
1551  	/*
1552  	 * If the current task on @p's runqueue is an RT task, then
1553  	 * try to see if we can wake this RT task up on another
1554  	 * runqueue. Otherwise simply start this RT task
1555  	 * on its current runqueue.
1556  	 *
1557  	 * We want to avoid overloading runqueues. If the woken
1558  	 * task is a higher priority, then it will stay on this CPU
1559  	 * and the lower prio task should be moved to another CPU.
1560  	 * Even though this will probably make the lower prio task
1561  	 * lose its cache, we do not want to bounce a higher task
1562  	 * around just because it gave up its CPU, perhaps for a
1563  	 * lock?
1564  	 *
1565  	 * For equal prio tasks, we just let the scheduler sort it out.
1566  	 *
1567  	 * Otherwise, just let it ride on the affine RQ and the
1568  	 * post-schedule router will push the preempted task away
1569  	 *
1570  	 * This test is optimistic, if we get it wrong the load-balancer
1571  	 * will have to sort it out.
1572  	 *
1573  	 * We take into account the capacity of the CPU to ensure it fits the
1574  	 * requirement of the task - which is only important on heterogeneous
1575  	 * systems like big.LITTLE.
1576  	 */
1577  	test = curr &&
1578  	       unlikely(rt_task(curr)) &&
1579  	       (curr->nr_cpus_allowed < 2 || curr->prio <= p->prio);
1580  
1581  	if (test || !rt_task_fits_capacity(p, cpu)) {
1582  		int target = find_lowest_rq(p);
1583  
1584  		/*
1585  		 * Bail out if we were forcing a migration to find a better
1586  		 * fitting CPU but our search failed.
1587  		 */
1588  		if (!test && target != -1 && !rt_task_fits_capacity(p, target))
1589  			goto out_unlock;
1590  
1591  		/*
1592  		 * Don't bother moving it if the destination CPU is
1593  		 * not running a lower priority task.
1594  		 */
1595  		if (target != -1 &&
1596  		    p->prio < cpu_rq(target)->rt.highest_prio.curr)
1597  			cpu = target;
1598  	}
1599  
1600  out_unlock:
1601  	rcu_read_unlock();
1602  
1603  out:
1604  	return cpu;
1605  }
1606  
1607  static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
1608  {
1609  	/*
1610  	 * Current can't be migrated, useless to reschedule,
1611  	 * let's hope p can move out.
1612  	 */
1613  	if (rq->curr->nr_cpus_allowed == 1 ||
1614  	    !cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
1615  		return;
1616  
1617  	/*
1618  	 * p is migratable, so let's not schedule it and
1619  	 * see if it is pushed or pulled somewhere else.
1620  	 */
1621  	if (p->nr_cpus_allowed != 1 &&
1622  	    cpupri_find(&rq->rd->cpupri, p, NULL))
1623  		return;
1624  
1625  	/*
1626  	 * There appear to be other CPUs that can accept
1627  	 * the current task but none can run 'p', so lets reschedule
1628  	 * to try and push the current task away:
1629  	 */
1630  	requeue_task_rt(rq, p, 1);
1631  	resched_curr(rq);
1632  }
1633  
1634  static int balance_rt(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
1635  {
1636  	if (!on_rt_rq(&p->rt) && need_pull_rt_task(rq, p)) {
1637  		/*
1638  		 * This is OK, because current is on_cpu, which avoids it being
1639  		 * picked for load-balance and preemption/IRQs are still
1640  		 * disabled avoiding further scheduler activity on it and we've
1641  		 * not yet started the picking loop.
1642  		 */
1643  		rq_unpin_lock(rq, rf);
1644  		pull_rt_task(rq);
1645  		rq_repin_lock(rq, rf);
1646  	}
1647  
1648  	return sched_stop_runnable(rq) || sched_dl_runnable(rq) || sched_rt_runnable(rq);
1649  }
1650  #endif /* CONFIG_SMP */
1651  
1652  /*
1653   * Preempt the current task with a newly woken task if needed:
1654   */
1655  static void wakeup_preempt_rt(struct rq *rq, struct task_struct *p, int flags)
1656  {
1657  	if (p->prio < rq->curr->prio) {
1658  		resched_curr(rq);
1659  		return;
1660  	}
1661  
1662  #ifdef CONFIG_SMP
1663  	/*
1664  	 * If:
1665  	 *
1666  	 * - the newly woken task is of equal priority to the current task
1667  	 * - the newly woken task is non-migratable while current is migratable
1668  	 * - current will be preempted on the next reschedule
1669  	 *
1670  	 * we should check to see if current can readily move to a different
1671  	 * cpu.  If so, we will reschedule to allow the push logic to try
1672  	 * to move current somewhere else, making room for our non-migratable
1673  	 * task.
1674  	 */
1675  	if (p->prio == rq->curr->prio && !test_tsk_need_resched(rq->curr))
1676  		check_preempt_equal_prio(rq, p);
1677  #endif
1678  }
1679  
1680  static inline void set_next_task_rt(struct rq *rq, struct task_struct *p, bool first)
1681  {
1682  	struct sched_rt_entity *rt_se = &p->rt;
1683  	struct rt_rq *rt_rq = &rq->rt;
1684  
1685  	p->se.exec_start = rq_clock_task(rq);
1686  	if (on_rt_rq(&p->rt))
1687  		update_stats_wait_end_rt(rt_rq, rt_se);
1688  
1689  	/* The running task is never eligible for pushing */
1690  	dequeue_pushable_task(rq, p);
1691  
1692  	if (!first)
1693  		return;
1694  
1695  	/*
1696  	 * If prev task was rt, put_prev_task() has already updated the
1697  	 * utilization. We only care of the case where we start to schedule a
1698  	 * rt task
1699  	 */
1700  	if (rq->curr->sched_class != &rt_sched_class)
1701  		update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0);
1702  
1703  	rt_queue_push_tasks(rq);
1704  }
1705  
1706  static struct sched_rt_entity *pick_next_rt_entity(struct rt_rq *rt_rq)
1707  {
1708  	struct rt_prio_array *array = &rt_rq->active;
1709  	struct sched_rt_entity *next = NULL;
1710  	struct list_head *queue;
1711  	int idx;
1712  
1713  	idx = sched_find_first_bit(array->bitmap);
1714  	BUG_ON(idx >= MAX_RT_PRIO);
1715  
1716  	queue = array->queue + idx;
1717  	if (SCHED_WARN_ON(list_empty(queue)))
1718  		return NULL;
1719  	next = list_entry(queue->next, struct sched_rt_entity, run_list);
1720  
1721  	return next;
1722  }
1723  
1724  static struct task_struct *_pick_next_task_rt(struct rq *rq)
1725  {
1726  	struct sched_rt_entity *rt_se;
1727  	struct rt_rq *rt_rq  = &rq->rt;
1728  
1729  	do {
1730  		rt_se = pick_next_rt_entity(rt_rq);
1731  		if (unlikely(!rt_se))
1732  			return NULL;
1733  		rt_rq = group_rt_rq(rt_se);
1734  	} while (rt_rq);
1735  
1736  	return rt_task_of(rt_se);
1737  }
1738  
1739  static struct task_struct *pick_task_rt(struct rq *rq)
1740  {
1741  	struct task_struct *p;
1742  
1743  	if (!sched_rt_runnable(rq))
1744  		return NULL;
1745  
1746  	p = _pick_next_task_rt(rq);
1747  
1748  	return p;
1749  }
1750  
1751  static void put_prev_task_rt(struct rq *rq, struct task_struct *p, struct task_struct *next)
1752  {
1753  	struct sched_rt_entity *rt_se = &p->rt;
1754  	struct rt_rq *rt_rq = &rq->rt;
1755  
1756  	if (on_rt_rq(&p->rt))
1757  		update_stats_wait_start_rt(rt_rq, rt_se);
1758  
1759  	update_curr_rt(rq);
1760  
1761  	update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 1);
1762  
1763  	/*
1764  	 * The previous task needs to be made eligible for pushing
1765  	 * if it is still active
1766  	 */
1767  	if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1)
1768  		enqueue_pushable_task(rq, p);
1769  }
1770  
1771  #ifdef CONFIG_SMP
1772  
1773  /* Only try algorithms three times */
1774  #define RT_MAX_TRIES 3
1775  
1776  static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
1777  {
1778  	if (!task_on_cpu(rq, p) &&
1779  	    cpumask_test_cpu(cpu, &p->cpus_mask))
1780  		return 1;
1781  
1782  	return 0;
1783  }
1784  
1785  /*
1786   * Return the highest pushable rq's task, which is suitable to be executed
1787   * on the CPU, NULL otherwise
1788   */
1789  static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu)
1790  {
1791  	struct plist_head *head = &rq->rt.pushable_tasks;
1792  	struct task_struct *p;
1793  
1794  	if (!has_pushable_tasks(rq))
1795  		return NULL;
1796  
1797  	plist_for_each_entry(p, head, pushable_tasks) {
1798  		if (pick_rt_task(rq, p, cpu))
1799  			return p;
1800  	}
1801  
1802  	return NULL;
1803  }
1804  
1805  static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
1806  
1807  static int find_lowest_rq(struct task_struct *task)
1808  {
1809  	struct sched_domain *sd;
1810  	struct cpumask *lowest_mask = this_cpu_cpumask_var_ptr(local_cpu_mask);
1811  	int this_cpu = smp_processor_id();
1812  	int cpu      = task_cpu(task);
1813  	int ret;
1814  
1815  	/* Make sure the mask is initialized first */
1816  	if (unlikely(!lowest_mask))
1817  		return -1;
1818  
1819  	if (task->nr_cpus_allowed == 1)
1820  		return -1; /* No other targets possible */
1821  
1822  	/*
1823  	 * If we're on asym system ensure we consider the different capacities
1824  	 * of the CPUs when searching for the lowest_mask.
1825  	 */
1826  	if (sched_asym_cpucap_active()) {
1827  
1828  		ret = cpupri_find_fitness(&task_rq(task)->rd->cpupri,
1829  					  task, lowest_mask,
1830  					  rt_task_fits_capacity);
1831  	} else {
1832  
1833  		ret = cpupri_find(&task_rq(task)->rd->cpupri,
1834  				  task, lowest_mask);
1835  	}
1836  
1837  	if (!ret)
1838  		return -1; /* No targets found */
1839  
1840  	/*
1841  	 * At this point we have built a mask of CPUs representing the
1842  	 * lowest priority tasks in the system.  Now we want to elect
1843  	 * the best one based on our affinity and topology.
1844  	 *
1845  	 * We prioritize the last CPU that the task executed on since
1846  	 * it is most likely cache-hot in that location.
1847  	 */
1848  	if (cpumask_test_cpu(cpu, lowest_mask))
1849  		return cpu;
1850  
1851  	/*
1852  	 * Otherwise, we consult the sched_domains span maps to figure
1853  	 * out which CPU is logically closest to our hot cache data.
1854  	 */
1855  	if (!cpumask_test_cpu(this_cpu, lowest_mask))
1856  		this_cpu = -1; /* Skip this_cpu opt if not among lowest */
1857  
1858  	rcu_read_lock();
1859  	for_each_domain(cpu, sd) {
1860  		if (sd->flags & SD_WAKE_AFFINE) {
1861  			int best_cpu;
1862  
1863  			/*
1864  			 * "this_cpu" is cheaper to preempt than a
1865  			 * remote processor.
1866  			 */
1867  			if (this_cpu != -1 &&
1868  			    cpumask_test_cpu(this_cpu, sched_domain_span(sd))) {
1869  				rcu_read_unlock();
1870  				return this_cpu;
1871  			}
1872  
1873  			best_cpu = cpumask_any_and_distribute(lowest_mask,
1874  							      sched_domain_span(sd));
1875  			if (best_cpu < nr_cpu_ids) {
1876  				rcu_read_unlock();
1877  				return best_cpu;
1878  			}
1879  		}
1880  	}
1881  	rcu_read_unlock();
1882  
1883  	/*
1884  	 * And finally, if there were no matches within the domains
1885  	 * just give the caller *something* to work with from the compatible
1886  	 * locations.
1887  	 */
1888  	if (this_cpu != -1)
1889  		return this_cpu;
1890  
1891  	cpu = cpumask_any_distribute(lowest_mask);
1892  	if (cpu < nr_cpu_ids)
1893  		return cpu;
1894  
1895  	return -1;
1896  }
1897  
1898  /* Will lock the rq it finds */
1899  static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
1900  {
1901  	struct rq *lowest_rq = NULL;
1902  	int tries;
1903  	int cpu;
1904  
1905  	for (tries = 0; tries < RT_MAX_TRIES; tries++) {
1906  		cpu = find_lowest_rq(task);
1907  
1908  		if ((cpu == -1) || (cpu == rq->cpu))
1909  			break;
1910  
1911  		lowest_rq = cpu_rq(cpu);
1912  
1913  		if (lowest_rq->rt.highest_prio.curr <= task->prio) {
1914  			/*
1915  			 * Target rq has tasks of equal or higher priority,
1916  			 * retrying does not release any lock and is unlikely
1917  			 * to yield a different result.
1918  			 */
1919  			lowest_rq = NULL;
1920  			break;
1921  		}
1922  
1923  		/* if the prio of this runqueue changed, try again */
1924  		if (double_lock_balance(rq, lowest_rq)) {
1925  			/*
1926  			 * We had to unlock the run queue. In
1927  			 * the mean time, task could have
1928  			 * migrated already or had its affinity changed.
1929  			 * Also make sure that it wasn't scheduled on its rq.
1930  			 * It is possible the task was scheduled, set
1931  			 * "migrate_disabled" and then got preempted, so we must
1932  			 * check the task migration disable flag here too.
1933  			 */
1934  			if (unlikely(task_rq(task) != rq ||
1935  				     !cpumask_test_cpu(lowest_rq->cpu, &task->cpus_mask) ||
1936  				     task_on_cpu(rq, task) ||
1937  				     !rt_task(task) ||
1938  				     is_migration_disabled(task) ||
1939  				     !task_on_rq_queued(task))) {
1940  
1941  				double_unlock_balance(rq, lowest_rq);
1942  				lowest_rq = NULL;
1943  				break;
1944  			}
1945  		}
1946  
1947  		/* If this rq is still suitable use it. */
1948  		if (lowest_rq->rt.highest_prio.curr > task->prio)
1949  			break;
1950  
1951  		/* try again */
1952  		double_unlock_balance(rq, lowest_rq);
1953  		lowest_rq = NULL;
1954  	}
1955  
1956  	return lowest_rq;
1957  }
1958  
1959  static struct task_struct *pick_next_pushable_task(struct rq *rq)
1960  {
1961  	struct task_struct *p;
1962  
1963  	if (!has_pushable_tasks(rq))
1964  		return NULL;
1965  
1966  	p = plist_first_entry(&rq->rt.pushable_tasks,
1967  			      struct task_struct, pushable_tasks);
1968  
1969  	BUG_ON(rq->cpu != task_cpu(p));
1970  	BUG_ON(task_current(rq, p));
1971  	BUG_ON(p->nr_cpus_allowed <= 1);
1972  
1973  	BUG_ON(!task_on_rq_queued(p));
1974  	BUG_ON(!rt_task(p));
1975  
1976  	return p;
1977  }
1978  
1979  /*
1980   * If the current CPU has more than one RT task, see if the non
1981   * running task can migrate over to a CPU that is running a task
1982   * of lesser priority.
1983   */
1984  static int push_rt_task(struct rq *rq, bool pull)
1985  {
1986  	struct task_struct *next_task;
1987  	struct rq *lowest_rq;
1988  	int ret = 0;
1989  
1990  	if (!rq->rt.overloaded)
1991  		return 0;
1992  
1993  	next_task = pick_next_pushable_task(rq);
1994  	if (!next_task)
1995  		return 0;
1996  
1997  retry:
1998  	/*
1999  	 * It's possible that the next_task slipped in of
2000  	 * higher priority than current. If that's the case
2001  	 * just reschedule current.
2002  	 */
2003  	if (unlikely(next_task->prio < rq->curr->prio)) {
2004  		resched_curr(rq);
2005  		return 0;
2006  	}
2007  
2008  	if (is_migration_disabled(next_task)) {
2009  		struct task_struct *push_task = NULL;
2010  		int cpu;
2011  
2012  		if (!pull || rq->push_busy)
2013  			return 0;
2014  
2015  		/*
2016  		 * Invoking find_lowest_rq() on anything but an RT task doesn't
2017  		 * make sense. Per the above priority check, curr has to
2018  		 * be of higher priority than next_task, so no need to
2019  		 * reschedule when bailing out.
2020  		 *
2021  		 * Note that the stoppers are masqueraded as SCHED_FIFO
2022  		 * (cf. sched_set_stop_task()), so we can't rely on rt_task().
2023  		 */
2024  		if (rq->curr->sched_class != &rt_sched_class)
2025  			return 0;
2026  
2027  		cpu = find_lowest_rq(rq->curr);
2028  		if (cpu == -1 || cpu == rq->cpu)
2029  			return 0;
2030  
2031  		/*
2032  		 * Given we found a CPU with lower priority than @next_task,
2033  		 * therefore it should be running. However we cannot migrate it
2034  		 * to this other CPU, instead attempt to push the current
2035  		 * running task on this CPU away.
2036  		 */
2037  		push_task = get_push_task(rq);
2038  		if (push_task) {
2039  			preempt_disable();
2040  			raw_spin_rq_unlock(rq);
2041  			stop_one_cpu_nowait(rq->cpu, push_cpu_stop,
2042  					    push_task, &rq->push_work);
2043  			preempt_enable();
2044  			raw_spin_rq_lock(rq);
2045  		}
2046  
2047  		return 0;
2048  	}
2049  
2050  	if (WARN_ON(next_task == rq->curr))
2051  		return 0;
2052  
2053  	/* We might release rq lock */
2054  	get_task_struct(next_task);
2055  
2056  	/* find_lock_lowest_rq locks the rq if found */
2057  	lowest_rq = find_lock_lowest_rq(next_task, rq);
2058  	if (!lowest_rq) {
2059  		struct task_struct *task;
2060  		/*
2061  		 * find_lock_lowest_rq releases rq->lock
2062  		 * so it is possible that next_task has migrated.
2063  		 *
2064  		 * We need to make sure that the task is still on the same
2065  		 * run-queue and is also still the next task eligible for
2066  		 * pushing.
2067  		 */
2068  		task = pick_next_pushable_task(rq);
2069  		if (task == next_task) {
2070  			/*
2071  			 * The task hasn't migrated, and is still the next
2072  			 * eligible task, but we failed to find a run-queue
2073  			 * to push it to.  Do not retry in this case, since
2074  			 * other CPUs will pull from us when ready.
2075  			 */
2076  			goto out;
2077  		}
2078  
2079  		if (!task)
2080  			/* No more tasks, just exit */
2081  			goto out;
2082  
2083  		/*
2084  		 * Something has shifted, try again.
2085  		 */
2086  		put_task_struct(next_task);
2087  		next_task = task;
2088  		goto retry;
2089  	}
2090  
2091  	deactivate_task(rq, next_task, 0);
2092  	set_task_cpu(next_task, lowest_rq->cpu);
2093  	activate_task(lowest_rq, next_task, 0);
2094  	resched_curr(lowest_rq);
2095  	ret = 1;
2096  
2097  	double_unlock_balance(rq, lowest_rq);
2098  out:
2099  	put_task_struct(next_task);
2100  
2101  	return ret;
2102  }
2103  
2104  static void push_rt_tasks(struct rq *rq)
2105  {
2106  	/* push_rt_task will return true if it moved an RT */
2107  	while (push_rt_task(rq, false))
2108  		;
2109  }
2110  
2111  #ifdef HAVE_RT_PUSH_IPI
2112  
2113  /*
2114   * When a high priority task schedules out from a CPU and a lower priority
2115   * task is scheduled in, a check is made to see if there's any RT tasks
2116   * on other CPUs that are waiting to run because a higher priority RT task
2117   * is currently running on its CPU. In this case, the CPU with multiple RT
2118   * tasks queued on it (overloaded) needs to be notified that a CPU has opened
2119   * up that may be able to run one of its non-running queued RT tasks.
2120   *
2121   * All CPUs with overloaded RT tasks need to be notified as there is currently
2122   * no way to know which of these CPUs have the highest priority task waiting
2123   * to run. Instead of trying to take a spinlock on each of these CPUs,
2124   * which has shown to cause large latency when done on machines with many
2125   * CPUs, sending an IPI to the CPUs to have them push off the overloaded
2126   * RT tasks waiting to run.
2127   *
2128   * Just sending an IPI to each of the CPUs is also an issue, as on large
2129   * count CPU machines, this can cause an IPI storm on a CPU, especially
2130   * if its the only CPU with multiple RT tasks queued, and a large number
2131   * of CPUs scheduling a lower priority task at the same time.
2132   *
2133   * Each root domain has its own IRQ work function that can iterate over
2134   * all CPUs with RT overloaded tasks. Since all CPUs with overloaded RT
2135   * task must be checked if there's one or many CPUs that are lowering
2136   * their priority, there's a single IRQ work iterator that will try to
2137   * push off RT tasks that are waiting to run.
2138   *
2139   * When a CPU schedules a lower priority task, it will kick off the
2140   * IRQ work iterator that will jump to each CPU with overloaded RT tasks.
2141   * As it only takes the first CPU that schedules a lower priority task
2142   * to start the process, the rto_start variable is incremented and if
2143   * the atomic result is one, then that CPU will try to take the rto_lock.
2144   * This prevents high contention on the lock as the process handles all
2145   * CPUs scheduling lower priority tasks.
2146   *
2147   * All CPUs that are scheduling a lower priority task will increment the
2148   * rt_loop_next variable. This will make sure that the IRQ work iterator
2149   * checks all RT overloaded CPUs whenever a CPU schedules a new lower
2150   * priority task, even if the iterator is in the middle of a scan. Incrementing
2151   * the rt_loop_next will cause the iterator to perform another scan.
2152   *
2153   */
2154  static int rto_next_cpu(struct root_domain *rd)
2155  {
2156  	int next;
2157  	int cpu;
2158  
2159  	/*
2160  	 * When starting the IPI RT pushing, the rto_cpu is set to -1,
2161  	 * rt_next_cpu() will simply return the first CPU found in
2162  	 * the rto_mask.
2163  	 *
2164  	 * If rto_next_cpu() is called with rto_cpu is a valid CPU, it
2165  	 * will return the next CPU found in the rto_mask.
2166  	 *
2167  	 * If there are no more CPUs left in the rto_mask, then a check is made
2168  	 * against rto_loop and rto_loop_next. rto_loop is only updated with
2169  	 * the rto_lock held, but any CPU may increment the rto_loop_next
2170  	 * without any locking.
2171  	 */
2172  	for (;;) {
2173  
2174  		/* When rto_cpu is -1 this acts like cpumask_first() */
2175  		cpu = cpumask_next(rd->rto_cpu, rd->rto_mask);
2176  
2177  		rd->rto_cpu = cpu;
2178  
2179  		if (cpu < nr_cpu_ids)
2180  			return cpu;
2181  
2182  		rd->rto_cpu = -1;
2183  
2184  		/*
2185  		 * ACQUIRE ensures we see the @rto_mask changes
2186  		 * made prior to the @next value observed.
2187  		 *
2188  		 * Matches WMB in rt_set_overload().
2189  		 */
2190  		next = atomic_read_acquire(&rd->rto_loop_next);
2191  
2192  		if (rd->rto_loop == next)
2193  			break;
2194  
2195  		rd->rto_loop = next;
2196  	}
2197  
2198  	return -1;
2199  }
2200  
2201  static inline bool rto_start_trylock(atomic_t *v)
2202  {
2203  	return !atomic_cmpxchg_acquire(v, 0, 1);
2204  }
2205  
2206  static inline void rto_start_unlock(atomic_t *v)
2207  {
2208  	atomic_set_release(v, 0);
2209  }
2210  
2211  static void tell_cpu_to_push(struct rq *rq)
2212  {
2213  	int cpu = -1;
2214  
2215  	/* Keep the loop going if the IPI is currently active */
2216  	atomic_inc(&rq->rd->rto_loop_next);
2217  
2218  	/* Only one CPU can initiate a loop at a time */
2219  	if (!rto_start_trylock(&rq->rd->rto_loop_start))
2220  		return;
2221  
2222  	raw_spin_lock(&rq->rd->rto_lock);
2223  
2224  	/*
2225  	 * The rto_cpu is updated under the lock, if it has a valid CPU
2226  	 * then the IPI is still running and will continue due to the
2227  	 * update to loop_next, and nothing needs to be done here.
2228  	 * Otherwise it is finishing up and an IPI needs to be sent.
2229  	 */
2230  	if (rq->rd->rto_cpu < 0)
2231  		cpu = rto_next_cpu(rq->rd);
2232  
2233  	raw_spin_unlock(&rq->rd->rto_lock);
2234  
2235  	rto_start_unlock(&rq->rd->rto_loop_start);
2236  
2237  	if (cpu >= 0) {
2238  		/* Make sure the rd does not get freed while pushing */
2239  		sched_get_rd(rq->rd);
2240  		irq_work_queue_on(&rq->rd->rto_push_work, cpu);
2241  	}
2242  }
2243  
2244  /* Called from hardirq context */
2245  void rto_push_irq_work_func(struct irq_work *work)
2246  {
2247  	struct root_domain *rd =
2248  		container_of(work, struct root_domain, rto_push_work);
2249  	struct rq *rq;
2250  	int cpu;
2251  
2252  	rq = this_rq();
2253  
2254  	/*
2255  	 * We do not need to grab the lock to check for has_pushable_tasks.
2256  	 * When it gets updated, a check is made if a push is possible.
2257  	 */
2258  	if (has_pushable_tasks(rq)) {
2259  		raw_spin_rq_lock(rq);
2260  		while (push_rt_task(rq, true))
2261  			;
2262  		raw_spin_rq_unlock(rq);
2263  	}
2264  
2265  	raw_spin_lock(&rd->rto_lock);
2266  
2267  	/* Pass the IPI to the next rt overloaded queue */
2268  	cpu = rto_next_cpu(rd);
2269  
2270  	raw_spin_unlock(&rd->rto_lock);
2271  
2272  	if (cpu < 0) {
2273  		sched_put_rd(rd);
2274  		return;
2275  	}
2276  
2277  	/* Try the next RT overloaded CPU */
2278  	irq_work_queue_on(&rd->rto_push_work, cpu);
2279  }
2280  #endif /* HAVE_RT_PUSH_IPI */
2281  
2282  static void pull_rt_task(struct rq *this_rq)
2283  {
2284  	int this_cpu = this_rq->cpu, cpu;
2285  	bool resched = false;
2286  	struct task_struct *p, *push_task;
2287  	struct rq *src_rq;
2288  	int rt_overload_count = rt_overloaded(this_rq);
2289  
2290  	if (likely(!rt_overload_count))
2291  		return;
2292  
2293  	/*
2294  	 * Match the barrier from rt_set_overloaded; this guarantees that if we
2295  	 * see overloaded we must also see the rto_mask bit.
2296  	 */
2297  	smp_rmb();
2298  
2299  	/* If we are the only overloaded CPU do nothing */
2300  	if (rt_overload_count == 1 &&
2301  	    cpumask_test_cpu(this_rq->cpu, this_rq->rd->rto_mask))
2302  		return;
2303  
2304  #ifdef HAVE_RT_PUSH_IPI
2305  	if (sched_feat(RT_PUSH_IPI)) {
2306  		tell_cpu_to_push(this_rq);
2307  		return;
2308  	}
2309  #endif
2310  
2311  	for_each_cpu(cpu, this_rq->rd->rto_mask) {
2312  		if (this_cpu == cpu)
2313  			continue;
2314  
2315  		src_rq = cpu_rq(cpu);
2316  
2317  		/*
2318  		 * Don't bother taking the src_rq->lock if the next highest
2319  		 * task is known to be lower-priority than our current task.
2320  		 * This may look racy, but if this value is about to go
2321  		 * logically higher, the src_rq will push this task away.
2322  		 * And if its going logically lower, we do not care
2323  		 */
2324  		if (src_rq->rt.highest_prio.next >=
2325  		    this_rq->rt.highest_prio.curr)
2326  			continue;
2327  
2328  		/*
2329  		 * We can potentially drop this_rq's lock in
2330  		 * double_lock_balance, and another CPU could
2331  		 * alter this_rq
2332  		 */
2333  		push_task = NULL;
2334  		double_lock_balance(this_rq, src_rq);
2335  
2336  		/*
2337  		 * We can pull only a task, which is pushable
2338  		 * on its rq, and no others.
2339  		 */
2340  		p = pick_highest_pushable_task(src_rq, this_cpu);
2341  
2342  		/*
2343  		 * Do we have an RT task that preempts
2344  		 * the to-be-scheduled task?
2345  		 */
2346  		if (p && (p->prio < this_rq->rt.highest_prio.curr)) {
2347  			WARN_ON(p == src_rq->curr);
2348  			WARN_ON(!task_on_rq_queued(p));
2349  
2350  			/*
2351  			 * There's a chance that p is higher in priority
2352  			 * than what's currently running on its CPU.
2353  			 * This is just that p is waking up and hasn't
2354  			 * had a chance to schedule. We only pull
2355  			 * p if it is lower in priority than the
2356  			 * current task on the run queue
2357  			 */
2358  			if (p->prio < src_rq->curr->prio)
2359  				goto skip;
2360  
2361  			if (is_migration_disabled(p)) {
2362  				push_task = get_push_task(src_rq);
2363  			} else {
2364  				deactivate_task(src_rq, p, 0);
2365  				set_task_cpu(p, this_cpu);
2366  				activate_task(this_rq, p, 0);
2367  				resched = true;
2368  			}
2369  			/*
2370  			 * We continue with the search, just in
2371  			 * case there's an even higher prio task
2372  			 * in another runqueue. (low likelihood
2373  			 * but possible)
2374  			 */
2375  		}
2376  skip:
2377  		double_unlock_balance(this_rq, src_rq);
2378  
2379  		if (push_task) {
2380  			preempt_disable();
2381  			raw_spin_rq_unlock(this_rq);
2382  			stop_one_cpu_nowait(src_rq->cpu, push_cpu_stop,
2383  					    push_task, &src_rq->push_work);
2384  			preempt_enable();
2385  			raw_spin_rq_lock(this_rq);
2386  		}
2387  	}
2388  
2389  	if (resched)
2390  		resched_curr(this_rq);
2391  }
2392  
2393  /*
2394   * If we are not running and we are not going to reschedule soon, we should
2395   * try to push tasks away now
2396   */
2397  static void task_woken_rt(struct rq *rq, struct task_struct *p)
2398  {
2399  	bool need_to_push = !task_on_cpu(rq, p) &&
2400  			    !test_tsk_need_resched(rq->curr) &&
2401  			    p->nr_cpus_allowed > 1 &&
2402  			    (dl_task(rq->curr) || rt_task(rq->curr)) &&
2403  			    (rq->curr->nr_cpus_allowed < 2 ||
2404  			     rq->curr->prio <= p->prio);
2405  
2406  	if (need_to_push)
2407  		push_rt_tasks(rq);
2408  }
2409  
2410  /* Assumes rq->lock is held */
2411  static void rq_online_rt(struct rq *rq)
2412  {
2413  	if (rq->rt.overloaded)
2414  		rt_set_overload(rq);
2415  
2416  	__enable_runtime(rq);
2417  
2418  	cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr);
2419  }
2420  
2421  /* Assumes rq->lock is held */
2422  static void rq_offline_rt(struct rq *rq)
2423  {
2424  	if (rq->rt.overloaded)
2425  		rt_clear_overload(rq);
2426  
2427  	__disable_runtime(rq);
2428  
2429  	cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_INVALID);
2430  }
2431  
2432  /*
2433   * When switch from the rt queue, we bring ourselves to a position
2434   * that we might want to pull RT tasks from other runqueues.
2435   */
2436  static void switched_from_rt(struct rq *rq, struct task_struct *p)
2437  {
2438  	/*
2439  	 * If there are other RT tasks then we will reschedule
2440  	 * and the scheduling of the other RT tasks will handle
2441  	 * the balancing. But if we are the last RT task
2442  	 * we may need to handle the pulling of RT tasks
2443  	 * now.
2444  	 */
2445  	if (!task_on_rq_queued(p) || rq->rt.rt_nr_running)
2446  		return;
2447  
2448  	rt_queue_pull_task(rq);
2449  }
2450  
2451  void __init init_sched_rt_class(void)
2452  {
2453  	unsigned int i;
2454  
2455  	for_each_possible_cpu(i) {
2456  		zalloc_cpumask_var_node(&per_cpu(local_cpu_mask, i),
2457  					GFP_KERNEL, cpu_to_node(i));
2458  	}
2459  }
2460  #endif /* CONFIG_SMP */
2461  
2462  /*
2463   * When switching a task to RT, we may overload the runqueue
2464   * with RT tasks. In this case we try to push them off to
2465   * other runqueues.
2466   */
2467  static void switched_to_rt(struct rq *rq, struct task_struct *p)
2468  {
2469  	/*
2470  	 * If we are running, update the avg_rt tracking, as the running time
2471  	 * will now on be accounted into the latter.
2472  	 */
2473  	if (task_current(rq, p)) {
2474  		update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0);
2475  		return;
2476  	}
2477  
2478  	/*
2479  	 * If we are not running we may need to preempt the current
2480  	 * running task. If that current running task is also an RT task
2481  	 * then see if we can move to another run queue.
2482  	 */
2483  	if (task_on_rq_queued(p)) {
2484  #ifdef CONFIG_SMP
2485  		if (p->nr_cpus_allowed > 1 && rq->rt.overloaded)
2486  			rt_queue_push_tasks(rq);
2487  #endif /* CONFIG_SMP */
2488  		if (p->prio < rq->curr->prio && cpu_online(cpu_of(rq)))
2489  			resched_curr(rq);
2490  	}
2491  }
2492  
2493  /*
2494   * Priority of the task has changed. This may cause
2495   * us to initiate a push or pull.
2496   */
2497  static void
2498  prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
2499  {
2500  	if (!task_on_rq_queued(p))
2501  		return;
2502  
2503  	if (task_current(rq, p)) {
2504  #ifdef CONFIG_SMP
2505  		/*
2506  		 * If our priority decreases while running, we
2507  		 * may need to pull tasks to this runqueue.
2508  		 */
2509  		if (oldprio < p->prio)
2510  			rt_queue_pull_task(rq);
2511  
2512  		/*
2513  		 * If there's a higher priority task waiting to run
2514  		 * then reschedule.
2515  		 */
2516  		if (p->prio > rq->rt.highest_prio.curr)
2517  			resched_curr(rq);
2518  #else
2519  		/* For UP simply resched on drop of prio */
2520  		if (oldprio < p->prio)
2521  			resched_curr(rq);
2522  #endif /* CONFIG_SMP */
2523  	} else {
2524  		/*
2525  		 * This task is not running, but if it is
2526  		 * greater than the current running task
2527  		 * then reschedule.
2528  		 */
2529  		if (p->prio < rq->curr->prio)
2530  			resched_curr(rq);
2531  	}
2532  }
2533  
2534  #ifdef CONFIG_POSIX_TIMERS
2535  static void watchdog(struct rq *rq, struct task_struct *p)
2536  {
2537  	unsigned long soft, hard;
2538  
2539  	/* max may change after cur was read, this will be fixed next tick */
2540  	soft = task_rlimit(p, RLIMIT_RTTIME);
2541  	hard = task_rlimit_max(p, RLIMIT_RTTIME);
2542  
2543  	if (soft != RLIM_INFINITY) {
2544  		unsigned long next;
2545  
2546  		if (p->rt.watchdog_stamp != jiffies) {
2547  			p->rt.timeout++;
2548  			p->rt.watchdog_stamp = jiffies;
2549  		}
2550  
2551  		next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ);
2552  		if (p->rt.timeout > next) {
2553  			posix_cputimers_rt_watchdog(&p->posix_cputimers,
2554  						    p->se.sum_exec_runtime);
2555  		}
2556  	}
2557  }
2558  #else
2559  static inline void watchdog(struct rq *rq, struct task_struct *p) { }
2560  #endif
2561  
2562  /*
2563   * scheduler tick hitting a task of our scheduling class.
2564   *
2565   * NOTE: This function can be called remotely by the tick offload that
2566   * goes along full dynticks. Therefore no local assumption can be made
2567   * and everything must be accessed through the @rq and @curr passed in
2568   * parameters.
2569   */
2570  static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
2571  {
2572  	struct sched_rt_entity *rt_se = &p->rt;
2573  
2574  	update_curr_rt(rq);
2575  	update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 1);
2576  
2577  	watchdog(rq, p);
2578  
2579  	/*
2580  	 * RR tasks need a special form of time-slice management.
2581  	 * FIFO tasks have no timeslices.
2582  	 */
2583  	if (p->policy != SCHED_RR)
2584  		return;
2585  
2586  	if (--p->rt.time_slice)
2587  		return;
2588  
2589  	p->rt.time_slice = sched_rr_timeslice;
2590  
2591  	/*
2592  	 * Requeue to the end of queue if we (and all of our ancestors) are not
2593  	 * the only element on the queue
2594  	 */
2595  	for_each_sched_rt_entity(rt_se) {
2596  		if (rt_se->run_list.prev != rt_se->run_list.next) {
2597  			requeue_task_rt(rq, p, 0);
2598  			resched_curr(rq);
2599  			return;
2600  		}
2601  	}
2602  }
2603  
2604  static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
2605  {
2606  	/*
2607  	 * Time slice is 0 for SCHED_FIFO tasks
2608  	 */
2609  	if (task->policy == SCHED_RR)
2610  		return sched_rr_timeslice;
2611  	else
2612  		return 0;
2613  }
2614  
2615  #ifdef CONFIG_SCHED_CORE
2616  static int task_is_throttled_rt(struct task_struct *p, int cpu)
2617  {
2618  	struct rt_rq *rt_rq;
2619  
2620  #ifdef CONFIG_RT_GROUP_SCHED
2621  	rt_rq = task_group(p)->rt_rq[cpu];
2622  #else
2623  	rt_rq = &cpu_rq(cpu)->rt;
2624  #endif
2625  
2626  	return rt_rq_throttled(rt_rq);
2627  }
2628  #endif
2629  
2630  DEFINE_SCHED_CLASS(rt) = {
2631  
2632  	.enqueue_task		= enqueue_task_rt,
2633  	.dequeue_task		= dequeue_task_rt,
2634  	.yield_task		= yield_task_rt,
2635  
2636  	.wakeup_preempt		= wakeup_preempt_rt,
2637  
2638  	.pick_task		= pick_task_rt,
2639  	.put_prev_task		= put_prev_task_rt,
2640  	.set_next_task          = set_next_task_rt,
2641  
2642  #ifdef CONFIG_SMP
2643  	.balance		= balance_rt,
2644  	.select_task_rq		= select_task_rq_rt,
2645  	.set_cpus_allowed       = set_cpus_allowed_common,
2646  	.rq_online              = rq_online_rt,
2647  	.rq_offline             = rq_offline_rt,
2648  	.task_woken		= task_woken_rt,
2649  	.switched_from		= switched_from_rt,
2650  	.find_lock_rq		= find_lock_lowest_rq,
2651  #endif
2652  
2653  	.task_tick		= task_tick_rt,
2654  
2655  	.get_rr_interval	= get_rr_interval_rt,
2656  
2657  	.prio_changed		= prio_changed_rt,
2658  	.switched_to		= switched_to_rt,
2659  
2660  	.update_curr		= update_curr_rt,
2661  
2662  #ifdef CONFIG_SCHED_CORE
2663  	.task_is_throttled	= task_is_throttled_rt,
2664  #endif
2665  
2666  #ifdef CONFIG_UCLAMP_TASK
2667  	.uclamp_enabled		= 1,
2668  #endif
2669  };
2670  
2671  #ifdef CONFIG_RT_GROUP_SCHED
2672  /*
2673   * Ensure that the real time constraints are schedulable.
2674   */
2675  static DEFINE_MUTEX(rt_constraints_mutex);
2676  
2677  static inline int tg_has_rt_tasks(struct task_group *tg)
2678  {
2679  	struct task_struct *task;
2680  	struct css_task_iter it;
2681  	int ret = 0;
2682  
2683  	/*
2684  	 * Autogroups do not have RT tasks; see autogroup_create().
2685  	 */
2686  	if (task_group_is_autogroup(tg))
2687  		return 0;
2688  
2689  	css_task_iter_start(&tg->css, 0, &it);
2690  	while (!ret && (task = css_task_iter_next(&it)))
2691  		ret |= rt_task(task);
2692  	css_task_iter_end(&it);
2693  
2694  	return ret;
2695  }
2696  
2697  struct rt_schedulable_data {
2698  	struct task_group *tg;
2699  	u64 rt_period;
2700  	u64 rt_runtime;
2701  };
2702  
2703  static int tg_rt_schedulable(struct task_group *tg, void *data)
2704  {
2705  	struct rt_schedulable_data *d = data;
2706  	struct task_group *child;
2707  	unsigned long total, sum = 0;
2708  	u64 period, runtime;
2709  
2710  	period = ktime_to_ns(tg->rt_bandwidth.rt_period);
2711  	runtime = tg->rt_bandwidth.rt_runtime;
2712  
2713  	if (tg == d->tg) {
2714  		period = d->rt_period;
2715  		runtime = d->rt_runtime;
2716  	}
2717  
2718  	/*
2719  	 * Cannot have more runtime than the period.
2720  	 */
2721  	if (runtime > period && runtime != RUNTIME_INF)
2722  		return -EINVAL;
2723  
2724  	/*
2725  	 * Ensure we don't starve existing RT tasks if runtime turns zero.
2726  	 */
2727  	if (rt_bandwidth_enabled() && !runtime &&
2728  	    tg->rt_bandwidth.rt_runtime && tg_has_rt_tasks(tg))
2729  		return -EBUSY;
2730  
2731  	total = to_ratio(period, runtime);
2732  
2733  	/*
2734  	 * Nobody can have more than the global setting allows.
2735  	 */
2736  	if (total > to_ratio(global_rt_period(), global_rt_runtime()))
2737  		return -EINVAL;
2738  
2739  	/*
2740  	 * The sum of our children's runtime should not exceed our own.
2741  	 */
2742  	list_for_each_entry_rcu(child, &tg->children, siblings) {
2743  		period = ktime_to_ns(child->rt_bandwidth.rt_period);
2744  		runtime = child->rt_bandwidth.rt_runtime;
2745  
2746  		if (child == d->tg) {
2747  			period = d->rt_period;
2748  			runtime = d->rt_runtime;
2749  		}
2750  
2751  		sum += to_ratio(period, runtime);
2752  	}
2753  
2754  	if (sum > total)
2755  		return -EINVAL;
2756  
2757  	return 0;
2758  }
2759  
2760  static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
2761  {
2762  	int ret;
2763  
2764  	struct rt_schedulable_data data = {
2765  		.tg = tg,
2766  		.rt_period = period,
2767  		.rt_runtime = runtime,
2768  	};
2769  
2770  	rcu_read_lock();
2771  	ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data);
2772  	rcu_read_unlock();
2773  
2774  	return ret;
2775  }
2776  
2777  static int tg_set_rt_bandwidth(struct task_group *tg,
2778  		u64 rt_period, u64 rt_runtime)
2779  {
2780  	int i, err = 0;
2781  
2782  	/*
2783  	 * Disallowing the root group RT runtime is BAD, it would disallow the
2784  	 * kernel creating (and or operating) RT threads.
2785  	 */
2786  	if (tg == &root_task_group && rt_runtime == 0)
2787  		return -EINVAL;
2788  
2789  	/* No period doesn't make any sense. */
2790  	if (rt_period == 0)
2791  		return -EINVAL;
2792  
2793  	/*
2794  	 * Bound quota to defend quota against overflow during bandwidth shift.
2795  	 */
2796  	if (rt_runtime != RUNTIME_INF && rt_runtime > max_rt_runtime)
2797  		return -EINVAL;
2798  
2799  	mutex_lock(&rt_constraints_mutex);
2800  	err = __rt_schedulable(tg, rt_period, rt_runtime);
2801  	if (err)
2802  		goto unlock;
2803  
2804  	raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
2805  	tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
2806  	tg->rt_bandwidth.rt_runtime = rt_runtime;
2807  
2808  	for_each_possible_cpu(i) {
2809  		struct rt_rq *rt_rq = tg->rt_rq[i];
2810  
2811  		raw_spin_lock(&rt_rq->rt_runtime_lock);
2812  		rt_rq->rt_runtime = rt_runtime;
2813  		raw_spin_unlock(&rt_rq->rt_runtime_lock);
2814  	}
2815  	raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
2816  unlock:
2817  	mutex_unlock(&rt_constraints_mutex);
2818  
2819  	return err;
2820  }
2821  
2822  int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
2823  {
2824  	u64 rt_runtime, rt_period;
2825  
2826  	rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
2827  	rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
2828  	if (rt_runtime_us < 0)
2829  		rt_runtime = RUNTIME_INF;
2830  	else if ((u64)rt_runtime_us > U64_MAX / NSEC_PER_USEC)
2831  		return -EINVAL;
2832  
2833  	return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
2834  }
2835  
2836  long sched_group_rt_runtime(struct task_group *tg)
2837  {
2838  	u64 rt_runtime_us;
2839  
2840  	if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)
2841  		return -1;
2842  
2843  	rt_runtime_us = tg->rt_bandwidth.rt_runtime;
2844  	do_div(rt_runtime_us, NSEC_PER_USEC);
2845  	return rt_runtime_us;
2846  }
2847  
2848  int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us)
2849  {
2850  	u64 rt_runtime, rt_period;
2851  
2852  	if (rt_period_us > U64_MAX / NSEC_PER_USEC)
2853  		return -EINVAL;
2854  
2855  	rt_period = rt_period_us * NSEC_PER_USEC;
2856  	rt_runtime = tg->rt_bandwidth.rt_runtime;
2857  
2858  	return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
2859  }
2860  
2861  long sched_group_rt_period(struct task_group *tg)
2862  {
2863  	u64 rt_period_us;
2864  
2865  	rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);
2866  	do_div(rt_period_us, NSEC_PER_USEC);
2867  	return rt_period_us;
2868  }
2869  
2870  #ifdef CONFIG_SYSCTL
2871  static int sched_rt_global_constraints(void)
2872  {
2873  	int ret = 0;
2874  
2875  	mutex_lock(&rt_constraints_mutex);
2876  	ret = __rt_schedulable(NULL, 0, 0);
2877  	mutex_unlock(&rt_constraints_mutex);
2878  
2879  	return ret;
2880  }
2881  #endif /* CONFIG_SYSCTL */
2882  
2883  int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
2884  {
2885  	/* Don't accept real-time tasks when there is no way for them to run */
2886  	if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
2887  		return 0;
2888  
2889  	return 1;
2890  }
2891  
2892  #else /* !CONFIG_RT_GROUP_SCHED */
2893  
2894  #ifdef CONFIG_SYSCTL
2895  static int sched_rt_global_constraints(void)
2896  {
2897  	return 0;
2898  }
2899  #endif /* CONFIG_SYSCTL */
2900  #endif /* CONFIG_RT_GROUP_SCHED */
2901  
2902  #ifdef CONFIG_SYSCTL
2903  static int sched_rt_global_validate(void)
2904  {
2905  	if ((sysctl_sched_rt_runtime != RUNTIME_INF) &&
2906  		((sysctl_sched_rt_runtime > sysctl_sched_rt_period) ||
2907  		 ((u64)sysctl_sched_rt_runtime *
2908  			NSEC_PER_USEC > max_rt_runtime)))
2909  		return -EINVAL;
2910  
2911  	return 0;
2912  }
2913  
2914  static void sched_rt_do_global(void)
2915  {
2916  }
2917  
2918  static int sched_rt_handler(const struct ctl_table *table, int write, void *buffer,
2919  		size_t *lenp, loff_t *ppos)
2920  {
2921  	int old_period, old_runtime;
2922  	static DEFINE_MUTEX(mutex);
2923  	int ret;
2924  
2925  	mutex_lock(&mutex);
2926  	old_period = sysctl_sched_rt_period;
2927  	old_runtime = sysctl_sched_rt_runtime;
2928  
2929  	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
2930  
2931  	if (!ret && write) {
2932  		ret = sched_rt_global_validate();
2933  		if (ret)
2934  			goto undo;
2935  
2936  		ret = sched_dl_global_validate();
2937  		if (ret)
2938  			goto undo;
2939  
2940  		ret = sched_rt_global_constraints();
2941  		if (ret)
2942  			goto undo;
2943  
2944  		sched_rt_do_global();
2945  		sched_dl_do_global();
2946  	}
2947  	if (0) {
2948  undo:
2949  		sysctl_sched_rt_period = old_period;
2950  		sysctl_sched_rt_runtime = old_runtime;
2951  	}
2952  	mutex_unlock(&mutex);
2953  
2954  	return ret;
2955  }
2956  
2957  static int sched_rr_handler(const struct ctl_table *table, int write, void *buffer,
2958  		size_t *lenp, loff_t *ppos)
2959  {
2960  	int ret;
2961  	static DEFINE_MUTEX(mutex);
2962  
2963  	mutex_lock(&mutex);
2964  	ret = proc_dointvec(table, write, buffer, lenp, ppos);
2965  	/*
2966  	 * Make sure that internally we keep jiffies.
2967  	 * Also, writing zero resets the time-slice to default:
2968  	 */
2969  	if (!ret && write) {
2970  		sched_rr_timeslice =
2971  			sysctl_sched_rr_timeslice <= 0 ? RR_TIMESLICE :
2972  			msecs_to_jiffies(sysctl_sched_rr_timeslice);
2973  
2974  		if (sysctl_sched_rr_timeslice <= 0)
2975  			sysctl_sched_rr_timeslice = jiffies_to_msecs(RR_TIMESLICE);
2976  	}
2977  	mutex_unlock(&mutex);
2978  
2979  	return ret;
2980  }
2981  #endif /* CONFIG_SYSCTL */
2982  
2983  #ifdef CONFIG_SCHED_DEBUG
2984  void print_rt_stats(struct seq_file *m, int cpu)
2985  {
2986  	rt_rq_iter_t iter;
2987  	struct rt_rq *rt_rq;
2988  
2989  	rcu_read_lock();
2990  	for_each_rt_rq(rt_rq, iter, cpu_rq(cpu))
2991  		print_rt_rq(m, cpu, rt_rq);
2992  	rcu_read_unlock();
2993  }
2994  #endif /* CONFIG_SCHED_DEBUG */
2995