1b886d83cSThomas Gleixner // SPDX-License-Identifier: GPL-2.0-only
2391e43daSPeter Zijlstra /*
3391e43daSPeter Zijlstra * kernel/sched/cpupri.c
4391e43daSPeter Zijlstra *
5391e43daSPeter Zijlstra * CPU priority management
6391e43daSPeter Zijlstra *
7391e43daSPeter Zijlstra * Copyright (C) 2007-2008 Novell
8391e43daSPeter Zijlstra *
9391e43daSPeter Zijlstra * Author: Gregory Haskins <ghaskins@novell.com>
10391e43daSPeter Zijlstra *
11391e43daSPeter Zijlstra * This code tracks the priority of each CPU so that global migration
12391e43daSPeter Zijlstra * decisions are easy to calculate. Each CPU can be in a state as follows:
13391e43daSPeter Zijlstra *
14b13772f8SPeter Zijlstra * (INVALID), NORMAL, RT1, ... RT99, HIGHER
15391e43daSPeter Zijlstra *
16391e43daSPeter Zijlstra * going from the lowest priority to the highest. CPUs in the INVALID state
17391e43daSPeter Zijlstra * are not eligible for routing. The system maintains this state with
1897fb7a0aSIngo Molnar * a 2 dimensional bitmap (the first for priority class, the second for CPUs
19391e43daSPeter Zijlstra * in that class). Therefore a typical application without affinity
20391e43daSPeter Zijlstra * restrictions can find a suitable CPU with O(1) complexity (e.g. two bit
21391e43daSPeter Zijlstra * searches). For tasks with affinity restrictions, the algorithm has a
22b13772f8SPeter Zijlstra * worst case complexity of O(min(101, nr_domcpus)), though the scenario that
23391e43daSPeter Zijlstra * yields the worst case search is fairly contrived.
24391e43daSPeter Zijlstra */
25391e43daSPeter Zijlstra
26934fc331SPeter Zijlstra /*
27934fc331SPeter Zijlstra * p->rt_priority p->prio newpri cpupri
28934fc331SPeter Zijlstra *
29934fc331SPeter Zijlstra * -1 -1 (CPUPRI_INVALID)
30934fc331SPeter Zijlstra *
31934fc331SPeter Zijlstra * 99 0 (CPUPRI_NORMAL)
32934fc331SPeter Zijlstra *
33934fc331SPeter Zijlstra * 1 98 98 1
34934fc331SPeter Zijlstra * ...
35934fc331SPeter Zijlstra * 49 50 50 49
36934fc331SPeter Zijlstra * 50 49 49 50
37934fc331SPeter Zijlstra * ...
38934fc331SPeter Zijlstra * 99 0 0 99
39b13772f8SPeter Zijlstra *
40b13772f8SPeter Zijlstra * 100 100 (CPUPRI_HIGHER)
41934fc331SPeter Zijlstra */
convert_prio(int prio)42391e43daSPeter Zijlstra static int convert_prio(int prio)
43391e43daSPeter Zijlstra {
44391e43daSPeter Zijlstra int cpupri;
45391e43daSPeter Zijlstra
46934fc331SPeter Zijlstra switch (prio) {
47934fc331SPeter Zijlstra case CPUPRI_INVALID:
48934fc331SPeter Zijlstra cpupri = CPUPRI_INVALID; /* -1 */
49934fc331SPeter Zijlstra break;
50934fc331SPeter Zijlstra
51934fc331SPeter Zijlstra case 0 ... 98:
52934fc331SPeter Zijlstra cpupri = MAX_RT_PRIO-1 - prio; /* 1 ... 99 */
53934fc331SPeter Zijlstra break;
54934fc331SPeter Zijlstra
55934fc331SPeter Zijlstra case MAX_RT_PRIO-1:
56934fc331SPeter Zijlstra cpupri = CPUPRI_NORMAL; /* 0 */
57934fc331SPeter Zijlstra break;
58b13772f8SPeter Zijlstra
59b13772f8SPeter Zijlstra case MAX_RT_PRIO:
60b13772f8SPeter Zijlstra cpupri = CPUPRI_HIGHER; /* 100 */
61b13772f8SPeter Zijlstra break;
62934fc331SPeter Zijlstra }
63391e43daSPeter Zijlstra
64391e43daSPeter Zijlstra return cpupri;
65391e43daSPeter Zijlstra }
66391e43daSPeter Zijlstra
__cpupri_find(struct cpupri * cp,struct task_struct * p,struct cpumask * lowest_mask,int idx)67d9cb236bSQais Yousef static inline int __cpupri_find(struct cpupri *cp, struct task_struct *p,
68d9cb236bSQais Yousef struct cpumask *lowest_mask, int idx)
69391e43daSPeter Zijlstra {
70391e43daSPeter Zijlstra struct cpupri_vec *vec = &cp->pri_to_cpu[idx];
71391e43daSPeter Zijlstra int skip = 0;
72391e43daSPeter Zijlstra
73391e43daSPeter Zijlstra if (!atomic_read(&(vec)->count))
74391e43daSPeter Zijlstra skip = 1;
75391e43daSPeter Zijlstra /*
76391e43daSPeter Zijlstra * When looking at the vector, we need to read the counter,
77391e43daSPeter Zijlstra * do a memory barrier, then read the mask.
78391e43daSPeter Zijlstra *
793b03706fSIngo Molnar * Note: This is still all racy, but we can deal with it.
80391e43daSPeter Zijlstra * Ideally, we only want to look at masks that are set.
81391e43daSPeter Zijlstra *
82391e43daSPeter Zijlstra * If a mask is not set, then the only thing wrong is that we
83391e43daSPeter Zijlstra * did a little more work than necessary.
84391e43daSPeter Zijlstra *
85391e43daSPeter Zijlstra * If we read a zero count but the mask is set, because of the
86391e43daSPeter Zijlstra * memory barriers, that can only happen when the highest prio
87391e43daSPeter Zijlstra * task for a run queue has left the run queue, in which case,
88391e43daSPeter Zijlstra * it will be followed by a pull. If the task we are processing
89391e43daSPeter Zijlstra * fails to find a proper place to go, that pull request will
90391e43daSPeter Zijlstra * pull this task if the run queue is running at a lower
91391e43daSPeter Zijlstra * priority.
92391e43daSPeter Zijlstra */
93391e43daSPeter Zijlstra smp_rmb();
94391e43daSPeter Zijlstra
95391e43daSPeter Zijlstra /* Need to do the rmb for every iteration */
96391e43daSPeter Zijlstra if (skip)
97d9cb236bSQais Yousef return 0;
98391e43daSPeter Zijlstra
9995158a89SPeter Zijlstra if (cpumask_any_and(&p->cpus_mask, vec->mask) >= nr_cpu_ids)
100d9cb236bSQais Yousef return 0;
101391e43daSPeter Zijlstra
102391e43daSPeter Zijlstra if (lowest_mask) {
10395158a89SPeter Zijlstra cpumask_and(lowest_mask, &p->cpus_mask, vec->mask);
104*fc090277SJoel Fernandes (Google) cpumask_and(lowest_mask, lowest_mask, cpu_active_mask);
105391e43daSPeter Zijlstra
106391e43daSPeter Zijlstra /*
107391e43daSPeter Zijlstra * We have to ensure that we have at least one bit
108391e43daSPeter Zijlstra * still set in the array, since the map could have
109391e43daSPeter Zijlstra * been concurrently emptied between the first and
110391e43daSPeter Zijlstra * second reads of vec->mask. If we hit this
111391e43daSPeter Zijlstra * condition, simply act as though we never hit this
112391e43daSPeter Zijlstra * priority level and continue on.
113391e43daSPeter Zijlstra */
114804d402fSQais Yousef if (cpumask_empty(lowest_mask))
115d9cb236bSQais Yousef return 0;
116d9cb236bSQais Yousef }
117d9cb236bSQais Yousef
118d9cb236bSQais Yousef return 1;
119d9cb236bSQais Yousef }
120d9cb236bSQais Yousef
cpupri_find(struct cpupri * cp,struct task_struct * p,struct cpumask * lowest_mask)121a1bd02e1SQais Yousef int cpupri_find(struct cpupri *cp, struct task_struct *p,
122a1bd02e1SQais Yousef struct cpumask *lowest_mask)
123a1bd02e1SQais Yousef {
124a1bd02e1SQais Yousef return cpupri_find_fitness(cp, p, lowest_mask, NULL);
125a1bd02e1SQais Yousef }
126a1bd02e1SQais Yousef
127d9cb236bSQais Yousef /**
128a1bd02e1SQais Yousef * cpupri_find_fitness - find the best (lowest-pri) CPU in the system
129d9cb236bSQais Yousef * @cp: The cpupri context
130d9cb236bSQais Yousef * @p: The task
131d9cb236bSQais Yousef * @lowest_mask: A mask to fill in with selected CPUs (or NULL)
132d9cb236bSQais Yousef * @fitness_fn: A pointer to a function to do custom checks whether the CPU
133d9cb236bSQais Yousef * fits a specific criteria so that we only return those CPUs.
134d9cb236bSQais Yousef *
135d9cb236bSQais Yousef * Note: This function returns the recommended CPUs as calculated during the
136d9cb236bSQais Yousef * current invocation. By the time the call returns, the CPUs may have in
137d9cb236bSQais Yousef * fact changed priorities any number of times. While not ideal, it is not
138d9cb236bSQais Yousef * an issue of correctness since the normal rebalancer logic will correct
139d9cb236bSQais Yousef * any discrepancies created by racing against the uncertainty of the current
140d9cb236bSQais Yousef * priority configuration.
141d9cb236bSQais Yousef *
142d9cb236bSQais Yousef * Return: (int)bool - CPUs were found
143d9cb236bSQais Yousef */
cpupri_find_fitness(struct cpupri * cp,struct task_struct * p,struct cpumask * lowest_mask,bool (* fitness_fn)(struct task_struct * p,int cpu))144a1bd02e1SQais Yousef int cpupri_find_fitness(struct cpupri *cp, struct task_struct *p,
145d9cb236bSQais Yousef struct cpumask *lowest_mask,
146d9cb236bSQais Yousef bool (*fitness_fn)(struct task_struct *p, int cpu))
147d9cb236bSQais Yousef {
148d9cb236bSQais Yousef int task_pri = convert_prio(p->prio);
149e94f80f6SQais Yousef int idx, cpu;
150d9cb236bSQais Yousef
15109348d75SIngo Molnar WARN_ON_ONCE(task_pri >= CPUPRI_NR_PRIORITIES);
152d9cb236bSQais Yousef
153d9cb236bSQais Yousef for (idx = 0; idx < task_pri; idx++) {
154d9cb236bSQais Yousef
155d9cb236bSQais Yousef if (!__cpupri_find(cp, p, lowest_mask, idx))
156804d402fSQais Yousef continue;
157804d402fSQais Yousef
158d9cb236bSQais Yousef if (!lowest_mask || !fitness_fn)
159804d402fSQais Yousef return 1;
160804d402fSQais Yousef
161804d402fSQais Yousef /* Ensure the capacity of the CPUs fit the task */
162804d402fSQais Yousef for_each_cpu(cpu, lowest_mask) {
163804d402fSQais Yousef if (!fitness_fn(p, cpu))
164804d402fSQais Yousef cpumask_clear_cpu(cpu, lowest_mask);
165804d402fSQais Yousef }
166804d402fSQais Yousef
167804d402fSQais Yousef /*
168804d402fSQais Yousef * If no CPU at the current priority can fit the task
169804d402fSQais Yousef * continue looking
170804d402fSQais Yousef */
171e94f80f6SQais Yousef if (cpumask_empty(lowest_mask))
172391e43daSPeter Zijlstra continue;
173391e43daSPeter Zijlstra
174391e43daSPeter Zijlstra return 1;
175391e43daSPeter Zijlstra }
176391e43daSPeter Zijlstra
177d9cb236bSQais Yousef /*
178e94f80f6SQais Yousef * If we failed to find a fitting lowest_mask, kick off a new search
179e94f80f6SQais Yousef * but without taking into account any fitness criteria this time.
180d9cb236bSQais Yousef *
181d9cb236bSQais Yousef * This rule favours honouring priority over fitting the task in the
182d9cb236bSQais Yousef * correct CPU (Capacity Awareness being the only user now).
183d9cb236bSQais Yousef * The idea is that if a higher priority task can run, then it should
184d9cb236bSQais Yousef * run even if this ends up being on unfitting CPU.
185d9cb236bSQais Yousef *
186d9cb236bSQais Yousef * The cost of this trade-off is not entirely clear and will probably
187d9cb236bSQais Yousef * be good for some workloads and bad for others.
188d9cb236bSQais Yousef *
1893b03706fSIngo Molnar * The main idea here is that if some CPUs were over-committed, we try
190d9cb236bSQais Yousef * to spread which is what the scheduler traditionally did. Sys admins
191d9cb236bSQais Yousef * must do proper RT planning to avoid overloading the system if they
192d9cb236bSQais Yousef * really care.
193d9cb236bSQais Yousef */
194e94f80f6SQais Yousef if (fitness_fn)
195e94f80f6SQais Yousef return cpupri_find(cp, p, lowest_mask);
196d9cb236bSQais Yousef
197391e43daSPeter Zijlstra return 0;
198391e43daSPeter Zijlstra }
199391e43daSPeter Zijlstra
200391e43daSPeter Zijlstra /**
20197fb7a0aSIngo Molnar * cpupri_set - update the CPU priority setting
202391e43daSPeter Zijlstra * @cp: The cpupri context
20397fb7a0aSIngo Molnar * @cpu: The target CPU
204b13772f8SPeter Zijlstra * @newpri: The priority (INVALID,NORMAL,RT1-RT99,HIGHER) to assign to this CPU
205391e43daSPeter Zijlstra *
206391e43daSPeter Zijlstra * Note: Assumes cpu_rq(cpu)->lock is locked
207391e43daSPeter Zijlstra *
208391e43daSPeter Zijlstra * Returns: (void)
209391e43daSPeter Zijlstra */
cpupri_set(struct cpupri * cp,int cpu,int newpri)210391e43daSPeter Zijlstra void cpupri_set(struct cpupri *cp, int cpu, int newpri)
211391e43daSPeter Zijlstra {
212391e43daSPeter Zijlstra int *currpri = &cp->cpu_to_pri[cpu];
213391e43daSPeter Zijlstra int oldpri = *currpri;
214391e43daSPeter Zijlstra int do_mb = 0;
215391e43daSPeter Zijlstra
216391e43daSPeter Zijlstra newpri = convert_prio(newpri);
217391e43daSPeter Zijlstra
218391e43daSPeter Zijlstra BUG_ON(newpri >= CPUPRI_NR_PRIORITIES);
219391e43daSPeter Zijlstra
220391e43daSPeter Zijlstra if (newpri == oldpri)
221391e43daSPeter Zijlstra return;
222391e43daSPeter Zijlstra
223391e43daSPeter Zijlstra /*
22497fb7a0aSIngo Molnar * If the CPU was currently mapped to a different value, we
225391e43daSPeter Zijlstra * need to map it to the new value then remove the old value.
226391e43daSPeter Zijlstra * Note, we must add the new value first, otherwise we risk the
227391e43daSPeter Zijlstra * cpu being missed by the priority loop in cpupri_find.
228391e43daSPeter Zijlstra */
229391e43daSPeter Zijlstra if (likely(newpri != CPUPRI_INVALID)) {
230391e43daSPeter Zijlstra struct cpupri_vec *vec = &cp->pri_to_cpu[newpri];
231391e43daSPeter Zijlstra
232391e43daSPeter Zijlstra cpumask_set_cpu(cpu, vec->mask);
233391e43daSPeter Zijlstra /*
234391e43daSPeter Zijlstra * When adding a new vector, we update the mask first,
235391e43daSPeter Zijlstra * do a write memory barrier, and then update the count, to
236391e43daSPeter Zijlstra * make sure the vector is visible when count is set.
237391e43daSPeter Zijlstra */
2384e857c58SPeter Zijlstra smp_mb__before_atomic();
239391e43daSPeter Zijlstra atomic_inc(&(vec)->count);
240391e43daSPeter Zijlstra do_mb = 1;
241391e43daSPeter Zijlstra }
242391e43daSPeter Zijlstra if (likely(oldpri != CPUPRI_INVALID)) {
243391e43daSPeter Zijlstra struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri];
244391e43daSPeter Zijlstra
245391e43daSPeter Zijlstra /*
246391e43daSPeter Zijlstra * Because the order of modification of the vec->count
247391e43daSPeter Zijlstra * is important, we must make sure that the update
248391e43daSPeter Zijlstra * of the new prio is seen before we decrement the
249391e43daSPeter Zijlstra * old prio. This makes sure that the loop sees
250391e43daSPeter Zijlstra * one or the other when we raise the priority of
251391e43daSPeter Zijlstra * the run queue. We don't care about when we lower the
252391e43daSPeter Zijlstra * priority, as that will trigger an rt pull anyway.
253391e43daSPeter Zijlstra *
254391e43daSPeter Zijlstra * We only need to do a memory barrier if we updated
255391e43daSPeter Zijlstra * the new priority vec.
256391e43daSPeter Zijlstra */
257391e43daSPeter Zijlstra if (do_mb)
2584e857c58SPeter Zijlstra smp_mb__after_atomic();
259391e43daSPeter Zijlstra
260391e43daSPeter Zijlstra /*
261391e43daSPeter Zijlstra * When removing from the vector, we decrement the counter first
262391e43daSPeter Zijlstra * do a memory barrier and then clear the mask.
263391e43daSPeter Zijlstra */
264391e43daSPeter Zijlstra atomic_dec(&(vec)->count);
2654e857c58SPeter Zijlstra smp_mb__after_atomic();
266391e43daSPeter Zijlstra cpumask_clear_cpu(cpu, vec->mask);
267391e43daSPeter Zijlstra }
268391e43daSPeter Zijlstra
269391e43daSPeter Zijlstra *currpri = newpri;
270391e43daSPeter Zijlstra }
271391e43daSPeter Zijlstra
272391e43daSPeter Zijlstra /**
273391e43daSPeter Zijlstra * cpupri_init - initialize the cpupri structure
274391e43daSPeter Zijlstra * @cp: The cpupri context
275391e43daSPeter Zijlstra *
276e69f6186SYacine Belkadi * Return: -ENOMEM on memory allocation failure.
277391e43daSPeter Zijlstra */
cpupri_init(struct cpupri * cp)278391e43daSPeter Zijlstra int cpupri_init(struct cpupri *cp)
279391e43daSPeter Zijlstra {
280391e43daSPeter Zijlstra int i;
281391e43daSPeter Zijlstra
282391e43daSPeter Zijlstra for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) {
283391e43daSPeter Zijlstra struct cpupri_vec *vec = &cp->pri_to_cpu[i];
284391e43daSPeter Zijlstra
285391e43daSPeter Zijlstra atomic_set(&vec->count, 0);
286391e43daSPeter Zijlstra if (!zalloc_cpumask_var(&vec->mask, GFP_KERNEL))
287391e43daSPeter Zijlstra goto cleanup;
288391e43daSPeter Zijlstra }
289391e43daSPeter Zijlstra
2904dac0b63SPeter Zijlstra cp->cpu_to_pri = kcalloc(nr_cpu_ids, sizeof(int), GFP_KERNEL);
2914dac0b63SPeter Zijlstra if (!cp->cpu_to_pri)
2924dac0b63SPeter Zijlstra goto cleanup;
2934dac0b63SPeter Zijlstra
294391e43daSPeter Zijlstra for_each_possible_cpu(i)
295391e43daSPeter Zijlstra cp->cpu_to_pri[i] = CPUPRI_INVALID;
2964dac0b63SPeter Zijlstra
297391e43daSPeter Zijlstra return 0;
298391e43daSPeter Zijlstra
299391e43daSPeter Zijlstra cleanup:
300391e43daSPeter Zijlstra for (i--; i >= 0; i--)
301391e43daSPeter Zijlstra free_cpumask_var(cp->pri_to_cpu[i].mask);
302391e43daSPeter Zijlstra return -ENOMEM;
303391e43daSPeter Zijlstra }
304391e43daSPeter Zijlstra
305391e43daSPeter Zijlstra /**
306391e43daSPeter Zijlstra * cpupri_cleanup - clean up the cpupri structure
307391e43daSPeter Zijlstra * @cp: The cpupri context
308391e43daSPeter Zijlstra */
cpupri_cleanup(struct cpupri * cp)309391e43daSPeter Zijlstra void cpupri_cleanup(struct cpupri *cp)
310391e43daSPeter Zijlstra {
311391e43daSPeter Zijlstra int i;
312391e43daSPeter Zijlstra
3134dac0b63SPeter Zijlstra kfree(cp->cpu_to_pri);
314391e43daSPeter Zijlstra for (i = 0; i < CPUPRI_NR_PRIORITIES; i++)
315391e43daSPeter Zijlstra free_cpumask_var(cp->pri_to_cpu[i].mask);
316391e43daSPeter Zijlstra }
317