xref: /linux/kernel/sched/syscalls.c (revision b1b9a9d0b5c875decbc129c16c6e827fb50489a5)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  *  kernel/sched/syscalls.c
4  *
5  *  Core kernel scheduler syscalls related code
6  *
7  *  Copyright (C) 1991-2002  Linus Torvalds
8  *  Copyright (C) 1998-2024  Ingo Molnar, Red Hat
9  */
10 #include <linux/sched.h>
11 #include <linux/cpuset.h>
12 #include <linux/sched/debug.h>
13 
14 #include <uapi/linux/sched/types.h>
15 
16 #include "sched.h"
17 #include "autogroup.h"
18 
__normal_prio(int policy,int rt_prio,int nice)19 static inline int __normal_prio(int policy, int rt_prio, int nice)
20 {
21 	int prio;
22 
23 	if (dl_policy(policy))
24 		prio = MAX_DL_PRIO - 1;
25 	else if (rt_policy(policy))
26 		prio = MAX_RT_PRIO - 1 - rt_prio;
27 	else
28 		prio = NICE_TO_PRIO(nice);
29 
30 	return prio;
31 }
32 
33 /*
34  * Calculate the expected normal priority: i.e. priority
35  * without taking RT-inheritance into account. Might be
36  * boosted by interactivity modifiers. Changes upon fork,
37  * setprio syscalls, and whenever the interactivity
38  * estimator recalculates.
39  */
normal_prio(struct task_struct * p)40 static inline int normal_prio(struct task_struct *p)
41 {
42 	return __normal_prio(p->policy, p->rt_priority, PRIO_TO_NICE(p->static_prio));
43 }
44 
45 /*
46  * Calculate the current priority, i.e. the priority
47  * taken into account by the scheduler. This value might
48  * be boosted by RT tasks, or might be boosted by
49  * interactivity modifiers. Will be RT if the task got
50  * RT-boosted. If not then it returns p->normal_prio.
51  */
effective_prio(struct task_struct * p)52 static int effective_prio(struct task_struct *p)
53 {
54 	p->normal_prio = normal_prio(p);
55 	/*
56 	 * If we are RT tasks or we were boosted to RT priority,
57 	 * keep the priority unchanged. Otherwise, update priority
58 	 * to the normal priority:
59 	 */
60 	if (!rt_or_dl_prio(p->prio))
61 		return p->normal_prio;
62 	return p->prio;
63 }
64 
set_user_nice(struct task_struct * p,long nice)65 void set_user_nice(struct task_struct *p, long nice)
66 {
67 	int old_prio;
68 
69 	if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE)
70 		return;
71 	/*
72 	 * We have to be careful, if called from sys_setpriority(),
73 	 * the task might be in the middle of scheduling on another CPU.
74 	 */
75 	guard(task_rq_lock)(p);
76 
77 	/*
78 	 * The RT priorities are set via sched_setscheduler(), but we still
79 	 * allow the 'normal' nice value to be set - but as expected
80 	 * it won't have any effect on scheduling until the task is
81 	 * SCHED_DEADLINE, SCHED_FIFO or SCHED_RR:
82 	 */
83 	if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
84 		p->static_prio = NICE_TO_PRIO(nice);
85 		return;
86 	}
87 
88 	scoped_guard (sched_change, p, DEQUEUE_SAVE) {
89 		p->static_prio = NICE_TO_PRIO(nice);
90 		set_load_weight(p, true);
91 		old_prio = p->prio;
92 		p->prio = effective_prio(p);
93 	}
94 }
95 EXPORT_SYMBOL(set_user_nice);
96 
97 /*
98  * is_nice_reduction - check if nice value is an actual reduction
99  *
100  * Similar to can_nice() but does not perform a capability check.
101  *
102  * @p: task
103  * @nice: nice value
104  */
is_nice_reduction(const struct task_struct * p,const int nice)105 static bool is_nice_reduction(const struct task_struct *p, const int nice)
106 {
107 	/* Convert nice value [19,-20] to rlimit style value [1,40]: */
108 	int nice_rlim = nice_to_rlimit(nice);
109 
110 	return (nice_rlim <= task_rlimit(p, RLIMIT_NICE));
111 }
112 
113 /*
114  * can_nice - check if a task can reduce its nice value
115  * @p: task
116  * @nice: nice value
117  */
can_nice(const struct task_struct * p,const int nice)118 int can_nice(const struct task_struct *p, const int nice)
119 {
120 	return is_nice_reduction(p, nice) || capable(CAP_SYS_NICE);
121 }
122 
123 #ifdef __ARCH_WANT_SYS_NICE
124 
125 /*
126  * sys_nice - change the priority of the current process.
127  * @increment: priority increment
128  *
129  * sys_setpriority is a more generic, but much slower function that
130  * does similar things.
131  */
SYSCALL_DEFINE1(nice,int,increment)132 SYSCALL_DEFINE1(nice, int, increment)
133 {
134 	long nice, retval;
135 
136 	/*
137 	 * Setpriority might change our priority at the same moment.
138 	 * We don't have to worry. Conceptually one call occurs first
139 	 * and we have a single winner.
140 	 */
141 	increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH);
142 	nice = task_nice(current) + increment;
143 
144 	nice = clamp_val(nice, MIN_NICE, MAX_NICE);
145 	if (increment < 0 && !can_nice(current, nice))
146 		return -EPERM;
147 
148 	retval = security_task_setnice(current, nice);
149 	if (retval)
150 		return retval;
151 
152 	set_user_nice(current, nice);
153 	return 0;
154 }
155 
156 #endif /* __ARCH_WANT_SYS_NICE */
157 
158 /**
159  * task_prio - return the priority value of a given task.
160  * @p: the task in question.
161  *
162  * Return: The priority value as seen by users in /proc.
163  *
164  * sched policy         return value   kernel prio    user prio/nice
165  *
166  * normal, batch, idle     [0 ... 39]  [100 ... 139]          0/[-20 ... 19]
167  * fifo, rr             [-2 ... -100]     [98 ... 0]  [1 ... 99]
168  * deadline                     -101             -1           0
169  */
task_prio(const struct task_struct * p)170 int task_prio(const struct task_struct *p)
171 {
172 	return p->prio - MAX_RT_PRIO;
173 }
174 
175 /**
176  * idle_cpu - is a given CPU idle currently?
177  * @cpu: the processor in question.
178  *
179  * Return: 1 if the CPU is currently idle. 0 otherwise.
180  */
idle_cpu(int cpu)181 int idle_cpu(int cpu)
182 {
183 	return idle_rq(cpu_rq(cpu));
184 }
185 
186 /**
187  * idle_task - return the idle task for a given CPU.
188  * @cpu: the processor in question.
189  *
190  * Return: The idle task for the CPU @cpu.
191  */
idle_task(int cpu)192 struct task_struct *idle_task(int cpu)
193 {
194 	return cpu_rq(cpu)->idle;
195 }
196 
197 #ifdef CONFIG_SCHED_CORE
sched_core_idle_cpu(int cpu)198 int sched_core_idle_cpu(int cpu)
199 {
200 	struct rq *rq = cpu_rq(cpu);
201 
202 	if (sched_core_enabled(rq) && rq->curr == rq->idle)
203 		return 1;
204 
205 	return idle_cpu(cpu);
206 }
207 #endif /* CONFIG_SCHED_CORE */
208 
209 /**
210  * find_process_by_pid - find a process with a matching PID value.
211  * @pid: the pid in question.
212  *
213  * The task of @pid, if found. %NULL otherwise.
214  */
find_process_by_pid(pid_t pid)215 static struct task_struct *find_process_by_pid(pid_t pid)
216 {
217 	return pid ? find_task_by_vpid(pid) : current;
218 }
219 
find_get_task(pid_t pid)220 static struct task_struct *find_get_task(pid_t pid)
221 {
222 	struct task_struct *p;
223 	guard(rcu)();
224 
225 	p = find_process_by_pid(pid);
226 	if (likely(p))
227 		get_task_struct(p);
228 
229 	return p;
230 }
231 
DEFINE_CLASS(find_get_task,struct task_struct *,if (_T)put_task_struct (_T),find_get_task (pid),pid_t pid)232 DEFINE_CLASS(find_get_task, struct task_struct *, if (_T) put_task_struct(_T),
233 	     find_get_task(pid), pid_t pid)
234 
235 /*
236  * sched_setparam() passes in -1 for its policy, to let the functions
237  * it calls know not to change it.
238  */
239 #define SETPARAM_POLICY	-1
240 
241 static void __setscheduler_params(struct task_struct *p,
242 		const struct sched_attr *attr)
243 {
244 	int policy = attr->sched_policy;
245 
246 	if (policy == SETPARAM_POLICY)
247 		policy = p->policy;
248 
249 	p->policy = policy;
250 
251 	if (dl_policy(policy))
252 		__setparam_dl(p, attr);
253 	else if (fair_policy(policy))
254 		__setparam_fair(p, attr);
255 
256 	/* rt-policy tasks do not have a timerslack */
257 	if (rt_or_dl_task_policy(p)) {
258 		p->timer_slack_ns = 0;
259 	} else if (p->timer_slack_ns == 0) {
260 		/* when switching back to non-rt policy, restore timerslack */
261 		p->timer_slack_ns = p->default_timer_slack_ns;
262 	}
263 
264 	/*
265 	 * __sched_setscheduler() ensures attr->sched_priority == 0 when
266 	 * !rt_policy. Always setting this ensures that things like
267 	 * getparam()/getattr() don't report silly values for !rt tasks.
268 	 */
269 	p->rt_priority = attr->sched_priority;
270 	p->normal_prio = normal_prio(p);
271 	set_load_weight(p, true);
272 }
273 
274 /*
275  * Check the target process has a UID that matches the current process's:
276  */
check_same_owner(struct task_struct * p)277 static bool check_same_owner(struct task_struct *p)
278 {
279 	const struct cred *cred = current_cred(), *pcred;
280 	guard(rcu)();
281 
282 	pcred = __task_cred(p);
283 	return (uid_eq(cred->euid, pcred->euid) ||
284 		uid_eq(cred->euid, pcred->uid));
285 }
286 
287 #ifdef CONFIG_RT_MUTEXES
__setscheduler_dl_pi(int newprio,int policy,struct task_struct * p,struct sched_change_ctx * scope)288 static inline void __setscheduler_dl_pi(int newprio, int policy,
289 			      struct task_struct *p,
290 			      struct sched_change_ctx *scope)
291 {
292 	/*
293 	 * In case a DEADLINE task (either proper or boosted) gets
294 	 * setscheduled to a lower priority class, check if it neeeds to
295 	 * inherit parameters from a potential pi_task. In that case make
296 	 * sure replenishment happens with the next enqueue.
297 	 */
298 
299 	if (dl_prio(newprio) && !dl_policy(policy)) {
300 		struct task_struct *pi_task = rt_mutex_get_top_task(p);
301 
302 		if (pi_task) {
303 			p->dl.pi_se = pi_task->dl.pi_se;
304 			scope->flags |= ENQUEUE_REPLENISH;
305 		}
306 	}
307 }
308 #else /* !CONFIG_RT_MUTEXES */
__setscheduler_dl_pi(int newprio,int policy,struct task_struct * p,struct sched_change_ctx * scope)309 static inline void __setscheduler_dl_pi(int newprio, int policy,
310 			      struct task_struct *p,
311 			      struct sched_change_ctx *scope)
312 {
313 }
314 #endif /* !CONFIG_RT_MUTEXES */
315 
316 #ifdef CONFIG_UCLAMP_TASK
317 
uclamp_validate(struct task_struct * p,const struct sched_attr * attr)318 static int uclamp_validate(struct task_struct *p,
319 			   const struct sched_attr *attr)
320 {
321 	int util_min = p->uclamp_req[UCLAMP_MIN].value;
322 	int util_max = p->uclamp_req[UCLAMP_MAX].value;
323 
324 	if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) {
325 		util_min = attr->sched_util_min;
326 
327 		if (util_min + 1 > SCHED_CAPACITY_SCALE + 1)
328 			return -EINVAL;
329 	}
330 
331 	if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) {
332 		util_max = attr->sched_util_max;
333 
334 		if (util_max + 1 > SCHED_CAPACITY_SCALE + 1)
335 			return -EINVAL;
336 	}
337 
338 	if (util_min != -1 && util_max != -1 && util_min > util_max)
339 		return -EINVAL;
340 
341 	/*
342 	 * We have valid uclamp attributes; make sure uclamp is enabled.
343 	 *
344 	 * We need to do that here, because enabling static branches is a
345 	 * blocking operation which obviously cannot be done while holding
346 	 * scheduler locks.
347 	 */
348 	sched_uclamp_enable();
349 
350 	return 0;
351 }
352 
uclamp_reset(const struct sched_attr * attr,enum uclamp_id clamp_id,struct uclamp_se * uc_se)353 static bool uclamp_reset(const struct sched_attr *attr,
354 			 enum uclamp_id clamp_id,
355 			 struct uclamp_se *uc_se)
356 {
357 	/* Reset on sched class change for a non user-defined clamp value. */
358 	if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)) &&
359 	    !uc_se->user_defined)
360 		return true;
361 
362 	/* Reset on sched_util_{min,max} == -1. */
363 	if (clamp_id == UCLAMP_MIN &&
364 	    attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN &&
365 	    attr->sched_util_min == -1) {
366 		return true;
367 	}
368 
369 	if (clamp_id == UCLAMP_MAX &&
370 	    attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX &&
371 	    attr->sched_util_max == -1) {
372 		return true;
373 	}
374 
375 	return false;
376 }
377 
__setscheduler_uclamp(struct task_struct * p,const struct sched_attr * attr)378 static void __setscheduler_uclamp(struct task_struct *p,
379 				  const struct sched_attr *attr)
380 {
381 	enum uclamp_id clamp_id;
382 
383 	for_each_clamp_id(clamp_id) {
384 		struct uclamp_se *uc_se = &p->uclamp_req[clamp_id];
385 		unsigned int value;
386 
387 		if (!uclamp_reset(attr, clamp_id, uc_se))
388 			continue;
389 
390 		/*
391 		 * RT by default have a 100% boost value that could be modified
392 		 * at runtime.
393 		 */
394 		if (unlikely(rt_task(p) && clamp_id == UCLAMP_MIN))
395 			value = sysctl_sched_uclamp_util_min_rt_default;
396 		else
397 			value = uclamp_none(clamp_id);
398 
399 		uclamp_se_set(uc_se, value, false);
400 
401 	}
402 
403 	if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)))
404 		return;
405 
406 	if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN &&
407 	    attr->sched_util_min != -1) {
408 		uclamp_se_set(&p->uclamp_req[UCLAMP_MIN],
409 			      attr->sched_util_min, true);
410 	}
411 
412 	if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX &&
413 	    attr->sched_util_max != -1) {
414 		uclamp_se_set(&p->uclamp_req[UCLAMP_MAX],
415 			      attr->sched_util_max, true);
416 	}
417 }
418 
419 #else /* !CONFIG_UCLAMP_TASK: */
420 
uclamp_validate(struct task_struct * p,const struct sched_attr * attr)421 static inline int uclamp_validate(struct task_struct *p,
422 				  const struct sched_attr *attr)
423 {
424 	return -EOPNOTSUPP;
425 }
__setscheduler_uclamp(struct task_struct * p,const struct sched_attr * attr)426 static void __setscheduler_uclamp(struct task_struct *p,
427 				  const struct sched_attr *attr) { }
428 #endif /* !CONFIG_UCLAMP_TASK */
429 
430 /*
431  * Allow unprivileged RT tasks to decrease priority.
432  * Only issue a capable test if needed and only once to avoid an audit
433  * event on permitted non-privileged operations:
434  */
user_check_sched_setscheduler(struct task_struct * p,const struct sched_attr * attr,int policy,int reset_on_fork)435 static int user_check_sched_setscheduler(struct task_struct *p,
436 					 const struct sched_attr *attr,
437 					 int policy, int reset_on_fork)
438 {
439 	if (fair_policy(policy)) {
440 		if (attr->sched_nice < task_nice(p) &&
441 		    !is_nice_reduction(p, attr->sched_nice))
442 			goto req_priv;
443 	}
444 
445 	if (rt_policy(policy)) {
446 		unsigned long rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO);
447 
448 		/* Can't set/change the rt policy: */
449 		if (policy != p->policy && !rlim_rtprio)
450 			goto req_priv;
451 
452 		/* Can't increase priority: */
453 		if (attr->sched_priority > p->rt_priority &&
454 		    attr->sched_priority > rlim_rtprio)
455 			goto req_priv;
456 	}
457 
458 	/*
459 	 * Can't set/change SCHED_DEADLINE policy at all for now
460 	 * (safest behavior); in the future we would like to allow
461 	 * unprivileged DL tasks to increase their relative deadline
462 	 * or reduce their runtime (both ways reducing utilization)
463 	 */
464 	if (dl_policy(policy))
465 		goto req_priv;
466 
467 	/*
468 	 * Treat SCHED_IDLE as nice 20. Only allow a switch to
469 	 * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
470 	 */
471 	if (task_has_idle_policy(p) && !idle_policy(policy)) {
472 		if (!is_nice_reduction(p, task_nice(p)))
473 			goto req_priv;
474 	}
475 
476 	/* Can't change other user's priorities: */
477 	if (!check_same_owner(p))
478 		goto req_priv;
479 
480 	/* Normal users shall not reset the sched_reset_on_fork flag: */
481 	if (p->sched_reset_on_fork && !reset_on_fork)
482 		goto req_priv;
483 
484 	return 0;
485 
486 req_priv:
487 	if (!capable(CAP_SYS_NICE))
488 		return -EPERM;
489 
490 	return 0;
491 }
492 
__sched_setscheduler(struct task_struct * p,const struct sched_attr * attr,bool user,bool pi)493 int __sched_setscheduler(struct task_struct *p,
494 			 const struct sched_attr *attr,
495 			 bool user, bool pi)
496 {
497 	int oldpolicy = -1, policy = attr->sched_policy;
498 	int retval, oldprio, newprio;
499 	const struct sched_class *prev_class, *next_class;
500 	struct balance_callback *head;
501 	struct rq_flags rf;
502 	int reset_on_fork;
503 	int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
504 	struct rq *rq;
505 	bool cpuset_locked = false;
506 
507 	/* The pi code expects interrupts enabled */
508 	BUG_ON(pi && in_interrupt());
509 recheck:
510 	/* Double check policy once rq lock held: */
511 	if (policy < 0) {
512 		reset_on_fork = p->sched_reset_on_fork;
513 		policy = oldpolicy = p->policy;
514 	} else {
515 		reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK);
516 
517 		if (!valid_policy(policy))
518 			return -EINVAL;
519 	}
520 
521 	if (attr->sched_flags & ~(SCHED_FLAG_ALL | SCHED_FLAG_SUGOV))
522 		return -EINVAL;
523 
524 	/*
525 	 * Valid priorities for SCHED_FIFO and SCHED_RR are
526 	 * 1..MAX_RT_PRIO-1, valid priority for SCHED_NORMAL,
527 	 * SCHED_BATCH and SCHED_IDLE is 0.
528 	 */
529 	if (attr->sched_priority > MAX_RT_PRIO-1)
530 		return -EINVAL;
531 	if ((dl_policy(policy) && !__checkparam_dl(attr)) ||
532 	    (rt_policy(policy) != (attr->sched_priority != 0)))
533 		return -EINVAL;
534 
535 	if (user) {
536 		retval = user_check_sched_setscheduler(p, attr, policy, reset_on_fork);
537 		if (retval)
538 			return retval;
539 
540 		if (attr->sched_flags & SCHED_FLAG_SUGOV)
541 			return -EINVAL;
542 
543 		retval = security_task_setscheduler(p);
544 		if (retval)
545 			return retval;
546 	}
547 
548 	/* Update task specific "requested" clamps */
549 	if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) {
550 		retval = uclamp_validate(p, attr);
551 		if (retval)
552 			return retval;
553 	}
554 
555 	/*
556 	 * SCHED_DEADLINE bandwidth accounting relies on stable cpusets
557 	 * information.
558 	 */
559 	if (dl_policy(policy) || dl_policy(p->policy)) {
560 		cpuset_locked = true;
561 		cpuset_lock();
562 	}
563 
564 	/*
565 	 * Make sure no PI-waiters arrive (or leave) while we are
566 	 * changing the priority of the task:
567 	 *
568 	 * To be able to change p->policy safely, the appropriate
569 	 * runqueue lock must be held.
570 	 */
571 	rq = task_rq_lock(p, &rf);
572 	update_rq_clock(rq);
573 
574 	/*
575 	 * Changing the policy of the stop threads its a very bad idea:
576 	 */
577 	if (p == rq->stop) {
578 		retval = -EINVAL;
579 		goto unlock;
580 	}
581 
582 	retval = scx_check_setscheduler(p, policy);
583 	if (retval)
584 		goto unlock;
585 
586 	/*
587 	 * If not changing anything there's no need to proceed further,
588 	 * but store a possible modification of reset_on_fork.
589 	 */
590 	if (unlikely(policy == p->policy)) {
591 		if (fair_policy(policy) &&
592 		    (attr->sched_nice != task_nice(p) ||
593 		     (attr->sched_runtime != p->se.slice)))
594 			goto change;
595 		if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
596 			goto change;
597 		if (dl_policy(policy) && dl_param_changed(p, attr))
598 			goto change;
599 		if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)
600 			goto change;
601 
602 		p->sched_reset_on_fork = reset_on_fork;
603 		retval = 0;
604 		goto unlock;
605 	}
606 change:
607 
608 	if (user) {
609 #ifdef CONFIG_RT_GROUP_SCHED
610 		/*
611 		 * Do not allow real-time tasks into groups that have no runtime
612 		 * assigned.
613 		 */
614 		if (rt_group_sched_enabled() &&
615 				rt_bandwidth_enabled() && rt_policy(policy) &&
616 				task_group(p)->rt_bandwidth.rt_runtime == 0 &&
617 				!task_group_is_autogroup(task_group(p))) {
618 			retval = -EPERM;
619 			goto unlock;
620 		}
621 #endif /* CONFIG_RT_GROUP_SCHED */
622 		if (dl_bandwidth_enabled() && dl_policy(policy) &&
623 				!(attr->sched_flags & SCHED_FLAG_SUGOV)) {
624 			cpumask_t *span = rq->rd->span;
625 
626 			/*
627 			 * Don't allow tasks with an affinity mask smaller than
628 			 * the entire root_domain to become SCHED_DEADLINE. We
629 			 * will also fail if there's no bandwidth available.
630 			 */
631 			if (!cpumask_subset(span, p->cpus_ptr) ||
632 			    rq->rd->dl_bw.bw == 0) {
633 				retval = -EPERM;
634 				goto unlock;
635 			}
636 		}
637 	}
638 
639 	/* Re-check policy now with rq lock held: */
640 	if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
641 		policy = oldpolicy = -1;
642 		task_rq_unlock(rq, p, &rf);
643 		if (cpuset_locked)
644 			cpuset_unlock();
645 		goto recheck;
646 	}
647 
648 	/*
649 	 * If setscheduling to SCHED_DEADLINE (or changing the parameters
650 	 * of a SCHED_DEADLINE task) we need to check if enough bandwidth
651 	 * is available.
652 	 */
653 	if ((dl_policy(policy) || dl_task(p)) && sched_dl_overflow(p, policy, attr)) {
654 		retval = -EBUSY;
655 		goto unlock;
656 	}
657 
658 	p->sched_reset_on_fork = reset_on_fork;
659 	oldprio = p->prio;
660 
661 	newprio = __normal_prio(policy, attr->sched_priority, attr->sched_nice);
662 	if (pi) {
663 		/*
664 		 * Take priority boosted tasks into account. If the new
665 		 * effective priority is unchanged, we just store the new
666 		 * normal parameters and do not touch the scheduler class and
667 		 * the runqueue. This will be done when the task deboost
668 		 * itself.
669 		 */
670 		newprio = rt_effective_prio(p, newprio);
671 		if (newprio == oldprio && !dl_prio(newprio))
672 			queue_flags &= ~DEQUEUE_MOVE;
673 	}
674 
675 	prev_class = p->sched_class;
676 	next_class = __setscheduler_class(policy, newprio);
677 
678 	if (prev_class != next_class)
679 		queue_flags |= DEQUEUE_CLASS;
680 
681 	scoped_guard (sched_change, p, queue_flags) {
682 
683 		if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) {
684 			__setscheduler_params(p, attr);
685 			p->sched_class = next_class;
686 			p->prio = newprio;
687 			__setscheduler_dl_pi(newprio, policy, p, scope);
688 		}
689 		__setscheduler_uclamp(p, attr);
690 
691 		if (scope->queued) {
692 			/*
693 			 * We enqueue to tail when the priority of a task is
694 			 * increased (user space view).
695 			 */
696 			if (oldprio < p->prio)
697 				scope->flags |= ENQUEUE_HEAD;
698 		}
699 	}
700 
701 	/* Avoid rq from going away on us: */
702 	preempt_disable();
703 	head = splice_balance_callbacks(rq);
704 	task_rq_unlock(rq, p, &rf);
705 
706 	if (pi) {
707 		if (cpuset_locked)
708 			cpuset_unlock();
709 		rt_mutex_adjust_pi(p);
710 	}
711 
712 	/* Run balance callbacks after we've adjusted the PI chain: */
713 	balance_callbacks(rq, head);
714 	preempt_enable();
715 
716 	return 0;
717 
718 unlock:
719 	task_rq_unlock(rq, p, &rf);
720 	if (cpuset_locked)
721 		cpuset_unlock();
722 	return retval;
723 }
724 
_sched_setscheduler(struct task_struct * p,int policy,const struct sched_param * param,bool check)725 static int _sched_setscheduler(struct task_struct *p, int policy,
726 			       const struct sched_param *param, bool check)
727 {
728 	struct sched_attr attr = {
729 		.sched_policy   = policy,
730 		.sched_priority = param->sched_priority,
731 		.sched_nice	= PRIO_TO_NICE(p->static_prio),
732 	};
733 
734 	if (p->se.custom_slice)
735 		attr.sched_runtime = p->se.slice;
736 
737 	/* Fixup the legacy SCHED_RESET_ON_FORK hack. */
738 	if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) {
739 		attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
740 		policy &= ~SCHED_RESET_ON_FORK;
741 		attr.sched_policy = policy;
742 	}
743 
744 	return __sched_setscheduler(p, &attr, check, true);
745 }
746 /**
747  * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
748  * @p: the task in question.
749  * @policy: new policy.
750  * @param: structure containing the new RT priority.
751  *
752  * Use sched_set_fifo(), read its comment.
753  *
754  * Return: 0 on success. An error code otherwise.
755  *
756  * NOTE that the task may be already dead.
757  */
sched_setscheduler(struct task_struct * p,int policy,const struct sched_param * param)758 int sched_setscheduler(struct task_struct *p, int policy,
759 		       const struct sched_param *param)
760 {
761 	return _sched_setscheduler(p, policy, param, true);
762 }
763 
sched_setattr(struct task_struct * p,const struct sched_attr * attr)764 int sched_setattr(struct task_struct *p, const struct sched_attr *attr)
765 {
766 	return __sched_setscheduler(p, attr, true, true);
767 }
768 
sched_setattr_nocheck(struct task_struct * p,const struct sched_attr * attr)769 int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr)
770 {
771 	return __sched_setscheduler(p, attr, false, true);
772 }
773 EXPORT_SYMBOL_GPL(sched_setattr_nocheck);
774 
775 /**
776  * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernel-space.
777  * @p: the task in question.
778  * @policy: new policy.
779  * @param: structure containing the new RT priority.
780  *
781  * Just like sched_setscheduler, only don't bother checking if the
782  * current context has permission.  For example, this is needed in
783  * stop_machine(): we create temporary high priority worker threads,
784  * but our caller might not have that capability.
785  *
786  * Return: 0 on success. An error code otherwise.
787  */
sched_setscheduler_nocheck(struct task_struct * p,int policy,const struct sched_param * param)788 int sched_setscheduler_nocheck(struct task_struct *p, int policy,
789 			       const struct sched_param *param)
790 {
791 	return _sched_setscheduler(p, policy, param, false);
792 }
793 
794 /*
795  * SCHED_FIFO is a broken scheduler model; that is, it is fundamentally
796  * incapable of resource management, which is the one thing an OS really should
797  * be doing.
798  *
799  * This is of course the reason it is limited to privileged users only.
800  *
801  * Worse still; it is fundamentally impossible to compose static priority
802  * workloads. You cannot take two correctly working static prio workloads
803  * and smash them together and still expect them to work.
804  *
805  * For this reason 'all' FIFO tasks the kernel creates are basically at:
806  *
807  *   MAX_RT_PRIO / 2
808  *
809  * The administrator _MUST_ configure the system, the kernel simply doesn't
810  * know enough information to make a sensible choice.
811  */
sched_set_fifo(struct task_struct * p)812 void sched_set_fifo(struct task_struct *p)
813 {
814 	struct sched_param sp = { .sched_priority = MAX_RT_PRIO / 2 };
815 	WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0);
816 }
817 EXPORT_SYMBOL_GPL(sched_set_fifo);
818 
819 /*
820  * For when you don't much care about FIFO, but want to be above SCHED_NORMAL.
821  */
sched_set_fifo_low(struct task_struct * p)822 void sched_set_fifo_low(struct task_struct *p)
823 {
824 	struct sched_param sp = { .sched_priority = 1 };
825 	WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0);
826 }
827 EXPORT_SYMBOL_GPL(sched_set_fifo_low);
828 
829 /*
830  * Used when the primary interrupt handler is forced into a thread, in addition
831  * to the (always threaded) secondary handler.  The secondary handler gets a
832  * slightly lower priority so that the primary handler can preempt it, thereby
833  * emulating the behavior of a non-PREEMPT_RT system where the primary handler
834  * runs in hard interrupt context.
835  */
sched_set_fifo_secondary(struct task_struct * p)836 void sched_set_fifo_secondary(struct task_struct *p)
837 {
838 	struct sched_param sp = { .sched_priority = MAX_RT_PRIO / 2 - 1 };
839 	WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0);
840 }
841 
sched_set_normal(struct task_struct * p,int nice)842 void sched_set_normal(struct task_struct *p, int nice)
843 {
844 	struct sched_attr attr = {
845 		.sched_policy = SCHED_NORMAL,
846 		.sched_nice = nice,
847 	};
848 	WARN_ON_ONCE(sched_setattr_nocheck(p, &attr) != 0);
849 }
850 EXPORT_SYMBOL_GPL(sched_set_normal);
851 
852 static int
do_sched_setscheduler(pid_t pid,int policy,struct sched_param __user * param)853 do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
854 {
855 	struct sched_param lparam;
856 
857 	if (unlikely(!param || pid < 0))
858 		return -EINVAL;
859 	if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
860 		return -EFAULT;
861 
862 	CLASS(find_get_task, p)(pid);
863 	if (!p)
864 		return -ESRCH;
865 
866 	return sched_setscheduler(p, policy, &lparam);
867 }
868 
869 /*
870  * Mimics kernel/events/core.c perf_copy_attr().
871  */
sched_copy_attr(struct sched_attr __user * uattr,struct sched_attr * attr)872 static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *attr)
873 {
874 	u32 size;
875 	int ret;
876 
877 	/* Zero the full structure, so that a short copy will be nice: */
878 	memset(attr, 0, sizeof(*attr));
879 
880 	ret = get_user(size, &uattr->size);
881 	if (ret)
882 		return ret;
883 
884 	/* ABI compatibility quirk: */
885 	if (!size)
886 		size = SCHED_ATTR_SIZE_VER0;
887 	if (size < SCHED_ATTR_SIZE_VER0 || size > PAGE_SIZE)
888 		goto err_size;
889 
890 	ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size);
891 	if (ret) {
892 		if (ret == -E2BIG)
893 			goto err_size;
894 		return ret;
895 	}
896 
897 	if ((attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) &&
898 	    size < SCHED_ATTR_SIZE_VER1)
899 		return -EINVAL;
900 
901 	/*
902 	 * XXX: Do we want to be lenient like existing syscalls; or do we want
903 	 * to be strict and return an error on out-of-bounds values?
904 	 */
905 	attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE);
906 
907 	return 0;
908 
909 err_size:
910 	put_user(sizeof(*attr), &uattr->size);
911 	return -E2BIG;
912 }
913 
get_params(struct task_struct * p,struct sched_attr * attr)914 static void get_params(struct task_struct *p, struct sched_attr *attr)
915 {
916 	if (task_has_dl_policy(p)) {
917 		__getparam_dl(p, attr);
918 	} else if (task_has_rt_policy(p)) {
919 		attr->sched_priority = p->rt_priority;
920 	} else {
921 		attr->sched_nice = task_nice(p);
922 		attr->sched_runtime = p->se.slice;
923 	}
924 }
925 
926 /**
927  * sys_sched_setscheduler - set/change the scheduler policy and RT priority
928  * @pid: the pid in question.
929  * @policy: new policy.
930  * @param: structure containing the new RT priority.
931  *
932  * Return: 0 on success. An error code otherwise.
933  */
SYSCALL_DEFINE3(sched_setscheduler,pid_t,pid,int,policy,struct sched_param __user *,param)934 SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, struct sched_param __user *, param)
935 {
936 	if (policy < 0)
937 		return -EINVAL;
938 
939 	return do_sched_setscheduler(pid, policy, param);
940 }
941 
942 /**
943  * sys_sched_setparam - set/change the RT priority of a thread
944  * @pid: the pid in question.
945  * @param: structure containing the new RT priority.
946  *
947  * Return: 0 on success. An error code otherwise.
948  */
SYSCALL_DEFINE2(sched_setparam,pid_t,pid,struct sched_param __user *,param)949 SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
950 {
951 	return do_sched_setscheduler(pid, SETPARAM_POLICY, param);
952 }
953 
954 /**
955  * sys_sched_setattr - same as above, but with extended sched_attr
956  * @pid: the pid in question.
957  * @uattr: structure containing the extended parameters.
958  * @flags: for future extension.
959  */
SYSCALL_DEFINE3(sched_setattr,pid_t,pid,struct sched_attr __user *,uattr,unsigned int,flags)960 SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
961 			       unsigned int, flags)
962 {
963 	struct sched_attr attr;
964 	int retval;
965 
966 	if (unlikely(!uattr || pid < 0 || flags))
967 		return -EINVAL;
968 
969 	retval = sched_copy_attr(uattr, &attr);
970 	if (retval)
971 		return retval;
972 
973 	if ((int)attr.sched_policy < 0)
974 		return -EINVAL;
975 	if (attr.sched_flags & SCHED_FLAG_KEEP_POLICY)
976 		attr.sched_policy = SETPARAM_POLICY;
977 
978 	CLASS(find_get_task, p)(pid);
979 	if (!p)
980 		return -ESRCH;
981 
982 	if (attr.sched_flags & SCHED_FLAG_KEEP_PARAMS)
983 		get_params(p, &attr);
984 
985 	return sched_setattr(p, &attr);
986 }
987 
988 /**
989  * sys_sched_getscheduler - get the policy (scheduling class) of a thread
990  * @pid: the pid in question.
991  *
992  * Return: On success, the policy of the thread. Otherwise, a negative error
993  * code.
994  */
SYSCALL_DEFINE1(sched_getscheduler,pid_t,pid)995 SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
996 {
997 	struct task_struct *p;
998 	int retval;
999 
1000 	if (pid < 0)
1001 		return -EINVAL;
1002 
1003 	guard(rcu)();
1004 	p = find_process_by_pid(pid);
1005 	if (!p)
1006 		return -ESRCH;
1007 
1008 	retval = security_task_getscheduler(p);
1009 	if (!retval) {
1010 		retval = p->policy;
1011 		if (p->sched_reset_on_fork)
1012 			retval |= SCHED_RESET_ON_FORK;
1013 	}
1014 	return retval;
1015 }
1016 
1017 /**
1018  * sys_sched_getparam - get the RT priority of a thread
1019  * @pid: the pid in question.
1020  * @param: structure containing the RT priority.
1021  *
1022  * Return: On success, 0 and the RT priority is in @param. Otherwise, an error
1023  * code.
1024  */
SYSCALL_DEFINE2(sched_getparam,pid_t,pid,struct sched_param __user *,param)1025 SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
1026 {
1027 	struct sched_param lp = { .sched_priority = 0 };
1028 	struct task_struct *p;
1029 	int retval;
1030 
1031 	if (unlikely(!param || pid < 0))
1032 		return -EINVAL;
1033 
1034 	scoped_guard (rcu) {
1035 		p = find_process_by_pid(pid);
1036 		if (!p)
1037 			return -ESRCH;
1038 
1039 		retval = security_task_getscheduler(p);
1040 		if (retval)
1041 			return retval;
1042 
1043 		if (task_has_rt_policy(p))
1044 			lp.sched_priority = p->rt_priority;
1045 	}
1046 
1047 	/*
1048 	 * This one might sleep, we cannot do it with a spinlock held ...
1049 	 */
1050 	return copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
1051 }
1052 
1053 /**
1054  * sys_sched_getattr - similar to sched_getparam, but with sched_attr
1055  * @pid: the pid in question.
1056  * @uattr: structure containing the extended parameters.
1057  * @usize: sizeof(attr) for fwd/bwd comp.
1058  * @flags: for future extension.
1059  */
SYSCALL_DEFINE4(sched_getattr,pid_t,pid,struct sched_attr __user *,uattr,unsigned int,usize,unsigned int,flags)1060 SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
1061 		unsigned int, usize, unsigned int, flags)
1062 {
1063 	struct sched_attr kattr = { };
1064 	struct task_struct *p;
1065 	int retval;
1066 
1067 	if (unlikely(!uattr || pid < 0 || usize > PAGE_SIZE ||
1068 		      usize < SCHED_ATTR_SIZE_VER0 || flags))
1069 		return -EINVAL;
1070 
1071 	scoped_guard (rcu) {
1072 		p = find_process_by_pid(pid);
1073 		if (!p)
1074 			return -ESRCH;
1075 
1076 		retval = security_task_getscheduler(p);
1077 		if (retval)
1078 			return retval;
1079 
1080 		kattr.sched_policy = p->policy;
1081 		if (p->sched_reset_on_fork)
1082 			kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
1083 		get_params(p, &kattr);
1084 		kattr.sched_flags &= SCHED_FLAG_ALL;
1085 
1086 #ifdef CONFIG_UCLAMP_TASK
1087 		/*
1088 		 * This could race with another potential updater, but this is fine
1089 		 * because it'll correctly read the old or the new value. We don't need
1090 		 * to guarantee who wins the race as long as it doesn't return garbage.
1091 		 */
1092 		kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value;
1093 		kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value;
1094 #endif
1095 	}
1096 
1097 	kattr.size = min(usize, sizeof(kattr));
1098 	return copy_struct_to_user(uattr, usize, &kattr, sizeof(kattr), NULL);
1099 }
1100 
dl_task_check_affinity(struct task_struct * p,const struct cpumask * mask)1101 int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask)
1102 {
1103 	/*
1104 	 * If the task isn't a deadline task or admission control is
1105 	 * disabled then we don't care about affinity changes.
1106 	 */
1107 	if (!task_has_dl_policy(p) || !dl_bandwidth_enabled())
1108 		return 0;
1109 
1110 	/*
1111 	 * The special/sugov task isn't part of regular bandwidth/admission
1112 	 * control so let userspace change affinities.
1113 	 */
1114 	if (dl_entity_is_special(&p->dl))
1115 		return 0;
1116 
1117 	/*
1118 	 * Since bandwidth control happens on root_domain basis,
1119 	 * if admission test is enabled, we only admit -deadline
1120 	 * tasks allowed to run on all the CPUs in the task's
1121 	 * root_domain.
1122 	 */
1123 	guard(rcu)();
1124 	if (!cpumask_subset(task_rq(p)->rd->span, mask))
1125 		return -EBUSY;
1126 
1127 	return 0;
1128 }
1129 
__sched_setaffinity(struct task_struct * p,struct affinity_context * ctx)1130 int __sched_setaffinity(struct task_struct *p, struct affinity_context *ctx)
1131 {
1132 	int retval;
1133 	cpumask_var_t cpus_allowed, new_mask;
1134 
1135 	if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL))
1136 		return -ENOMEM;
1137 
1138 	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
1139 		retval = -ENOMEM;
1140 		goto out_free_cpus_allowed;
1141 	}
1142 
1143 	cpuset_cpus_allowed(p, cpus_allowed);
1144 	cpumask_and(new_mask, ctx->new_mask, cpus_allowed);
1145 
1146 	ctx->new_mask = new_mask;
1147 	ctx->flags |= SCA_CHECK;
1148 
1149 	retval = dl_task_check_affinity(p, new_mask);
1150 	if (retval)
1151 		goto out_free_new_mask;
1152 
1153 	retval = __set_cpus_allowed_ptr(p, ctx);
1154 	if (retval)
1155 		goto out_free_new_mask;
1156 
1157 	cpuset_cpus_allowed(p, cpus_allowed);
1158 	if (!cpumask_subset(new_mask, cpus_allowed)) {
1159 		/*
1160 		 * We must have raced with a concurrent cpuset update.
1161 		 * Just reset the cpumask to the cpuset's cpus_allowed.
1162 		 */
1163 		cpumask_copy(new_mask, cpus_allowed);
1164 
1165 		/*
1166 		 * If SCA_USER is set, a 2nd call to __set_cpus_allowed_ptr()
1167 		 * will restore the previous user_cpus_ptr value.
1168 		 *
1169 		 * In the unlikely event a previous user_cpus_ptr exists,
1170 		 * we need to further restrict the mask to what is allowed
1171 		 * by that old user_cpus_ptr.
1172 		 */
1173 		if (unlikely((ctx->flags & SCA_USER) && ctx->user_mask)) {
1174 			bool empty = !cpumask_and(new_mask, new_mask,
1175 						  ctx->user_mask);
1176 
1177 			if (empty)
1178 				cpumask_copy(new_mask, cpus_allowed);
1179 		}
1180 		__set_cpus_allowed_ptr(p, ctx);
1181 		retval = -EINVAL;
1182 	}
1183 
1184 out_free_new_mask:
1185 	free_cpumask_var(new_mask);
1186 out_free_cpus_allowed:
1187 	free_cpumask_var(cpus_allowed);
1188 	return retval;
1189 }
1190 
sched_setaffinity(pid_t pid,const struct cpumask * in_mask)1191 long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
1192 {
1193 	struct affinity_context ac;
1194 	struct cpumask *user_mask;
1195 	int retval;
1196 
1197 	CLASS(find_get_task, p)(pid);
1198 	if (!p)
1199 		return -ESRCH;
1200 
1201 	if (p->flags & PF_NO_SETAFFINITY)
1202 		return -EINVAL;
1203 
1204 	if (!check_same_owner(p)) {
1205 		guard(rcu)();
1206 		if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE))
1207 			return -EPERM;
1208 	}
1209 
1210 	retval = security_task_setscheduler(p);
1211 	if (retval)
1212 		return retval;
1213 
1214 	/*
1215 	 * With non-SMP configs, user_cpus_ptr/user_mask isn't used and
1216 	 * alloc_user_cpus_ptr() returns NULL.
1217 	 */
1218 	user_mask = alloc_user_cpus_ptr(NUMA_NO_NODE);
1219 	if (user_mask) {
1220 		cpumask_copy(user_mask, in_mask);
1221 	} else {
1222 		return -ENOMEM;
1223 	}
1224 
1225 	ac = (struct affinity_context){
1226 		.new_mask  = in_mask,
1227 		.user_mask = user_mask,
1228 		.flags     = SCA_USER,
1229 	};
1230 
1231 	retval = __sched_setaffinity(p, &ac);
1232 	kfree(ac.user_mask);
1233 
1234 	return retval;
1235 }
1236 
get_user_cpu_mask(unsigned long __user * user_mask_ptr,unsigned len,struct cpumask * new_mask)1237 static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
1238 			     struct cpumask *new_mask)
1239 {
1240 	if (len < cpumask_size())
1241 		cpumask_clear(new_mask);
1242 	else if (len > cpumask_size())
1243 		len = cpumask_size();
1244 
1245 	return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
1246 }
1247 
1248 /**
1249  * sys_sched_setaffinity - set the CPU affinity of a process
1250  * @pid: pid of the process
1251  * @len: length in bytes of the bitmask pointed to by user_mask_ptr
1252  * @user_mask_ptr: user-space pointer to the new CPU mask
1253  *
1254  * Return: 0 on success. An error code otherwise.
1255  */
SYSCALL_DEFINE3(sched_setaffinity,pid_t,pid,unsigned int,len,unsigned long __user *,user_mask_ptr)1256 SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
1257 		unsigned long __user *, user_mask_ptr)
1258 {
1259 	cpumask_var_t new_mask;
1260 	int retval;
1261 
1262 	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
1263 		return -ENOMEM;
1264 
1265 	retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
1266 	if (retval == 0)
1267 		retval = sched_setaffinity(pid, new_mask);
1268 	free_cpumask_var(new_mask);
1269 	return retval;
1270 }
1271 
sched_getaffinity(pid_t pid,struct cpumask * mask)1272 long sched_getaffinity(pid_t pid, struct cpumask *mask)
1273 {
1274 	struct task_struct *p;
1275 	int retval;
1276 
1277 	guard(rcu)();
1278 	p = find_process_by_pid(pid);
1279 	if (!p)
1280 		return -ESRCH;
1281 
1282 	retval = security_task_getscheduler(p);
1283 	if (retval)
1284 		return retval;
1285 
1286 	guard(raw_spinlock_irqsave)(&p->pi_lock);
1287 	cpumask_and(mask, &p->cpus_mask, cpu_active_mask);
1288 
1289 	return 0;
1290 }
1291 
1292 /**
1293  * sys_sched_getaffinity - get the CPU affinity of a process
1294  * @pid: pid of the process
1295  * @len: length in bytes of the bitmask pointed to by user_mask_ptr
1296  * @user_mask_ptr: user-space pointer to hold the current CPU mask
1297  *
1298  * Return: size of CPU mask copied to user_mask_ptr on success. An
1299  * error code otherwise.
1300  */
SYSCALL_DEFINE3(sched_getaffinity,pid_t,pid,unsigned int,len,unsigned long __user *,user_mask_ptr)1301 SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
1302 		unsigned long __user *, user_mask_ptr)
1303 {
1304 	int ret;
1305 	cpumask_var_t mask;
1306 
1307 	if ((len * BITS_PER_BYTE) < nr_cpu_ids)
1308 		return -EINVAL;
1309 	if (len & (sizeof(unsigned long)-1))
1310 		return -EINVAL;
1311 
1312 	if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
1313 		return -ENOMEM;
1314 
1315 	ret = sched_getaffinity(pid, mask);
1316 	if (ret == 0) {
1317 		unsigned int retlen = min(len, cpumask_size());
1318 
1319 		if (copy_to_user(user_mask_ptr, cpumask_bits(mask), retlen))
1320 			ret = -EFAULT;
1321 		else
1322 			ret = retlen;
1323 	}
1324 	free_cpumask_var(mask);
1325 
1326 	return ret;
1327 }
1328 
do_sched_yield(void)1329 static void do_sched_yield(void)
1330 {
1331 	struct rq_flags rf;
1332 	struct rq *rq;
1333 
1334 	rq = this_rq_lock_irq(&rf);
1335 
1336 	schedstat_inc(rq->yld_count);
1337 	rq->donor->sched_class->yield_task(rq);
1338 
1339 	preempt_disable();
1340 	rq_unlock_irq(rq, &rf);
1341 	sched_preempt_enable_no_resched();
1342 
1343 	schedule();
1344 }
1345 
1346 /**
1347  * sys_sched_yield - yield the current processor to other threads.
1348  *
1349  * This function yields the current CPU to other tasks. If there are no
1350  * other threads running on this CPU then this function will return.
1351  *
1352  * Return: 0.
1353  */
SYSCALL_DEFINE0(sched_yield)1354 SYSCALL_DEFINE0(sched_yield)
1355 {
1356 	do_sched_yield();
1357 	return 0;
1358 }
1359 
1360 /**
1361  * yield - yield the current processor to other threads.
1362  *
1363  * Do not ever use this function, there's a 99% chance you're doing it wrong.
1364  *
1365  * The scheduler is at all times free to pick the calling task as the most
1366  * eligible task to run, if removing the yield() call from your code breaks
1367  * it, it's already broken.
1368  *
1369  * Typical broken usage is:
1370  *
1371  * while (!event)
1372  *	yield();
1373  *
1374  * where one assumes that yield() will let 'the other' process run that will
1375  * make event true. If the current task is a SCHED_FIFO task that will never
1376  * happen. Never use yield() as a progress guarantee!!
1377  *
1378  * If you want to use yield() to wait for something, use wait_event().
1379  * If you want to use yield() to be 'nice' for others, use cond_resched().
1380  * If you still want to use yield(), do not!
1381  */
yield(void)1382 void __sched yield(void)
1383 {
1384 	set_current_state(TASK_RUNNING);
1385 	do_sched_yield();
1386 }
1387 EXPORT_SYMBOL(yield);
1388 
1389 /**
1390  * yield_to - yield the current processor to another thread in
1391  * your thread group, or accelerate that thread toward the
1392  * processor it's on.
1393  * @p: target task
1394  * @preempt: whether task preemption is allowed or not
1395  *
1396  * It's the caller's job to ensure that the target task struct
1397  * can't go away on us before we can do any checks.
1398  *
1399  * Return:
1400  *	true (>0) if we indeed boosted the target task.
1401  *	false (0) if we failed to boost the target.
1402  *	-ESRCH if there's no task to yield to.
1403  */
yield_to(struct task_struct * p,bool preempt)1404 int __sched yield_to(struct task_struct *p, bool preempt)
1405 {
1406 	struct task_struct *curr;
1407 	struct rq *rq, *p_rq;
1408 	int yielded = 0;
1409 
1410 	scoped_guard (raw_spinlock_irqsave, &p->pi_lock) {
1411 		rq = this_rq();
1412 		curr = rq->donor;
1413 
1414 again:
1415 		p_rq = task_rq(p);
1416 		/*
1417 		 * If we're the only runnable task on the rq and target rq also
1418 		 * has only one task, there's absolutely no point in yielding.
1419 		 */
1420 		if (rq->nr_running == 1 && p_rq->nr_running == 1)
1421 			return -ESRCH;
1422 
1423 		guard(double_rq_lock)(rq, p_rq);
1424 		if (task_rq(p) != p_rq)
1425 			goto again;
1426 
1427 		if (!curr->sched_class->yield_to_task)
1428 			return 0;
1429 
1430 		if (curr->sched_class != p->sched_class)
1431 			return 0;
1432 
1433 		if (task_on_cpu(p_rq, p) || !task_is_running(p))
1434 			return 0;
1435 
1436 		yielded = curr->sched_class->yield_to_task(rq, p);
1437 		if (yielded) {
1438 			schedstat_inc(rq->yld_count);
1439 			/*
1440 			 * Make p's CPU reschedule; pick_next_entity
1441 			 * takes care of fairness.
1442 			 */
1443 			if (preempt && rq != p_rq)
1444 				resched_curr(p_rq);
1445 		}
1446 	}
1447 
1448 	if (yielded)
1449 		schedule();
1450 
1451 	return yielded;
1452 }
1453 EXPORT_SYMBOL_GPL(yield_to);
1454 
1455 /**
1456  * sys_sched_get_priority_max - return maximum RT priority.
1457  * @policy: scheduling class.
1458  *
1459  * Return: On success, this syscall returns the maximum
1460  * rt_priority that can be used by a given scheduling class.
1461  * On failure, a negative error code is returned.
1462  */
SYSCALL_DEFINE1(sched_get_priority_max,int,policy)1463 SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
1464 {
1465 	int ret = -EINVAL;
1466 
1467 	switch (policy) {
1468 	case SCHED_FIFO:
1469 	case SCHED_RR:
1470 		ret = MAX_RT_PRIO-1;
1471 		break;
1472 	case SCHED_DEADLINE:
1473 	case SCHED_NORMAL:
1474 	case SCHED_BATCH:
1475 	case SCHED_IDLE:
1476 	case SCHED_EXT:
1477 		ret = 0;
1478 		break;
1479 	}
1480 	return ret;
1481 }
1482 
1483 /**
1484  * sys_sched_get_priority_min - return minimum RT priority.
1485  * @policy: scheduling class.
1486  *
1487  * Return: On success, this syscall returns the minimum
1488  * rt_priority that can be used by a given scheduling class.
1489  * On failure, a negative error code is returned.
1490  */
SYSCALL_DEFINE1(sched_get_priority_min,int,policy)1491 SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
1492 {
1493 	int ret = -EINVAL;
1494 
1495 	switch (policy) {
1496 	case SCHED_FIFO:
1497 	case SCHED_RR:
1498 		ret = 1;
1499 		break;
1500 	case SCHED_DEADLINE:
1501 	case SCHED_NORMAL:
1502 	case SCHED_BATCH:
1503 	case SCHED_IDLE:
1504 	case SCHED_EXT:
1505 		ret = 0;
1506 	}
1507 	return ret;
1508 }
1509 
sched_rr_get_interval(pid_t pid,struct timespec64 * t)1510 static int sched_rr_get_interval(pid_t pid, struct timespec64 *t)
1511 {
1512 	unsigned int time_slice = 0;
1513 	int retval;
1514 
1515 	if (pid < 0)
1516 		return -EINVAL;
1517 
1518 	scoped_guard (rcu) {
1519 		struct task_struct *p = find_process_by_pid(pid);
1520 		if (!p)
1521 			return -ESRCH;
1522 
1523 		retval = security_task_getscheduler(p);
1524 		if (retval)
1525 			return retval;
1526 
1527 		scoped_guard (task_rq_lock, p) {
1528 			struct rq *rq = scope.rq;
1529 			if (p->sched_class->get_rr_interval)
1530 				time_slice = p->sched_class->get_rr_interval(rq, p);
1531 		}
1532 	}
1533 
1534 	jiffies_to_timespec64(time_slice, t);
1535 	return 0;
1536 }
1537 
1538 /**
1539  * sys_sched_rr_get_interval - return the default time-slice of a process.
1540  * @pid: pid of the process.
1541  * @interval: userspace pointer to the time-slice value.
1542  *
1543  * this syscall writes the default time-slice value of a given process
1544  * into the user-space timespec buffer. A value of '0' means infinity.
1545  *
1546  * Return: On success, 0 and the time-slice is in @interval. Otherwise,
1547  * an error code.
1548  */
SYSCALL_DEFINE2(sched_rr_get_interval,pid_t,pid,struct __kernel_timespec __user *,interval)1549 SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
1550 		struct __kernel_timespec __user *, interval)
1551 {
1552 	struct timespec64 t;
1553 	int retval = sched_rr_get_interval(pid, &t);
1554 
1555 	if (retval == 0)
1556 		retval = put_timespec64(&t, interval);
1557 
1558 	return retval;
1559 }
1560 
1561 #ifdef CONFIG_COMPAT_32BIT_TIME
SYSCALL_DEFINE2(sched_rr_get_interval_time32,pid_t,pid,struct old_timespec32 __user *,interval)1562 SYSCALL_DEFINE2(sched_rr_get_interval_time32, pid_t, pid,
1563 		struct old_timespec32 __user *, interval)
1564 {
1565 	struct timespec64 t;
1566 	int retval = sched_rr_get_interval(pid, &t);
1567 
1568 	if (retval == 0)
1569 		retval = put_old_timespec32(&t, interval);
1570 	return retval;
1571 }
1572 #endif
1573