1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * kernel/sched/syscalls.c
4 *
5 * Core kernel scheduler syscalls related code
6 *
7 * Copyright (C) 1991-2002 Linus Torvalds
8 * Copyright (C) 1998-2024 Ingo Molnar, Red Hat
9 */
10 #include <linux/sched.h>
11 #include <linux/cpuset.h>
12 #include <linux/sched/debug.h>
13
14 #include <uapi/linux/sched/types.h>
15
16 #include "sched.h"
17 #include "autogroup.h"
18
__normal_prio(int policy,int rt_prio,int nice)19 static inline int __normal_prio(int policy, int rt_prio, int nice)
20 {
21 int prio;
22
23 if (dl_policy(policy))
24 prio = MAX_DL_PRIO - 1;
25 else if (rt_policy(policy))
26 prio = MAX_RT_PRIO - 1 - rt_prio;
27 else
28 prio = NICE_TO_PRIO(nice);
29
30 return prio;
31 }
32
33 /*
34 * Calculate the expected normal priority: i.e. priority
35 * without taking RT-inheritance into account. Might be
36 * boosted by interactivity modifiers. Changes upon fork,
37 * setprio syscalls, and whenever the interactivity
38 * estimator recalculates.
39 */
normal_prio(struct task_struct * p)40 static inline int normal_prio(struct task_struct *p)
41 {
42 return __normal_prio(p->policy, p->rt_priority, PRIO_TO_NICE(p->static_prio));
43 }
44
45 /*
46 * Calculate the current priority, i.e. the priority
47 * taken into account by the scheduler. This value might
48 * be boosted by RT tasks, or might be boosted by
49 * interactivity modifiers. Will be RT if the task got
50 * RT-boosted. If not then it returns p->normal_prio.
51 */
effective_prio(struct task_struct * p)52 static int effective_prio(struct task_struct *p)
53 {
54 p->normal_prio = normal_prio(p);
55 /*
56 * If we are RT tasks or we were boosted to RT priority,
57 * keep the priority unchanged. Otherwise, update priority
58 * to the normal priority:
59 */
60 if (!rt_or_dl_prio(p->prio))
61 return p->normal_prio;
62 return p->prio;
63 }
64
set_user_nice(struct task_struct * p,long nice)65 void set_user_nice(struct task_struct *p, long nice)
66 {
67 int old_prio;
68
69 if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE)
70 return;
71 /*
72 * We have to be careful, if called from sys_setpriority(),
73 * the task might be in the middle of scheduling on another CPU.
74 */
75 guard(task_rq_lock)(p);
76
77 /*
78 * The RT priorities are set via sched_setscheduler(), but we still
79 * allow the 'normal' nice value to be set - but as expected
80 * it won't have any effect on scheduling until the task is
81 * SCHED_DEADLINE, SCHED_FIFO or SCHED_RR:
82 */
83 if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
84 p->static_prio = NICE_TO_PRIO(nice);
85 return;
86 }
87
88 scoped_guard (sched_change, p, DEQUEUE_SAVE) {
89 p->static_prio = NICE_TO_PRIO(nice);
90 set_load_weight(p, true);
91 old_prio = p->prio;
92 p->prio = effective_prio(p);
93 }
94 }
95 EXPORT_SYMBOL(set_user_nice);
96
97 /*
98 * is_nice_reduction - check if nice value is an actual reduction
99 *
100 * Similar to can_nice() but does not perform a capability check.
101 *
102 * @p: task
103 * @nice: nice value
104 */
is_nice_reduction(const struct task_struct * p,const int nice)105 static bool is_nice_reduction(const struct task_struct *p, const int nice)
106 {
107 /* Convert nice value [19,-20] to rlimit style value [1,40]: */
108 int nice_rlim = nice_to_rlimit(nice);
109
110 return (nice_rlim <= task_rlimit(p, RLIMIT_NICE));
111 }
112
113 /*
114 * can_nice - check if a task can reduce its nice value
115 * @p: task
116 * @nice: nice value
117 */
can_nice(const struct task_struct * p,const int nice)118 int can_nice(const struct task_struct *p, const int nice)
119 {
120 return is_nice_reduction(p, nice) || capable(CAP_SYS_NICE);
121 }
122
123 #ifdef __ARCH_WANT_SYS_NICE
124
125 /*
126 * sys_nice - change the priority of the current process.
127 * @increment: priority increment
128 *
129 * sys_setpriority is a more generic, but much slower function that
130 * does similar things.
131 */
SYSCALL_DEFINE1(nice,int,increment)132 SYSCALL_DEFINE1(nice, int, increment)
133 {
134 long nice, retval;
135
136 /*
137 * Setpriority might change our priority at the same moment.
138 * We don't have to worry. Conceptually one call occurs first
139 * and we have a single winner.
140 */
141 increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH);
142 nice = task_nice(current) + increment;
143
144 nice = clamp_val(nice, MIN_NICE, MAX_NICE);
145 if (increment < 0 && !can_nice(current, nice))
146 return -EPERM;
147
148 retval = security_task_setnice(current, nice);
149 if (retval)
150 return retval;
151
152 set_user_nice(current, nice);
153 return 0;
154 }
155
156 #endif /* __ARCH_WANT_SYS_NICE */
157
158 /**
159 * task_prio - return the priority value of a given task.
160 * @p: the task in question.
161 *
162 * Return: The priority value as seen by users in /proc.
163 *
164 * sched policy return value kernel prio user prio/nice
165 *
166 * normal, batch, idle [0 ... 39] [100 ... 139] 0/[-20 ... 19]
167 * fifo, rr [-2 ... -100] [98 ... 0] [1 ... 99]
168 * deadline -101 -1 0
169 */
task_prio(const struct task_struct * p)170 int task_prio(const struct task_struct *p)
171 {
172 return p->prio - MAX_RT_PRIO;
173 }
174
175 /**
176 * idle_cpu - is a given CPU idle currently?
177 * @cpu: the processor in question.
178 *
179 * Return: 1 if the CPU is currently idle. 0 otherwise.
180 */
idle_cpu(int cpu)181 int idle_cpu(int cpu)
182 {
183 return idle_rq(cpu_rq(cpu));
184 }
185
186 /**
187 * idle_task - return the idle task for a given CPU.
188 * @cpu: the processor in question.
189 *
190 * Return: The idle task for the CPU @cpu.
191 */
idle_task(int cpu)192 struct task_struct *idle_task(int cpu)
193 {
194 return cpu_rq(cpu)->idle;
195 }
196
197 #ifdef CONFIG_SCHED_CORE
sched_core_idle_cpu(int cpu)198 int sched_core_idle_cpu(int cpu)
199 {
200 struct rq *rq = cpu_rq(cpu);
201
202 if (sched_core_enabled(rq) && rq->curr == rq->idle)
203 return 1;
204
205 return idle_cpu(cpu);
206 }
207 #endif /* CONFIG_SCHED_CORE */
208
209 /**
210 * find_process_by_pid - find a process with a matching PID value.
211 * @pid: the pid in question.
212 *
213 * The task of @pid, if found. %NULL otherwise.
214 */
find_process_by_pid(pid_t pid)215 static struct task_struct *find_process_by_pid(pid_t pid)
216 {
217 return pid ? find_task_by_vpid(pid) : current;
218 }
219
find_get_task(pid_t pid)220 static struct task_struct *find_get_task(pid_t pid)
221 {
222 struct task_struct *p;
223 guard(rcu)();
224
225 p = find_process_by_pid(pid);
226 if (likely(p))
227 get_task_struct(p);
228
229 return p;
230 }
231
DEFINE_CLASS(find_get_task,struct task_struct *,if (_T)put_task_struct (_T),find_get_task (pid),pid_t pid)232 DEFINE_CLASS(find_get_task, struct task_struct *, if (_T) put_task_struct(_T),
233 find_get_task(pid), pid_t pid)
234
235 /*
236 * sched_setparam() passes in -1 for its policy, to let the functions
237 * it calls know not to change it.
238 */
239 #define SETPARAM_POLICY -1
240
241 static void __setscheduler_params(struct task_struct *p,
242 const struct sched_attr *attr)
243 {
244 int policy = attr->sched_policy;
245
246 if (policy == SETPARAM_POLICY)
247 policy = p->policy;
248
249 p->policy = policy;
250
251 if (dl_policy(policy))
252 __setparam_dl(p, attr);
253 else if (fair_policy(policy))
254 __setparam_fair(p, attr);
255
256 /* rt-policy tasks do not have a timerslack */
257 if (rt_or_dl_task_policy(p)) {
258 p->timer_slack_ns = 0;
259 } else if (p->timer_slack_ns == 0) {
260 /* when switching back to non-rt policy, restore timerslack */
261 p->timer_slack_ns = p->default_timer_slack_ns;
262 }
263
264 /*
265 * __sched_setscheduler() ensures attr->sched_priority == 0 when
266 * !rt_policy. Always setting this ensures that things like
267 * getparam()/getattr() don't report silly values for !rt tasks.
268 */
269 p->rt_priority = attr->sched_priority;
270 p->normal_prio = normal_prio(p);
271 set_load_weight(p, true);
272 }
273
274 /*
275 * Check the target process has a UID that matches the current process's:
276 */
check_same_owner(struct task_struct * p)277 static bool check_same_owner(struct task_struct *p)
278 {
279 const struct cred *cred = current_cred(), *pcred;
280 guard(rcu)();
281
282 pcred = __task_cred(p);
283 return (uid_eq(cred->euid, pcred->euid) ||
284 uid_eq(cred->euid, pcred->uid));
285 }
286
287 #ifdef CONFIG_UCLAMP_TASK
288
uclamp_validate(struct task_struct * p,const struct sched_attr * attr)289 static int uclamp_validate(struct task_struct *p,
290 const struct sched_attr *attr)
291 {
292 int util_min = p->uclamp_req[UCLAMP_MIN].value;
293 int util_max = p->uclamp_req[UCLAMP_MAX].value;
294
295 if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) {
296 util_min = attr->sched_util_min;
297
298 if (util_min + 1 > SCHED_CAPACITY_SCALE + 1)
299 return -EINVAL;
300 }
301
302 if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) {
303 util_max = attr->sched_util_max;
304
305 if (util_max + 1 > SCHED_CAPACITY_SCALE + 1)
306 return -EINVAL;
307 }
308
309 if (util_min != -1 && util_max != -1 && util_min > util_max)
310 return -EINVAL;
311
312 /*
313 * We have valid uclamp attributes; make sure uclamp is enabled.
314 *
315 * We need to do that here, because enabling static branches is a
316 * blocking operation which obviously cannot be done while holding
317 * scheduler locks.
318 */
319 sched_uclamp_enable();
320
321 return 0;
322 }
323
uclamp_reset(const struct sched_attr * attr,enum uclamp_id clamp_id,struct uclamp_se * uc_se)324 static bool uclamp_reset(const struct sched_attr *attr,
325 enum uclamp_id clamp_id,
326 struct uclamp_se *uc_se)
327 {
328 /* Reset on sched class change for a non user-defined clamp value. */
329 if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)) &&
330 !uc_se->user_defined)
331 return true;
332
333 /* Reset on sched_util_{min,max} == -1. */
334 if (clamp_id == UCLAMP_MIN &&
335 attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN &&
336 attr->sched_util_min == -1) {
337 return true;
338 }
339
340 if (clamp_id == UCLAMP_MAX &&
341 attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX &&
342 attr->sched_util_max == -1) {
343 return true;
344 }
345
346 return false;
347 }
348
__setscheduler_uclamp(struct task_struct * p,const struct sched_attr * attr)349 static void __setscheduler_uclamp(struct task_struct *p,
350 const struct sched_attr *attr)
351 {
352 enum uclamp_id clamp_id;
353
354 for_each_clamp_id(clamp_id) {
355 struct uclamp_se *uc_se = &p->uclamp_req[clamp_id];
356 unsigned int value;
357
358 if (!uclamp_reset(attr, clamp_id, uc_se))
359 continue;
360
361 /*
362 * RT by default have a 100% boost value that could be modified
363 * at runtime.
364 */
365 if (unlikely(rt_task(p) && clamp_id == UCLAMP_MIN))
366 value = sysctl_sched_uclamp_util_min_rt_default;
367 else
368 value = uclamp_none(clamp_id);
369
370 uclamp_se_set(uc_se, value, false);
371
372 }
373
374 if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)))
375 return;
376
377 if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN &&
378 attr->sched_util_min != -1) {
379 uclamp_se_set(&p->uclamp_req[UCLAMP_MIN],
380 attr->sched_util_min, true);
381 }
382
383 if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX &&
384 attr->sched_util_max != -1) {
385 uclamp_se_set(&p->uclamp_req[UCLAMP_MAX],
386 attr->sched_util_max, true);
387 }
388 }
389
390 #else /* !CONFIG_UCLAMP_TASK: */
391
uclamp_validate(struct task_struct * p,const struct sched_attr * attr)392 static inline int uclamp_validate(struct task_struct *p,
393 const struct sched_attr *attr)
394 {
395 return -EOPNOTSUPP;
396 }
__setscheduler_uclamp(struct task_struct * p,const struct sched_attr * attr)397 static void __setscheduler_uclamp(struct task_struct *p,
398 const struct sched_attr *attr) { }
399 #endif /* !CONFIG_UCLAMP_TASK */
400
401 /*
402 * Allow unprivileged RT tasks to decrease priority.
403 * Only issue a capable test if needed and only once to avoid an audit
404 * event on permitted non-privileged operations:
405 */
user_check_sched_setscheduler(struct task_struct * p,const struct sched_attr * attr,int policy,int reset_on_fork)406 static int user_check_sched_setscheduler(struct task_struct *p,
407 const struct sched_attr *attr,
408 int policy, int reset_on_fork)
409 {
410 if (fair_policy(policy)) {
411 if (attr->sched_nice < task_nice(p) &&
412 !is_nice_reduction(p, attr->sched_nice))
413 goto req_priv;
414 }
415
416 if (rt_policy(policy)) {
417 unsigned long rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO);
418
419 /* Can't set/change the rt policy: */
420 if (policy != p->policy && !rlim_rtprio)
421 goto req_priv;
422
423 /* Can't increase priority: */
424 if (attr->sched_priority > p->rt_priority &&
425 attr->sched_priority > rlim_rtprio)
426 goto req_priv;
427 }
428
429 /*
430 * Can't set/change SCHED_DEADLINE policy at all for now
431 * (safest behavior); in the future we would like to allow
432 * unprivileged DL tasks to increase their relative deadline
433 * or reduce their runtime (both ways reducing utilization)
434 */
435 if (dl_policy(policy))
436 goto req_priv;
437
438 /*
439 * Treat SCHED_IDLE as nice 20. Only allow a switch to
440 * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
441 */
442 if (task_has_idle_policy(p) && !idle_policy(policy)) {
443 if (!is_nice_reduction(p, task_nice(p)))
444 goto req_priv;
445 }
446
447 /* Can't change other user's priorities: */
448 if (!check_same_owner(p))
449 goto req_priv;
450
451 /* Normal users shall not reset the sched_reset_on_fork flag: */
452 if (p->sched_reset_on_fork && !reset_on_fork)
453 goto req_priv;
454
455 return 0;
456
457 req_priv:
458 if (!capable(CAP_SYS_NICE))
459 return -EPERM;
460
461 return 0;
462 }
463
__sched_setscheduler(struct task_struct * p,const struct sched_attr * attr,bool user,bool pi)464 int __sched_setscheduler(struct task_struct *p,
465 const struct sched_attr *attr,
466 bool user, bool pi)
467 {
468 int oldpolicy = -1, policy = attr->sched_policy;
469 int retval, oldprio, newprio;
470 const struct sched_class *prev_class, *next_class;
471 struct balance_callback *head;
472 struct rq_flags rf;
473 int reset_on_fork;
474 int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
475 struct rq *rq;
476 bool cpuset_locked = false;
477
478 /* The pi code expects interrupts enabled */
479 BUG_ON(pi && in_interrupt());
480 recheck:
481 /* Double check policy once rq lock held: */
482 if (policy < 0) {
483 reset_on_fork = p->sched_reset_on_fork;
484 policy = oldpolicy = p->policy;
485 } else {
486 reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK);
487
488 if (!valid_policy(policy))
489 return -EINVAL;
490 }
491
492 if (attr->sched_flags & ~(SCHED_FLAG_ALL | SCHED_FLAG_SUGOV))
493 return -EINVAL;
494
495 /*
496 * Valid priorities for SCHED_FIFO and SCHED_RR are
497 * 1..MAX_RT_PRIO-1, valid priority for SCHED_NORMAL,
498 * SCHED_BATCH and SCHED_IDLE is 0.
499 */
500 if (attr->sched_priority > MAX_RT_PRIO-1)
501 return -EINVAL;
502 if ((dl_policy(policy) && !__checkparam_dl(attr)) ||
503 (rt_policy(policy) != (attr->sched_priority != 0)))
504 return -EINVAL;
505
506 if (user) {
507 retval = user_check_sched_setscheduler(p, attr, policy, reset_on_fork);
508 if (retval)
509 return retval;
510
511 if (attr->sched_flags & SCHED_FLAG_SUGOV)
512 return -EINVAL;
513
514 retval = security_task_setscheduler(p);
515 if (retval)
516 return retval;
517 }
518
519 /* Update task specific "requested" clamps */
520 if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) {
521 retval = uclamp_validate(p, attr);
522 if (retval)
523 return retval;
524 }
525
526 /*
527 * SCHED_DEADLINE bandwidth accounting relies on stable cpusets
528 * information.
529 */
530 if (dl_policy(policy) || dl_policy(p->policy)) {
531 cpuset_locked = true;
532 cpuset_lock();
533 }
534
535 /*
536 * Make sure no PI-waiters arrive (or leave) while we are
537 * changing the priority of the task:
538 *
539 * To be able to change p->policy safely, the appropriate
540 * runqueue lock must be held.
541 */
542 rq = task_rq_lock(p, &rf);
543 update_rq_clock(rq);
544
545 /*
546 * Changing the policy of the stop threads its a very bad idea:
547 */
548 if (p == rq->stop) {
549 retval = -EINVAL;
550 goto unlock;
551 }
552
553 retval = scx_check_setscheduler(p, policy);
554 if (retval)
555 goto unlock;
556
557 /*
558 * If not changing anything there's no need to proceed further,
559 * but store a possible modification of reset_on_fork.
560 */
561 if (unlikely(policy == p->policy)) {
562 if (fair_policy(policy) &&
563 (attr->sched_nice != task_nice(p) ||
564 (attr->sched_runtime != p->se.slice)))
565 goto change;
566 if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
567 goto change;
568 if (dl_policy(policy) && dl_param_changed(p, attr))
569 goto change;
570 if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)
571 goto change;
572
573 p->sched_reset_on_fork = reset_on_fork;
574 retval = 0;
575 goto unlock;
576 }
577 change:
578
579 if (user) {
580 #ifdef CONFIG_RT_GROUP_SCHED
581 /*
582 * Do not allow real-time tasks into groups that have no runtime
583 * assigned.
584 */
585 if (rt_group_sched_enabled() &&
586 rt_bandwidth_enabled() && rt_policy(policy) &&
587 task_group(p)->rt_bandwidth.rt_runtime == 0 &&
588 !task_group_is_autogroup(task_group(p))) {
589 retval = -EPERM;
590 goto unlock;
591 }
592 #endif /* CONFIG_RT_GROUP_SCHED */
593 if (dl_bandwidth_enabled() && dl_policy(policy) &&
594 !(attr->sched_flags & SCHED_FLAG_SUGOV)) {
595 cpumask_t *span = rq->rd->span;
596
597 /*
598 * Don't allow tasks with an affinity mask smaller than
599 * the entire root_domain to become SCHED_DEADLINE. We
600 * will also fail if there's no bandwidth available.
601 */
602 if (!cpumask_subset(span, p->cpus_ptr) ||
603 rq->rd->dl_bw.bw == 0) {
604 retval = -EPERM;
605 goto unlock;
606 }
607 }
608 }
609
610 /* Re-check policy now with rq lock held: */
611 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
612 policy = oldpolicy = -1;
613 task_rq_unlock(rq, p, &rf);
614 if (cpuset_locked)
615 cpuset_unlock();
616 goto recheck;
617 }
618
619 /*
620 * If setscheduling to SCHED_DEADLINE (or changing the parameters
621 * of a SCHED_DEADLINE task) we need to check if enough bandwidth
622 * is available.
623 */
624 if ((dl_policy(policy) || dl_task(p)) && sched_dl_overflow(p, policy, attr)) {
625 retval = -EBUSY;
626 goto unlock;
627 }
628
629 p->sched_reset_on_fork = reset_on_fork;
630 oldprio = p->prio;
631
632 newprio = __normal_prio(policy, attr->sched_priority, attr->sched_nice);
633 if (pi) {
634 /*
635 * Take priority boosted tasks into account. If the new
636 * effective priority is unchanged, we just store the new
637 * normal parameters and do not touch the scheduler class and
638 * the runqueue. This will be done when the task deboost
639 * itself.
640 */
641 newprio = rt_effective_prio(p, newprio);
642 if (newprio == oldprio && !dl_prio(newprio))
643 queue_flags &= ~DEQUEUE_MOVE;
644 }
645
646 prev_class = p->sched_class;
647 next_class = __setscheduler_class(policy, newprio);
648
649 if (prev_class != next_class)
650 queue_flags |= DEQUEUE_CLASS;
651
652 scoped_guard (sched_change, p, queue_flags) {
653
654 if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) {
655 __setscheduler_params(p, attr);
656 p->sched_class = next_class;
657 p->prio = newprio;
658 }
659 __setscheduler_uclamp(p, attr);
660
661 if (scope->queued) {
662 /*
663 * We enqueue to tail when the priority of a task is
664 * increased (user space view).
665 */
666 if (oldprio < p->prio)
667 scope->flags |= ENQUEUE_HEAD;
668 }
669 }
670
671 /* Avoid rq from going away on us: */
672 preempt_disable();
673 head = splice_balance_callbacks(rq);
674 task_rq_unlock(rq, p, &rf);
675
676 if (pi) {
677 if (cpuset_locked)
678 cpuset_unlock();
679 rt_mutex_adjust_pi(p);
680 }
681
682 /* Run balance callbacks after we've adjusted the PI chain: */
683 balance_callbacks(rq, head);
684 preempt_enable();
685
686 return 0;
687
688 unlock:
689 task_rq_unlock(rq, p, &rf);
690 if (cpuset_locked)
691 cpuset_unlock();
692 return retval;
693 }
694
_sched_setscheduler(struct task_struct * p,int policy,const struct sched_param * param,bool check)695 static int _sched_setscheduler(struct task_struct *p, int policy,
696 const struct sched_param *param, bool check)
697 {
698 struct sched_attr attr = {
699 .sched_policy = policy,
700 .sched_priority = param->sched_priority,
701 .sched_nice = PRIO_TO_NICE(p->static_prio),
702 };
703
704 if (p->se.custom_slice)
705 attr.sched_runtime = p->se.slice;
706
707 /* Fixup the legacy SCHED_RESET_ON_FORK hack. */
708 if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) {
709 attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
710 policy &= ~SCHED_RESET_ON_FORK;
711 attr.sched_policy = policy;
712 }
713
714 return __sched_setscheduler(p, &attr, check, true);
715 }
716 /**
717 * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
718 * @p: the task in question.
719 * @policy: new policy.
720 * @param: structure containing the new RT priority.
721 *
722 * Use sched_set_fifo(), read its comment.
723 *
724 * Return: 0 on success. An error code otherwise.
725 *
726 * NOTE that the task may be already dead.
727 */
sched_setscheduler(struct task_struct * p,int policy,const struct sched_param * param)728 int sched_setscheduler(struct task_struct *p, int policy,
729 const struct sched_param *param)
730 {
731 return _sched_setscheduler(p, policy, param, true);
732 }
733
sched_setattr(struct task_struct * p,const struct sched_attr * attr)734 int sched_setattr(struct task_struct *p, const struct sched_attr *attr)
735 {
736 return __sched_setscheduler(p, attr, true, true);
737 }
738
sched_setattr_nocheck(struct task_struct * p,const struct sched_attr * attr)739 int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr)
740 {
741 return __sched_setscheduler(p, attr, false, true);
742 }
743 EXPORT_SYMBOL_GPL(sched_setattr_nocheck);
744
745 /**
746 * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernel-space.
747 * @p: the task in question.
748 * @policy: new policy.
749 * @param: structure containing the new RT priority.
750 *
751 * Just like sched_setscheduler, only don't bother checking if the
752 * current context has permission. For example, this is needed in
753 * stop_machine(): we create temporary high priority worker threads,
754 * but our caller might not have that capability.
755 *
756 * Return: 0 on success. An error code otherwise.
757 */
sched_setscheduler_nocheck(struct task_struct * p,int policy,const struct sched_param * param)758 int sched_setscheduler_nocheck(struct task_struct *p, int policy,
759 const struct sched_param *param)
760 {
761 return _sched_setscheduler(p, policy, param, false);
762 }
763
764 /*
765 * SCHED_FIFO is a broken scheduler model; that is, it is fundamentally
766 * incapable of resource management, which is the one thing an OS really should
767 * be doing.
768 *
769 * This is of course the reason it is limited to privileged users only.
770 *
771 * Worse still; it is fundamentally impossible to compose static priority
772 * workloads. You cannot take two correctly working static prio workloads
773 * and smash them together and still expect them to work.
774 *
775 * For this reason 'all' FIFO tasks the kernel creates are basically at:
776 *
777 * MAX_RT_PRIO / 2
778 *
779 * The administrator _MUST_ configure the system, the kernel simply doesn't
780 * know enough information to make a sensible choice.
781 */
sched_set_fifo(struct task_struct * p)782 void sched_set_fifo(struct task_struct *p)
783 {
784 struct sched_param sp = { .sched_priority = MAX_RT_PRIO / 2 };
785 WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0);
786 }
787 EXPORT_SYMBOL_GPL(sched_set_fifo);
788
789 /*
790 * For when you don't much care about FIFO, but want to be above SCHED_NORMAL.
791 */
sched_set_fifo_low(struct task_struct * p)792 void sched_set_fifo_low(struct task_struct *p)
793 {
794 struct sched_param sp = { .sched_priority = 1 };
795 WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0);
796 }
797 EXPORT_SYMBOL_GPL(sched_set_fifo_low);
798
799 /*
800 * Used when the primary interrupt handler is forced into a thread, in addition
801 * to the (always threaded) secondary handler. The secondary handler gets a
802 * slightly lower priority so that the primary handler can preempt it, thereby
803 * emulating the behavior of a non-PREEMPT_RT system where the primary handler
804 * runs in hard interrupt context.
805 */
sched_set_fifo_secondary(struct task_struct * p)806 void sched_set_fifo_secondary(struct task_struct *p)
807 {
808 struct sched_param sp = { .sched_priority = MAX_RT_PRIO / 2 - 1 };
809 WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0);
810 }
811
sched_set_normal(struct task_struct * p,int nice)812 void sched_set_normal(struct task_struct *p, int nice)
813 {
814 struct sched_attr attr = {
815 .sched_policy = SCHED_NORMAL,
816 .sched_nice = nice,
817 };
818 WARN_ON_ONCE(sched_setattr_nocheck(p, &attr) != 0);
819 }
820 EXPORT_SYMBOL_GPL(sched_set_normal);
821
822 static int
do_sched_setscheduler(pid_t pid,int policy,struct sched_param __user * param)823 do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
824 {
825 struct sched_param lparam;
826
827 if (unlikely(!param || pid < 0))
828 return -EINVAL;
829 if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
830 return -EFAULT;
831
832 CLASS(find_get_task, p)(pid);
833 if (!p)
834 return -ESRCH;
835
836 return sched_setscheduler(p, policy, &lparam);
837 }
838
839 /*
840 * Mimics kernel/events/core.c perf_copy_attr().
841 */
sched_copy_attr(struct sched_attr __user * uattr,struct sched_attr * attr)842 static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *attr)
843 {
844 u32 size;
845 int ret;
846
847 /* Zero the full structure, so that a short copy will be nice: */
848 memset(attr, 0, sizeof(*attr));
849
850 ret = get_user(size, &uattr->size);
851 if (ret)
852 return ret;
853
854 /* ABI compatibility quirk: */
855 if (!size)
856 size = SCHED_ATTR_SIZE_VER0;
857 if (size < SCHED_ATTR_SIZE_VER0 || size > PAGE_SIZE)
858 goto err_size;
859
860 ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size);
861 if (ret) {
862 if (ret == -E2BIG)
863 goto err_size;
864 return ret;
865 }
866
867 if ((attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) &&
868 size < SCHED_ATTR_SIZE_VER1)
869 return -EINVAL;
870
871 /*
872 * XXX: Do we want to be lenient like existing syscalls; or do we want
873 * to be strict and return an error on out-of-bounds values?
874 */
875 attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE);
876
877 return 0;
878
879 err_size:
880 put_user(sizeof(*attr), &uattr->size);
881 return -E2BIG;
882 }
883
get_params(struct task_struct * p,struct sched_attr * attr)884 static void get_params(struct task_struct *p, struct sched_attr *attr)
885 {
886 if (task_has_dl_policy(p)) {
887 __getparam_dl(p, attr);
888 } else if (task_has_rt_policy(p)) {
889 attr->sched_priority = p->rt_priority;
890 } else {
891 attr->sched_nice = task_nice(p);
892 attr->sched_runtime = p->se.slice;
893 }
894 }
895
896 /**
897 * sys_sched_setscheduler - set/change the scheduler policy and RT priority
898 * @pid: the pid in question.
899 * @policy: new policy.
900 * @param: structure containing the new RT priority.
901 *
902 * Return: 0 on success. An error code otherwise.
903 */
SYSCALL_DEFINE3(sched_setscheduler,pid_t,pid,int,policy,struct sched_param __user *,param)904 SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, struct sched_param __user *, param)
905 {
906 if (policy < 0)
907 return -EINVAL;
908
909 return do_sched_setscheduler(pid, policy, param);
910 }
911
912 /**
913 * sys_sched_setparam - set/change the RT priority of a thread
914 * @pid: the pid in question.
915 * @param: structure containing the new RT priority.
916 *
917 * Return: 0 on success. An error code otherwise.
918 */
SYSCALL_DEFINE2(sched_setparam,pid_t,pid,struct sched_param __user *,param)919 SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
920 {
921 return do_sched_setscheduler(pid, SETPARAM_POLICY, param);
922 }
923
924 /**
925 * sys_sched_setattr - same as above, but with extended sched_attr
926 * @pid: the pid in question.
927 * @uattr: structure containing the extended parameters.
928 * @flags: for future extension.
929 */
SYSCALL_DEFINE3(sched_setattr,pid_t,pid,struct sched_attr __user *,uattr,unsigned int,flags)930 SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
931 unsigned int, flags)
932 {
933 struct sched_attr attr;
934 int retval;
935
936 if (unlikely(!uattr || pid < 0 || flags))
937 return -EINVAL;
938
939 retval = sched_copy_attr(uattr, &attr);
940 if (retval)
941 return retval;
942
943 if ((int)attr.sched_policy < 0)
944 return -EINVAL;
945 if (attr.sched_flags & SCHED_FLAG_KEEP_POLICY)
946 attr.sched_policy = SETPARAM_POLICY;
947
948 CLASS(find_get_task, p)(pid);
949 if (!p)
950 return -ESRCH;
951
952 if (attr.sched_flags & SCHED_FLAG_KEEP_PARAMS)
953 get_params(p, &attr);
954
955 return sched_setattr(p, &attr);
956 }
957
958 /**
959 * sys_sched_getscheduler - get the policy (scheduling class) of a thread
960 * @pid: the pid in question.
961 *
962 * Return: On success, the policy of the thread. Otherwise, a negative error
963 * code.
964 */
SYSCALL_DEFINE1(sched_getscheduler,pid_t,pid)965 SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
966 {
967 struct task_struct *p;
968 int retval;
969
970 if (pid < 0)
971 return -EINVAL;
972
973 guard(rcu)();
974 p = find_process_by_pid(pid);
975 if (!p)
976 return -ESRCH;
977
978 retval = security_task_getscheduler(p);
979 if (!retval) {
980 retval = p->policy;
981 if (p->sched_reset_on_fork)
982 retval |= SCHED_RESET_ON_FORK;
983 }
984 return retval;
985 }
986
987 /**
988 * sys_sched_getparam - get the RT priority of a thread
989 * @pid: the pid in question.
990 * @param: structure containing the RT priority.
991 *
992 * Return: On success, 0 and the RT priority is in @param. Otherwise, an error
993 * code.
994 */
SYSCALL_DEFINE2(sched_getparam,pid_t,pid,struct sched_param __user *,param)995 SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
996 {
997 struct sched_param lp = { .sched_priority = 0 };
998 struct task_struct *p;
999 int retval;
1000
1001 if (unlikely(!param || pid < 0))
1002 return -EINVAL;
1003
1004 scoped_guard (rcu) {
1005 p = find_process_by_pid(pid);
1006 if (!p)
1007 return -ESRCH;
1008
1009 retval = security_task_getscheduler(p);
1010 if (retval)
1011 return retval;
1012
1013 if (task_has_rt_policy(p))
1014 lp.sched_priority = p->rt_priority;
1015 }
1016
1017 /*
1018 * This one might sleep, we cannot do it with a spinlock held ...
1019 */
1020 return copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
1021 }
1022
1023 /**
1024 * sys_sched_getattr - similar to sched_getparam, but with sched_attr
1025 * @pid: the pid in question.
1026 * @uattr: structure containing the extended parameters.
1027 * @usize: sizeof(attr) for fwd/bwd comp.
1028 * @flags: for future extension.
1029 */
SYSCALL_DEFINE4(sched_getattr,pid_t,pid,struct sched_attr __user *,uattr,unsigned int,usize,unsigned int,flags)1030 SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
1031 unsigned int, usize, unsigned int, flags)
1032 {
1033 struct sched_attr kattr = { };
1034 struct task_struct *p;
1035 int retval;
1036
1037 if (unlikely(!uattr || pid < 0 || usize > PAGE_SIZE ||
1038 usize < SCHED_ATTR_SIZE_VER0 || flags))
1039 return -EINVAL;
1040
1041 scoped_guard (rcu) {
1042 p = find_process_by_pid(pid);
1043 if (!p)
1044 return -ESRCH;
1045
1046 retval = security_task_getscheduler(p);
1047 if (retval)
1048 return retval;
1049
1050 kattr.sched_policy = p->policy;
1051 if (p->sched_reset_on_fork)
1052 kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
1053 get_params(p, &kattr);
1054 kattr.sched_flags &= SCHED_FLAG_ALL;
1055
1056 #ifdef CONFIG_UCLAMP_TASK
1057 /*
1058 * This could race with another potential updater, but this is fine
1059 * because it'll correctly read the old or the new value. We don't need
1060 * to guarantee who wins the race as long as it doesn't return garbage.
1061 */
1062 kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value;
1063 kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value;
1064 #endif
1065 }
1066
1067 kattr.size = min(usize, sizeof(kattr));
1068 return copy_struct_to_user(uattr, usize, &kattr, sizeof(kattr), NULL);
1069 }
1070
dl_task_check_affinity(struct task_struct * p,const struct cpumask * mask)1071 int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask)
1072 {
1073 /*
1074 * If the task isn't a deadline task or admission control is
1075 * disabled then we don't care about affinity changes.
1076 */
1077 if (!task_has_dl_policy(p) || !dl_bandwidth_enabled())
1078 return 0;
1079
1080 /*
1081 * The special/sugov task isn't part of regular bandwidth/admission
1082 * control so let userspace change affinities.
1083 */
1084 if (dl_entity_is_special(&p->dl))
1085 return 0;
1086
1087 /*
1088 * Since bandwidth control happens on root_domain basis,
1089 * if admission test is enabled, we only admit -deadline
1090 * tasks allowed to run on all the CPUs in the task's
1091 * root_domain.
1092 */
1093 guard(rcu)();
1094 if (!cpumask_subset(task_rq(p)->rd->span, mask))
1095 return -EBUSY;
1096
1097 return 0;
1098 }
1099
__sched_setaffinity(struct task_struct * p,struct affinity_context * ctx)1100 int __sched_setaffinity(struct task_struct *p, struct affinity_context *ctx)
1101 {
1102 int retval;
1103 cpumask_var_t cpus_allowed, new_mask;
1104
1105 if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL))
1106 return -ENOMEM;
1107
1108 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
1109 retval = -ENOMEM;
1110 goto out_free_cpus_allowed;
1111 }
1112
1113 cpuset_cpus_allowed(p, cpus_allowed);
1114 cpumask_and(new_mask, ctx->new_mask, cpus_allowed);
1115
1116 ctx->new_mask = new_mask;
1117 ctx->flags |= SCA_CHECK;
1118
1119 retval = dl_task_check_affinity(p, new_mask);
1120 if (retval)
1121 goto out_free_new_mask;
1122
1123 retval = __set_cpus_allowed_ptr(p, ctx);
1124 if (retval)
1125 goto out_free_new_mask;
1126
1127 cpuset_cpus_allowed(p, cpus_allowed);
1128 if (!cpumask_subset(new_mask, cpus_allowed)) {
1129 /*
1130 * We must have raced with a concurrent cpuset update.
1131 * Just reset the cpumask to the cpuset's cpus_allowed.
1132 */
1133 cpumask_copy(new_mask, cpus_allowed);
1134
1135 /*
1136 * If SCA_USER is set, a 2nd call to __set_cpus_allowed_ptr()
1137 * will restore the previous user_cpus_ptr value.
1138 *
1139 * In the unlikely event a previous user_cpus_ptr exists,
1140 * we need to further restrict the mask to what is allowed
1141 * by that old user_cpus_ptr.
1142 */
1143 if (unlikely((ctx->flags & SCA_USER) && ctx->user_mask)) {
1144 bool empty = !cpumask_and(new_mask, new_mask,
1145 ctx->user_mask);
1146
1147 if (empty)
1148 cpumask_copy(new_mask, cpus_allowed);
1149 }
1150 __set_cpus_allowed_ptr(p, ctx);
1151 retval = -EINVAL;
1152 }
1153
1154 out_free_new_mask:
1155 free_cpumask_var(new_mask);
1156 out_free_cpus_allowed:
1157 free_cpumask_var(cpus_allowed);
1158 return retval;
1159 }
1160
sched_setaffinity(pid_t pid,const struct cpumask * in_mask)1161 long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
1162 {
1163 struct affinity_context ac;
1164 struct cpumask *user_mask;
1165 int retval;
1166
1167 CLASS(find_get_task, p)(pid);
1168 if (!p)
1169 return -ESRCH;
1170
1171 if (p->flags & PF_NO_SETAFFINITY)
1172 return -EINVAL;
1173
1174 if (!check_same_owner(p)) {
1175 guard(rcu)();
1176 if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE))
1177 return -EPERM;
1178 }
1179
1180 retval = security_task_setscheduler(p);
1181 if (retval)
1182 return retval;
1183
1184 /*
1185 * With non-SMP configs, user_cpus_ptr/user_mask isn't used and
1186 * alloc_user_cpus_ptr() returns NULL.
1187 */
1188 user_mask = alloc_user_cpus_ptr(NUMA_NO_NODE);
1189 if (user_mask) {
1190 cpumask_copy(user_mask, in_mask);
1191 } else {
1192 return -ENOMEM;
1193 }
1194
1195 ac = (struct affinity_context){
1196 .new_mask = in_mask,
1197 .user_mask = user_mask,
1198 .flags = SCA_USER,
1199 };
1200
1201 retval = __sched_setaffinity(p, &ac);
1202 kfree(ac.user_mask);
1203
1204 return retval;
1205 }
1206
get_user_cpu_mask(unsigned long __user * user_mask_ptr,unsigned len,struct cpumask * new_mask)1207 static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
1208 struct cpumask *new_mask)
1209 {
1210 if (len < cpumask_size())
1211 cpumask_clear(new_mask);
1212 else if (len > cpumask_size())
1213 len = cpumask_size();
1214
1215 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
1216 }
1217
1218 /**
1219 * sys_sched_setaffinity - set the CPU affinity of a process
1220 * @pid: pid of the process
1221 * @len: length in bytes of the bitmask pointed to by user_mask_ptr
1222 * @user_mask_ptr: user-space pointer to the new CPU mask
1223 *
1224 * Return: 0 on success. An error code otherwise.
1225 */
SYSCALL_DEFINE3(sched_setaffinity,pid_t,pid,unsigned int,len,unsigned long __user *,user_mask_ptr)1226 SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
1227 unsigned long __user *, user_mask_ptr)
1228 {
1229 cpumask_var_t new_mask;
1230 int retval;
1231
1232 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
1233 return -ENOMEM;
1234
1235 retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
1236 if (retval == 0)
1237 retval = sched_setaffinity(pid, new_mask);
1238 free_cpumask_var(new_mask);
1239 return retval;
1240 }
1241
sched_getaffinity(pid_t pid,struct cpumask * mask)1242 long sched_getaffinity(pid_t pid, struct cpumask *mask)
1243 {
1244 struct task_struct *p;
1245 int retval;
1246
1247 guard(rcu)();
1248 p = find_process_by_pid(pid);
1249 if (!p)
1250 return -ESRCH;
1251
1252 retval = security_task_getscheduler(p);
1253 if (retval)
1254 return retval;
1255
1256 guard(raw_spinlock_irqsave)(&p->pi_lock);
1257 cpumask_and(mask, &p->cpus_mask, cpu_active_mask);
1258
1259 return 0;
1260 }
1261
1262 /**
1263 * sys_sched_getaffinity - get the CPU affinity of a process
1264 * @pid: pid of the process
1265 * @len: length in bytes of the bitmask pointed to by user_mask_ptr
1266 * @user_mask_ptr: user-space pointer to hold the current CPU mask
1267 *
1268 * Return: size of CPU mask copied to user_mask_ptr on success. An
1269 * error code otherwise.
1270 */
SYSCALL_DEFINE3(sched_getaffinity,pid_t,pid,unsigned int,len,unsigned long __user *,user_mask_ptr)1271 SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
1272 unsigned long __user *, user_mask_ptr)
1273 {
1274 int ret;
1275 cpumask_var_t mask;
1276
1277 if ((len * BITS_PER_BYTE) < nr_cpu_ids)
1278 return -EINVAL;
1279 if (len & (sizeof(unsigned long)-1))
1280 return -EINVAL;
1281
1282 if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
1283 return -ENOMEM;
1284
1285 ret = sched_getaffinity(pid, mask);
1286 if (ret == 0) {
1287 unsigned int retlen = min(len, cpumask_size());
1288
1289 if (copy_to_user(user_mask_ptr, cpumask_bits(mask), retlen))
1290 ret = -EFAULT;
1291 else
1292 ret = retlen;
1293 }
1294 free_cpumask_var(mask);
1295
1296 return ret;
1297 }
1298
do_sched_yield(void)1299 static void do_sched_yield(void)
1300 {
1301 struct rq_flags rf;
1302 struct rq *rq;
1303
1304 rq = this_rq_lock_irq(&rf);
1305
1306 schedstat_inc(rq->yld_count);
1307 rq->donor->sched_class->yield_task(rq);
1308
1309 preempt_disable();
1310 rq_unlock_irq(rq, &rf);
1311 sched_preempt_enable_no_resched();
1312
1313 schedule();
1314 }
1315
1316 /**
1317 * sys_sched_yield - yield the current processor to other threads.
1318 *
1319 * This function yields the current CPU to other tasks. If there are no
1320 * other threads running on this CPU then this function will return.
1321 *
1322 * Return: 0.
1323 */
SYSCALL_DEFINE0(sched_yield)1324 SYSCALL_DEFINE0(sched_yield)
1325 {
1326 do_sched_yield();
1327 return 0;
1328 }
1329
1330 /**
1331 * yield - yield the current processor to other threads.
1332 *
1333 * Do not ever use this function, there's a 99% chance you're doing it wrong.
1334 *
1335 * The scheduler is at all times free to pick the calling task as the most
1336 * eligible task to run, if removing the yield() call from your code breaks
1337 * it, it's already broken.
1338 *
1339 * Typical broken usage is:
1340 *
1341 * while (!event)
1342 * yield();
1343 *
1344 * where one assumes that yield() will let 'the other' process run that will
1345 * make event true. If the current task is a SCHED_FIFO task that will never
1346 * happen. Never use yield() as a progress guarantee!!
1347 *
1348 * If you want to use yield() to wait for something, use wait_event().
1349 * If you want to use yield() to be 'nice' for others, use cond_resched().
1350 * If you still want to use yield(), do not!
1351 */
yield(void)1352 void __sched yield(void)
1353 {
1354 set_current_state(TASK_RUNNING);
1355 do_sched_yield();
1356 }
1357 EXPORT_SYMBOL(yield);
1358
1359 /**
1360 * yield_to - yield the current processor to another thread in
1361 * your thread group, or accelerate that thread toward the
1362 * processor it's on.
1363 * @p: target task
1364 * @preempt: whether task preemption is allowed or not
1365 *
1366 * It's the caller's job to ensure that the target task struct
1367 * can't go away on us before we can do any checks.
1368 *
1369 * Return:
1370 * true (>0) if we indeed boosted the target task.
1371 * false (0) if we failed to boost the target.
1372 * -ESRCH if there's no task to yield to.
1373 */
yield_to(struct task_struct * p,bool preempt)1374 int __sched yield_to(struct task_struct *p, bool preempt)
1375 {
1376 struct task_struct *curr;
1377 struct rq *rq, *p_rq;
1378 int yielded = 0;
1379
1380 scoped_guard (raw_spinlock_irqsave, &p->pi_lock) {
1381 rq = this_rq();
1382 curr = rq->donor;
1383
1384 again:
1385 p_rq = task_rq(p);
1386 /*
1387 * If we're the only runnable task on the rq and target rq also
1388 * has only one task, there's absolutely no point in yielding.
1389 */
1390 if (rq->nr_running == 1 && p_rq->nr_running == 1)
1391 return -ESRCH;
1392
1393 guard(double_rq_lock)(rq, p_rq);
1394 if (task_rq(p) != p_rq)
1395 goto again;
1396
1397 if (!curr->sched_class->yield_to_task)
1398 return 0;
1399
1400 if (curr->sched_class != p->sched_class)
1401 return 0;
1402
1403 if (task_on_cpu(p_rq, p) || !task_is_running(p))
1404 return 0;
1405
1406 yielded = curr->sched_class->yield_to_task(rq, p);
1407 if (yielded) {
1408 schedstat_inc(rq->yld_count);
1409 /*
1410 * Make p's CPU reschedule; pick_next_entity
1411 * takes care of fairness.
1412 */
1413 if (preempt && rq != p_rq)
1414 resched_curr(p_rq);
1415 }
1416 }
1417
1418 if (yielded)
1419 schedule();
1420
1421 return yielded;
1422 }
1423 EXPORT_SYMBOL_GPL(yield_to);
1424
1425 /**
1426 * sys_sched_get_priority_max - return maximum RT priority.
1427 * @policy: scheduling class.
1428 *
1429 * Return: On success, this syscall returns the maximum
1430 * rt_priority that can be used by a given scheduling class.
1431 * On failure, a negative error code is returned.
1432 */
SYSCALL_DEFINE1(sched_get_priority_max,int,policy)1433 SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
1434 {
1435 int ret = -EINVAL;
1436
1437 switch (policy) {
1438 case SCHED_FIFO:
1439 case SCHED_RR:
1440 ret = MAX_RT_PRIO-1;
1441 break;
1442 case SCHED_DEADLINE:
1443 case SCHED_NORMAL:
1444 case SCHED_BATCH:
1445 case SCHED_IDLE:
1446 case SCHED_EXT:
1447 ret = 0;
1448 break;
1449 }
1450 return ret;
1451 }
1452
1453 /**
1454 * sys_sched_get_priority_min - return minimum RT priority.
1455 * @policy: scheduling class.
1456 *
1457 * Return: On success, this syscall returns the minimum
1458 * rt_priority that can be used by a given scheduling class.
1459 * On failure, a negative error code is returned.
1460 */
SYSCALL_DEFINE1(sched_get_priority_min,int,policy)1461 SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
1462 {
1463 int ret = -EINVAL;
1464
1465 switch (policy) {
1466 case SCHED_FIFO:
1467 case SCHED_RR:
1468 ret = 1;
1469 break;
1470 case SCHED_DEADLINE:
1471 case SCHED_NORMAL:
1472 case SCHED_BATCH:
1473 case SCHED_IDLE:
1474 case SCHED_EXT:
1475 ret = 0;
1476 }
1477 return ret;
1478 }
1479
sched_rr_get_interval(pid_t pid,struct timespec64 * t)1480 static int sched_rr_get_interval(pid_t pid, struct timespec64 *t)
1481 {
1482 unsigned int time_slice = 0;
1483 int retval;
1484
1485 if (pid < 0)
1486 return -EINVAL;
1487
1488 scoped_guard (rcu) {
1489 struct task_struct *p = find_process_by_pid(pid);
1490 if (!p)
1491 return -ESRCH;
1492
1493 retval = security_task_getscheduler(p);
1494 if (retval)
1495 return retval;
1496
1497 scoped_guard (task_rq_lock, p) {
1498 struct rq *rq = scope.rq;
1499 if (p->sched_class->get_rr_interval)
1500 time_slice = p->sched_class->get_rr_interval(rq, p);
1501 }
1502 }
1503
1504 jiffies_to_timespec64(time_slice, t);
1505 return 0;
1506 }
1507
1508 /**
1509 * sys_sched_rr_get_interval - return the default time-slice of a process.
1510 * @pid: pid of the process.
1511 * @interval: userspace pointer to the time-slice value.
1512 *
1513 * this syscall writes the default time-slice value of a given process
1514 * into the user-space timespec buffer. A value of '0' means infinity.
1515 *
1516 * Return: On success, 0 and the time-slice is in @interval. Otherwise,
1517 * an error code.
1518 */
SYSCALL_DEFINE2(sched_rr_get_interval,pid_t,pid,struct __kernel_timespec __user *,interval)1519 SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
1520 struct __kernel_timespec __user *, interval)
1521 {
1522 struct timespec64 t;
1523 int retval = sched_rr_get_interval(pid, &t);
1524
1525 if (retval == 0)
1526 retval = put_timespec64(&t, interval);
1527
1528 return retval;
1529 }
1530
1531 #ifdef CONFIG_COMPAT_32BIT_TIME
SYSCALL_DEFINE2(sched_rr_get_interval_time32,pid_t,pid,struct old_timespec32 __user *,interval)1532 SYSCALL_DEFINE2(sched_rr_get_interval_time32, pid_t, pid,
1533 struct old_timespec32 __user *, interval)
1534 {
1535 struct timespec64 t;
1536 int retval = sched_rr_get_interval(pid, &t);
1537
1538 if (retval == 0)
1539 retval = put_old_timespec32(&t, interval);
1540 return retval;
1541 }
1542 #endif
1543