1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * kernel/cpuset.c
4 *
5 * Processor and Memory placement constraints for sets of tasks.
6 *
7 * Copyright (C) 2003 BULL SA.
8 * Copyright (C) 2004-2007 Silicon Graphics, Inc.
9 * Copyright (C) 2006 Google, Inc
10 *
11 * Portions derived from Patrick Mochel's sysfs code.
12 * sysfs is Copyright (c) 2001-3 Patrick Mochel
13 *
14 * 2003-10-10 Written by Simon Derr.
15 * 2003-10-22 Updates by Stephen Hemminger.
16 * 2004 May-July Rework by Paul Jackson.
17 * 2006 Rework by Paul Menage to use generic cgroups
18 * 2008 Rework of the scheduler domains and CPU hotplug handling
19 * by Max Krasnyansky
20 */
21 #include "cpuset-internal.h"
22
23 #include <linux/init.h>
24 #include <linux/interrupt.h>
25 #include <linux/kernel.h>
26 #include <linux/mempolicy.h>
27 #include <linux/mm.h>
28 #include <linux/memory.h>
29 #include <linux/rcupdate.h>
30 #include <linux/sched.h>
31 #include <linux/sched/deadline.h>
32 #include <linux/sched/mm.h>
33 #include <linux/sched/task.h>
34 #include <linux/security.h>
35 #include <linux/oom.h>
36 #include <linux/sched/isolation.h>
37 #include <linux/wait.h>
38 #include <linux/workqueue.h>
39 #include <linux/task_work.h>
40
41 DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key);
42 DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key);
43
44 /*
45 * There could be abnormal cpuset configurations for cpu or memory
46 * node binding, add this key to provide a quick low-cost judgment
47 * of the situation.
48 */
49 DEFINE_STATIC_KEY_FALSE(cpusets_insane_config_key);
50
51 static const char * const perr_strings[] = {
52 [PERR_INVCPUS] = "Invalid cpu list in cpuset.cpus.exclusive",
53 [PERR_INVPARENT] = "Parent is an invalid partition root",
54 [PERR_NOTPART] = "Parent is not a partition root",
55 [PERR_NOTEXCL] = "Cpu list in cpuset.cpus not exclusive",
56 [PERR_NOCPUS] = "Parent unable to distribute cpu downstream",
57 [PERR_HOTPLUG] = "No cpu available due to hotplug",
58 [PERR_CPUSEMPTY] = "cpuset.cpus and cpuset.cpus.exclusive are empty",
59 [PERR_HKEEPING] = "partition config conflicts with housekeeping setup",
60 [PERR_ACCESS] = "Enable partition not permitted",
61 [PERR_REMOTE] = "Have remote partition underneath",
62 };
63
64 /*
65 * For local partitions, update to subpartitions_cpus & isolated_cpus is done
66 * in update_parent_effective_cpumask(). For remote partitions, it is done in
67 * the remote_partition_*() and remote_cpus_update() helpers.
68 */
69 /*
70 * Exclusive CPUs distributed out to local or remote sub-partitions of
71 * top_cpuset
72 */
73 static cpumask_var_t subpartitions_cpus;
74
75 /*
76 * Exclusive CPUs in isolated partitions
77 */
78 static cpumask_var_t isolated_cpus;
79
80 /*
81 * isolated_cpus updating flag (protected by cpuset_mutex)
82 * Set if isolated_cpus is going to be updated in the current
83 * cpuset_mutex crtical section.
84 */
85 static bool isolated_cpus_updating;
86
87 /*
88 * A flag to force sched domain rebuild at the end of an operation.
89 * It can be set in
90 * - update_partition_sd_lb()
91 * - update_cpumasks_hier()
92 * - cpuset_update_flag()
93 * - cpuset_hotplug_update_tasks()
94 * - cpuset_handle_hotplug()
95 *
96 * Protected by cpuset_mutex (with cpus_read_lock held) or cpus_write_lock.
97 *
98 * Note that update_relax_domain_level() in cpuset-v1.c can still call
99 * rebuild_sched_domains_locked() directly without using this flag.
100 */
101 static bool force_sd_rebuild;
102
103 /*
104 * Partition root states:
105 *
106 * 0 - member (not a partition root)
107 * 1 - partition root
108 * 2 - partition root without load balancing (isolated)
109 * -1 - invalid partition root
110 * -2 - invalid isolated partition root
111 *
112 * There are 2 types of partitions - local or remote. Local partitions are
113 * those whose parents are partition root themselves. Setting of
114 * cpuset.cpus.exclusive are optional in setting up local partitions.
115 * Remote partitions are those whose parents are not partition roots. Passing
116 * down exclusive CPUs by setting cpuset.cpus.exclusive along its ancestor
117 * nodes are mandatory in creating a remote partition.
118 *
119 * For simplicity, a local partition can be created under a local or remote
120 * partition but a remote partition cannot have any partition root in its
121 * ancestor chain except the cgroup root.
122 *
123 * A valid partition can be formed by setting exclusive_cpus or cpus_allowed
124 * if exclusive_cpus is not set. In the case of partition with empty
125 * exclusive_cpus, all the conflicting exclusive CPUs specified in the
126 * following cpumasks of sibling cpusets will be removed from its
127 * cpus_allowed in determining its effective_xcpus.
128 * - effective_xcpus
129 * - exclusive_cpus
130 *
131 * The "cpuset.cpus.exclusive" control file should be used for setting up
132 * partition if the users want to get as many CPUs as possible.
133 */
134 #define PRS_MEMBER 0
135 #define PRS_ROOT 1
136 #define PRS_ISOLATED 2
137 #define PRS_INVALID_ROOT -1
138 #define PRS_INVALID_ISOLATED -2
139
140 /*
141 * Temporary cpumasks for working with partitions that are passed among
142 * functions to avoid memory allocation in inner functions.
143 */
144 struct tmpmasks {
145 cpumask_var_t addmask, delmask; /* For partition root */
146 cpumask_var_t new_cpus; /* For update_cpumasks_hier() */
147 };
148
inc_dl_tasks_cs(struct task_struct * p)149 void inc_dl_tasks_cs(struct task_struct *p)
150 {
151 struct cpuset *cs = task_cs(p);
152
153 cs->nr_deadline_tasks++;
154 }
155
dec_dl_tasks_cs(struct task_struct * p)156 void dec_dl_tasks_cs(struct task_struct *p)
157 {
158 struct cpuset *cs = task_cs(p);
159
160 cs->nr_deadline_tasks--;
161 }
162
is_partition_valid(const struct cpuset * cs)163 static inline bool is_partition_valid(const struct cpuset *cs)
164 {
165 return cs->partition_root_state > 0;
166 }
167
is_partition_invalid(const struct cpuset * cs)168 static inline bool is_partition_invalid(const struct cpuset *cs)
169 {
170 return cs->partition_root_state < 0;
171 }
172
cs_is_member(const struct cpuset * cs)173 static inline bool cs_is_member(const struct cpuset *cs)
174 {
175 return cs->partition_root_state == PRS_MEMBER;
176 }
177
178 /*
179 * Callers should hold callback_lock to modify partition_root_state.
180 */
make_partition_invalid(struct cpuset * cs)181 static inline void make_partition_invalid(struct cpuset *cs)
182 {
183 if (cs->partition_root_state > 0)
184 cs->partition_root_state = -cs->partition_root_state;
185 }
186
187 /*
188 * Send notification event of whenever partition_root_state changes.
189 */
notify_partition_change(struct cpuset * cs,int old_prs)190 static inline void notify_partition_change(struct cpuset *cs, int old_prs)
191 {
192 if (old_prs == cs->partition_root_state)
193 return;
194 cgroup_file_notify(&cs->partition_file);
195
196 /* Reset prs_err if not invalid */
197 if (is_partition_valid(cs))
198 WRITE_ONCE(cs->prs_err, PERR_NONE);
199 }
200
201 /*
202 * The top_cpuset is always synchronized to cpu_active_mask and we should avoid
203 * using cpu_online_mask as much as possible. An active CPU is always an online
204 * CPU, but not vice versa. cpu_active_mask and cpu_online_mask can differ
205 * during hotplug operations. A CPU is marked active at the last stage of CPU
206 * bringup (CPUHP_AP_ACTIVE). It is also the stage where cpuset hotplug code
207 * will be called to update the sched domains so that the scheduler can move
208 * a normal task to a newly active CPU or remove tasks away from a newly
209 * inactivated CPU. The online bit is set much earlier in the CPU bringup
210 * process and cleared much later in CPU teardown.
211 *
212 * If cpu_online_mask is used while a hotunplug operation is happening in
213 * parallel, we may leave an offline CPU in cpu_allowed or some other masks.
214 */
215 struct cpuset top_cpuset = {
216 .flags = BIT(CS_CPU_EXCLUSIVE) |
217 BIT(CS_MEM_EXCLUSIVE) | BIT(CS_SCHED_LOAD_BALANCE),
218 .partition_root_state = PRS_ROOT,
219 };
220
221 /*
222 * There are two global locks guarding cpuset structures - cpuset_mutex and
223 * callback_lock. The cpuset code uses only cpuset_mutex. Other kernel
224 * subsystems can use cpuset_lock()/cpuset_unlock() to prevent change to cpuset
225 * structures. Note that cpuset_mutex needs to be a mutex as it is used in
226 * paths that rely on priority inheritance (e.g. scheduler - on RT) for
227 * correctness.
228 *
229 * A task must hold both locks to modify cpusets. If a task holds
230 * cpuset_mutex, it blocks others, ensuring that it is the only task able to
231 * also acquire callback_lock and be able to modify cpusets. It can perform
232 * various checks on the cpuset structure first, knowing nothing will change.
233 * It can also allocate memory while just holding cpuset_mutex. While it is
234 * performing these checks, various callback routines can briefly acquire
235 * callback_lock to query cpusets. Once it is ready to make the changes, it
236 * takes callback_lock, blocking everyone else.
237 *
238 * Calls to the kernel memory allocator can not be made while holding
239 * callback_lock, as that would risk double tripping on callback_lock
240 * from one of the callbacks into the cpuset code from within
241 * __alloc_pages().
242 *
243 * If a task is only holding callback_lock, then it has read-only
244 * access to cpusets.
245 *
246 * Now, the task_struct fields mems_allowed and mempolicy may be changed
247 * by other task, we use alloc_lock in the task_struct fields to protect
248 * them.
249 *
250 * The cpuset_common_seq_show() handlers only hold callback_lock across
251 * small pieces of code, such as when reading out possibly multi-word
252 * cpumasks and nodemasks.
253 */
254
255 static DEFINE_MUTEX(cpuset_mutex);
256
257 /**
258 * cpuset_lock - Acquire the global cpuset mutex
259 *
260 * This locks the global cpuset mutex to prevent modifications to cpuset
261 * hierarchy and configurations. This helper is not enough to make modification.
262 */
cpuset_lock(void)263 void cpuset_lock(void)
264 {
265 mutex_lock(&cpuset_mutex);
266 }
267
cpuset_unlock(void)268 void cpuset_unlock(void)
269 {
270 mutex_unlock(&cpuset_mutex);
271 }
272
lockdep_assert_cpuset_lock_held(void)273 void lockdep_assert_cpuset_lock_held(void)
274 {
275 lockdep_assert_held(&cpuset_mutex);
276 }
277
278 /**
279 * cpuset_full_lock - Acquire full protection for cpuset modification
280 *
281 * Takes both CPU hotplug read lock (cpus_read_lock()) and cpuset mutex
282 * to safely modify cpuset data.
283 */
cpuset_full_lock(void)284 void cpuset_full_lock(void)
285 {
286 cpus_read_lock();
287 mutex_lock(&cpuset_mutex);
288 }
289
cpuset_full_unlock(void)290 void cpuset_full_unlock(void)
291 {
292 mutex_unlock(&cpuset_mutex);
293 cpus_read_unlock();
294 }
295
296 #ifdef CONFIG_LOCKDEP
lockdep_is_cpuset_held(void)297 bool lockdep_is_cpuset_held(void)
298 {
299 return lockdep_is_held(&cpuset_mutex);
300 }
301 #endif
302
303 static DEFINE_SPINLOCK(callback_lock);
304
cpuset_callback_lock_irq(void)305 void cpuset_callback_lock_irq(void)
306 {
307 spin_lock_irq(&callback_lock);
308 }
309
cpuset_callback_unlock_irq(void)310 void cpuset_callback_unlock_irq(void)
311 {
312 spin_unlock_irq(&callback_lock);
313 }
314
315 static struct workqueue_struct *cpuset_migrate_mm_wq;
316
317 static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq);
318
check_insane_mems_config(nodemask_t * nodes)319 static inline void check_insane_mems_config(nodemask_t *nodes)
320 {
321 if (!cpusets_insane_config() &&
322 movable_only_nodes(nodes)) {
323 static_branch_enable_cpuslocked(&cpusets_insane_config_key);
324 pr_info("Unsupported (movable nodes only) cpuset configuration detected (nmask=%*pbl)!\n"
325 "Cpuset allocations might fail even with a lot of memory available.\n",
326 nodemask_pr_args(nodes));
327 }
328 }
329
330 /*
331 * decrease cs->attach_in_progress.
332 * wake_up cpuset_attach_wq if cs->attach_in_progress==0.
333 */
dec_attach_in_progress_locked(struct cpuset * cs)334 static inline void dec_attach_in_progress_locked(struct cpuset *cs)
335 {
336 lockdep_assert_cpuset_lock_held();
337
338 cs->attach_in_progress--;
339 if (!cs->attach_in_progress)
340 wake_up(&cpuset_attach_wq);
341 }
342
dec_attach_in_progress(struct cpuset * cs)343 static inline void dec_attach_in_progress(struct cpuset *cs)
344 {
345 mutex_lock(&cpuset_mutex);
346 dec_attach_in_progress_locked(cs);
347 mutex_unlock(&cpuset_mutex);
348 }
349
cpuset_v2(void)350 static inline bool cpuset_v2(void)
351 {
352 return !IS_ENABLED(CONFIG_CPUSETS_V1) ||
353 cgroup_subsys_on_dfl(cpuset_cgrp_subsys);
354 }
355
356 /*
357 * Cgroup v2 behavior is used on the "cpus" and "mems" control files when
358 * on default hierarchy or when the cpuset_v2_mode flag is set by mounting
359 * the v1 cpuset cgroup filesystem with the "cpuset_v2_mode" mount option.
360 * With v2 behavior, "cpus" and "mems" are always what the users have
361 * requested and won't be changed by hotplug events. Only the effective
362 * cpus or mems will be affected.
363 */
is_in_v2_mode(void)364 static inline bool is_in_v2_mode(void)
365 {
366 return cpuset_v2() ||
367 (cpuset_cgrp_subsys.root->flags & CGRP_ROOT_CPUSET_V2_MODE);
368 }
369
370 /**
371 * partition_is_populated - check if partition has tasks
372 * @cs: partition root to be checked
373 * @excluded_child: a child cpuset to be excluded in task checking
374 * Return: true if there are tasks, false otherwise
375 *
376 * @cs should be a valid partition root or going to become a partition root.
377 * @excluded_child should be non-NULL when this cpuset is going to become a
378 * partition itself.
379 *
380 * Note that a remote partition is not allowed underneath a valid local
381 * or remote partition. So if a non-partition root child is populated,
382 * the whole partition is considered populated.
383 */
partition_is_populated(struct cpuset * cs,struct cpuset * excluded_child)384 static inline bool partition_is_populated(struct cpuset *cs,
385 struct cpuset *excluded_child)
386 {
387 struct cpuset *cp;
388 struct cgroup_subsys_state *pos_css;
389
390 /*
391 * We cannot call cs_is_populated(cs) directly, as
392 * nr_populated_domain_children may include populated
393 * csets from descendants that are partitions.
394 */
395 if (cs->css.cgroup->nr_populated_csets ||
396 cs->attach_in_progress)
397 return true;
398
399 rcu_read_lock();
400 cpuset_for_each_descendant_pre(cp, pos_css, cs) {
401 if (cp == cs || cp == excluded_child)
402 continue;
403
404 if (is_partition_valid(cp)) {
405 pos_css = css_rightmost_descendant(pos_css);
406 continue;
407 }
408
409 if (cpuset_is_populated(cp)) {
410 rcu_read_unlock();
411 return true;
412 }
413 }
414 rcu_read_unlock();
415 return false;
416 }
417
418 /*
419 * Return in pmask the portion of a task's cpusets's cpus_allowed that
420 * are online and are capable of running the task. If none are found,
421 * walk up the cpuset hierarchy until we find one that does have some
422 * appropriate cpus.
423 *
424 * One way or another, we guarantee to return some non-empty subset
425 * of cpu_active_mask.
426 *
427 * Call with callback_lock or cpuset_mutex held.
428 */
guarantee_active_cpus(struct task_struct * tsk,struct cpumask * pmask)429 static void guarantee_active_cpus(struct task_struct *tsk,
430 struct cpumask *pmask)
431 {
432 const struct cpumask *possible_mask = task_cpu_possible_mask(tsk);
433 struct cpuset *cs;
434
435 if (WARN_ON(!cpumask_and(pmask, possible_mask, cpu_active_mask)))
436 cpumask_copy(pmask, cpu_active_mask);
437
438 rcu_read_lock();
439 cs = task_cs(tsk);
440
441 while (!cpumask_intersects(cs->effective_cpus, pmask))
442 cs = parent_cs(cs);
443
444 cpumask_and(pmask, pmask, cs->effective_cpus);
445 rcu_read_unlock();
446 }
447
448 /*
449 * Return in *pmask the portion of a cpusets's mems_allowed that
450 * are online, with memory. If none are online with memory, walk
451 * up the cpuset hierarchy until we find one that does have some
452 * online mems. The top cpuset always has some mems online.
453 *
454 * One way or another, we guarantee to return some non-empty subset
455 * of node_states[N_MEMORY].
456 *
457 * Call with callback_lock or cpuset_mutex held.
458 */
guarantee_online_mems(struct cpuset * cs,nodemask_t * pmask)459 static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
460 {
461 while (!nodes_and(*pmask, cs->effective_mems, node_states[N_MEMORY]))
462 cs = parent_cs(cs);
463 }
464
465 /**
466 * alloc_cpumasks - Allocate an array of cpumask variables
467 * @pmasks: Pointer to array of cpumask_var_t pointers
468 * @size: Number of cpumasks to allocate
469 * Return: 0 if successful, -ENOMEM otherwise.
470 *
471 * Allocates @size cpumasks and initializes them to empty. Returns 0 on
472 * success, -ENOMEM on allocation failure. On failure, any previously
473 * allocated cpumasks are freed.
474 */
alloc_cpumasks(cpumask_var_t * pmasks[],u32 size)475 static inline int alloc_cpumasks(cpumask_var_t *pmasks[], u32 size)
476 {
477 int i;
478
479 for (i = 0; i < size; i++) {
480 if (!zalloc_cpumask_var(pmasks[i], GFP_KERNEL)) {
481 while (--i >= 0)
482 free_cpumask_var(*pmasks[i]);
483 return -ENOMEM;
484 }
485 }
486 return 0;
487 }
488
489 /**
490 * alloc_tmpmasks - Allocate temporary cpumasks for cpuset operations.
491 * @tmp: Pointer to tmpmasks structure to populate
492 * Return: 0 on success, -ENOMEM on allocation failure
493 */
alloc_tmpmasks(struct tmpmasks * tmp)494 static inline int alloc_tmpmasks(struct tmpmasks *tmp)
495 {
496 /*
497 * Array of pointers to the three cpumask_var_t fields in tmpmasks.
498 * Note: Array size must match actual number of masks (3)
499 */
500 cpumask_var_t *pmask[3] = {
501 &tmp->new_cpus,
502 &tmp->addmask,
503 &tmp->delmask
504 };
505
506 return alloc_cpumasks(pmask, ARRAY_SIZE(pmask));
507 }
508
509 /**
510 * free_tmpmasks - free cpumasks in a tmpmasks structure
511 * @tmp: the tmpmasks structure pointer
512 */
free_tmpmasks(struct tmpmasks * tmp)513 static inline void free_tmpmasks(struct tmpmasks *tmp)
514 {
515 if (!tmp)
516 return;
517
518 free_cpumask_var(tmp->new_cpus);
519 free_cpumask_var(tmp->addmask);
520 free_cpumask_var(tmp->delmask);
521 }
522
523 /**
524 * dup_or_alloc_cpuset - Duplicate or allocate a new cpuset
525 * @cs: Source cpuset to duplicate (NULL for a fresh allocation)
526 *
527 * Creates a new cpuset by either:
528 * 1. Duplicating an existing cpuset (if @cs is non-NULL), or
529 * 2. Allocating a fresh cpuset with zero-initialized masks (if @cs is NULL)
530 *
531 * Return: Pointer to newly allocated cpuset on success, NULL on failure
532 */
dup_or_alloc_cpuset(struct cpuset * cs)533 static struct cpuset *dup_or_alloc_cpuset(struct cpuset *cs)
534 {
535 struct cpuset *trial;
536
537 /* Allocate base structure */
538 trial = cs ? kmemdup(cs, sizeof(*cs), GFP_KERNEL) :
539 kzalloc_obj(*cs);
540 if (!trial)
541 return NULL;
542
543 /* Setup cpumask pointer array */
544 cpumask_var_t *pmask[4] = {
545 &trial->cpus_allowed,
546 &trial->effective_cpus,
547 &trial->effective_xcpus,
548 &trial->exclusive_cpus
549 };
550
551 if (alloc_cpumasks(pmask, ARRAY_SIZE(pmask))) {
552 kfree(trial);
553 return NULL;
554 }
555
556 /* Copy masks if duplicating */
557 if (cs) {
558 cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
559 cpumask_copy(trial->effective_cpus, cs->effective_cpus);
560 cpumask_copy(trial->effective_xcpus, cs->effective_xcpus);
561 cpumask_copy(trial->exclusive_cpus, cs->exclusive_cpus);
562 }
563
564 return trial;
565 }
566
567 /**
568 * free_cpuset - free the cpuset
569 * @cs: the cpuset to be freed
570 */
free_cpuset(struct cpuset * cs)571 static inline void free_cpuset(struct cpuset *cs)
572 {
573 free_cpumask_var(cs->cpus_allowed);
574 free_cpumask_var(cs->effective_cpus);
575 free_cpumask_var(cs->effective_xcpus);
576 free_cpumask_var(cs->exclusive_cpus);
577 kfree(cs);
578 }
579
580 /* Return user specified exclusive CPUs */
user_xcpus(struct cpuset * cs)581 static inline struct cpumask *user_xcpus(struct cpuset *cs)
582 {
583 return cpumask_empty(cs->exclusive_cpus) ? cs->cpus_allowed
584 : cs->exclusive_cpus;
585 }
586
xcpus_empty(struct cpuset * cs)587 static inline bool xcpus_empty(struct cpuset *cs)
588 {
589 return cpumask_empty(cs->cpus_allowed) &&
590 cpumask_empty(cs->exclusive_cpus);
591 }
592
593 /*
594 * cpusets_are_exclusive() - check if two cpusets are exclusive
595 *
596 * Return true if exclusive, false if not
597 */
cpusets_are_exclusive(struct cpuset * cs1,struct cpuset * cs2)598 static inline bool cpusets_are_exclusive(struct cpuset *cs1, struct cpuset *cs2)
599 {
600 struct cpumask *xcpus1 = user_xcpus(cs1);
601 struct cpumask *xcpus2 = user_xcpus(cs2);
602
603 if (cpumask_intersects(xcpus1, xcpus2))
604 return false;
605 return true;
606 }
607
608 /**
609 * cpus_excl_conflict - Check if two cpusets have exclusive CPU conflicts
610 * @trial: the trial cpuset to be checked
611 * @sibling: a sibling cpuset to be checked against
612 * @xcpus_changed: set if exclusive_cpus has been set
613 *
614 * Returns: true if CPU exclusivity conflict exists, false otherwise
615 *
616 * Conflict detection rules:
617 * o cgroup v1
618 * See cpuset1_cpus_excl_conflict()
619 * o cgroup v2
620 * - The exclusive_cpus values cannot overlap.
621 * - New exclusive_cpus cannot be a superset of a sibling's cpus_allowed.
622 */
cpus_excl_conflict(struct cpuset * trial,struct cpuset * sibling,bool xcpus_changed)623 static inline bool cpus_excl_conflict(struct cpuset *trial, struct cpuset *sibling,
624 bool xcpus_changed)
625 {
626 if (!cpuset_v2())
627 return cpuset1_cpus_excl_conflict(trial, sibling);
628
629 /* The cpus_allowed of a sibling cpuset cannot be a subset of the new exclusive_cpus */
630 if (xcpus_changed && !cpumask_empty(sibling->cpus_allowed) &&
631 cpumask_subset(sibling->cpus_allowed, trial->exclusive_cpus))
632 return true;
633
634 /* Exclusive_cpus cannot intersect */
635 return cpumask_intersects(trial->exclusive_cpus, sibling->exclusive_cpus);
636 }
637
mems_excl_conflict(struct cpuset * cs1,struct cpuset * cs2)638 static inline bool mems_excl_conflict(struct cpuset *cs1, struct cpuset *cs2)
639 {
640 if ((is_mem_exclusive(cs1) || is_mem_exclusive(cs2)))
641 return nodes_intersects(cs1->mems_allowed, cs2->mems_allowed);
642 return false;
643 }
644
645 /*
646 * validate_change() - Used to validate that any proposed cpuset change
647 * follows the structural rules for cpusets.
648 *
649 * If we replaced the flag and mask values of the current cpuset
650 * (cur) with those values in the trial cpuset (trial), would
651 * our various subset and exclusive rules still be valid? Presumes
652 * cpuset_mutex held.
653 *
654 * 'cur' is the address of an actual, in-use cpuset. Operations
655 * such as list traversal that depend on the actual address of the
656 * cpuset in the list must use cur below, not trial.
657 *
658 * 'trial' is the address of bulk structure copy of cur, with
659 * perhaps one or more of the fields cpus_allowed, mems_allowed,
660 * or flags changed to new, trial values.
661 *
662 * Return 0 if valid, -errno if not.
663 */
664
validate_change(struct cpuset * cur,struct cpuset * trial)665 static int validate_change(struct cpuset *cur, struct cpuset *trial)
666 {
667 struct cgroup_subsys_state *css;
668 struct cpuset *c, *par;
669 bool xcpus_changed;
670 int ret = 0;
671
672 rcu_read_lock();
673
674 if (!is_in_v2_mode())
675 ret = cpuset1_validate_change(cur, trial);
676 if (ret)
677 goto out;
678
679 /* Remaining checks don't apply to root cpuset */
680 if (cur == &top_cpuset)
681 goto out;
682
683 par = parent_cs(cur);
684
685 /*
686 * We can't shrink if we won't have enough room for SCHED_DEADLINE
687 * tasks. This check is not done when scheduling is disabled as the
688 * users should know what they are doing.
689 *
690 * For v1, effective_cpus == cpus_allowed & user_xcpus() returns
691 * cpus_allowed.
692 *
693 * For v2, is_cpu_exclusive() & is_sched_load_balance() are true only
694 * for non-isolated partition root. At this point, the target
695 * effective_cpus isn't computed yet. user_xcpus() is the best
696 * approximation.
697 *
698 * TBD: May need to precompute the real effective_cpus here in case
699 * incorrect scheduling of SCHED_DEADLINE tasks in a partition
700 * becomes an issue.
701 */
702 ret = -EBUSY;
703 if (is_cpu_exclusive(cur) && is_sched_load_balance(cur) &&
704 !cpuset_cpumask_can_shrink(cur->effective_cpus, user_xcpus(trial)))
705 goto out;
706
707 /*
708 * If either I or some sibling (!= me) is exclusive, we can't
709 * overlap. exclusive_cpus cannot overlap with each other if set.
710 */
711 ret = -EINVAL;
712 xcpus_changed = !cpumask_equal(cur->exclusive_cpus, trial->exclusive_cpus);
713 cpuset_for_each_child(c, css, par) {
714 if (c == cur)
715 continue;
716 if (cpus_excl_conflict(trial, c, xcpus_changed))
717 goto out;
718 if (mems_excl_conflict(trial, c))
719 goto out;
720 }
721
722 ret = 0;
723 out:
724 rcu_read_unlock();
725 return ret;
726 }
727
728 #ifdef CONFIG_SMP
729
730 /*
731 * generate_sched_domains()
732 *
733 * This function builds a partial partition of the systems CPUs
734 * A 'partial partition' is a set of non-overlapping subsets whose
735 * union is a subset of that set.
736 * The output of this function needs to be passed to kernel/sched/core.c
737 * partition_sched_domains() routine, which will rebuild the scheduler's
738 * load balancing domains (sched domains) as specified by that partial
739 * partition.
740 *
741 * See "What is sched_load_balance" in Documentation/admin-guide/cgroup-v1/cpusets.rst
742 * for a background explanation of this.
743 *
744 * Does not return errors, on the theory that the callers of this
745 * routine would rather not worry about failures to rebuild sched
746 * domains when operating in the severe memory shortage situations
747 * that could cause allocation failures below.
748 *
749 * Must be called with cpuset_mutex held.
750 *
751 * The three key local variables below are:
752 * cp - cpuset pointer, used (together with pos_css) to perform a
753 * top-down scan of all cpusets. For our purposes, rebuilding
754 * the schedulers sched domains, we can ignore !is_sched_load_
755 * balance cpusets.
756 * csa - (for CpuSet Array) Array of pointers to all the cpusets
757 * that need to be load balanced, for convenient iterative
758 * access by the subsequent code that finds the best partition,
759 * i.e the set of domains (subsets) of CPUs such that the
760 * cpus_allowed of every cpuset marked is_sched_load_balance
761 * is a subset of one of these domains, while there are as
762 * many such domains as possible, each as small as possible.
763 * doms - Conversion of 'csa' to an array of cpumasks, for passing to
764 * the kernel/sched/core.c routine partition_sched_domains() in a
765 * convenient format, that can be easily compared to the prior
766 * value to determine what partition elements (sched domains)
767 * were changed (added or removed.)
768 */
generate_sched_domains(cpumask_var_t ** domains,struct sched_domain_attr ** attributes)769 static int generate_sched_domains(cpumask_var_t **domains,
770 struct sched_domain_attr **attributes)
771 {
772 struct cpuset *cp; /* top-down scan of cpusets */
773 struct cpuset **csa; /* array of all cpuset ptrs */
774 int i, j; /* indices for partition finding loops */
775 cpumask_var_t *doms; /* resulting partition; i.e. sched domains */
776 struct sched_domain_attr *dattr; /* attributes for custom domains */
777 int ndoms = 0; /* number of sched domains in result */
778 struct cgroup_subsys_state *pos_css;
779
780 if (!cpuset_v2())
781 return cpuset1_generate_sched_domains(domains, attributes);
782
783 doms = NULL;
784 dattr = NULL;
785 csa = NULL;
786
787 /* Special case for the 99% of systems with one, full, sched domain */
788 if (cpumask_empty(subpartitions_cpus)) {
789 ndoms = 1;
790 /* !csa will be checked and can be correctly handled */
791 goto generate_doms;
792 }
793
794 csa = kmalloc_objs(cp, nr_cpusets());
795 if (!csa)
796 goto done;
797
798 /* Find how many partitions and cache them to csa[] */
799 rcu_read_lock();
800 cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) {
801 /*
802 * Only valid partition roots that are not isolated and with
803 * non-empty effective_cpus will be saved into csa[].
804 */
805 if ((cp->partition_root_state == PRS_ROOT) &&
806 !cpumask_empty(cp->effective_cpus))
807 csa[ndoms++] = cp;
808
809 /*
810 * Skip @cp's subtree if not a partition root and has no
811 * exclusive CPUs to be granted to child cpusets.
812 */
813 if (!is_partition_valid(cp) && cpumask_empty(cp->exclusive_cpus))
814 pos_css = css_rightmost_descendant(pos_css);
815 }
816 rcu_read_unlock();
817
818 for (i = 0; i < ndoms; i++) {
819 for (j = i + 1; j < ndoms; j++) {
820 if (cpusets_overlap(csa[i], csa[j]))
821 /*
822 * Cgroup v2 shouldn't pass down overlapping
823 * partition root cpusets.
824 */
825 WARN_ON_ONCE(1);
826 }
827 }
828
829 generate_doms:
830 doms = alloc_sched_domains(ndoms);
831 if (!doms)
832 goto done;
833
834 /*
835 * The rest of the code, including the scheduler, can deal with
836 * dattr==NULL case. No need to abort if alloc fails.
837 */
838 dattr = kmalloc_objs(struct sched_domain_attr, ndoms);
839
840 /*
841 * Cgroup v2 doesn't support domain attributes, just set all of them
842 * to SD_ATTR_INIT. Also non-isolating partition root CPUs are a
843 * subset of HK_TYPE_DOMAIN housekeeping CPUs.
844 */
845 for (i = 0; i < ndoms; i++) {
846 /*
847 * The top cpuset may contain some boot time isolated
848 * CPUs that need to be excluded from the sched domain.
849 */
850 if (!csa || csa[i] == &top_cpuset)
851 cpumask_and(doms[i], top_cpuset.effective_cpus,
852 housekeeping_cpumask(HK_TYPE_DOMAIN));
853 else
854 cpumask_copy(doms[i], csa[i]->effective_cpus);
855 if (dattr)
856 dattr[i] = SD_ATTR_INIT;
857 }
858
859 done:
860 kfree(csa);
861
862 /*
863 * Fallback to the default domain if kmalloc() failed.
864 * See comments in partition_sched_domains().
865 */
866 if (doms == NULL)
867 ndoms = 1;
868
869 *domains = doms;
870 *attributes = dattr;
871 return ndoms;
872 }
873
dl_update_tasks_root_domain(struct cpuset * cs)874 static void dl_update_tasks_root_domain(struct cpuset *cs)
875 {
876 struct css_task_iter it;
877 struct task_struct *task;
878
879 if (cs->nr_deadline_tasks == 0)
880 return;
881
882 css_task_iter_start(&cs->css, 0, &it);
883
884 while ((task = css_task_iter_next(&it)))
885 dl_add_task_root_domain(task);
886
887 css_task_iter_end(&it);
888 }
889
dl_rebuild_rd_accounting(void)890 void dl_rebuild_rd_accounting(void)
891 {
892 struct cpuset *cs = NULL;
893 struct cgroup_subsys_state *pos_css;
894 int cpu;
895 u64 cookie = ++dl_cookie;
896
897 lockdep_assert_cpuset_lock_held();
898 lockdep_assert_cpus_held();
899 lockdep_assert_held(&sched_domains_mutex);
900
901 rcu_read_lock();
902
903 for_each_possible_cpu(cpu) {
904 if (dl_bw_visited(cpu, cookie))
905 continue;
906
907 dl_clear_root_domain_cpu(cpu);
908 }
909
910 cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
911
912 if (cpumask_empty(cs->effective_cpus)) {
913 pos_css = css_rightmost_descendant(pos_css);
914 continue;
915 }
916
917 css_get(&cs->css);
918
919 rcu_read_unlock();
920
921 dl_update_tasks_root_domain(cs);
922
923 rcu_read_lock();
924 css_put(&cs->css);
925 }
926 rcu_read_unlock();
927 }
928
929 /*
930 * Rebuild scheduler domains.
931 *
932 * If the flag 'sched_load_balance' of any cpuset with non-empty
933 * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset
934 * which has that flag enabled, or if any cpuset with a non-empty
935 * 'cpus' is removed, then call this routine to rebuild the
936 * scheduler's dynamic sched domains.
937 *
938 * Call with cpuset_mutex held. Takes cpus_read_lock().
939 */
rebuild_sched_domains_locked(void)940 void rebuild_sched_domains_locked(void)
941 {
942 struct sched_domain_attr *attr;
943 cpumask_var_t *doms;
944 int ndoms;
945 int i;
946
947 lockdep_assert_cpus_held();
948 lockdep_assert_cpuset_lock_held();
949 force_sd_rebuild = false;
950
951 /* Generate domain masks and attrs */
952 ndoms = generate_sched_domains(&doms, &attr);
953
954 /*
955 * cpuset_hotplug_workfn is invoked synchronously now, thus this
956 * function should not race with CPU hotplug. And the effective CPUs
957 * must not include any offline CPUs. Passing an offline CPU in the
958 * doms to partition_sched_domains() will trigger a kernel panic.
959 *
960 * We perform a final check here: if the doms contains any
961 * offline CPUs, a warning is emitted and we return directly to
962 * prevent the panic.
963 */
964 for (i = 0; i < ndoms; ++i) {
965 if (WARN_ON_ONCE(!cpumask_subset(doms[i], cpu_active_mask)))
966 return;
967 }
968
969 /* Have scheduler rebuild the domains */
970 partition_sched_domains(ndoms, doms, attr);
971 }
972 #else /* !CONFIG_SMP */
rebuild_sched_domains_locked(void)973 void rebuild_sched_domains_locked(void)
974 {
975 }
976 #endif /* CONFIG_SMP */
977
rebuild_sched_domains_cpuslocked(void)978 static void rebuild_sched_domains_cpuslocked(void)
979 {
980 mutex_lock(&cpuset_mutex);
981 rebuild_sched_domains_locked();
982 mutex_unlock(&cpuset_mutex);
983 }
984
rebuild_sched_domains(void)985 void rebuild_sched_domains(void)
986 {
987 cpus_read_lock();
988 rebuild_sched_domains_cpuslocked();
989 cpus_read_unlock();
990 }
991
cpuset_reset_sched_domains(void)992 void cpuset_reset_sched_domains(void)
993 {
994 mutex_lock(&cpuset_mutex);
995 partition_sched_domains(1, NULL, NULL);
996 mutex_unlock(&cpuset_mutex);
997 }
998
999 /**
1000 * cpuset_update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
1001 * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
1002 * @new_cpus: the temp variable for the new effective_cpus mask
1003 *
1004 * Iterate through each task of @cs updating its cpus_allowed to the
1005 * effective cpuset's. As this function is called with cpuset_mutex held,
1006 * cpuset membership stays stable.
1007 *
1008 * For top_cpuset, task_cpu_possible_mask() is used instead of effective_cpus
1009 * to make sure all offline CPUs are also included as hotplug code won't
1010 * update cpumasks for tasks in top_cpuset.
1011 *
1012 * As task_cpu_possible_mask() can be task dependent in arm64, we have to
1013 * do cpu masking per task instead of doing it once for all.
1014 */
cpuset_update_tasks_cpumask(struct cpuset * cs,struct cpumask * new_cpus)1015 void cpuset_update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus)
1016 {
1017 struct css_task_iter it;
1018 struct task_struct *task;
1019 bool top_cs = cs == &top_cpuset;
1020
1021 css_task_iter_start(&cs->css, 0, &it);
1022 while ((task = css_task_iter_next(&it))) {
1023 const struct cpumask *possible_mask = task_cpu_possible_mask(task);
1024
1025 if (top_cs) {
1026 /*
1027 * PF_KTHREAD tasks are handled by housekeeping.
1028 * PF_NO_SETAFFINITY tasks are ignored.
1029 */
1030 if (task->flags & (PF_KTHREAD | PF_NO_SETAFFINITY))
1031 continue;
1032 cpumask_andnot(new_cpus, possible_mask, subpartitions_cpus);
1033 } else {
1034 cpumask_and(new_cpus, possible_mask, cs->effective_cpus);
1035 }
1036 set_cpus_allowed_ptr(task, new_cpus);
1037 }
1038 css_task_iter_end(&it);
1039 }
1040
1041 /**
1042 * compute_effective_cpumask - Compute the effective cpumask of the cpuset
1043 * @new_cpus: the temp variable for the new effective_cpus mask
1044 * @cs: the cpuset the need to recompute the new effective_cpus mask
1045 * @parent: the parent cpuset
1046 *
1047 * The result is valid only if the given cpuset isn't a partition root.
1048 */
compute_effective_cpumask(struct cpumask * new_cpus,struct cpuset * cs,struct cpuset * parent)1049 static void compute_effective_cpumask(struct cpumask *new_cpus,
1050 struct cpuset *cs, struct cpuset *parent)
1051 {
1052 cpumask_and(new_cpus, cs->cpus_allowed, parent->effective_cpus);
1053 }
1054
1055 /*
1056 * Commands for update_parent_effective_cpumask
1057 */
1058 enum partition_cmd {
1059 partcmd_enable, /* Enable partition root */
1060 partcmd_enablei, /* Enable isolated partition root */
1061 partcmd_disable, /* Disable partition root */
1062 partcmd_update, /* Update parent's effective_cpus */
1063 partcmd_invalidate, /* Make partition invalid */
1064 };
1065
1066 static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
1067 struct tmpmasks *tmp);
1068
1069 /*
1070 * Update partition exclusive flag
1071 *
1072 * Return: 0 if successful, an error code otherwise
1073 */
update_partition_exclusive_flag(struct cpuset * cs,int new_prs)1074 static int update_partition_exclusive_flag(struct cpuset *cs, int new_prs)
1075 {
1076 bool exclusive = (new_prs > PRS_MEMBER);
1077
1078 if (exclusive && !is_cpu_exclusive(cs)) {
1079 if (cpuset_update_flag(CS_CPU_EXCLUSIVE, cs, 1))
1080 return PERR_NOTEXCL;
1081 } else if (!exclusive && is_cpu_exclusive(cs)) {
1082 /* Turning off CS_CPU_EXCLUSIVE will not return error */
1083 cpuset_update_flag(CS_CPU_EXCLUSIVE, cs, 0);
1084 }
1085 return 0;
1086 }
1087
1088 /*
1089 * Update partition load balance flag and/or rebuild sched domain
1090 *
1091 * Changing load balance flag will automatically call
1092 * rebuild_sched_domains_locked().
1093 * This function is for cgroup v2 only.
1094 */
update_partition_sd_lb(struct cpuset * cs,int old_prs)1095 static void update_partition_sd_lb(struct cpuset *cs, int old_prs)
1096 {
1097 int new_prs = cs->partition_root_state;
1098 bool rebuild_domains = (new_prs > 0) || (old_prs > 0);
1099 bool new_lb;
1100
1101 /*
1102 * If cs is not a valid partition root, the load balance state
1103 * will follow its parent.
1104 */
1105 if (new_prs > 0) {
1106 new_lb = (new_prs != PRS_ISOLATED);
1107 } else {
1108 new_lb = is_sched_load_balance(parent_cs(cs));
1109 }
1110 if (new_lb != !!is_sched_load_balance(cs)) {
1111 rebuild_domains = true;
1112 if (new_lb)
1113 set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
1114 else
1115 clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
1116 }
1117
1118 if (rebuild_domains)
1119 cpuset_force_rebuild();
1120 }
1121
1122 /*
1123 * tasks_nocpu_error - Return true if tasks will have no effective_cpus
1124 */
tasks_nocpu_error(struct cpuset * parent,struct cpuset * cs,struct cpumask * xcpus)1125 static bool tasks_nocpu_error(struct cpuset *parent, struct cpuset *cs,
1126 struct cpumask *xcpus)
1127 {
1128 /*
1129 * A populated partition (cs or parent) can't have empty effective_cpus
1130 */
1131 return (cpumask_subset(parent->effective_cpus, xcpus) &&
1132 partition_is_populated(parent, cs)) ||
1133 (!cpumask_intersects(xcpus, cpu_active_mask) &&
1134 partition_is_populated(cs, NULL));
1135 }
1136
reset_partition_data(struct cpuset * cs)1137 static void reset_partition_data(struct cpuset *cs)
1138 {
1139 struct cpuset *parent = parent_cs(cs);
1140
1141 if (!cpuset_v2())
1142 return;
1143
1144 lockdep_assert_held(&callback_lock);
1145
1146 if (cpumask_empty(cs->exclusive_cpus)) {
1147 cpumask_clear(cs->effective_xcpus);
1148 if (is_cpu_exclusive(cs))
1149 clear_bit(CS_CPU_EXCLUSIVE, &cs->flags);
1150 }
1151 if (!cpumask_and(cs->effective_cpus, parent->effective_cpus, cs->cpus_allowed))
1152 cpumask_copy(cs->effective_cpus, parent->effective_cpus);
1153 }
1154
1155 /*
1156 * isolated_cpus_update - Update the isolated_cpus mask
1157 * @old_prs: old partition_root_state
1158 * @new_prs: new partition_root_state
1159 * @xcpus: exclusive CPUs with state change
1160 */
isolated_cpus_update(int old_prs,int new_prs,struct cpumask * xcpus)1161 static void isolated_cpus_update(int old_prs, int new_prs, struct cpumask *xcpus)
1162 {
1163 WARN_ON_ONCE(old_prs == new_prs);
1164 if (new_prs == PRS_ISOLATED)
1165 cpumask_or(isolated_cpus, isolated_cpus, xcpus);
1166 else
1167 cpumask_andnot(isolated_cpus, isolated_cpus, xcpus);
1168
1169 isolated_cpus_updating = true;
1170 }
1171
1172 /*
1173 * partition_xcpus_add - Add new exclusive CPUs to partition
1174 * @new_prs: new partition_root_state
1175 * @parent: parent cpuset
1176 * @xcpus: exclusive CPUs to be added
1177 *
1178 * Remote partition if parent == NULL
1179 */
partition_xcpus_add(int new_prs,struct cpuset * parent,struct cpumask * xcpus)1180 static void partition_xcpus_add(int new_prs, struct cpuset *parent,
1181 struct cpumask *xcpus)
1182 {
1183 WARN_ON_ONCE(new_prs < 0);
1184 lockdep_assert_held(&callback_lock);
1185 if (!parent)
1186 parent = &top_cpuset;
1187
1188
1189 if (parent == &top_cpuset)
1190 cpumask_or(subpartitions_cpus, subpartitions_cpus, xcpus);
1191
1192 if (new_prs != parent->partition_root_state)
1193 isolated_cpus_update(parent->partition_root_state, new_prs,
1194 xcpus);
1195
1196 cpumask_andnot(parent->effective_cpus, parent->effective_cpus, xcpus);
1197 }
1198
1199 /*
1200 * partition_xcpus_del - Remove exclusive CPUs from partition
1201 * @old_prs: old partition_root_state
1202 * @parent: parent cpuset
1203 * @xcpus: exclusive CPUs to be removed
1204 *
1205 * Remote partition if parent == NULL
1206 */
partition_xcpus_del(int old_prs,struct cpuset * parent,struct cpumask * xcpus)1207 static void partition_xcpus_del(int old_prs, struct cpuset *parent,
1208 struct cpumask *xcpus)
1209 {
1210 WARN_ON_ONCE(old_prs < 0);
1211 lockdep_assert_held(&callback_lock);
1212 if (!parent)
1213 parent = &top_cpuset;
1214
1215 if (parent == &top_cpuset)
1216 cpumask_andnot(subpartitions_cpus, subpartitions_cpus, xcpus);
1217
1218 if (old_prs != parent->partition_root_state)
1219 isolated_cpus_update(old_prs, parent->partition_root_state,
1220 xcpus);
1221
1222 cpumask_and(xcpus, xcpus, cpu_active_mask);
1223 cpumask_or(parent->effective_cpus, parent->effective_cpus, xcpus);
1224 }
1225
1226 /*
1227 * isolated_cpus_can_update - check for isolated & nohz_full conflicts
1228 * @add_cpus: cpu mask for cpus that are going to be isolated
1229 * @del_cpus: cpu mask for cpus that are no longer isolated, can be NULL
1230 * Return: false if there is conflict, true otherwise
1231 *
1232 * If nohz_full is enabled and we have isolated CPUs, their combination must
1233 * still leave housekeeping CPUs.
1234 *
1235 * TBD: Should consider merging this function into
1236 * prstate_housekeeping_conflict().
1237 */
isolated_cpus_can_update(struct cpumask * add_cpus,struct cpumask * del_cpus)1238 static bool isolated_cpus_can_update(struct cpumask *add_cpus,
1239 struct cpumask *del_cpus)
1240 {
1241 cpumask_var_t full_hk_cpus;
1242 int res = true;
1243
1244 if (!housekeeping_enabled(HK_TYPE_KERNEL_NOISE))
1245 return true;
1246
1247 if (del_cpus && cpumask_weight_and(del_cpus,
1248 housekeeping_cpumask(HK_TYPE_KERNEL_NOISE)))
1249 return true;
1250
1251 if (!alloc_cpumask_var(&full_hk_cpus, GFP_KERNEL))
1252 return false;
1253
1254 cpumask_and(full_hk_cpus, housekeeping_cpumask(HK_TYPE_KERNEL_NOISE),
1255 housekeeping_cpumask(HK_TYPE_DOMAIN));
1256 cpumask_andnot(full_hk_cpus, full_hk_cpus, isolated_cpus);
1257 cpumask_and(full_hk_cpus, full_hk_cpus, cpu_active_mask);
1258 if (!cpumask_weight_andnot(full_hk_cpus, add_cpus))
1259 res = false;
1260
1261 free_cpumask_var(full_hk_cpus);
1262 return res;
1263 }
1264
1265 /*
1266 * prstate_housekeeping_conflict - check for partition & housekeeping conflicts
1267 * @prstate: partition root state to be checked
1268 * @new_cpus: cpu mask
1269 * Return: true if there is conflict, false otherwise
1270 *
1271 * CPUs outside of HK_TYPE_DOMAIN_BOOT, if defined, can only be used in an
1272 * isolated partition.
1273 */
prstate_housekeeping_conflict(int prstate,struct cpumask * new_cpus)1274 static bool prstate_housekeeping_conflict(int prstate, struct cpumask *new_cpus)
1275 {
1276 if (!housekeeping_enabled(HK_TYPE_DOMAIN_BOOT))
1277 return false;
1278
1279 if ((prstate != PRS_ISOLATED) &&
1280 !cpumask_subset(new_cpus, housekeeping_cpumask(HK_TYPE_DOMAIN_BOOT)))
1281 return true;
1282
1283 return false;
1284 }
1285
1286 /*
1287 * update_isolation_cpumasks - Update external isolation related CPU masks
1288 *
1289 * The following external CPU masks will be updated if necessary:
1290 * - workqueue unbound cpumask
1291 */
update_isolation_cpumasks(void)1292 static void update_isolation_cpumasks(void)
1293 {
1294 int ret;
1295
1296 if (!isolated_cpus_updating)
1297 return;
1298
1299 ret = housekeeping_update(isolated_cpus);
1300 WARN_ON_ONCE(ret < 0);
1301
1302 isolated_cpus_updating = false;
1303 }
1304
1305 /**
1306 * rm_siblings_excl_cpus - Remove exclusive CPUs that are used by sibling cpusets
1307 * @parent: Parent cpuset containing all siblings
1308 * @cs: Current cpuset (will be skipped)
1309 * @excpus: exclusive effective CPU mask to modify
1310 *
1311 * This function ensures the given @excpus mask doesn't include any CPUs that
1312 * are exclusively allocated to sibling cpusets. It walks through all siblings
1313 * of @cs under @parent and removes their exclusive CPUs from @excpus.
1314 */
rm_siblings_excl_cpus(struct cpuset * parent,struct cpuset * cs,struct cpumask * excpus)1315 static int rm_siblings_excl_cpus(struct cpuset *parent, struct cpuset *cs,
1316 struct cpumask *excpus)
1317 {
1318 struct cgroup_subsys_state *css;
1319 struct cpuset *sibling;
1320 int retval = 0;
1321
1322 if (cpumask_empty(excpus))
1323 return 0;
1324
1325 /*
1326 * Remove exclusive CPUs from siblings
1327 */
1328 rcu_read_lock();
1329 cpuset_for_each_child(sibling, css, parent) {
1330 struct cpumask *sibling_xcpus;
1331
1332 if (sibling == cs)
1333 continue;
1334
1335 /*
1336 * If exclusive_cpus is defined, effective_xcpus will always
1337 * be a subset. Otherwise, effective_xcpus will only be set
1338 * in a valid partition root.
1339 */
1340 sibling_xcpus = cpumask_empty(sibling->exclusive_cpus)
1341 ? sibling->effective_xcpus
1342 : sibling->exclusive_cpus;
1343
1344 if (cpumask_intersects(excpus, sibling_xcpus)) {
1345 cpumask_andnot(excpus, excpus, sibling_xcpus);
1346 retval++;
1347 }
1348 }
1349 rcu_read_unlock();
1350
1351 return retval;
1352 }
1353
1354 /*
1355 * compute_excpus - compute effective exclusive CPUs
1356 * @cs: cpuset
1357 * @xcpus: effective exclusive CPUs value to be set
1358 * Return: 0 if there is no sibling conflict, > 0 otherwise
1359 *
1360 * If exclusive_cpus isn't explicitly set , we have to scan the sibling cpusets
1361 * and exclude their exclusive_cpus or effective_xcpus as well.
1362 */
compute_excpus(struct cpuset * cs,struct cpumask * excpus)1363 static int compute_excpus(struct cpuset *cs, struct cpumask *excpus)
1364 {
1365 struct cpuset *parent = parent_cs(cs);
1366
1367 cpumask_and(excpus, user_xcpus(cs), parent->effective_xcpus);
1368
1369 if (!cpumask_empty(cs->exclusive_cpus))
1370 return 0;
1371
1372 return rm_siblings_excl_cpus(parent, cs, excpus);
1373 }
1374
1375 /*
1376 * compute_trialcs_excpus - Compute effective exclusive CPUs for a trial cpuset
1377 * @trialcs: The trial cpuset containing the proposed new configuration
1378 * @cs: The original cpuset that the trial configuration is based on
1379 * Return: 0 if successful with no sibling conflict, >0 if a conflict is found
1380 *
1381 * Computes the effective_xcpus for a trial configuration. @cs is provided to represent
1382 * the real cs.
1383 */
compute_trialcs_excpus(struct cpuset * trialcs,struct cpuset * cs)1384 static int compute_trialcs_excpus(struct cpuset *trialcs, struct cpuset *cs)
1385 {
1386 struct cpuset *parent = parent_cs(trialcs);
1387 struct cpumask *excpus = trialcs->effective_xcpus;
1388
1389 /* trialcs is member, cpuset.cpus has no impact to excpus */
1390 if (cs_is_member(cs))
1391 cpumask_and(excpus, trialcs->exclusive_cpus,
1392 parent->effective_xcpus);
1393 else
1394 cpumask_and(excpus, user_xcpus(trialcs), parent->effective_xcpus);
1395
1396 return rm_siblings_excl_cpus(parent, cs, excpus);
1397 }
1398
is_remote_partition(struct cpuset * cs)1399 static inline bool is_remote_partition(struct cpuset *cs)
1400 {
1401 return cs->remote_partition;
1402 }
1403
is_local_partition(struct cpuset * cs)1404 static inline bool is_local_partition(struct cpuset *cs)
1405 {
1406 return is_partition_valid(cs) && !is_remote_partition(cs);
1407 }
1408
1409 /*
1410 * remote_partition_enable - Enable current cpuset as a remote partition root
1411 * @cs: the cpuset to update
1412 * @new_prs: new partition_root_state
1413 * @tmp: temporary masks
1414 * Return: 0 if successful, errcode if error
1415 *
1416 * Enable the current cpuset to become a remote partition root taking CPUs
1417 * directly from the top cpuset. cpuset_mutex must be held by the caller.
1418 */
remote_partition_enable(struct cpuset * cs,int new_prs,struct tmpmasks * tmp)1419 static int remote_partition_enable(struct cpuset *cs, int new_prs,
1420 struct tmpmasks *tmp)
1421 {
1422 /*
1423 * The user must have sysadmin privilege.
1424 */
1425 if (!capable(CAP_SYS_ADMIN))
1426 return PERR_ACCESS;
1427
1428 /*
1429 * The requested exclusive_cpus must not be allocated to other
1430 * partitions and it can't use up all the root's effective_cpus.
1431 *
1432 * The effective_xcpus mask can contain offline CPUs, but there must
1433 * be at least one or more online CPUs present before it can be enabled.
1434 *
1435 * Note that creating a remote partition with any local partition root
1436 * above it or remote partition root underneath it is not allowed.
1437 */
1438 compute_excpus(cs, tmp->new_cpus);
1439 WARN_ON_ONCE(cpumask_intersects(tmp->new_cpus, subpartitions_cpus));
1440 if (!cpumask_intersects(tmp->new_cpus, cpu_active_mask) ||
1441 cpumask_subset(top_cpuset.effective_cpus, tmp->new_cpus))
1442 return PERR_INVCPUS;
1443 if (((new_prs == PRS_ISOLATED) &&
1444 !isolated_cpus_can_update(tmp->new_cpus, NULL)) ||
1445 prstate_housekeeping_conflict(new_prs, tmp->new_cpus))
1446 return PERR_HKEEPING;
1447
1448 spin_lock_irq(&callback_lock);
1449 partition_xcpus_add(new_prs, NULL, tmp->new_cpus);
1450 cs->remote_partition = true;
1451 cpumask_copy(cs->effective_xcpus, tmp->new_cpus);
1452 spin_unlock_irq(&callback_lock);
1453 update_isolation_cpumasks();
1454 cpuset_force_rebuild();
1455 cs->prs_err = 0;
1456
1457 /*
1458 * Propagate changes in top_cpuset's effective_cpus down the hierarchy.
1459 */
1460 cpuset_update_tasks_cpumask(&top_cpuset, tmp->new_cpus);
1461 update_sibling_cpumasks(&top_cpuset, NULL, tmp);
1462 return 0;
1463 }
1464
1465 /*
1466 * remote_partition_disable - Remove current cpuset from remote partition list
1467 * @cs: the cpuset to update
1468 * @tmp: temporary masks
1469 *
1470 * The effective_cpus is also updated.
1471 *
1472 * cpuset_mutex must be held by the caller.
1473 */
remote_partition_disable(struct cpuset * cs,struct tmpmasks * tmp)1474 static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp)
1475 {
1476 WARN_ON_ONCE(!is_remote_partition(cs));
1477 /*
1478 * When a CPU is offlined, top_cpuset may end up with no available CPUs,
1479 * which should clear subpartitions_cpus. We should not emit a warning for this
1480 * scenario: the hierarchy is updated from top to bottom, so subpartitions_cpus
1481 * may already be cleared when disabling the partition.
1482 */
1483 WARN_ON_ONCE(!cpumask_subset(cs->effective_xcpus, subpartitions_cpus) &&
1484 !cpumask_empty(subpartitions_cpus));
1485
1486 spin_lock_irq(&callback_lock);
1487 cs->remote_partition = false;
1488 partition_xcpus_del(cs->partition_root_state, NULL, cs->effective_xcpus);
1489 if (cs->prs_err)
1490 cs->partition_root_state = -cs->partition_root_state;
1491 else
1492 cs->partition_root_state = PRS_MEMBER;
1493
1494 /* effective_xcpus may need to be changed */
1495 compute_excpus(cs, cs->effective_xcpus);
1496 reset_partition_data(cs);
1497 spin_unlock_irq(&callback_lock);
1498 update_isolation_cpumasks();
1499 cpuset_force_rebuild();
1500
1501 /*
1502 * Propagate changes in top_cpuset's effective_cpus down the hierarchy.
1503 */
1504 cpuset_update_tasks_cpumask(&top_cpuset, tmp->new_cpus);
1505 update_sibling_cpumasks(&top_cpuset, NULL, tmp);
1506 }
1507
1508 /*
1509 * remote_cpus_update - cpus_exclusive change of remote partition
1510 * @cs: the cpuset to be updated
1511 * @xcpus: the new exclusive_cpus mask, if non-NULL
1512 * @excpus: the new effective_xcpus mask
1513 * @tmp: temporary masks
1514 *
1515 * top_cpuset and subpartitions_cpus will be updated or partition can be
1516 * invalidated.
1517 */
remote_cpus_update(struct cpuset * cs,struct cpumask * xcpus,struct cpumask * excpus,struct tmpmasks * tmp)1518 static void remote_cpus_update(struct cpuset *cs, struct cpumask *xcpus,
1519 struct cpumask *excpus, struct tmpmasks *tmp)
1520 {
1521 bool adding, deleting;
1522 int prs = cs->partition_root_state;
1523
1524 if (WARN_ON_ONCE(!is_remote_partition(cs)))
1525 return;
1526
1527 WARN_ON_ONCE(!cpumask_subset(cs->effective_xcpus, subpartitions_cpus));
1528
1529 if (cpumask_empty(excpus)) {
1530 cs->prs_err = PERR_CPUSEMPTY;
1531 goto invalidate;
1532 }
1533
1534 adding = cpumask_andnot(tmp->addmask, excpus, cs->effective_xcpus);
1535 deleting = cpumask_andnot(tmp->delmask, cs->effective_xcpus, excpus);
1536
1537 /*
1538 * Additions of remote CPUs is only allowed if those CPUs are
1539 * not allocated to other partitions and there are effective_cpus
1540 * left in the top cpuset.
1541 */
1542 if (adding) {
1543 WARN_ON_ONCE(cpumask_intersects(tmp->addmask, subpartitions_cpus));
1544 if (!capable(CAP_SYS_ADMIN))
1545 cs->prs_err = PERR_ACCESS;
1546 else if (cpumask_intersects(tmp->addmask, subpartitions_cpus) ||
1547 cpumask_subset(top_cpuset.effective_cpus, tmp->addmask))
1548 cs->prs_err = PERR_NOCPUS;
1549 else if ((prs == PRS_ISOLATED) &&
1550 !isolated_cpus_can_update(tmp->addmask, tmp->delmask))
1551 cs->prs_err = PERR_HKEEPING;
1552 if (cs->prs_err)
1553 goto invalidate;
1554 }
1555
1556 spin_lock_irq(&callback_lock);
1557 if (adding)
1558 partition_xcpus_add(prs, NULL, tmp->addmask);
1559 if (deleting)
1560 partition_xcpus_del(prs, NULL, tmp->delmask);
1561 /*
1562 * Need to update effective_xcpus and exclusive_cpus now as
1563 * update_sibling_cpumasks() below may iterate back to the same cs.
1564 */
1565 cpumask_copy(cs->effective_xcpus, excpus);
1566 if (xcpus)
1567 cpumask_copy(cs->exclusive_cpus, xcpus);
1568 spin_unlock_irq(&callback_lock);
1569 update_isolation_cpumasks();
1570 if (adding || deleting)
1571 cpuset_force_rebuild();
1572
1573 /*
1574 * Propagate changes in top_cpuset's effective_cpus down the hierarchy.
1575 */
1576 cpuset_update_tasks_cpumask(&top_cpuset, tmp->new_cpus);
1577 update_sibling_cpumasks(&top_cpuset, NULL, tmp);
1578 return;
1579
1580 invalidate:
1581 remote_partition_disable(cs, tmp);
1582 }
1583
1584 /**
1585 * update_parent_effective_cpumask - update effective_cpus mask of parent cpuset
1586 * @cs: The cpuset that requests change in partition root state
1587 * @cmd: Partition root state change command
1588 * @newmask: Optional new cpumask for partcmd_update
1589 * @tmp: Temporary addmask and delmask
1590 * Return: 0 or a partition root state error code
1591 *
1592 * For partcmd_enable*, the cpuset is being transformed from a non-partition
1593 * root to a partition root. The effective_xcpus (cpus_allowed if
1594 * effective_xcpus not set) mask of the given cpuset will be taken away from
1595 * parent's effective_cpus. The function will return 0 if all the CPUs listed
1596 * in effective_xcpus can be granted or an error code will be returned.
1597 *
1598 * For partcmd_disable, the cpuset is being transformed from a partition
1599 * root back to a non-partition root. Any CPUs in effective_xcpus will be
1600 * given back to parent's effective_cpus. 0 will always be returned.
1601 *
1602 * For partcmd_update, if the optional newmask is specified, the cpu list is
1603 * to be changed from effective_xcpus to newmask. Otherwise, effective_xcpus is
1604 * assumed to remain the same. The cpuset should either be a valid or invalid
1605 * partition root. The partition root state may change from valid to invalid
1606 * or vice versa. An error code will be returned if transitioning from
1607 * invalid to valid violates the exclusivity rule.
1608 *
1609 * For partcmd_invalidate, the current partition will be made invalid.
1610 *
1611 * The partcmd_enable* and partcmd_disable commands are used by
1612 * update_prstate(). An error code may be returned and the caller will check
1613 * for error.
1614 *
1615 * The partcmd_update command is used by update_cpumasks_hier() with newmask
1616 * NULL and update_cpumask() with newmask set. The partcmd_invalidate is used
1617 * by update_cpumask() with NULL newmask. In both cases, the callers won't
1618 * check for error and so partition_root_state and prs_err will be updated
1619 * directly.
1620 */
update_parent_effective_cpumask(struct cpuset * cs,int cmd,struct cpumask * newmask,struct tmpmasks * tmp)1621 static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
1622 struct cpumask *newmask,
1623 struct tmpmasks *tmp)
1624 {
1625 struct cpuset *parent = parent_cs(cs);
1626 int adding; /* Adding cpus to parent's effective_cpus */
1627 int deleting; /* Deleting cpus from parent's effective_cpus */
1628 int old_prs, new_prs;
1629 int part_error = PERR_NONE; /* Partition error? */
1630 struct cpumask *xcpus = user_xcpus(cs);
1631 int parent_prs = parent->partition_root_state;
1632 bool nocpu;
1633
1634 lockdep_assert_cpuset_lock_held();
1635 WARN_ON_ONCE(is_remote_partition(cs)); /* For local partition only */
1636
1637 /*
1638 * new_prs will only be changed for the partcmd_update and
1639 * partcmd_invalidate commands.
1640 */
1641 adding = deleting = false;
1642 old_prs = new_prs = cs->partition_root_state;
1643
1644 if (cmd == partcmd_invalidate) {
1645 if (is_partition_invalid(cs))
1646 return 0;
1647
1648 /*
1649 * Make the current partition invalid.
1650 */
1651 if (is_partition_valid(parent))
1652 adding = cpumask_and(tmp->addmask,
1653 xcpus, parent->effective_xcpus);
1654 if (old_prs > 0)
1655 new_prs = -old_prs;
1656
1657 goto write_error;
1658 }
1659
1660 /*
1661 * The parent must be a partition root.
1662 * The new cpumask, if present, or the current cpus_allowed must
1663 * not be empty.
1664 */
1665 if (!is_partition_valid(parent)) {
1666 return is_partition_invalid(parent)
1667 ? PERR_INVPARENT : PERR_NOTPART;
1668 }
1669 if (!newmask && xcpus_empty(cs))
1670 return PERR_CPUSEMPTY;
1671
1672 nocpu = tasks_nocpu_error(parent, cs, xcpus);
1673
1674 if ((cmd == partcmd_enable) || (cmd == partcmd_enablei)) {
1675 /*
1676 * Need to call compute_excpus() in case
1677 * exclusive_cpus not set. Sibling conflict should only happen
1678 * if exclusive_cpus isn't set.
1679 */
1680 xcpus = tmp->delmask;
1681 if (compute_excpus(cs, xcpus))
1682 WARN_ON_ONCE(!cpumask_empty(cs->exclusive_cpus));
1683 new_prs = (cmd == partcmd_enable) ? PRS_ROOT : PRS_ISOLATED;
1684
1685 /*
1686 * Enabling partition root is not allowed if its
1687 * effective_xcpus is empty.
1688 */
1689 if (cpumask_empty(xcpus))
1690 return PERR_INVCPUS;
1691
1692 if (prstate_housekeeping_conflict(new_prs, xcpus))
1693 return PERR_HKEEPING;
1694
1695 if ((new_prs == PRS_ISOLATED) && (new_prs != parent_prs) &&
1696 !isolated_cpus_can_update(xcpus, NULL))
1697 return PERR_HKEEPING;
1698
1699 if (tasks_nocpu_error(parent, cs, xcpus))
1700 return PERR_NOCPUS;
1701
1702 /*
1703 * This function will only be called when all the preliminary
1704 * checks have passed. At this point, the following condition
1705 * should hold.
1706 *
1707 * (cs->effective_xcpus & cpu_active_mask) ⊆ parent->effective_cpus
1708 *
1709 * Warn if it is not the case.
1710 */
1711 cpumask_and(tmp->new_cpus, xcpus, cpu_active_mask);
1712 WARN_ON_ONCE(!cpumask_subset(tmp->new_cpus, parent->effective_cpus));
1713
1714 deleting = true;
1715 } else if (cmd == partcmd_disable) {
1716 /*
1717 * May need to add cpus back to parent's effective_cpus
1718 * (and maybe removed from subpartitions_cpus/isolated_cpus)
1719 * for valid partition root. xcpus may contain CPUs that
1720 * shouldn't be removed from the two global cpumasks.
1721 */
1722 if (is_partition_valid(cs)) {
1723 cpumask_copy(tmp->addmask, cs->effective_xcpus);
1724 adding = true;
1725 }
1726 new_prs = PRS_MEMBER;
1727 } else if (newmask) {
1728 /*
1729 * Empty cpumask is not allowed
1730 */
1731 if (cpumask_empty(newmask)) {
1732 part_error = PERR_CPUSEMPTY;
1733 goto write_error;
1734 }
1735
1736 /* Check newmask again, whether cpus are available for parent/cs */
1737 nocpu |= tasks_nocpu_error(parent, cs, newmask);
1738
1739 /*
1740 * partcmd_update with newmask:
1741 *
1742 * Compute add/delete mask to/from effective_cpus
1743 *
1744 * For valid partition:
1745 * addmask = exclusive_cpus & ~newmask
1746 * & parent->effective_xcpus
1747 * delmask = newmask & ~exclusive_cpus
1748 * & parent->effective_xcpus
1749 *
1750 * For invalid partition:
1751 * delmask = newmask & parent->effective_xcpus
1752 * The partition may become valid soon.
1753 */
1754 if (is_partition_invalid(cs)) {
1755 adding = false;
1756 deleting = cpumask_and(tmp->delmask,
1757 newmask, parent->effective_xcpus);
1758 } else {
1759 cpumask_andnot(tmp->addmask, xcpus, newmask);
1760 adding = cpumask_and(tmp->addmask, tmp->addmask,
1761 parent->effective_xcpus);
1762
1763 cpumask_andnot(tmp->delmask, newmask, xcpus);
1764 deleting = cpumask_and(tmp->delmask, tmp->delmask,
1765 parent->effective_xcpus);
1766 }
1767
1768 /*
1769 * TBD: Invalidate a currently valid child root partition may
1770 * still break isolated_cpus_can_update() rule if parent is an
1771 * isolated partition.
1772 */
1773 if (is_partition_valid(cs) && (old_prs != parent_prs)) {
1774 if ((parent_prs == PRS_ROOT) &&
1775 /* Adding to parent means removing isolated CPUs */
1776 !isolated_cpus_can_update(tmp->delmask, tmp->addmask))
1777 part_error = PERR_HKEEPING;
1778 if ((parent_prs == PRS_ISOLATED) &&
1779 /* Adding to parent means adding isolated CPUs */
1780 !isolated_cpus_can_update(tmp->addmask, tmp->delmask))
1781 part_error = PERR_HKEEPING;
1782 }
1783
1784 /*
1785 * The new CPUs to be removed from parent's effective CPUs
1786 * must be present.
1787 */
1788 if (deleting) {
1789 cpumask_and(tmp->new_cpus, tmp->delmask, cpu_active_mask);
1790 WARN_ON_ONCE(!cpumask_subset(tmp->new_cpus, parent->effective_cpus));
1791 }
1792
1793 /*
1794 * Make partition invalid if parent's effective_cpus could
1795 * become empty and there are tasks in the parent.
1796 */
1797 if (nocpu && (!adding ||
1798 !cpumask_intersects(tmp->addmask, cpu_active_mask))) {
1799 part_error = PERR_NOCPUS;
1800 deleting = false;
1801 adding = cpumask_and(tmp->addmask,
1802 xcpus, parent->effective_xcpus);
1803 }
1804 } else {
1805 /*
1806 * partcmd_update w/o newmask
1807 *
1808 * delmask = effective_xcpus & parent->effective_cpus
1809 *
1810 * This can be called from:
1811 * 1) update_cpumasks_hier()
1812 * 2) cpuset_hotplug_update_tasks()
1813 *
1814 * Check to see if it can be transitioned from valid to
1815 * invalid partition or vice versa.
1816 *
1817 * A partition error happens when parent has tasks and all
1818 * its effective CPUs will have to be distributed out.
1819 */
1820 if (nocpu) {
1821 part_error = PERR_NOCPUS;
1822 if (is_partition_valid(cs))
1823 adding = cpumask_and(tmp->addmask,
1824 xcpus, parent->effective_xcpus);
1825 } else if (is_partition_invalid(cs) && !cpumask_empty(xcpus) &&
1826 cpumask_subset(xcpus, parent->effective_xcpus)) {
1827 struct cgroup_subsys_state *css;
1828 struct cpuset *child;
1829 bool exclusive = true;
1830
1831 /*
1832 * Convert invalid partition to valid has to
1833 * pass the cpu exclusivity test.
1834 */
1835 rcu_read_lock();
1836 cpuset_for_each_child(child, css, parent) {
1837 if (child == cs)
1838 continue;
1839 if (!cpusets_are_exclusive(cs, child)) {
1840 exclusive = false;
1841 break;
1842 }
1843 }
1844 rcu_read_unlock();
1845 if (exclusive)
1846 deleting = cpumask_and(tmp->delmask,
1847 xcpus, parent->effective_cpus);
1848 else
1849 part_error = PERR_NOTEXCL;
1850 }
1851 }
1852
1853 write_error:
1854 if (part_error)
1855 WRITE_ONCE(cs->prs_err, part_error);
1856
1857 if (cmd == partcmd_update) {
1858 /*
1859 * Check for possible transition between valid and invalid
1860 * partition root.
1861 */
1862 switch (cs->partition_root_state) {
1863 case PRS_ROOT:
1864 case PRS_ISOLATED:
1865 if (part_error)
1866 new_prs = -old_prs;
1867 break;
1868 case PRS_INVALID_ROOT:
1869 case PRS_INVALID_ISOLATED:
1870 if (!part_error)
1871 new_prs = -old_prs;
1872 break;
1873 }
1874 }
1875
1876 if (!adding && !deleting && (new_prs == old_prs))
1877 return 0;
1878
1879 /*
1880 * Transitioning between invalid to valid or vice versa may require
1881 * changing CS_CPU_EXCLUSIVE. In the case of partcmd_update,
1882 * validate_change() has already been successfully called and
1883 * CPU lists in cs haven't been updated yet. So defer it to later.
1884 */
1885 if ((old_prs != new_prs) && (cmd != partcmd_update)) {
1886 int err = update_partition_exclusive_flag(cs, new_prs);
1887
1888 if (err)
1889 return err;
1890 }
1891
1892 /*
1893 * Change the parent's effective_cpus & effective_xcpus (top cpuset
1894 * only).
1895 *
1896 * Newly added CPUs will be removed from effective_cpus and
1897 * newly deleted ones will be added back to effective_cpus.
1898 */
1899 spin_lock_irq(&callback_lock);
1900 if (old_prs != new_prs)
1901 cs->partition_root_state = new_prs;
1902
1903 /*
1904 * Adding to parent's effective_cpus means deletion CPUs from cs
1905 * and vice versa.
1906 */
1907 if (adding)
1908 partition_xcpus_del(old_prs, parent, tmp->addmask);
1909 if (deleting)
1910 partition_xcpus_add(new_prs, parent, tmp->delmask);
1911
1912 spin_unlock_irq(&callback_lock);
1913 update_isolation_cpumasks();
1914
1915 if ((old_prs != new_prs) && (cmd == partcmd_update))
1916 update_partition_exclusive_flag(cs, new_prs);
1917
1918 if (adding || deleting) {
1919 cpuset_update_tasks_cpumask(parent, tmp->addmask);
1920 update_sibling_cpumasks(parent, cs, tmp);
1921 }
1922
1923 /*
1924 * For partcmd_update without newmask, it is being called from
1925 * cpuset_handle_hotplug(). Update the load balance flag and
1926 * scheduling domain accordingly.
1927 */
1928 if ((cmd == partcmd_update) && !newmask)
1929 update_partition_sd_lb(cs, old_prs);
1930
1931 notify_partition_change(cs, old_prs);
1932 return 0;
1933 }
1934
1935 /**
1936 * compute_partition_effective_cpumask - compute effective_cpus for partition
1937 * @cs: partition root cpuset
1938 * @new_ecpus: previously computed effective_cpus to be updated
1939 *
1940 * Compute the effective_cpus of a partition root by scanning effective_xcpus
1941 * of child partition roots and excluding their effective_xcpus.
1942 *
1943 * This has the side effect of invalidating valid child partition roots,
1944 * if necessary. Since it is called from either cpuset_hotplug_update_tasks()
1945 * or update_cpumasks_hier() where parent and children are modified
1946 * successively, we don't need to call update_parent_effective_cpumask()
1947 * and the child's effective_cpus will be updated in later iterations.
1948 *
1949 * Note that rcu_read_lock() is assumed to be held.
1950 */
compute_partition_effective_cpumask(struct cpuset * cs,struct cpumask * new_ecpus)1951 static void compute_partition_effective_cpumask(struct cpuset *cs,
1952 struct cpumask *new_ecpus)
1953 {
1954 struct cgroup_subsys_state *css;
1955 struct cpuset *child;
1956 bool populated = partition_is_populated(cs, NULL);
1957
1958 /*
1959 * Check child partition roots to see if they should be
1960 * invalidated when
1961 * 1) child effective_xcpus not a subset of new
1962 * excluisve_cpus
1963 * 2) All the effective_cpus will be used up and cp
1964 * has tasks
1965 */
1966 compute_excpus(cs, new_ecpus);
1967 cpumask_and(new_ecpus, new_ecpus, cpu_active_mask);
1968
1969 rcu_read_lock();
1970 cpuset_for_each_child(child, css, cs) {
1971 if (!is_partition_valid(child))
1972 continue;
1973
1974 /*
1975 * There shouldn't be a remote partition underneath another
1976 * partition root.
1977 */
1978 WARN_ON_ONCE(is_remote_partition(child));
1979 child->prs_err = 0;
1980 if (!cpumask_subset(child->effective_xcpus,
1981 cs->effective_xcpus))
1982 child->prs_err = PERR_INVCPUS;
1983 else if (populated &&
1984 cpumask_subset(new_ecpus, child->effective_xcpus))
1985 child->prs_err = PERR_NOCPUS;
1986
1987 if (child->prs_err) {
1988 int old_prs = child->partition_root_state;
1989
1990 /*
1991 * Invalidate child partition
1992 */
1993 spin_lock_irq(&callback_lock);
1994 make_partition_invalid(child);
1995 spin_unlock_irq(&callback_lock);
1996 notify_partition_change(child, old_prs);
1997 continue;
1998 }
1999 cpumask_andnot(new_ecpus, new_ecpus,
2000 child->effective_xcpus);
2001 }
2002 rcu_read_unlock();
2003 }
2004
2005 /*
2006 * update_cpumasks_hier - Update effective cpumasks and tasks in the subtree
2007 * @cs: the cpuset to consider
2008 * @tmp: temp variables for calculating effective_cpus & partition setup
2009 * @force: don't skip any descendant cpusets if set
2010 *
2011 * When configured cpumask is changed, the effective cpumasks of this cpuset
2012 * and all its descendants need to be updated.
2013 *
2014 * On legacy hierarchy, effective_cpus will be the same with cpu_allowed.
2015 *
2016 * Called with cpuset_mutex held
2017 */
update_cpumasks_hier(struct cpuset * cs,struct tmpmasks * tmp,bool force)2018 static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp,
2019 bool force)
2020 {
2021 struct cpuset *cp;
2022 struct cgroup_subsys_state *pos_css;
2023 int old_prs, new_prs;
2024
2025 rcu_read_lock();
2026 cpuset_for_each_descendant_pre(cp, pos_css, cs) {
2027 struct cpuset *parent = parent_cs(cp);
2028 bool remote = is_remote_partition(cp);
2029 bool update_parent = false;
2030
2031 old_prs = new_prs = cp->partition_root_state;
2032
2033 /*
2034 * For child remote partition root (!= cs), we need to call
2035 * remote_cpus_update() if effective_xcpus will be changed.
2036 * Otherwise, we can skip the whole subtree.
2037 *
2038 * remote_cpus_update() will reuse tmp->new_cpus only after
2039 * its value is being processed.
2040 */
2041 if (remote && (cp != cs)) {
2042 compute_excpus(cp, tmp->new_cpus);
2043 if (cpumask_equal(cp->effective_xcpus, tmp->new_cpus)) {
2044 pos_css = css_rightmost_descendant(pos_css);
2045 continue;
2046 }
2047 rcu_read_unlock();
2048 remote_cpus_update(cp, NULL, tmp->new_cpus, tmp);
2049 rcu_read_lock();
2050
2051 /* Remote partition may be invalidated */
2052 new_prs = cp->partition_root_state;
2053 remote = (new_prs == old_prs);
2054 }
2055
2056 if (remote || (is_partition_valid(parent) && is_partition_valid(cp)))
2057 compute_partition_effective_cpumask(cp, tmp->new_cpus);
2058 else
2059 compute_effective_cpumask(tmp->new_cpus, cp, parent);
2060
2061 if (remote)
2062 goto get_css; /* Ready to update cpuset data */
2063
2064 /*
2065 * A partition with no effective_cpus is allowed as long as
2066 * there is no task associated with it. Call
2067 * update_parent_effective_cpumask() to check it.
2068 */
2069 if (is_partition_valid(cp) && cpumask_empty(tmp->new_cpus)) {
2070 update_parent = true;
2071 goto update_parent_effective;
2072 }
2073
2074 /*
2075 * If it becomes empty, inherit the effective mask of the
2076 * parent, which is guaranteed to have some CPUs unless
2077 * it is a partition root that has explicitly distributed
2078 * out all its CPUs.
2079 */
2080 if (is_in_v2_mode() && !remote && cpumask_empty(tmp->new_cpus))
2081 cpumask_copy(tmp->new_cpus, parent->effective_cpus);
2082
2083 /*
2084 * Skip the whole subtree if
2085 * 1) the cpumask remains the same,
2086 * 2) has no partition root state,
2087 * 3) force flag not set, and
2088 * 4) for v2 load balance state same as its parent.
2089 */
2090 if (!cp->partition_root_state && !force &&
2091 cpumask_equal(tmp->new_cpus, cp->effective_cpus) &&
2092 (!cpuset_v2() ||
2093 (is_sched_load_balance(parent) == is_sched_load_balance(cp)))) {
2094 pos_css = css_rightmost_descendant(pos_css);
2095 continue;
2096 }
2097
2098 update_parent_effective:
2099 /*
2100 * update_parent_effective_cpumask() should have been called
2101 * for cs already in update_cpumask(). We should also call
2102 * cpuset_update_tasks_cpumask() again for tasks in the parent
2103 * cpuset if the parent's effective_cpus changes.
2104 */
2105 if ((cp != cs) && old_prs) {
2106 switch (parent->partition_root_state) {
2107 case PRS_ROOT:
2108 case PRS_ISOLATED:
2109 update_parent = true;
2110 break;
2111
2112 default:
2113 /*
2114 * When parent is not a partition root or is
2115 * invalid, child partition roots become
2116 * invalid too.
2117 */
2118 if (is_partition_valid(cp))
2119 new_prs = -cp->partition_root_state;
2120 WRITE_ONCE(cp->prs_err,
2121 is_partition_invalid(parent)
2122 ? PERR_INVPARENT : PERR_NOTPART);
2123 break;
2124 }
2125 }
2126 get_css:
2127 if (!css_tryget_online(&cp->css))
2128 continue;
2129 rcu_read_unlock();
2130
2131 if (update_parent) {
2132 update_parent_effective_cpumask(cp, partcmd_update, NULL, tmp);
2133 /*
2134 * The cpuset partition_root_state may become
2135 * invalid. Capture it.
2136 */
2137 new_prs = cp->partition_root_state;
2138 }
2139
2140 spin_lock_irq(&callback_lock);
2141 cpumask_copy(cp->effective_cpus, tmp->new_cpus);
2142 cp->partition_root_state = new_prs;
2143 /*
2144 * Need to compute effective_xcpus if either exclusive_cpus
2145 * is non-empty or it is a valid partition root.
2146 */
2147 if ((new_prs > 0) || !cpumask_empty(cp->exclusive_cpus))
2148 compute_excpus(cp, cp->effective_xcpus);
2149 if (new_prs <= 0)
2150 reset_partition_data(cp);
2151 spin_unlock_irq(&callback_lock);
2152
2153 notify_partition_change(cp, old_prs);
2154
2155 WARN_ON(!is_in_v2_mode() &&
2156 !cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
2157
2158 cpuset_update_tasks_cpumask(cp, cp->effective_cpus);
2159
2160 /*
2161 * On default hierarchy, inherit the CS_SCHED_LOAD_BALANCE
2162 * from parent if current cpuset isn't a valid partition root
2163 * and their load balance states differ.
2164 */
2165 if (cpuset_v2() && !is_partition_valid(cp) &&
2166 (is_sched_load_balance(parent) != is_sched_load_balance(cp))) {
2167 if (is_sched_load_balance(parent))
2168 set_bit(CS_SCHED_LOAD_BALANCE, &cp->flags);
2169 else
2170 clear_bit(CS_SCHED_LOAD_BALANCE, &cp->flags);
2171 }
2172
2173 /*
2174 * On legacy hierarchy, if the effective cpumask of any non-
2175 * empty cpuset is changed, we need to rebuild sched domains.
2176 * On default hierarchy, the cpuset needs to be a partition
2177 * root as well.
2178 */
2179 if (!cpumask_empty(cp->cpus_allowed) &&
2180 is_sched_load_balance(cp) &&
2181 (!cpuset_v2() || is_partition_valid(cp)))
2182 cpuset_force_rebuild();
2183
2184 rcu_read_lock();
2185 css_put(&cp->css);
2186 }
2187 rcu_read_unlock();
2188 }
2189
2190 /**
2191 * update_sibling_cpumasks - Update siblings cpumasks
2192 * @parent: Parent cpuset
2193 * @cs: Current cpuset
2194 * @tmp: Temp variables
2195 */
update_sibling_cpumasks(struct cpuset * parent,struct cpuset * cs,struct tmpmasks * tmp)2196 static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
2197 struct tmpmasks *tmp)
2198 {
2199 struct cpuset *sibling;
2200 struct cgroup_subsys_state *pos_css;
2201
2202 lockdep_assert_cpuset_lock_held();
2203
2204 /*
2205 * Check all its siblings and call update_cpumasks_hier()
2206 * if their effective_cpus will need to be changed.
2207 *
2208 * It is possible a change in parent's effective_cpus
2209 * due to a change in a child partition's effective_xcpus will impact
2210 * its siblings even if they do not inherit parent's effective_cpus
2211 * directly. It should not impact valid partition.
2212 *
2213 * The update_cpumasks_hier() function may sleep. So we have to
2214 * release the RCU read lock before calling it.
2215 */
2216 rcu_read_lock();
2217 cpuset_for_each_child(sibling, pos_css, parent) {
2218 if (sibling == cs || is_partition_valid(sibling))
2219 continue;
2220
2221 compute_effective_cpumask(tmp->new_cpus, sibling,
2222 parent);
2223 if (cpumask_equal(tmp->new_cpus, sibling->effective_cpus))
2224 continue;
2225
2226 if (!css_tryget_online(&sibling->css))
2227 continue;
2228
2229 rcu_read_unlock();
2230 update_cpumasks_hier(sibling, tmp, false);
2231 rcu_read_lock();
2232 css_put(&sibling->css);
2233 }
2234 rcu_read_unlock();
2235 }
2236
parse_cpuset_cpulist(const char * buf,struct cpumask * out_mask)2237 static int parse_cpuset_cpulist(const char *buf, struct cpumask *out_mask)
2238 {
2239 int retval;
2240
2241 retval = cpulist_parse(buf, out_mask);
2242 if (retval < 0)
2243 return retval;
2244 if (!cpumask_subset(out_mask, top_cpuset.cpus_allowed))
2245 return -EINVAL;
2246
2247 return 0;
2248 }
2249
2250 /**
2251 * validate_partition - Validate a cpuset partition configuration
2252 * @cs: The cpuset to validate
2253 * @trialcs: The trial cpuset containing proposed configuration changes
2254 *
2255 * If any validation check fails, the appropriate error code is set in the
2256 * cpuset's prs_err field.
2257 *
2258 * Return: PRS error code (0 if valid, non-zero error code if invalid)
2259 */
validate_partition(struct cpuset * cs,struct cpuset * trialcs)2260 static enum prs_errcode validate_partition(struct cpuset *cs, struct cpuset *trialcs)
2261 {
2262 struct cpuset *parent = parent_cs(cs);
2263
2264 if (cs_is_member(trialcs))
2265 return PERR_NONE;
2266
2267 if (cpumask_empty(trialcs->effective_xcpus))
2268 return PERR_INVCPUS;
2269
2270 if (prstate_housekeeping_conflict(trialcs->partition_root_state,
2271 trialcs->effective_xcpus))
2272 return PERR_HKEEPING;
2273
2274 if (tasks_nocpu_error(parent, cs, trialcs->effective_xcpus))
2275 return PERR_NOCPUS;
2276
2277 return PERR_NONE;
2278 }
2279
2280 /**
2281 * partition_cpus_change - Handle partition state changes due to CPU mask updates
2282 * @cs: The target cpuset being modified
2283 * @trialcs: The trial cpuset containing proposed configuration changes
2284 * @tmp: Temporary masks for intermediate calculations
2285 *
2286 * This function handles partition state transitions triggered by CPU mask changes.
2287 * CPU modifications may cause a partition to be disabled or require state updates.
2288 */
partition_cpus_change(struct cpuset * cs,struct cpuset * trialcs,struct tmpmasks * tmp)2289 static void partition_cpus_change(struct cpuset *cs, struct cpuset *trialcs,
2290 struct tmpmasks *tmp)
2291 {
2292 enum prs_errcode prs_err;
2293
2294 if (cs_is_member(cs))
2295 return;
2296
2297 prs_err = validate_partition(cs, trialcs);
2298 if (prs_err)
2299 trialcs->prs_err = cs->prs_err = prs_err;
2300
2301 if (is_remote_partition(cs)) {
2302 if (trialcs->prs_err)
2303 remote_partition_disable(cs, tmp);
2304 else
2305 remote_cpus_update(cs, trialcs->exclusive_cpus,
2306 trialcs->effective_xcpus, tmp);
2307 } else {
2308 if (trialcs->prs_err)
2309 update_parent_effective_cpumask(cs, partcmd_invalidate,
2310 NULL, tmp);
2311 else
2312 update_parent_effective_cpumask(cs, partcmd_update,
2313 trialcs->effective_xcpus, tmp);
2314 }
2315 }
2316
2317 /**
2318 * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it
2319 * @cs: the cpuset to consider
2320 * @trialcs: trial cpuset
2321 * @buf: buffer of cpu numbers written to this cpuset
2322 */
update_cpumask(struct cpuset * cs,struct cpuset * trialcs,const char * buf)2323 static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
2324 const char *buf)
2325 {
2326 int retval;
2327 struct tmpmasks tmp;
2328 bool force = false;
2329 int old_prs = cs->partition_root_state;
2330
2331 retval = parse_cpuset_cpulist(buf, trialcs->cpus_allowed);
2332 if (retval < 0)
2333 return retval;
2334
2335 /* Nothing to do if the cpus didn't change */
2336 if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))
2337 return 0;
2338
2339 compute_trialcs_excpus(trialcs, cs);
2340 trialcs->prs_err = PERR_NONE;
2341
2342 retval = validate_change(cs, trialcs);
2343 if (retval < 0)
2344 return retval;
2345
2346 if (alloc_tmpmasks(&tmp))
2347 return -ENOMEM;
2348
2349 /*
2350 * Check all the descendants in update_cpumasks_hier() if
2351 * effective_xcpus is to be changed.
2352 */
2353 force = !cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus);
2354
2355 partition_cpus_change(cs, trialcs, &tmp);
2356
2357 spin_lock_irq(&callback_lock);
2358 cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
2359 cpumask_copy(cs->effective_xcpus, trialcs->effective_xcpus);
2360 if ((old_prs > 0) && !is_partition_valid(cs))
2361 reset_partition_data(cs);
2362 spin_unlock_irq(&callback_lock);
2363
2364 /* effective_cpus/effective_xcpus will be updated here */
2365 update_cpumasks_hier(cs, &tmp, force);
2366
2367 /* Update CS_SCHED_LOAD_BALANCE and/or sched_domains, if necessary */
2368 if (cs->partition_root_state)
2369 update_partition_sd_lb(cs, old_prs);
2370
2371 free_tmpmasks(&tmp);
2372 return retval;
2373 }
2374
2375 /**
2376 * update_exclusive_cpumask - update the exclusive_cpus mask of a cpuset
2377 * @cs: the cpuset to consider
2378 * @trialcs: trial cpuset
2379 * @buf: buffer of cpu numbers written to this cpuset
2380 *
2381 * The tasks' cpumask will be updated if cs is a valid partition root.
2382 */
update_exclusive_cpumask(struct cpuset * cs,struct cpuset * trialcs,const char * buf)2383 static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs,
2384 const char *buf)
2385 {
2386 int retval;
2387 struct tmpmasks tmp;
2388 bool force = false;
2389 int old_prs = cs->partition_root_state;
2390
2391 retval = parse_cpuset_cpulist(buf, trialcs->exclusive_cpus);
2392 if (retval < 0)
2393 return retval;
2394
2395 /* Nothing to do if the CPUs didn't change */
2396 if (cpumask_equal(cs->exclusive_cpus, trialcs->exclusive_cpus))
2397 return 0;
2398
2399 /*
2400 * Reject the change if there is exclusive CPUs conflict with
2401 * the siblings.
2402 */
2403 if (compute_trialcs_excpus(trialcs, cs))
2404 return -EINVAL;
2405
2406 /*
2407 * Check all the descendants in update_cpumasks_hier() if
2408 * effective_xcpus is to be changed.
2409 */
2410 force = !cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus);
2411
2412 retval = validate_change(cs, trialcs);
2413 if (retval)
2414 return retval;
2415
2416 if (alloc_tmpmasks(&tmp))
2417 return -ENOMEM;
2418
2419 trialcs->prs_err = PERR_NONE;
2420 partition_cpus_change(cs, trialcs, &tmp);
2421
2422 spin_lock_irq(&callback_lock);
2423 cpumask_copy(cs->exclusive_cpus, trialcs->exclusive_cpus);
2424 cpumask_copy(cs->effective_xcpus, trialcs->effective_xcpus);
2425 if ((old_prs > 0) && !is_partition_valid(cs))
2426 reset_partition_data(cs);
2427 spin_unlock_irq(&callback_lock);
2428
2429 /*
2430 * Call update_cpumasks_hier() to update effective_cpus/effective_xcpus
2431 * of the subtree when it is a valid partition root or effective_xcpus
2432 * is updated.
2433 */
2434 if (is_partition_valid(cs) || force)
2435 update_cpumasks_hier(cs, &tmp, force);
2436
2437 /* Update CS_SCHED_LOAD_BALANCE and/or sched_domains, if necessary */
2438 if (cs->partition_root_state)
2439 update_partition_sd_lb(cs, old_prs);
2440
2441 free_tmpmasks(&tmp);
2442 return 0;
2443 }
2444
2445 /*
2446 * Migrate memory region from one set of nodes to another. This is
2447 * performed asynchronously as it can be called from process migration path
2448 * holding locks involved in process management. All mm migrations are
2449 * performed in the queued order and can be waited for by flushing
2450 * cpuset_migrate_mm_wq.
2451 */
2452
2453 struct cpuset_migrate_mm_work {
2454 struct work_struct work;
2455 struct mm_struct *mm;
2456 nodemask_t from;
2457 nodemask_t to;
2458 };
2459
cpuset_migrate_mm_workfn(struct work_struct * work)2460 static void cpuset_migrate_mm_workfn(struct work_struct *work)
2461 {
2462 struct cpuset_migrate_mm_work *mwork =
2463 container_of(work, struct cpuset_migrate_mm_work, work);
2464
2465 /* on a wq worker, no need to worry about %current's mems_allowed */
2466 do_migrate_pages(mwork->mm, &mwork->from, &mwork->to, MPOL_MF_MOVE_ALL);
2467 mmput(mwork->mm);
2468 kfree(mwork);
2469 }
2470
cpuset_migrate_mm(struct mm_struct * mm,const nodemask_t * from,const nodemask_t * to)2471 static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
2472 const nodemask_t *to)
2473 {
2474 struct cpuset_migrate_mm_work *mwork;
2475
2476 if (nodes_equal(*from, *to)) {
2477 mmput(mm);
2478 return;
2479 }
2480
2481 mwork = kzalloc_obj(*mwork);
2482 if (mwork) {
2483 mwork->mm = mm;
2484 mwork->from = *from;
2485 mwork->to = *to;
2486 INIT_WORK(&mwork->work, cpuset_migrate_mm_workfn);
2487 queue_work(cpuset_migrate_mm_wq, &mwork->work);
2488 } else {
2489 mmput(mm);
2490 }
2491 }
2492
flush_migrate_mm_task_workfn(struct callback_head * head)2493 static void flush_migrate_mm_task_workfn(struct callback_head *head)
2494 {
2495 flush_workqueue(cpuset_migrate_mm_wq);
2496 kfree(head);
2497 }
2498
schedule_flush_migrate_mm(void)2499 static void schedule_flush_migrate_mm(void)
2500 {
2501 struct callback_head *flush_cb;
2502
2503 flush_cb = kzalloc_obj(struct callback_head);
2504 if (!flush_cb)
2505 return;
2506
2507 init_task_work(flush_cb, flush_migrate_mm_task_workfn);
2508
2509 if (task_work_add(current, flush_cb, TWA_RESUME))
2510 kfree(flush_cb);
2511 }
2512
2513 /*
2514 * cpuset_change_task_nodemask - change task's mems_allowed and mempolicy
2515 * @tsk: the task to change
2516 * @newmems: new nodes that the task will be set
2517 *
2518 * We use the mems_allowed_seq seqlock to safely update both tsk->mems_allowed
2519 * and rebind an eventual tasks' mempolicy. If the task is allocating in
2520 * parallel, it might temporarily see an empty intersection, which results in
2521 * a seqlock check and retry before OOM or allocation failure.
2522 */
cpuset_change_task_nodemask(struct task_struct * tsk,nodemask_t * newmems)2523 static void cpuset_change_task_nodemask(struct task_struct *tsk,
2524 nodemask_t *newmems)
2525 {
2526 task_lock(tsk);
2527
2528 local_irq_disable();
2529 write_seqcount_begin(&tsk->mems_allowed_seq);
2530
2531 nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
2532 mpol_rebind_task(tsk, newmems);
2533 tsk->mems_allowed = *newmems;
2534
2535 write_seqcount_end(&tsk->mems_allowed_seq);
2536 local_irq_enable();
2537
2538 task_unlock(tsk);
2539 }
2540
2541 static void *cpuset_being_rebound;
2542
2543 /**
2544 * cpuset_update_tasks_nodemask - Update the nodemasks of tasks in the cpuset.
2545 * @cs: the cpuset in which each task's mems_allowed mask needs to be changed
2546 *
2547 * Iterate through each task of @cs updating its mems_allowed to the
2548 * effective cpuset's. As this function is called with cpuset_mutex held,
2549 * cpuset membership stays stable.
2550 */
cpuset_update_tasks_nodemask(struct cpuset * cs)2551 void cpuset_update_tasks_nodemask(struct cpuset *cs)
2552 {
2553 static nodemask_t newmems; /* protected by cpuset_mutex */
2554 struct css_task_iter it;
2555 struct task_struct *task;
2556
2557 cpuset_being_rebound = cs; /* causes mpol_dup() rebind */
2558
2559 guarantee_online_mems(cs, &newmems);
2560
2561 /*
2562 * The mpol_rebind_mm() call takes mmap_lock, which we couldn't
2563 * take while holding tasklist_lock. Forks can happen - the
2564 * mpol_dup() cpuset_being_rebound check will catch such forks,
2565 * and rebind their vma mempolicies too. Because we still hold
2566 * the global cpuset_mutex, we know that no other rebind effort
2567 * will be contending for the global variable cpuset_being_rebound.
2568 * It's ok if we rebind the same mm twice; mpol_rebind_mm()
2569 * is idempotent. Also migrate pages in each mm to new nodes.
2570 */
2571 css_task_iter_start(&cs->css, 0, &it);
2572 while ((task = css_task_iter_next(&it))) {
2573 struct mm_struct *mm;
2574 bool migrate;
2575
2576 cpuset_change_task_nodemask(task, &newmems);
2577
2578 mm = get_task_mm(task);
2579 if (!mm)
2580 continue;
2581
2582 migrate = is_memory_migrate(cs);
2583
2584 mpol_rebind_mm(mm, &cs->mems_allowed);
2585 if (migrate)
2586 cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems);
2587 else
2588 mmput(mm);
2589 }
2590 css_task_iter_end(&it);
2591
2592 /*
2593 * All the tasks' nodemasks have been updated, update
2594 * cs->old_mems_allowed.
2595 */
2596 cs->old_mems_allowed = newmems;
2597
2598 /* We're done rebinding vmas to this cpuset's new mems_allowed. */
2599 cpuset_being_rebound = NULL;
2600 }
2601
2602 /*
2603 * update_nodemasks_hier - Update effective nodemasks and tasks in the subtree
2604 * @cs: the cpuset to consider
2605 * @new_mems: a temp variable for calculating new effective_mems
2606 *
2607 * When configured nodemask is changed, the effective nodemasks of this cpuset
2608 * and all its descendants need to be updated.
2609 *
2610 * On legacy hierarchy, effective_mems will be the same with mems_allowed.
2611 *
2612 * Called with cpuset_mutex held
2613 */
update_nodemasks_hier(struct cpuset * cs,nodemask_t * new_mems)2614 static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
2615 {
2616 struct cpuset *cp;
2617 struct cgroup_subsys_state *pos_css;
2618
2619 rcu_read_lock();
2620 cpuset_for_each_descendant_pre(cp, pos_css, cs) {
2621 struct cpuset *parent = parent_cs(cp);
2622
2623 bool has_mems = nodes_and(*new_mems, cp->mems_allowed, parent->effective_mems);
2624
2625 /*
2626 * If it becomes empty, inherit the effective mask of the
2627 * parent, which is guaranteed to have some MEMs.
2628 */
2629 if (is_in_v2_mode() && !has_mems)
2630 *new_mems = parent->effective_mems;
2631
2632 /* Skip the whole subtree if the nodemask remains the same. */
2633 if (nodes_equal(*new_mems, cp->effective_mems)) {
2634 pos_css = css_rightmost_descendant(pos_css);
2635 continue;
2636 }
2637
2638 if (!css_tryget_online(&cp->css))
2639 continue;
2640 rcu_read_unlock();
2641
2642 spin_lock_irq(&callback_lock);
2643 cp->effective_mems = *new_mems;
2644 spin_unlock_irq(&callback_lock);
2645
2646 WARN_ON(!is_in_v2_mode() &&
2647 !nodes_equal(cp->mems_allowed, cp->effective_mems));
2648
2649 cpuset_update_tasks_nodemask(cp);
2650
2651 rcu_read_lock();
2652 css_put(&cp->css);
2653 }
2654 rcu_read_unlock();
2655 }
2656
2657 /*
2658 * Handle user request to change the 'mems' memory placement
2659 * of a cpuset. Needs to validate the request, update the
2660 * cpusets mems_allowed, and for each task in the cpuset,
2661 * update mems_allowed and rebind task's mempolicy and any vma
2662 * mempolicies and if the cpuset is marked 'memory_migrate',
2663 * migrate the tasks pages to the new memory.
2664 *
2665 * Call with cpuset_mutex held. May take callback_lock during call.
2666 * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
2667 * lock each such tasks mm->mmap_lock, scan its vma's and rebind
2668 * their mempolicies to the cpusets new mems_allowed.
2669 */
update_nodemask(struct cpuset * cs,struct cpuset * trialcs,const char * buf)2670 static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
2671 const char *buf)
2672 {
2673 int retval;
2674
2675 /*
2676 * An empty mems_allowed is ok iff there are no tasks in the cpuset.
2677 * The validate_change() call ensures that cpusets with tasks have memory.
2678 */
2679 retval = nodelist_parse(buf, trialcs->mems_allowed);
2680 if (retval < 0)
2681 return retval;
2682
2683 if (!nodes_subset(trialcs->mems_allowed,
2684 top_cpuset.mems_allowed))
2685 return -EINVAL;
2686
2687 /* No change? nothing to do */
2688 if (nodes_equal(cs->mems_allowed, trialcs->mems_allowed))
2689 return 0;
2690
2691 retval = validate_change(cs, trialcs);
2692 if (retval < 0)
2693 return retval;
2694
2695 check_insane_mems_config(&trialcs->mems_allowed);
2696
2697 spin_lock_irq(&callback_lock);
2698 cs->mems_allowed = trialcs->mems_allowed;
2699 spin_unlock_irq(&callback_lock);
2700
2701 /* use trialcs->mems_allowed as a temp variable */
2702 update_nodemasks_hier(cs, &trialcs->mems_allowed);
2703 return 0;
2704 }
2705
current_cpuset_is_being_rebound(void)2706 bool current_cpuset_is_being_rebound(void)
2707 {
2708 bool ret;
2709
2710 rcu_read_lock();
2711 ret = task_cs(current) == cpuset_being_rebound;
2712 rcu_read_unlock();
2713
2714 return ret;
2715 }
2716
2717 /*
2718 * cpuset_update_flag - read a 0 or a 1 in a file and update associated flag
2719 * bit: the bit to update (see cpuset_flagbits_t)
2720 * cs: the cpuset to update
2721 * turning_on: whether the flag is being set or cleared
2722 *
2723 * Call with cpuset_mutex held.
2724 */
2725
cpuset_update_flag(cpuset_flagbits_t bit,struct cpuset * cs,int turning_on)2726 int cpuset_update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
2727 int turning_on)
2728 {
2729 struct cpuset *trialcs;
2730 int balance_flag_changed;
2731 int spread_flag_changed;
2732 int err;
2733
2734 trialcs = dup_or_alloc_cpuset(cs);
2735 if (!trialcs)
2736 return -ENOMEM;
2737
2738 if (turning_on)
2739 set_bit(bit, &trialcs->flags);
2740 else
2741 clear_bit(bit, &trialcs->flags);
2742
2743 err = validate_change(cs, trialcs);
2744 if (err < 0)
2745 goto out;
2746
2747 balance_flag_changed = (is_sched_load_balance(cs) !=
2748 is_sched_load_balance(trialcs));
2749
2750 spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs))
2751 || (is_spread_page(cs) != is_spread_page(trialcs)));
2752
2753 spin_lock_irq(&callback_lock);
2754 cs->flags = trialcs->flags;
2755 spin_unlock_irq(&callback_lock);
2756
2757 if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) {
2758 if (cpuset_v2())
2759 cpuset_force_rebuild();
2760 else
2761 rebuild_sched_domains_locked();
2762 }
2763
2764 if (spread_flag_changed)
2765 cpuset1_update_tasks_flags(cs);
2766 out:
2767 free_cpuset(trialcs);
2768 return err;
2769 }
2770
2771 /**
2772 * update_prstate - update partition_root_state
2773 * @cs: the cpuset to update
2774 * @new_prs: new partition root state
2775 * Return: 0 if successful, != 0 if error
2776 *
2777 * Call with cpuset_mutex held.
2778 */
update_prstate(struct cpuset * cs,int new_prs)2779 static int update_prstate(struct cpuset *cs, int new_prs)
2780 {
2781 int err = PERR_NONE, old_prs = cs->partition_root_state;
2782 struct cpuset *parent = parent_cs(cs);
2783 struct tmpmasks tmpmask;
2784 bool isolcpus_updated = false;
2785
2786 if (old_prs == new_prs)
2787 return 0;
2788
2789 /*
2790 * Treat a previously invalid partition root as if it is a "member".
2791 */
2792 if (new_prs && is_partition_invalid(cs))
2793 old_prs = PRS_MEMBER;
2794
2795 if (alloc_tmpmasks(&tmpmask))
2796 return -ENOMEM;
2797
2798 err = update_partition_exclusive_flag(cs, new_prs);
2799 if (err)
2800 goto out;
2801
2802 if (!old_prs) {
2803 /*
2804 * cpus_allowed and exclusive_cpus cannot be both empty.
2805 */
2806 if (xcpus_empty(cs)) {
2807 err = PERR_CPUSEMPTY;
2808 goto out;
2809 }
2810
2811 /*
2812 * We don't support the creation of a new local partition with
2813 * a remote partition underneath it. This unsupported
2814 * setting can happen only if parent is the top_cpuset because
2815 * a remote partition cannot be created underneath an existing
2816 * local or remote partition.
2817 */
2818 if ((parent == &top_cpuset) &&
2819 cpumask_intersects(cs->exclusive_cpus, subpartitions_cpus)) {
2820 err = PERR_REMOTE;
2821 goto out;
2822 }
2823
2824 /*
2825 * If parent is valid partition, enable local partiion.
2826 * Otherwise, enable a remote partition.
2827 */
2828 if (is_partition_valid(parent)) {
2829 enum partition_cmd cmd = (new_prs == PRS_ROOT)
2830 ? partcmd_enable : partcmd_enablei;
2831
2832 err = update_parent_effective_cpumask(cs, cmd, NULL, &tmpmask);
2833 } else {
2834 err = remote_partition_enable(cs, new_prs, &tmpmask);
2835 }
2836 } else if (old_prs && new_prs) {
2837 /*
2838 * A change in load balance state only, no change in cpumasks.
2839 * Need to update isolated_cpus.
2840 */
2841 if (((new_prs == PRS_ISOLATED) &&
2842 !isolated_cpus_can_update(cs->effective_xcpus, NULL)) ||
2843 prstate_housekeeping_conflict(new_prs, cs->effective_xcpus))
2844 err = PERR_HKEEPING;
2845 else
2846 isolcpus_updated = true;
2847 } else {
2848 /*
2849 * Switching back to member is always allowed even if it
2850 * disables child partitions.
2851 */
2852 if (is_remote_partition(cs))
2853 remote_partition_disable(cs, &tmpmask);
2854 else
2855 update_parent_effective_cpumask(cs, partcmd_disable,
2856 NULL, &tmpmask);
2857
2858 /*
2859 * Invalidation of child partitions will be done in
2860 * update_cpumasks_hier().
2861 */
2862 }
2863 out:
2864 /*
2865 * Make partition invalid & disable CS_CPU_EXCLUSIVE if an error
2866 * happens.
2867 */
2868 if (err) {
2869 new_prs = -new_prs;
2870 update_partition_exclusive_flag(cs, new_prs);
2871 }
2872
2873 spin_lock_irq(&callback_lock);
2874 cs->partition_root_state = new_prs;
2875 WRITE_ONCE(cs->prs_err, err);
2876 if (!is_partition_valid(cs))
2877 reset_partition_data(cs);
2878 else if (isolcpus_updated)
2879 isolated_cpus_update(old_prs, new_prs, cs->effective_xcpus);
2880 spin_unlock_irq(&callback_lock);
2881 update_isolation_cpumasks();
2882
2883 /* Force update if switching back to member & update effective_xcpus */
2884 update_cpumasks_hier(cs, &tmpmask, !new_prs);
2885
2886 /* A newly created partition must have effective_xcpus set */
2887 WARN_ON_ONCE(!old_prs && (new_prs > 0)
2888 && cpumask_empty(cs->effective_xcpus));
2889
2890 /* Update sched domains and load balance flag */
2891 update_partition_sd_lb(cs, old_prs);
2892
2893 notify_partition_change(cs, old_prs);
2894 if (force_sd_rebuild)
2895 rebuild_sched_domains_locked();
2896 free_tmpmasks(&tmpmask);
2897 return 0;
2898 }
2899
2900 static struct cpuset *cpuset_attach_old_cs;
2901
2902 /*
2903 * Check to see if a cpuset can accept a new task
2904 * For v1, cpus_allowed and mems_allowed can't be empty.
2905 * For v2, effective_cpus can't be empty.
2906 * Note that in v1, effective_cpus = cpus_allowed.
2907 */
cpuset_can_attach_check(struct cpuset * cs)2908 static int cpuset_can_attach_check(struct cpuset *cs)
2909 {
2910 if (cpumask_empty(cs->effective_cpus) ||
2911 (!is_in_v2_mode() && nodes_empty(cs->mems_allowed)))
2912 return -ENOSPC;
2913 return 0;
2914 }
2915
reset_migrate_dl_data(struct cpuset * cs)2916 static void reset_migrate_dl_data(struct cpuset *cs)
2917 {
2918 cs->nr_migrate_dl_tasks = 0;
2919 cs->sum_migrate_dl_bw = 0;
2920 }
2921
2922 /* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */
cpuset_can_attach(struct cgroup_taskset * tset)2923 static int cpuset_can_attach(struct cgroup_taskset *tset)
2924 {
2925 struct cgroup_subsys_state *css;
2926 struct cpuset *cs, *oldcs;
2927 struct task_struct *task;
2928 bool cpus_updated, mems_updated;
2929 int ret;
2930
2931 /* used later by cpuset_attach() */
2932 cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css));
2933 oldcs = cpuset_attach_old_cs;
2934 cs = css_cs(css);
2935
2936 mutex_lock(&cpuset_mutex);
2937
2938 /* Check to see if task is allowed in the cpuset */
2939 ret = cpuset_can_attach_check(cs);
2940 if (ret)
2941 goto out_unlock;
2942
2943 cpus_updated = !cpumask_equal(cs->effective_cpus, oldcs->effective_cpus);
2944 mems_updated = !nodes_equal(cs->effective_mems, oldcs->effective_mems);
2945
2946 cgroup_taskset_for_each(task, css, tset) {
2947 ret = task_can_attach(task);
2948 if (ret)
2949 goto out_unlock;
2950
2951 /*
2952 * Skip rights over task check in v2 when nothing changes,
2953 * migration permission derives from hierarchy ownership in
2954 * cgroup_procs_write_permission()).
2955 */
2956 if (!cpuset_v2() || (cpus_updated || mems_updated)) {
2957 ret = security_task_setscheduler(task);
2958 if (ret)
2959 goto out_unlock;
2960 }
2961
2962 if (dl_task(task)) {
2963 cs->nr_migrate_dl_tasks++;
2964 cs->sum_migrate_dl_bw += task->dl.dl_bw;
2965 }
2966 }
2967
2968 if (!cs->nr_migrate_dl_tasks)
2969 goto out_success;
2970
2971 if (!cpumask_intersects(oldcs->effective_cpus, cs->effective_cpus)) {
2972 int cpu = cpumask_any_and(cpu_active_mask, cs->effective_cpus);
2973
2974 if (unlikely(cpu >= nr_cpu_ids)) {
2975 reset_migrate_dl_data(cs);
2976 ret = -EINVAL;
2977 goto out_unlock;
2978 }
2979
2980 ret = dl_bw_alloc(cpu, cs->sum_migrate_dl_bw);
2981 if (ret) {
2982 reset_migrate_dl_data(cs);
2983 goto out_unlock;
2984 }
2985 }
2986
2987 out_success:
2988 /*
2989 * Mark attach is in progress. This makes validate_change() fail
2990 * changes which zero cpus/mems_allowed.
2991 */
2992 cs->attach_in_progress++;
2993 out_unlock:
2994 mutex_unlock(&cpuset_mutex);
2995 return ret;
2996 }
2997
cpuset_cancel_attach(struct cgroup_taskset * tset)2998 static void cpuset_cancel_attach(struct cgroup_taskset *tset)
2999 {
3000 struct cgroup_subsys_state *css;
3001 struct cpuset *cs;
3002
3003 cgroup_taskset_first(tset, &css);
3004 cs = css_cs(css);
3005
3006 mutex_lock(&cpuset_mutex);
3007 dec_attach_in_progress_locked(cs);
3008
3009 if (cs->nr_migrate_dl_tasks) {
3010 int cpu = cpumask_any(cs->effective_cpus);
3011
3012 dl_bw_free(cpu, cs->sum_migrate_dl_bw);
3013 reset_migrate_dl_data(cs);
3014 }
3015
3016 mutex_unlock(&cpuset_mutex);
3017 }
3018
3019 /*
3020 * Protected by cpuset_mutex. cpus_attach is used only by cpuset_attach_task()
3021 * but we can't allocate it dynamically there. Define it global and
3022 * allocate from cpuset_init().
3023 */
3024 static cpumask_var_t cpus_attach;
3025 static nodemask_t cpuset_attach_nodemask_to;
3026
cpuset_attach_task(struct cpuset * cs,struct task_struct * task)3027 static void cpuset_attach_task(struct cpuset *cs, struct task_struct *task)
3028 {
3029 lockdep_assert_cpuset_lock_held();
3030
3031 if (cs != &top_cpuset)
3032 guarantee_active_cpus(task, cpus_attach);
3033 else
3034 cpumask_andnot(cpus_attach, task_cpu_possible_mask(task),
3035 subpartitions_cpus);
3036 /*
3037 * can_attach beforehand should guarantee that this doesn't
3038 * fail. TODO: have a better way to handle failure here
3039 */
3040 WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach));
3041
3042 cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to);
3043 cpuset1_update_task_spread_flags(cs, task);
3044 }
3045
cpuset_attach(struct cgroup_taskset * tset)3046 static void cpuset_attach(struct cgroup_taskset *tset)
3047 {
3048 struct task_struct *task;
3049 struct task_struct *leader;
3050 struct cgroup_subsys_state *css;
3051 struct cpuset *cs;
3052 struct cpuset *oldcs = cpuset_attach_old_cs;
3053 bool cpus_updated, mems_updated;
3054 bool queue_task_work = false;
3055
3056 cgroup_taskset_first(tset, &css);
3057 cs = css_cs(css);
3058
3059 lockdep_assert_cpus_held(); /* see cgroup_attach_lock() */
3060 mutex_lock(&cpuset_mutex);
3061 cpus_updated = !cpumask_equal(cs->effective_cpus,
3062 oldcs->effective_cpus);
3063 mems_updated = !nodes_equal(cs->effective_mems, oldcs->effective_mems);
3064
3065 /*
3066 * In the default hierarchy, enabling cpuset in the child cgroups
3067 * will trigger a number of cpuset_attach() calls with no change
3068 * in effective cpus and mems. In that case, we can optimize out
3069 * by skipping the task iteration and update.
3070 */
3071 if (cpuset_v2() && !cpus_updated && !mems_updated) {
3072 cpuset_attach_nodemask_to = cs->effective_mems;
3073 goto out;
3074 }
3075
3076 guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
3077
3078 cgroup_taskset_for_each(task, css, tset)
3079 cpuset_attach_task(cs, task);
3080
3081 /*
3082 * Change mm for all threadgroup leaders. This is expensive and may
3083 * sleep and should be moved outside migration path proper. Skip it
3084 * if there is no change in effective_mems and CS_MEMORY_MIGRATE is
3085 * not set.
3086 */
3087 cpuset_attach_nodemask_to = cs->effective_mems;
3088 if (!is_memory_migrate(cs) && !mems_updated)
3089 goto out;
3090
3091 cgroup_taskset_for_each_leader(leader, css, tset) {
3092 struct mm_struct *mm = get_task_mm(leader);
3093
3094 if (mm) {
3095 mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
3096
3097 /*
3098 * old_mems_allowed is the same with mems_allowed
3099 * here, except if this task is being moved
3100 * automatically due to hotplug. In that case
3101 * @mems_allowed has been updated and is empty, so
3102 * @old_mems_allowed is the right nodesets that we
3103 * migrate mm from.
3104 */
3105 if (is_memory_migrate(cs)) {
3106 cpuset_migrate_mm(mm, &oldcs->old_mems_allowed,
3107 &cpuset_attach_nodemask_to);
3108 queue_task_work = true;
3109 } else
3110 mmput(mm);
3111 }
3112 }
3113
3114 out:
3115 if (queue_task_work)
3116 schedule_flush_migrate_mm();
3117 cs->old_mems_allowed = cpuset_attach_nodemask_to;
3118
3119 if (cs->nr_migrate_dl_tasks) {
3120 cs->nr_deadline_tasks += cs->nr_migrate_dl_tasks;
3121 oldcs->nr_deadline_tasks -= cs->nr_migrate_dl_tasks;
3122 reset_migrate_dl_data(cs);
3123 }
3124
3125 dec_attach_in_progress_locked(cs);
3126
3127 mutex_unlock(&cpuset_mutex);
3128 }
3129
3130 /*
3131 * Common handling for a write to a "cpus" or "mems" file.
3132 */
cpuset_write_resmask(struct kernfs_open_file * of,char * buf,size_t nbytes,loff_t off)3133 ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
3134 char *buf, size_t nbytes, loff_t off)
3135 {
3136 struct cpuset *cs = css_cs(of_css(of));
3137 struct cpuset *trialcs;
3138 int retval = -ENODEV;
3139
3140 /* root is read-only */
3141 if (cs == &top_cpuset)
3142 return -EACCES;
3143
3144 buf = strstrip(buf);
3145 cpuset_full_lock();
3146 if (!is_cpuset_online(cs))
3147 goto out_unlock;
3148
3149 trialcs = dup_or_alloc_cpuset(cs);
3150 if (!trialcs) {
3151 retval = -ENOMEM;
3152 goto out_unlock;
3153 }
3154
3155 switch (of_cft(of)->private) {
3156 case FILE_CPULIST:
3157 retval = update_cpumask(cs, trialcs, buf);
3158 break;
3159 case FILE_EXCLUSIVE_CPULIST:
3160 retval = update_exclusive_cpumask(cs, trialcs, buf);
3161 break;
3162 case FILE_MEMLIST:
3163 retval = update_nodemask(cs, trialcs, buf);
3164 break;
3165 default:
3166 retval = -EINVAL;
3167 break;
3168 }
3169
3170 free_cpuset(trialcs);
3171 if (force_sd_rebuild)
3172 rebuild_sched_domains_locked();
3173 out_unlock:
3174 cpuset_full_unlock();
3175 if (of_cft(of)->private == FILE_MEMLIST)
3176 schedule_flush_migrate_mm();
3177 return retval ?: nbytes;
3178 }
3179
3180 /*
3181 * These ascii lists should be read in a single call, by using a user
3182 * buffer large enough to hold the entire map. If read in smaller
3183 * chunks, there is no guarantee of atomicity. Since the display format
3184 * used, list of ranges of sequential numbers, is variable length,
3185 * and since these maps can change value dynamically, one could read
3186 * gibberish by doing partial reads while a list was changing.
3187 */
cpuset_common_seq_show(struct seq_file * sf,void * v)3188 int cpuset_common_seq_show(struct seq_file *sf, void *v)
3189 {
3190 struct cpuset *cs = css_cs(seq_css(sf));
3191 cpuset_filetype_t type = seq_cft(sf)->private;
3192 int ret = 0;
3193
3194 spin_lock_irq(&callback_lock);
3195
3196 switch (type) {
3197 case FILE_CPULIST:
3198 seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->cpus_allowed));
3199 break;
3200 case FILE_MEMLIST:
3201 seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->mems_allowed));
3202 break;
3203 case FILE_EFFECTIVE_CPULIST:
3204 seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->effective_cpus));
3205 break;
3206 case FILE_EFFECTIVE_MEMLIST:
3207 seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->effective_mems));
3208 break;
3209 case FILE_EXCLUSIVE_CPULIST:
3210 seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->exclusive_cpus));
3211 break;
3212 case FILE_EFFECTIVE_XCPULIST:
3213 seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->effective_xcpus));
3214 break;
3215 case FILE_SUBPARTS_CPULIST:
3216 seq_printf(sf, "%*pbl\n", cpumask_pr_args(subpartitions_cpus));
3217 break;
3218 case FILE_ISOLATED_CPULIST:
3219 seq_printf(sf, "%*pbl\n", cpumask_pr_args(isolated_cpus));
3220 break;
3221 default:
3222 ret = -EINVAL;
3223 }
3224
3225 spin_unlock_irq(&callback_lock);
3226 return ret;
3227 }
3228
cpuset_partition_show(struct seq_file * seq,void * v)3229 static int cpuset_partition_show(struct seq_file *seq, void *v)
3230 {
3231 struct cpuset *cs = css_cs(seq_css(seq));
3232 const char *err, *type = NULL;
3233
3234 switch (cs->partition_root_state) {
3235 case PRS_ROOT:
3236 seq_puts(seq, "root\n");
3237 break;
3238 case PRS_ISOLATED:
3239 seq_puts(seq, "isolated\n");
3240 break;
3241 case PRS_MEMBER:
3242 seq_puts(seq, "member\n");
3243 break;
3244 case PRS_INVALID_ROOT:
3245 type = "root";
3246 fallthrough;
3247 case PRS_INVALID_ISOLATED:
3248 if (!type)
3249 type = "isolated";
3250 err = perr_strings[READ_ONCE(cs->prs_err)];
3251 if (err)
3252 seq_printf(seq, "%s invalid (%s)\n", type, err);
3253 else
3254 seq_printf(seq, "%s invalid\n", type);
3255 break;
3256 }
3257 return 0;
3258 }
3259
cpuset_partition_write(struct kernfs_open_file * of,char * buf,size_t nbytes,loff_t off)3260 static ssize_t cpuset_partition_write(struct kernfs_open_file *of, char *buf,
3261 size_t nbytes, loff_t off)
3262 {
3263 struct cpuset *cs = css_cs(of_css(of));
3264 int val;
3265 int retval = -ENODEV;
3266
3267 buf = strstrip(buf);
3268
3269 if (!strcmp(buf, "root"))
3270 val = PRS_ROOT;
3271 else if (!strcmp(buf, "member"))
3272 val = PRS_MEMBER;
3273 else if (!strcmp(buf, "isolated"))
3274 val = PRS_ISOLATED;
3275 else
3276 return -EINVAL;
3277
3278 cpuset_full_lock();
3279 if (is_cpuset_online(cs))
3280 retval = update_prstate(cs, val);
3281 cpuset_full_unlock();
3282 return retval ?: nbytes;
3283 }
3284
3285 /*
3286 * This is currently a minimal set for the default hierarchy. It can be
3287 * expanded later on by migrating more features and control files from v1.
3288 */
3289 static struct cftype dfl_files[] = {
3290 {
3291 .name = "cpus",
3292 .seq_show = cpuset_common_seq_show,
3293 .write = cpuset_write_resmask,
3294 .max_write_len = (100U + 6 * NR_CPUS),
3295 .private = FILE_CPULIST,
3296 .flags = CFTYPE_NOT_ON_ROOT,
3297 },
3298
3299 {
3300 .name = "mems",
3301 .seq_show = cpuset_common_seq_show,
3302 .write = cpuset_write_resmask,
3303 .max_write_len = (100U + 6 * MAX_NUMNODES),
3304 .private = FILE_MEMLIST,
3305 .flags = CFTYPE_NOT_ON_ROOT,
3306 },
3307
3308 {
3309 .name = "cpus.effective",
3310 .seq_show = cpuset_common_seq_show,
3311 .private = FILE_EFFECTIVE_CPULIST,
3312 },
3313
3314 {
3315 .name = "mems.effective",
3316 .seq_show = cpuset_common_seq_show,
3317 .private = FILE_EFFECTIVE_MEMLIST,
3318 },
3319
3320 {
3321 .name = "cpus.partition",
3322 .seq_show = cpuset_partition_show,
3323 .write = cpuset_partition_write,
3324 .private = FILE_PARTITION_ROOT,
3325 .flags = CFTYPE_NOT_ON_ROOT,
3326 .file_offset = offsetof(struct cpuset, partition_file),
3327 },
3328
3329 {
3330 .name = "cpus.exclusive",
3331 .seq_show = cpuset_common_seq_show,
3332 .write = cpuset_write_resmask,
3333 .max_write_len = (100U + 6 * NR_CPUS),
3334 .private = FILE_EXCLUSIVE_CPULIST,
3335 .flags = CFTYPE_NOT_ON_ROOT,
3336 },
3337
3338 {
3339 .name = "cpus.exclusive.effective",
3340 .seq_show = cpuset_common_seq_show,
3341 .private = FILE_EFFECTIVE_XCPULIST,
3342 .flags = CFTYPE_NOT_ON_ROOT,
3343 },
3344
3345 {
3346 .name = "cpus.subpartitions",
3347 .seq_show = cpuset_common_seq_show,
3348 .private = FILE_SUBPARTS_CPULIST,
3349 .flags = CFTYPE_ONLY_ON_ROOT | CFTYPE_DEBUG,
3350 },
3351
3352 {
3353 .name = "cpus.isolated",
3354 .seq_show = cpuset_common_seq_show,
3355 .private = FILE_ISOLATED_CPULIST,
3356 .flags = CFTYPE_ONLY_ON_ROOT,
3357 },
3358
3359 { } /* terminate */
3360 };
3361
3362
3363 /**
3364 * cpuset_css_alloc - Allocate a cpuset css
3365 * @parent_css: Parent css of the control group that the new cpuset will be
3366 * part of
3367 * Return: cpuset css on success, -ENOMEM on failure.
3368 *
3369 * Allocate and initialize a new cpuset css, for non-NULL @parent_css, return
3370 * top cpuset css otherwise.
3371 */
3372 static struct cgroup_subsys_state *
cpuset_css_alloc(struct cgroup_subsys_state * parent_css)3373 cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
3374 {
3375 struct cpuset *cs;
3376
3377 if (!parent_css)
3378 return &top_cpuset.css;
3379
3380 cs = dup_or_alloc_cpuset(NULL);
3381 if (!cs)
3382 return ERR_PTR(-ENOMEM);
3383
3384 __set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
3385 cpuset1_init(cs);
3386
3387 /* Set CS_MEMORY_MIGRATE for default hierarchy */
3388 if (cpuset_v2())
3389 __set_bit(CS_MEMORY_MIGRATE, &cs->flags);
3390
3391 return &cs->css;
3392 }
3393
cpuset_css_online(struct cgroup_subsys_state * css)3394 static int cpuset_css_online(struct cgroup_subsys_state *css)
3395 {
3396 struct cpuset *cs = css_cs(css);
3397 struct cpuset *parent = parent_cs(cs);
3398
3399 if (!parent)
3400 return 0;
3401
3402 cpuset_full_lock();
3403 /*
3404 * For v2, clear CS_SCHED_LOAD_BALANCE if parent is isolated
3405 */
3406 if (cpuset_v2() && !is_sched_load_balance(parent))
3407 clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
3408
3409 cpuset_inc();
3410
3411 spin_lock_irq(&callback_lock);
3412 if (is_in_v2_mode()) {
3413 cpumask_copy(cs->effective_cpus, parent->effective_cpus);
3414 cs->effective_mems = parent->effective_mems;
3415 }
3416 spin_unlock_irq(&callback_lock);
3417 cpuset1_online_css(css);
3418
3419 cpuset_full_unlock();
3420 return 0;
3421 }
3422
3423 /*
3424 * If the cpuset being removed has its flag 'sched_load_balance'
3425 * enabled, then simulate turning sched_load_balance off, which
3426 * will call rebuild_sched_domains_locked(). That is not needed
3427 * in the default hierarchy where only changes in partition
3428 * will cause repartitioning.
3429 */
cpuset_css_offline(struct cgroup_subsys_state * css)3430 static void cpuset_css_offline(struct cgroup_subsys_state *css)
3431 {
3432 struct cpuset *cs = css_cs(css);
3433
3434 cpuset_full_lock();
3435 if (!cpuset_v2() && is_sched_load_balance(cs))
3436 cpuset_update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
3437
3438 cpuset_dec();
3439 cpuset_full_unlock();
3440 }
3441
3442 /*
3443 * If a dying cpuset has the 'cpus.partition' enabled, turn it off by
3444 * changing it back to member to free its exclusive CPUs back to the pool to
3445 * be used by other online cpusets.
3446 */
cpuset_css_killed(struct cgroup_subsys_state * css)3447 static void cpuset_css_killed(struct cgroup_subsys_state *css)
3448 {
3449 struct cpuset *cs = css_cs(css);
3450
3451 cpuset_full_lock();
3452 /* Reset valid partition back to member */
3453 if (is_partition_valid(cs))
3454 update_prstate(cs, PRS_MEMBER);
3455 cpuset_full_unlock();
3456 }
3457
cpuset_css_free(struct cgroup_subsys_state * css)3458 static void cpuset_css_free(struct cgroup_subsys_state *css)
3459 {
3460 struct cpuset *cs = css_cs(css);
3461
3462 free_cpuset(cs);
3463 }
3464
cpuset_bind(struct cgroup_subsys_state * root_css)3465 static void cpuset_bind(struct cgroup_subsys_state *root_css)
3466 {
3467 mutex_lock(&cpuset_mutex);
3468 spin_lock_irq(&callback_lock);
3469
3470 if (is_in_v2_mode()) {
3471 cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
3472 cpumask_copy(top_cpuset.effective_xcpus, cpu_possible_mask);
3473 top_cpuset.mems_allowed = node_possible_map;
3474 } else {
3475 cpumask_copy(top_cpuset.cpus_allowed,
3476 top_cpuset.effective_cpus);
3477 top_cpuset.mems_allowed = top_cpuset.effective_mems;
3478 }
3479
3480 spin_unlock_irq(&callback_lock);
3481 mutex_unlock(&cpuset_mutex);
3482 }
3483
3484 /*
3485 * In case the child is cloned into a cpuset different from its parent,
3486 * additional checks are done to see if the move is allowed.
3487 */
cpuset_can_fork(struct task_struct * task,struct css_set * cset)3488 static int cpuset_can_fork(struct task_struct *task, struct css_set *cset)
3489 {
3490 struct cpuset *cs = css_cs(cset->subsys[cpuset_cgrp_id]);
3491 bool same_cs;
3492 int ret;
3493
3494 rcu_read_lock();
3495 same_cs = (cs == task_cs(current));
3496 rcu_read_unlock();
3497
3498 if (same_cs)
3499 return 0;
3500
3501 lockdep_assert_held(&cgroup_mutex);
3502 mutex_lock(&cpuset_mutex);
3503
3504 /* Check to see if task is allowed in the cpuset */
3505 ret = cpuset_can_attach_check(cs);
3506 if (ret)
3507 goto out_unlock;
3508
3509 ret = task_can_attach(task);
3510 if (ret)
3511 goto out_unlock;
3512
3513 ret = security_task_setscheduler(task);
3514 if (ret)
3515 goto out_unlock;
3516
3517 /*
3518 * Mark attach is in progress. This makes validate_change() fail
3519 * changes which zero cpus/mems_allowed.
3520 */
3521 cs->attach_in_progress++;
3522 out_unlock:
3523 mutex_unlock(&cpuset_mutex);
3524 return ret;
3525 }
3526
cpuset_cancel_fork(struct task_struct * task,struct css_set * cset)3527 static void cpuset_cancel_fork(struct task_struct *task, struct css_set *cset)
3528 {
3529 struct cpuset *cs = css_cs(cset->subsys[cpuset_cgrp_id]);
3530 bool same_cs;
3531
3532 rcu_read_lock();
3533 same_cs = (cs == task_cs(current));
3534 rcu_read_unlock();
3535
3536 if (same_cs)
3537 return;
3538
3539 dec_attach_in_progress(cs);
3540 }
3541
3542 /*
3543 * Make sure the new task conform to the current state of its parent,
3544 * which could have been changed by cpuset just after it inherits the
3545 * state from the parent and before it sits on the cgroup's task list.
3546 */
cpuset_fork(struct task_struct * task)3547 static void cpuset_fork(struct task_struct *task)
3548 {
3549 struct cpuset *cs;
3550 bool same_cs;
3551
3552 rcu_read_lock();
3553 cs = task_cs(task);
3554 same_cs = (cs == task_cs(current));
3555 rcu_read_unlock();
3556
3557 if (same_cs) {
3558 if (cs == &top_cpuset)
3559 return;
3560
3561 set_cpus_allowed_ptr(task, current->cpus_ptr);
3562 task->mems_allowed = current->mems_allowed;
3563 return;
3564 }
3565
3566 /* CLONE_INTO_CGROUP */
3567 mutex_lock(&cpuset_mutex);
3568 guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
3569 cpuset_attach_task(cs, task);
3570
3571 dec_attach_in_progress_locked(cs);
3572 mutex_unlock(&cpuset_mutex);
3573 }
3574
3575 struct cgroup_subsys cpuset_cgrp_subsys = {
3576 .css_alloc = cpuset_css_alloc,
3577 .css_online = cpuset_css_online,
3578 .css_offline = cpuset_css_offline,
3579 .css_killed = cpuset_css_killed,
3580 .css_free = cpuset_css_free,
3581 .can_attach = cpuset_can_attach,
3582 .cancel_attach = cpuset_cancel_attach,
3583 .attach = cpuset_attach,
3584 .bind = cpuset_bind,
3585 .can_fork = cpuset_can_fork,
3586 .cancel_fork = cpuset_cancel_fork,
3587 .fork = cpuset_fork,
3588 #ifdef CONFIG_CPUSETS_V1
3589 .legacy_cftypes = cpuset1_files,
3590 #endif
3591 .dfl_cftypes = dfl_files,
3592 .early_init = true,
3593 .threaded = true,
3594 };
3595
3596 /**
3597 * cpuset_init - initialize cpusets at system boot
3598 *
3599 * Description: Initialize top_cpuset
3600 **/
3601
cpuset_init(void)3602 int __init cpuset_init(void)
3603 {
3604 BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL));
3605 BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL));
3606 BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_xcpus, GFP_KERNEL));
3607 BUG_ON(!alloc_cpumask_var(&top_cpuset.exclusive_cpus, GFP_KERNEL));
3608 BUG_ON(!zalloc_cpumask_var(&subpartitions_cpus, GFP_KERNEL));
3609 BUG_ON(!zalloc_cpumask_var(&isolated_cpus, GFP_KERNEL));
3610
3611 cpumask_setall(top_cpuset.cpus_allowed);
3612 nodes_setall(top_cpuset.mems_allowed);
3613 cpumask_setall(top_cpuset.effective_cpus);
3614 cpumask_setall(top_cpuset.effective_xcpus);
3615 cpumask_setall(top_cpuset.exclusive_cpus);
3616 nodes_setall(top_cpuset.effective_mems);
3617
3618 cpuset1_init(&top_cpuset);
3619
3620 BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL));
3621
3622 if (housekeeping_enabled(HK_TYPE_DOMAIN_BOOT))
3623 cpumask_andnot(isolated_cpus, cpu_possible_mask,
3624 housekeeping_cpumask(HK_TYPE_DOMAIN_BOOT));
3625
3626 return 0;
3627 }
3628
3629 static void
hotplug_update_tasks(struct cpuset * cs,struct cpumask * new_cpus,nodemask_t * new_mems,bool cpus_updated,bool mems_updated)3630 hotplug_update_tasks(struct cpuset *cs,
3631 struct cpumask *new_cpus, nodemask_t *new_mems,
3632 bool cpus_updated, bool mems_updated)
3633 {
3634 /* A partition root is allowed to have empty effective cpus */
3635 if (cpumask_empty(new_cpus) && !is_partition_valid(cs))
3636 cpumask_copy(new_cpus, parent_cs(cs)->effective_cpus);
3637 if (nodes_empty(*new_mems))
3638 *new_mems = parent_cs(cs)->effective_mems;
3639
3640 spin_lock_irq(&callback_lock);
3641 cpumask_copy(cs->effective_cpus, new_cpus);
3642 cs->effective_mems = *new_mems;
3643 spin_unlock_irq(&callback_lock);
3644
3645 if (cpus_updated)
3646 cpuset_update_tasks_cpumask(cs, new_cpus);
3647 if (mems_updated)
3648 cpuset_update_tasks_nodemask(cs);
3649 }
3650
cpuset_force_rebuild(void)3651 void cpuset_force_rebuild(void)
3652 {
3653 force_sd_rebuild = true;
3654 }
3655
3656 /**
3657 * cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug
3658 * @cs: cpuset in interest
3659 * @tmp: the tmpmasks structure pointer
3660 *
3661 * Compare @cs's cpu and mem masks against top_cpuset and if some have gone
3662 * offline, update @cs accordingly. If @cs ends up with no CPU or memory,
3663 * all its tasks are moved to the nearest ancestor with both resources.
3664 */
cpuset_hotplug_update_tasks(struct cpuset * cs,struct tmpmasks * tmp)3665 static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp)
3666 {
3667 static cpumask_t new_cpus;
3668 static nodemask_t new_mems;
3669 bool cpus_updated;
3670 bool mems_updated;
3671 bool remote;
3672 int partcmd = -1;
3673 struct cpuset *parent;
3674 retry:
3675 wait_event(cpuset_attach_wq, cs->attach_in_progress == 0);
3676
3677 mutex_lock(&cpuset_mutex);
3678
3679 /*
3680 * We have raced with task attaching. We wait until attaching
3681 * is finished, so we won't attach a task to an empty cpuset.
3682 */
3683 if (cs->attach_in_progress) {
3684 mutex_unlock(&cpuset_mutex);
3685 goto retry;
3686 }
3687
3688 parent = parent_cs(cs);
3689 compute_effective_cpumask(&new_cpus, cs, parent);
3690 nodes_and(new_mems, cs->mems_allowed, parent->effective_mems);
3691
3692 if (!tmp || !cs->partition_root_state)
3693 goto update_tasks;
3694
3695 /*
3696 * Compute effective_cpus for valid partition root, may invalidate
3697 * child partition roots if necessary.
3698 */
3699 remote = is_remote_partition(cs);
3700 if (remote || (is_partition_valid(cs) && is_partition_valid(parent)))
3701 compute_partition_effective_cpumask(cs, &new_cpus);
3702
3703 if (remote && (cpumask_empty(subpartitions_cpus) ||
3704 (cpumask_empty(&new_cpus) &&
3705 partition_is_populated(cs, NULL)))) {
3706 cs->prs_err = PERR_HOTPLUG;
3707 remote_partition_disable(cs, tmp);
3708 compute_effective_cpumask(&new_cpus, cs, parent);
3709 remote = false;
3710 }
3711
3712 /*
3713 * Force the partition to become invalid if either one of
3714 * the following conditions hold:
3715 * 1) empty effective cpus but not valid empty partition.
3716 * 2) parent is invalid or doesn't grant any cpus to child
3717 * partitions.
3718 * 3) subpartitions_cpus is empty.
3719 */
3720 if (is_local_partition(cs) &&
3721 (!is_partition_valid(parent) ||
3722 tasks_nocpu_error(parent, cs, &new_cpus) ||
3723 cpumask_empty(subpartitions_cpus)))
3724 partcmd = partcmd_invalidate;
3725 /*
3726 * On the other hand, an invalid partition root may be transitioned
3727 * back to a regular one with a non-empty effective xcpus.
3728 */
3729 else if (is_partition_valid(parent) && is_partition_invalid(cs) &&
3730 !cpumask_empty(cs->effective_xcpus))
3731 partcmd = partcmd_update;
3732
3733 if (partcmd >= 0) {
3734 update_parent_effective_cpumask(cs, partcmd, NULL, tmp);
3735 if ((partcmd == partcmd_invalidate) || is_partition_valid(cs)) {
3736 compute_partition_effective_cpumask(cs, &new_cpus);
3737 cpuset_force_rebuild();
3738 }
3739 }
3740
3741 update_tasks:
3742 cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);
3743 mems_updated = !nodes_equal(new_mems, cs->effective_mems);
3744 if (!cpus_updated && !mems_updated)
3745 goto unlock; /* Hotplug doesn't affect this cpuset */
3746
3747 if (mems_updated)
3748 check_insane_mems_config(&new_mems);
3749
3750 if (is_in_v2_mode())
3751 hotplug_update_tasks(cs, &new_cpus, &new_mems,
3752 cpus_updated, mems_updated);
3753 else
3754 cpuset1_hotplug_update_tasks(cs, &new_cpus, &new_mems,
3755 cpus_updated, mems_updated);
3756
3757 unlock:
3758 mutex_unlock(&cpuset_mutex);
3759 }
3760
3761 /**
3762 * cpuset_handle_hotplug - handle CPU/memory hot{,un}plug for a cpuset
3763 *
3764 * This function is called after either CPU or memory configuration has
3765 * changed and updates cpuset accordingly. The top_cpuset is always
3766 * synchronized to cpu_active_mask and N_MEMORY, which is necessary in
3767 * order to make cpusets transparent (of no affect) on systems that are
3768 * actively using CPU hotplug but making no active use of cpusets.
3769 *
3770 * Non-root cpusets are only affected by offlining. If any CPUs or memory
3771 * nodes have been taken down, cpuset_hotplug_update_tasks() is invoked on
3772 * all descendants.
3773 *
3774 * Note that CPU offlining during suspend is ignored. We don't modify
3775 * cpusets across suspend/resume cycles at all.
3776 *
3777 * CPU / memory hotplug is handled synchronously.
3778 */
cpuset_handle_hotplug(void)3779 static void cpuset_handle_hotplug(void)
3780 {
3781 static cpumask_t new_cpus;
3782 static nodemask_t new_mems;
3783 bool cpus_updated, mems_updated;
3784 bool on_dfl = is_in_v2_mode();
3785 struct tmpmasks tmp, *ptmp = NULL;
3786
3787 if (on_dfl && !alloc_tmpmasks(&tmp))
3788 ptmp = &tmp;
3789
3790 lockdep_assert_cpus_held();
3791 mutex_lock(&cpuset_mutex);
3792
3793 /* fetch the available cpus/mems and find out which changed how */
3794 cpumask_copy(&new_cpus, cpu_active_mask);
3795 new_mems = node_states[N_MEMORY];
3796
3797 /*
3798 * If subpartitions_cpus is populated, it is likely that the check
3799 * below will produce a false positive on cpus_updated when the cpu
3800 * list isn't changed. It is extra work, but it is better to be safe.
3801 */
3802 cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus) ||
3803 !cpumask_empty(subpartitions_cpus);
3804 mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems);
3805
3806 /* For v1, synchronize cpus_allowed to cpu_active_mask */
3807 if (cpus_updated) {
3808 cpuset_force_rebuild();
3809 spin_lock_irq(&callback_lock);
3810 if (!on_dfl)
3811 cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
3812 /*
3813 * Make sure that CPUs allocated to child partitions
3814 * do not show up in effective_cpus. If no CPU is left,
3815 * we clear the subpartitions_cpus & let the child partitions
3816 * fight for the CPUs again.
3817 */
3818 if (!cpumask_empty(subpartitions_cpus)) {
3819 if (cpumask_subset(&new_cpus, subpartitions_cpus)) {
3820 cpumask_clear(subpartitions_cpus);
3821 } else {
3822 cpumask_andnot(&new_cpus, &new_cpus,
3823 subpartitions_cpus);
3824 }
3825 }
3826 cpumask_copy(top_cpuset.effective_cpus, &new_cpus);
3827 spin_unlock_irq(&callback_lock);
3828 /* we don't mess with cpumasks of tasks in top_cpuset */
3829 }
3830
3831 /* synchronize mems_allowed to N_MEMORY */
3832 if (mems_updated) {
3833 spin_lock_irq(&callback_lock);
3834 if (!on_dfl)
3835 top_cpuset.mems_allowed = new_mems;
3836 top_cpuset.effective_mems = new_mems;
3837 spin_unlock_irq(&callback_lock);
3838 cpuset_update_tasks_nodemask(&top_cpuset);
3839 }
3840
3841 mutex_unlock(&cpuset_mutex);
3842
3843 /* if cpus or mems changed, we need to propagate to descendants */
3844 if (cpus_updated || mems_updated) {
3845 struct cpuset *cs;
3846 struct cgroup_subsys_state *pos_css;
3847
3848 rcu_read_lock();
3849 cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
3850 if (cs == &top_cpuset || !css_tryget_online(&cs->css))
3851 continue;
3852 rcu_read_unlock();
3853
3854 cpuset_hotplug_update_tasks(cs, ptmp);
3855
3856 rcu_read_lock();
3857 css_put(&cs->css);
3858 }
3859 rcu_read_unlock();
3860 }
3861
3862 /* rebuild sched domains if necessary */
3863 if (force_sd_rebuild)
3864 rebuild_sched_domains_cpuslocked();
3865
3866 free_tmpmasks(ptmp);
3867 }
3868
cpuset_update_active_cpus(void)3869 void cpuset_update_active_cpus(void)
3870 {
3871 /*
3872 * We're inside cpu hotplug critical region which usually nests
3873 * inside cgroup synchronization. Bounce actual hotplug processing
3874 * to a work item to avoid reverse locking order.
3875 */
3876 cpuset_handle_hotplug();
3877 }
3878
3879 /*
3880 * Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY].
3881 * Call this routine anytime after node_states[N_MEMORY] changes.
3882 * See cpuset_update_active_cpus() for CPU hotplug handling.
3883 */
cpuset_track_online_nodes(struct notifier_block * self,unsigned long action,void * arg)3884 static int cpuset_track_online_nodes(struct notifier_block *self,
3885 unsigned long action, void *arg)
3886 {
3887 cpuset_handle_hotplug();
3888 return NOTIFY_OK;
3889 }
3890
3891 /**
3892 * cpuset_init_smp - initialize cpus_allowed
3893 *
3894 * Description: Finish top cpuset after cpu, node maps are initialized
3895 */
cpuset_init_smp(void)3896 void __init cpuset_init_smp(void)
3897 {
3898 /*
3899 * cpus_allowd/mems_allowed set to v2 values in the initial
3900 * cpuset_bind() call will be reset to v1 values in another
3901 * cpuset_bind() call when v1 cpuset is mounted.
3902 */
3903 top_cpuset.old_mems_allowed = top_cpuset.mems_allowed;
3904
3905 cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask);
3906 top_cpuset.effective_mems = node_states[N_MEMORY];
3907
3908 hotplug_node_notifier(cpuset_track_online_nodes, CPUSET_CALLBACK_PRI);
3909
3910 cpuset_migrate_mm_wq = alloc_ordered_workqueue("cpuset_migrate_mm", 0);
3911 BUG_ON(!cpuset_migrate_mm_wq);
3912 }
3913
3914 /*
3915 * Return cpus_allowed mask from a task's cpuset.
3916 */
__cpuset_cpus_allowed_locked(struct task_struct * tsk,struct cpumask * pmask)3917 static void __cpuset_cpus_allowed_locked(struct task_struct *tsk, struct cpumask *pmask)
3918 {
3919 struct cpuset *cs;
3920
3921 cs = task_cs(tsk);
3922 if (cs != &top_cpuset)
3923 guarantee_active_cpus(tsk, pmask);
3924 /*
3925 * Tasks in the top cpuset won't get update to their cpumasks
3926 * when a hotplug online/offline event happens. So we include all
3927 * offline cpus in the allowed cpu list.
3928 */
3929 if ((cs == &top_cpuset) || cpumask_empty(pmask)) {
3930 const struct cpumask *possible_mask = task_cpu_possible_mask(tsk);
3931
3932 /*
3933 * We first exclude cpus allocated to partitions. If there is no
3934 * allowable online cpu left, we fall back to all possible cpus.
3935 */
3936 cpumask_andnot(pmask, possible_mask, subpartitions_cpus);
3937 if (!cpumask_intersects(pmask, cpu_active_mask))
3938 cpumask_copy(pmask, possible_mask);
3939 }
3940 }
3941
3942 /**
3943 * cpuset_cpus_allowed_locked - return cpus_allowed mask from a task's cpuset.
3944 * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed.
3945 * @pmask: pointer to struct cpumask variable to receive cpus_allowed set.
3946 *
3947 * Similir to cpuset_cpus_allowed() except that the caller must have acquired
3948 * cpuset_mutex.
3949 */
cpuset_cpus_allowed_locked(struct task_struct * tsk,struct cpumask * pmask)3950 void cpuset_cpus_allowed_locked(struct task_struct *tsk, struct cpumask *pmask)
3951 {
3952 lockdep_assert_cpuset_lock_held();
3953 __cpuset_cpus_allowed_locked(tsk, pmask);
3954 }
3955
3956 /**
3957 * cpuset_cpus_allowed - return cpus_allowed mask from a task's cpuset.
3958 * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed.
3959 * @pmask: pointer to struct cpumask variable to receive cpus_allowed set.
3960 *
3961 * Description: Returns the cpumask_var_t cpus_allowed of the cpuset
3962 * attached to the specified @tsk. Guaranteed to return some non-empty
3963 * subset of cpu_active_mask, even if this means going outside the
3964 * tasks cpuset, except when the task is in the top cpuset.
3965 **/
3966
cpuset_cpus_allowed(struct task_struct * tsk,struct cpumask * pmask)3967 void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
3968 {
3969 unsigned long flags;
3970
3971 spin_lock_irqsave(&callback_lock, flags);
3972 __cpuset_cpus_allowed_locked(tsk, pmask);
3973 spin_unlock_irqrestore(&callback_lock, flags);
3974 }
3975
3976 /**
3977 * cpuset_cpus_allowed_fallback - final fallback before complete catastrophe.
3978 * @tsk: pointer to task_struct with which the scheduler is struggling
3979 *
3980 * Description: In the case that the scheduler cannot find an allowed cpu in
3981 * tsk->cpus_allowed, we fall back to task_cs(tsk)->cpus_allowed. In legacy
3982 * mode however, this value is the same as task_cs(tsk)->effective_cpus,
3983 * which will not contain a sane cpumask during cases such as cpu hotplugging.
3984 * This is the absolute last resort for the scheduler and it is only used if
3985 * _every_ other avenue has been traveled.
3986 *
3987 * Returns true if the affinity of @tsk was changed, false otherwise.
3988 **/
3989
cpuset_cpus_allowed_fallback(struct task_struct * tsk)3990 bool cpuset_cpus_allowed_fallback(struct task_struct *tsk)
3991 {
3992 const struct cpumask *possible_mask = task_cpu_possible_mask(tsk);
3993 const struct cpumask *cs_mask;
3994 bool changed = false;
3995
3996 rcu_read_lock();
3997 cs_mask = task_cs(tsk)->cpus_allowed;
3998 if (is_in_v2_mode() && cpumask_subset(cs_mask, possible_mask)) {
3999 set_cpus_allowed_force(tsk, cs_mask);
4000 changed = true;
4001 }
4002 rcu_read_unlock();
4003
4004 /*
4005 * We own tsk->cpus_allowed, nobody can change it under us.
4006 *
4007 * But we used cs && cs->cpus_allowed lockless and thus can
4008 * race with cgroup_attach_task() or update_cpumask() and get
4009 * the wrong tsk->cpus_allowed. However, both cases imply the
4010 * subsequent cpuset_change_cpumask()->set_cpus_allowed_ptr()
4011 * which takes task_rq_lock().
4012 *
4013 * If we are called after it dropped the lock we must see all
4014 * changes in tsk_cs()->cpus_allowed. Otherwise we can temporary
4015 * set any mask even if it is not right from task_cs() pov,
4016 * the pending set_cpus_allowed_ptr() will fix things.
4017 *
4018 * select_fallback_rq() will fix things ups and set cpu_possible_mask
4019 * if required.
4020 */
4021 return changed;
4022 }
4023
cpuset_init_current_mems_allowed(void)4024 void __init cpuset_init_current_mems_allowed(void)
4025 {
4026 nodes_setall(current->mems_allowed);
4027 }
4028
4029 /**
4030 * cpuset_mems_allowed - return mems_allowed mask from a tasks cpuset.
4031 * @tsk: pointer to task_struct from which to obtain cpuset->mems_allowed.
4032 *
4033 * Description: Returns the nodemask_t mems_allowed of the cpuset
4034 * attached to the specified @tsk. Guaranteed to return some non-empty
4035 * subset of node_states[N_MEMORY], even if this means going outside the
4036 * tasks cpuset.
4037 **/
4038
cpuset_mems_allowed(struct task_struct * tsk)4039 nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
4040 {
4041 nodemask_t mask;
4042 unsigned long flags;
4043
4044 spin_lock_irqsave(&callback_lock, flags);
4045 guarantee_online_mems(task_cs(tsk), &mask);
4046 spin_unlock_irqrestore(&callback_lock, flags);
4047
4048 return mask;
4049 }
4050
4051 /**
4052 * cpuset_nodemask_valid_mems_allowed - check nodemask vs. current mems_allowed
4053 * @nodemask: the nodemask to be checked
4054 *
4055 * Are any of the nodes in the nodemask allowed in current->mems_allowed?
4056 */
cpuset_nodemask_valid_mems_allowed(nodemask_t * nodemask)4057 int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
4058 {
4059 return nodes_intersects(*nodemask, current->mems_allowed);
4060 }
4061
4062 /*
4063 * nearest_hardwall_ancestor() - Returns the nearest mem_exclusive or
4064 * mem_hardwall ancestor to the specified cpuset. Call holding
4065 * callback_lock. If no ancestor is mem_exclusive or mem_hardwall
4066 * (an unusual configuration), then returns the root cpuset.
4067 */
nearest_hardwall_ancestor(struct cpuset * cs)4068 static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
4069 {
4070 while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && parent_cs(cs))
4071 cs = parent_cs(cs);
4072 return cs;
4073 }
4074
4075 /*
4076 * cpuset_current_node_allowed - Can current task allocate on a memory node?
4077 * @node: is this an allowed node?
4078 * @gfp_mask: memory allocation flags
4079 *
4080 * If we're in interrupt, yes, we can always allocate. If @node is set in
4081 * current's mems_allowed, yes. If it's not a __GFP_HARDWALL request and this
4082 * node is set in the nearest hardwalled cpuset ancestor to current's cpuset,
4083 * yes. If current has access to memory reserves as an oom victim, yes.
4084 * Otherwise, no.
4085 *
4086 * GFP_USER allocations are marked with the __GFP_HARDWALL bit,
4087 * and do not allow allocations outside the current tasks cpuset
4088 * unless the task has been OOM killed.
4089 * GFP_KERNEL allocations are not so marked, so can escape to the
4090 * nearest enclosing hardwalled ancestor cpuset.
4091 *
4092 * Scanning up parent cpusets requires callback_lock. The
4093 * __alloc_pages() routine only calls here with __GFP_HARDWALL bit
4094 * _not_ set if it's a GFP_KERNEL allocation, and all nodes in the
4095 * current tasks mems_allowed came up empty on the first pass over
4096 * the zonelist. So only GFP_KERNEL allocations, if all nodes in the
4097 * cpuset are short of memory, might require taking the callback_lock.
4098 *
4099 * The first call here from mm/page_alloc:get_page_from_freelist()
4100 * has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets,
4101 * so no allocation on a node outside the cpuset is allowed (unless
4102 * in interrupt, of course).
4103 *
4104 * The second pass through get_page_from_freelist() doesn't even call
4105 * here for GFP_ATOMIC calls. For those calls, the __alloc_pages()
4106 * variable 'wait' is not set, and the bit ALLOC_CPUSET is not set
4107 * in alloc_flags. That logic and the checks below have the combined
4108 * affect that:
4109 * in_interrupt - any node ok (current task context irrelevant)
4110 * GFP_ATOMIC - any node ok
4111 * tsk_is_oom_victim - any node ok
4112 * GFP_KERNEL - any node in enclosing hardwalled cpuset ok
4113 * GFP_USER - only nodes in current tasks mems allowed ok.
4114 */
cpuset_current_node_allowed(int node,gfp_t gfp_mask)4115 bool cpuset_current_node_allowed(int node, gfp_t gfp_mask)
4116 {
4117 struct cpuset *cs; /* current cpuset ancestors */
4118 bool allowed; /* is allocation in zone z allowed? */
4119 unsigned long flags;
4120
4121 if (in_interrupt())
4122 return true;
4123 if (node_isset(node, current->mems_allowed))
4124 return true;
4125 /*
4126 * Allow tasks that have access to memory reserves because they have
4127 * been OOM killed to get memory anywhere.
4128 */
4129 if (unlikely(tsk_is_oom_victim(current)))
4130 return true;
4131 if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */
4132 return false;
4133
4134 if (current->flags & PF_EXITING) /* Let dying task have memory */
4135 return true;
4136
4137 /* Not hardwall and node outside mems_allowed: scan up cpusets */
4138 spin_lock_irqsave(&callback_lock, flags);
4139
4140 cs = nearest_hardwall_ancestor(task_cs(current));
4141 allowed = node_isset(node, cs->mems_allowed);
4142
4143 spin_unlock_irqrestore(&callback_lock, flags);
4144 return allowed;
4145 }
4146
4147 /**
4148 * cpuset_nodes_allowed - return effective_mems mask from a cgroup cpuset.
4149 * @cgroup: pointer to struct cgroup.
4150 * @mask: pointer to struct nodemask_t to be returned.
4151 *
4152 * Returns effective_mems mask from a cgroup cpuset if it is cgroup v2 and
4153 * has cpuset subsys. Otherwise, returns node_states[N_MEMORY].
4154 *
4155 * This function intentionally avoids taking the cpuset_mutex or callback_lock
4156 * when accessing effective_mems. This is because the obtained effective_mems
4157 * is stale immediately after the query anyway (e.g., effective_mems is updated
4158 * immediately after releasing the lock but before returning).
4159 *
4160 * As a result, returned @mask may be empty because cs->effective_mems can be
4161 * rebound during this call. Besides, nodes in @mask are not guaranteed to be
4162 * online due to hot plugins. Callers should check the mask for validity on
4163 * return based on its subsequent use.
4164 **/
cpuset_nodes_allowed(struct cgroup * cgroup,nodemask_t * mask)4165 void cpuset_nodes_allowed(struct cgroup *cgroup, nodemask_t *mask)
4166 {
4167 struct cgroup_subsys_state *css;
4168 struct cpuset *cs;
4169
4170 /*
4171 * In v1, mem_cgroup and cpuset are unlikely in the same hierarchy
4172 * and mems_allowed is likely to be empty even if we could get to it,
4173 * so return directly to avoid taking a global lock on the empty check.
4174 */
4175 if (!cgroup || !cpuset_v2()) {
4176 nodes_copy(*mask, node_states[N_MEMORY]);
4177 return;
4178 }
4179
4180 css = cgroup_get_e_css(cgroup, &cpuset_cgrp_subsys);
4181 if (!css) {
4182 nodes_copy(*mask, node_states[N_MEMORY]);
4183 return;
4184 }
4185
4186 /*
4187 * The reference taken via cgroup_get_e_css is sufficient to
4188 * protect css, but it does not imply safe accesses to effective_mems.
4189 *
4190 * Normally, accessing effective_mems would require the cpuset_mutex
4191 * or callback_lock - but the correctness of this information is stale
4192 * immediately after the query anyway. We do not acquire the lock
4193 * during this process to save lock contention in exchange for racing
4194 * against mems_allowed rebinds.
4195 */
4196 cs = container_of(css, struct cpuset, css);
4197 nodes_copy(*mask, cs->effective_mems);
4198 css_put(css);
4199 }
4200
4201 /**
4202 * cpuset_spread_node() - On which node to begin search for a page
4203 * @rotor: round robin rotor
4204 *
4205 * If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for
4206 * tasks in a cpuset with is_spread_page or is_spread_slab set),
4207 * and if the memory allocation used cpuset_mem_spread_node()
4208 * to determine on which node to start looking, as it will for
4209 * certain page cache or slab cache pages such as used for file
4210 * system buffers and inode caches, then instead of starting on the
4211 * local node to look for a free page, rather spread the starting
4212 * node around the tasks mems_allowed nodes.
4213 *
4214 * We don't have to worry about the returned node being offline
4215 * because "it can't happen", and even if it did, it would be ok.
4216 *
4217 * The routines calling guarantee_online_mems() are careful to
4218 * only set nodes in task->mems_allowed that are online. So it
4219 * should not be possible for the following code to return an
4220 * offline node. But if it did, that would be ok, as this routine
4221 * is not returning the node where the allocation must be, only
4222 * the node where the search should start. The zonelist passed to
4223 * __alloc_pages() will include all nodes. If the slab allocator
4224 * is passed an offline node, it will fall back to the local node.
4225 * See kmem_cache_alloc_node().
4226 */
cpuset_spread_node(int * rotor)4227 static int cpuset_spread_node(int *rotor)
4228 {
4229 return *rotor = next_node_in(*rotor, current->mems_allowed);
4230 }
4231
4232 /**
4233 * cpuset_mem_spread_node() - On which node to begin search for a file page
4234 */
cpuset_mem_spread_node(void)4235 int cpuset_mem_spread_node(void)
4236 {
4237 if (current->cpuset_mem_spread_rotor == NUMA_NO_NODE)
4238 current->cpuset_mem_spread_rotor =
4239 node_random(¤t->mems_allowed);
4240
4241 return cpuset_spread_node(¤t->cpuset_mem_spread_rotor);
4242 }
4243
4244 /**
4245 * cpuset_mems_allowed_intersects - Does @tsk1's mems_allowed intersect @tsk2's?
4246 * @tsk1: pointer to task_struct of some task.
4247 * @tsk2: pointer to task_struct of some other task.
4248 *
4249 * Description: Return true if @tsk1's mems_allowed intersects the
4250 * mems_allowed of @tsk2. Used by the OOM killer to determine if
4251 * one of the task's memory usage might impact the memory available
4252 * to the other.
4253 **/
4254
cpuset_mems_allowed_intersects(const struct task_struct * tsk1,const struct task_struct * tsk2)4255 int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
4256 const struct task_struct *tsk2)
4257 {
4258 return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed);
4259 }
4260
4261 /**
4262 * cpuset_print_current_mems_allowed - prints current's cpuset and mems_allowed
4263 *
4264 * Description: Prints current's name, cpuset name, and cached copy of its
4265 * mems_allowed to the kernel log.
4266 */
cpuset_print_current_mems_allowed(void)4267 void cpuset_print_current_mems_allowed(void)
4268 {
4269 struct cgroup *cgrp;
4270
4271 rcu_read_lock();
4272
4273 cgrp = task_cs(current)->css.cgroup;
4274 pr_cont(",cpuset=");
4275 pr_cont_cgroup_name(cgrp);
4276 pr_cont(",mems_allowed=%*pbl",
4277 nodemask_pr_args(¤t->mems_allowed));
4278
4279 rcu_read_unlock();
4280 }
4281
4282 /* Display task mems_allowed in /proc/<pid>/status file. */
cpuset_task_status_allowed(struct seq_file * m,struct task_struct * task)4283 void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task)
4284 {
4285 seq_printf(m, "Mems_allowed:\t%*pb\n",
4286 nodemask_pr_args(&task->mems_allowed));
4287 seq_printf(m, "Mems_allowed_list:\t%*pbl\n",
4288 nodemask_pr_args(&task->mems_allowed));
4289 }
4290