xref: /linux/kernel/cgroup/cpuset.c (revision d7484babd2c4dcfa1ca02e7e303fab3fab529d75)
1 /*
2  *  kernel/cpuset.c
3  *
4  *  Processor and Memory placement constraints for sets of tasks.
5  *
6  *  Copyright (C) 2003 BULL SA.
7  *  Copyright (C) 2004-2007 Silicon Graphics, Inc.
8  *  Copyright (C) 2006 Google, Inc
9  *
10  *  Portions derived from Patrick Mochel's sysfs code.
11  *  sysfs is Copyright (c) 2001-3 Patrick Mochel
12  *
13  *  2003-10-10 Written by Simon Derr.
14  *  2003-10-22 Updates by Stephen Hemminger.
15  *  2004 May-July Rework by Paul Jackson.
16  *  2006 Rework by Paul Menage to use generic cgroups
17  *  2008 Rework of the scheduler domains and CPU hotplug handling
18  *       by Max Krasnyansky
19  *
20  *  This file is subject to the terms and conditions of the GNU General Public
21  *  License.  See the file COPYING in the main directory of the Linux
22  *  distribution for more details.
23  */
24 #include "cpuset-internal.h"
25 
26 #include <linux/init.h>
27 #include <linux/interrupt.h>
28 #include <linux/kernel.h>
29 #include <linux/mempolicy.h>
30 #include <linux/mm.h>
31 #include <linux/memory.h>
32 #include <linux/export.h>
33 #include <linux/rcupdate.h>
34 #include <linux/sched.h>
35 #include <linux/sched/deadline.h>
36 #include <linux/sched/mm.h>
37 #include <linux/sched/task.h>
38 #include <linux/security.h>
39 #include <linux/oom.h>
40 #include <linux/sched/isolation.h>
41 #include <linux/wait.h>
42 #include <linux/workqueue.h>
43 
44 DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key);
45 DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key);
46 
47 /*
48  * There could be abnormal cpuset configurations for cpu or memory
49  * node binding, add this key to provide a quick low-cost judgment
50  * of the situation.
51  */
52 DEFINE_STATIC_KEY_FALSE(cpusets_insane_config_key);
53 
54 static const char * const perr_strings[] = {
55 	[PERR_INVCPUS]   = "Invalid cpu list in cpuset.cpus.exclusive",
56 	[PERR_INVPARENT] = "Parent is an invalid partition root",
57 	[PERR_NOTPART]   = "Parent is not a partition root",
58 	[PERR_NOTEXCL]   = "Cpu list in cpuset.cpus not exclusive",
59 	[PERR_NOCPUS]    = "Parent unable to distribute cpu downstream",
60 	[PERR_HOTPLUG]   = "No cpu available due to hotplug",
61 	[PERR_CPUSEMPTY] = "cpuset.cpus and cpuset.cpus.exclusive are empty",
62 	[PERR_HKEEPING]  = "partition config conflicts with housekeeping setup",
63 	[PERR_ACCESS]    = "Enable partition not permitted",
64 };
65 
66 /*
67  * Exclusive CPUs distributed out to sub-partitions of top_cpuset
68  */
69 static cpumask_var_t	subpartitions_cpus;
70 
71 /*
72  * Exclusive CPUs in isolated partitions
73  */
74 static cpumask_var_t	isolated_cpus;
75 
76 /*
77  * Housekeeping (HK_TYPE_DOMAIN) CPUs at boot
78  */
79 static cpumask_var_t	boot_hk_cpus;
80 static bool		have_boot_isolcpus;
81 
82 /* List of remote partition root children */
83 static struct list_head remote_children;
84 
85 /*
86  * A flag to force sched domain rebuild at the end of an operation.
87  * It can be set in
88  *  - update_partition_sd_lb()
89  *  - remote_partition_check()
90  *  - update_cpumasks_hier()
91  *  - cpuset_update_flag()
92  *  - cpuset_hotplug_update_tasks()
93  *  - cpuset_handle_hotplug()
94  *
95  * Protected by cpuset_mutex (with cpus_read_lock held) or cpus_write_lock.
96  *
97  * Note that update_relax_domain_level() in cpuset-v1.c can still call
98  * rebuild_sched_domains_locked() directly without using this flag.
99  */
100 static bool force_sd_rebuild;
101 
102 /*
103  * Partition root states:
104  *
105  *   0 - member (not a partition root)
106  *   1 - partition root
107  *   2 - partition root without load balancing (isolated)
108  *  -1 - invalid partition root
109  *  -2 - invalid isolated partition root
110  *
111  *  There are 2 types of partitions - local or remote. Local partitions are
112  *  those whose parents are partition root themselves. Setting of
113  *  cpuset.cpus.exclusive are optional in setting up local partitions.
114  *  Remote partitions are those whose parents are not partition roots. Passing
115  *  down exclusive CPUs by setting cpuset.cpus.exclusive along its ancestor
116  *  nodes are mandatory in creating a remote partition.
117  *
118  *  For simplicity, a local partition can be created under a local or remote
119  *  partition but a remote partition cannot have any partition root in its
120  *  ancestor chain except the cgroup root.
121  */
122 #define PRS_MEMBER		0
123 #define PRS_ROOT		1
124 #define PRS_ISOLATED		2
125 #define PRS_INVALID_ROOT	-1
126 #define PRS_INVALID_ISOLATED	-2
127 
128 static inline bool is_prs_invalid(int prs_state)
129 {
130 	return prs_state < 0;
131 }
132 
133 /*
134  * Temporary cpumasks for working with partitions that are passed among
135  * functions to avoid memory allocation in inner functions.
136  */
137 struct tmpmasks {
138 	cpumask_var_t addmask, delmask;	/* For partition root */
139 	cpumask_var_t new_cpus;		/* For update_cpumasks_hier() */
140 };
141 
142 void inc_dl_tasks_cs(struct task_struct *p)
143 {
144 	struct cpuset *cs = task_cs(p);
145 
146 	cs->nr_deadline_tasks++;
147 }
148 
149 void dec_dl_tasks_cs(struct task_struct *p)
150 {
151 	struct cpuset *cs = task_cs(p);
152 
153 	cs->nr_deadline_tasks--;
154 }
155 
156 static inline int is_partition_valid(const struct cpuset *cs)
157 {
158 	return cs->partition_root_state > 0;
159 }
160 
161 static inline int is_partition_invalid(const struct cpuset *cs)
162 {
163 	return cs->partition_root_state < 0;
164 }
165 
166 /*
167  * Callers should hold callback_lock to modify partition_root_state.
168  */
169 static inline void make_partition_invalid(struct cpuset *cs)
170 {
171 	if (cs->partition_root_state > 0)
172 		cs->partition_root_state = -cs->partition_root_state;
173 }
174 
175 /*
176  * Send notification event of whenever partition_root_state changes.
177  */
178 static inline void notify_partition_change(struct cpuset *cs, int old_prs)
179 {
180 	if (old_prs == cs->partition_root_state)
181 		return;
182 	cgroup_file_notify(&cs->partition_file);
183 
184 	/* Reset prs_err if not invalid */
185 	if (is_partition_valid(cs))
186 		WRITE_ONCE(cs->prs_err, PERR_NONE);
187 }
188 
189 static struct cpuset top_cpuset = {
190 	.flags = BIT(CS_ONLINE) | BIT(CS_CPU_EXCLUSIVE) |
191 		 BIT(CS_MEM_EXCLUSIVE) | BIT(CS_SCHED_LOAD_BALANCE),
192 	.partition_root_state = PRS_ROOT,
193 	.relax_domain_level = -1,
194 	.remote_sibling = LIST_HEAD_INIT(top_cpuset.remote_sibling),
195 };
196 
197 /*
198  * There are two global locks guarding cpuset structures - cpuset_mutex and
199  * callback_lock. The cpuset code uses only cpuset_mutex. Other kernel
200  * subsystems can use cpuset_lock()/cpuset_unlock() to prevent change to cpuset
201  * structures. Note that cpuset_mutex needs to be a mutex as it is used in
202  * paths that rely on priority inheritance (e.g. scheduler - on RT) for
203  * correctness.
204  *
205  * A task must hold both locks to modify cpusets.  If a task holds
206  * cpuset_mutex, it blocks others, ensuring that it is the only task able to
207  * also acquire callback_lock and be able to modify cpusets.  It can perform
208  * various checks on the cpuset structure first, knowing nothing will change.
209  * It can also allocate memory while just holding cpuset_mutex.  While it is
210  * performing these checks, various callback routines can briefly acquire
211  * callback_lock to query cpusets.  Once it is ready to make the changes, it
212  * takes callback_lock, blocking everyone else.
213  *
214  * Calls to the kernel memory allocator can not be made while holding
215  * callback_lock, as that would risk double tripping on callback_lock
216  * from one of the callbacks into the cpuset code from within
217  * __alloc_pages().
218  *
219  * If a task is only holding callback_lock, then it has read-only
220  * access to cpusets.
221  *
222  * Now, the task_struct fields mems_allowed and mempolicy may be changed
223  * by other task, we use alloc_lock in the task_struct fields to protect
224  * them.
225  *
226  * The cpuset_common_seq_show() handlers only hold callback_lock across
227  * small pieces of code, such as when reading out possibly multi-word
228  * cpumasks and nodemasks.
229  */
230 
231 static DEFINE_MUTEX(cpuset_mutex);
232 
233 void cpuset_lock(void)
234 {
235 	mutex_lock(&cpuset_mutex);
236 }
237 
238 void cpuset_unlock(void)
239 {
240 	mutex_unlock(&cpuset_mutex);
241 }
242 
243 static DEFINE_SPINLOCK(callback_lock);
244 
245 void cpuset_callback_lock_irq(void)
246 {
247 	spin_lock_irq(&callback_lock);
248 }
249 
250 void cpuset_callback_unlock_irq(void)
251 {
252 	spin_unlock_irq(&callback_lock);
253 }
254 
255 static struct workqueue_struct *cpuset_migrate_mm_wq;
256 
257 static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq);
258 
259 static inline void check_insane_mems_config(nodemask_t *nodes)
260 {
261 	if (!cpusets_insane_config() &&
262 		movable_only_nodes(nodes)) {
263 		static_branch_enable(&cpusets_insane_config_key);
264 		pr_info("Unsupported (movable nodes only) cpuset configuration detected (nmask=%*pbl)!\n"
265 			"Cpuset allocations might fail even with a lot of memory available.\n",
266 			nodemask_pr_args(nodes));
267 	}
268 }
269 
270 /*
271  * decrease cs->attach_in_progress.
272  * wake_up cpuset_attach_wq if cs->attach_in_progress==0.
273  */
274 static inline void dec_attach_in_progress_locked(struct cpuset *cs)
275 {
276 	lockdep_assert_held(&cpuset_mutex);
277 
278 	cs->attach_in_progress--;
279 	if (!cs->attach_in_progress)
280 		wake_up(&cpuset_attach_wq);
281 }
282 
283 static inline void dec_attach_in_progress(struct cpuset *cs)
284 {
285 	mutex_lock(&cpuset_mutex);
286 	dec_attach_in_progress_locked(cs);
287 	mutex_unlock(&cpuset_mutex);
288 }
289 
290 static inline bool cpuset_v2(void)
291 {
292 	return !IS_ENABLED(CONFIG_CPUSETS_V1) ||
293 		cgroup_subsys_on_dfl(cpuset_cgrp_subsys);
294 }
295 
296 /*
297  * Cgroup v2 behavior is used on the "cpus" and "mems" control files when
298  * on default hierarchy or when the cpuset_v2_mode flag is set by mounting
299  * the v1 cpuset cgroup filesystem with the "cpuset_v2_mode" mount option.
300  * With v2 behavior, "cpus" and "mems" are always what the users have
301  * requested and won't be changed by hotplug events. Only the effective
302  * cpus or mems will be affected.
303  */
304 static inline bool is_in_v2_mode(void)
305 {
306 	return cpuset_v2() ||
307 	      (cpuset_cgrp_subsys.root->flags & CGRP_ROOT_CPUSET_V2_MODE);
308 }
309 
310 /**
311  * partition_is_populated - check if partition has tasks
312  * @cs: partition root to be checked
313  * @excluded_child: a child cpuset to be excluded in task checking
314  * Return: true if there are tasks, false otherwise
315  *
316  * It is assumed that @cs is a valid partition root. @excluded_child should
317  * be non-NULL when this cpuset is going to become a partition itself.
318  */
319 static inline bool partition_is_populated(struct cpuset *cs,
320 					  struct cpuset *excluded_child)
321 {
322 	struct cgroup_subsys_state *css;
323 	struct cpuset *child;
324 
325 	if (cs->css.cgroup->nr_populated_csets)
326 		return true;
327 	if (!excluded_child && !cs->nr_subparts)
328 		return cgroup_is_populated(cs->css.cgroup);
329 
330 	rcu_read_lock();
331 	cpuset_for_each_child(child, css, cs) {
332 		if (child == excluded_child)
333 			continue;
334 		if (is_partition_valid(child))
335 			continue;
336 		if (cgroup_is_populated(child->css.cgroup)) {
337 			rcu_read_unlock();
338 			return true;
339 		}
340 	}
341 	rcu_read_unlock();
342 	return false;
343 }
344 
345 /*
346  * Return in pmask the portion of a task's cpusets's cpus_allowed that
347  * are online and are capable of running the task.  If none are found,
348  * walk up the cpuset hierarchy until we find one that does have some
349  * appropriate cpus.
350  *
351  * One way or another, we guarantee to return some non-empty subset
352  * of cpu_online_mask.
353  *
354  * Call with callback_lock or cpuset_mutex held.
355  */
356 static void guarantee_online_cpus(struct task_struct *tsk,
357 				  struct cpumask *pmask)
358 {
359 	const struct cpumask *possible_mask = task_cpu_possible_mask(tsk);
360 	struct cpuset *cs;
361 
362 	if (WARN_ON(!cpumask_and(pmask, possible_mask, cpu_online_mask)))
363 		cpumask_copy(pmask, cpu_online_mask);
364 
365 	rcu_read_lock();
366 	cs = task_cs(tsk);
367 
368 	while (!cpumask_intersects(cs->effective_cpus, pmask))
369 		cs = parent_cs(cs);
370 
371 	cpumask_and(pmask, pmask, cs->effective_cpus);
372 	rcu_read_unlock();
373 }
374 
375 /*
376  * Return in *pmask the portion of a cpusets's mems_allowed that
377  * are online, with memory.  If none are online with memory, walk
378  * up the cpuset hierarchy until we find one that does have some
379  * online mems.  The top cpuset always has some mems online.
380  *
381  * One way or another, we guarantee to return some non-empty subset
382  * of node_states[N_MEMORY].
383  *
384  * Call with callback_lock or cpuset_mutex held.
385  */
386 static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
387 {
388 	while (!nodes_intersects(cs->effective_mems, node_states[N_MEMORY]))
389 		cs = parent_cs(cs);
390 	nodes_and(*pmask, cs->effective_mems, node_states[N_MEMORY]);
391 }
392 
393 /**
394  * alloc_cpumasks - allocate three cpumasks for cpuset
395  * @cs:  the cpuset that have cpumasks to be allocated.
396  * @tmp: the tmpmasks structure pointer
397  * Return: 0 if successful, -ENOMEM otherwise.
398  *
399  * Only one of the two input arguments should be non-NULL.
400  */
401 static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp)
402 {
403 	cpumask_var_t *pmask1, *pmask2, *pmask3, *pmask4;
404 
405 	if (cs) {
406 		pmask1 = &cs->cpus_allowed;
407 		pmask2 = &cs->effective_cpus;
408 		pmask3 = &cs->effective_xcpus;
409 		pmask4 = &cs->exclusive_cpus;
410 	} else {
411 		pmask1 = &tmp->new_cpus;
412 		pmask2 = &tmp->addmask;
413 		pmask3 = &tmp->delmask;
414 		pmask4 = NULL;
415 	}
416 
417 	if (!zalloc_cpumask_var(pmask1, GFP_KERNEL))
418 		return -ENOMEM;
419 
420 	if (!zalloc_cpumask_var(pmask2, GFP_KERNEL))
421 		goto free_one;
422 
423 	if (!zalloc_cpumask_var(pmask3, GFP_KERNEL))
424 		goto free_two;
425 
426 	if (pmask4 && !zalloc_cpumask_var(pmask4, GFP_KERNEL))
427 		goto free_three;
428 
429 
430 	return 0;
431 
432 free_three:
433 	free_cpumask_var(*pmask3);
434 free_two:
435 	free_cpumask_var(*pmask2);
436 free_one:
437 	free_cpumask_var(*pmask1);
438 	return -ENOMEM;
439 }
440 
441 /**
442  * free_cpumasks - free cpumasks in a tmpmasks structure
443  * @cs:  the cpuset that have cpumasks to be free.
444  * @tmp: the tmpmasks structure pointer
445  */
446 static inline void free_cpumasks(struct cpuset *cs, struct tmpmasks *tmp)
447 {
448 	if (cs) {
449 		free_cpumask_var(cs->cpus_allowed);
450 		free_cpumask_var(cs->effective_cpus);
451 		free_cpumask_var(cs->effective_xcpus);
452 		free_cpumask_var(cs->exclusive_cpus);
453 	}
454 	if (tmp) {
455 		free_cpumask_var(tmp->new_cpus);
456 		free_cpumask_var(tmp->addmask);
457 		free_cpumask_var(tmp->delmask);
458 	}
459 }
460 
461 /**
462  * alloc_trial_cpuset - allocate a trial cpuset
463  * @cs: the cpuset that the trial cpuset duplicates
464  */
465 static struct cpuset *alloc_trial_cpuset(struct cpuset *cs)
466 {
467 	struct cpuset *trial;
468 
469 	trial = kmemdup(cs, sizeof(*cs), GFP_KERNEL);
470 	if (!trial)
471 		return NULL;
472 
473 	if (alloc_cpumasks(trial, NULL)) {
474 		kfree(trial);
475 		return NULL;
476 	}
477 
478 	cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
479 	cpumask_copy(trial->effective_cpus, cs->effective_cpus);
480 	cpumask_copy(trial->effective_xcpus, cs->effective_xcpus);
481 	cpumask_copy(trial->exclusive_cpus, cs->exclusive_cpus);
482 	return trial;
483 }
484 
485 /**
486  * free_cpuset - free the cpuset
487  * @cs: the cpuset to be freed
488  */
489 static inline void free_cpuset(struct cpuset *cs)
490 {
491 	free_cpumasks(cs, NULL);
492 	kfree(cs);
493 }
494 
495 /* Return user specified exclusive CPUs */
496 static inline struct cpumask *user_xcpus(struct cpuset *cs)
497 {
498 	return cpumask_empty(cs->exclusive_cpus) ? cs->cpus_allowed
499 						 : cs->exclusive_cpus;
500 }
501 
502 static inline bool xcpus_empty(struct cpuset *cs)
503 {
504 	return cpumask_empty(cs->cpus_allowed) &&
505 	       cpumask_empty(cs->exclusive_cpus);
506 }
507 
508 /*
509  * cpusets_are_exclusive() - check if two cpusets are exclusive
510  *
511  * Return true if exclusive, false if not
512  */
513 static inline bool cpusets_are_exclusive(struct cpuset *cs1, struct cpuset *cs2)
514 {
515 	struct cpumask *xcpus1 = user_xcpus(cs1);
516 	struct cpumask *xcpus2 = user_xcpus(cs2);
517 
518 	if (cpumask_intersects(xcpus1, xcpus2))
519 		return false;
520 	return true;
521 }
522 
523 /*
524  * validate_change() - Used to validate that any proposed cpuset change
525  *		       follows the structural rules for cpusets.
526  *
527  * If we replaced the flag and mask values of the current cpuset
528  * (cur) with those values in the trial cpuset (trial), would
529  * our various subset and exclusive rules still be valid?  Presumes
530  * cpuset_mutex held.
531  *
532  * 'cur' is the address of an actual, in-use cpuset.  Operations
533  * such as list traversal that depend on the actual address of the
534  * cpuset in the list must use cur below, not trial.
535  *
536  * 'trial' is the address of bulk structure copy of cur, with
537  * perhaps one or more of the fields cpus_allowed, mems_allowed,
538  * or flags changed to new, trial values.
539  *
540  * Return 0 if valid, -errno if not.
541  */
542 
543 static int validate_change(struct cpuset *cur, struct cpuset *trial)
544 {
545 	struct cgroup_subsys_state *css;
546 	struct cpuset *c, *par;
547 	int ret = 0;
548 
549 	rcu_read_lock();
550 
551 	if (!is_in_v2_mode())
552 		ret = cpuset1_validate_change(cur, trial);
553 	if (ret)
554 		goto out;
555 
556 	/* Remaining checks don't apply to root cpuset */
557 	if (cur == &top_cpuset)
558 		goto out;
559 
560 	par = parent_cs(cur);
561 
562 	/*
563 	 * Cpusets with tasks - existing or newly being attached - can't
564 	 * be changed to have empty cpus_allowed or mems_allowed.
565 	 */
566 	ret = -ENOSPC;
567 	if ((cgroup_is_populated(cur->css.cgroup) || cur->attach_in_progress)) {
568 		if (!cpumask_empty(cur->cpus_allowed) &&
569 		    cpumask_empty(trial->cpus_allowed))
570 			goto out;
571 		if (!nodes_empty(cur->mems_allowed) &&
572 		    nodes_empty(trial->mems_allowed))
573 			goto out;
574 	}
575 
576 	/*
577 	 * We can't shrink if we won't have enough room for SCHED_DEADLINE
578 	 * tasks. This check is not done when scheduling is disabled as the
579 	 * users should know what they are doing.
580 	 *
581 	 * For v1, effective_cpus == cpus_allowed & user_xcpus() returns
582 	 * cpus_allowed.
583 	 *
584 	 * For v2, is_cpu_exclusive() & is_sched_load_balance() are true only
585 	 * for non-isolated partition root. At this point, the target
586 	 * effective_cpus isn't computed yet. user_xcpus() is the best
587 	 * approximation.
588 	 *
589 	 * TBD: May need to precompute the real effective_cpus here in case
590 	 * incorrect scheduling of SCHED_DEADLINE tasks in a partition
591 	 * becomes an issue.
592 	 */
593 	ret = -EBUSY;
594 	if (is_cpu_exclusive(cur) && is_sched_load_balance(cur) &&
595 	    !cpuset_cpumask_can_shrink(cur->effective_cpus, user_xcpus(trial)))
596 		goto out;
597 
598 	/*
599 	 * If either I or some sibling (!= me) is exclusive, we can't
600 	 * overlap. exclusive_cpus cannot overlap with each other if set.
601 	 */
602 	ret = -EINVAL;
603 	cpuset_for_each_child(c, css, par) {
604 		bool txset, cxset;	/* Are exclusive_cpus set? */
605 
606 		if (c == cur)
607 			continue;
608 
609 		txset = !cpumask_empty(trial->exclusive_cpus);
610 		cxset = !cpumask_empty(c->exclusive_cpus);
611 		if (is_cpu_exclusive(trial) || is_cpu_exclusive(c) ||
612 		    (txset && cxset)) {
613 			if (!cpusets_are_exclusive(trial, c))
614 				goto out;
615 		} else if (txset || cxset) {
616 			struct cpumask *xcpus, *acpus;
617 
618 			/*
619 			 * When just one of the exclusive_cpus's is set,
620 			 * cpus_allowed of the other cpuset, if set, cannot be
621 			 * a subset of it or none of those CPUs will be
622 			 * available if these exclusive CPUs are activated.
623 			 */
624 			if (txset) {
625 				xcpus = trial->exclusive_cpus;
626 				acpus = c->cpus_allowed;
627 			} else {
628 				xcpus = c->exclusive_cpus;
629 				acpus = trial->cpus_allowed;
630 			}
631 			if (!cpumask_empty(acpus) && cpumask_subset(acpus, xcpus))
632 				goto out;
633 		}
634 		if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
635 		    nodes_intersects(trial->mems_allowed, c->mems_allowed))
636 			goto out;
637 	}
638 
639 	ret = 0;
640 out:
641 	rcu_read_unlock();
642 	return ret;
643 }
644 
645 #ifdef CONFIG_SMP
646 /*
647  * Helper routine for generate_sched_domains().
648  * Do cpusets a, b have overlapping effective cpus_allowed masks?
649  */
650 static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
651 {
652 	return cpumask_intersects(a->effective_cpus, b->effective_cpus);
653 }
654 
655 static void
656 update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
657 {
658 	if (dattr->relax_domain_level < c->relax_domain_level)
659 		dattr->relax_domain_level = c->relax_domain_level;
660 	return;
661 }
662 
663 static void update_domain_attr_tree(struct sched_domain_attr *dattr,
664 				    struct cpuset *root_cs)
665 {
666 	struct cpuset *cp;
667 	struct cgroup_subsys_state *pos_css;
668 
669 	rcu_read_lock();
670 	cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
671 		/* skip the whole subtree if @cp doesn't have any CPU */
672 		if (cpumask_empty(cp->cpus_allowed)) {
673 			pos_css = css_rightmost_descendant(pos_css);
674 			continue;
675 		}
676 
677 		if (is_sched_load_balance(cp))
678 			update_domain_attr(dattr, cp);
679 	}
680 	rcu_read_unlock();
681 }
682 
683 /* Must be called with cpuset_mutex held.  */
684 static inline int nr_cpusets(void)
685 {
686 	/* jump label reference count + the top-level cpuset */
687 	return static_key_count(&cpusets_enabled_key.key) + 1;
688 }
689 
690 /*
691  * generate_sched_domains()
692  *
693  * This function builds a partial partition of the systems CPUs
694  * A 'partial partition' is a set of non-overlapping subsets whose
695  * union is a subset of that set.
696  * The output of this function needs to be passed to kernel/sched/core.c
697  * partition_sched_domains() routine, which will rebuild the scheduler's
698  * load balancing domains (sched domains) as specified by that partial
699  * partition.
700  *
701  * See "What is sched_load_balance" in Documentation/admin-guide/cgroup-v1/cpusets.rst
702  * for a background explanation of this.
703  *
704  * Does not return errors, on the theory that the callers of this
705  * routine would rather not worry about failures to rebuild sched
706  * domains when operating in the severe memory shortage situations
707  * that could cause allocation failures below.
708  *
709  * Must be called with cpuset_mutex held.
710  *
711  * The three key local variables below are:
712  *    cp - cpuset pointer, used (together with pos_css) to perform a
713  *	   top-down scan of all cpusets. For our purposes, rebuilding
714  *	   the schedulers sched domains, we can ignore !is_sched_load_
715  *	   balance cpusets.
716  *  csa  - (for CpuSet Array) Array of pointers to all the cpusets
717  *	   that need to be load balanced, for convenient iterative
718  *	   access by the subsequent code that finds the best partition,
719  *	   i.e the set of domains (subsets) of CPUs such that the
720  *	   cpus_allowed of every cpuset marked is_sched_load_balance
721  *	   is a subset of one of these domains, while there are as
722  *	   many such domains as possible, each as small as possible.
723  * doms  - Conversion of 'csa' to an array of cpumasks, for passing to
724  *	   the kernel/sched/core.c routine partition_sched_domains() in a
725  *	   convenient format, that can be easily compared to the prior
726  *	   value to determine what partition elements (sched domains)
727  *	   were changed (added or removed.)
728  *
729  * Finding the best partition (set of domains):
730  *	The double nested loops below over i, j scan over the load
731  *	balanced cpusets (using the array of cpuset pointers in csa[])
732  *	looking for pairs of cpusets that have overlapping cpus_allowed
733  *	and merging them using a union-find algorithm.
734  *
735  *	The union of the cpus_allowed masks from the set of all cpusets
736  *	having the same root then form the one element of the partition
737  *	(one sched domain) to be passed to partition_sched_domains().
738  *
739  */
740 static int generate_sched_domains(cpumask_var_t **domains,
741 			struct sched_domain_attr **attributes)
742 {
743 	struct cpuset *cp;	/* top-down scan of cpusets */
744 	struct cpuset **csa;	/* array of all cpuset ptrs */
745 	int csn;		/* how many cpuset ptrs in csa so far */
746 	int i, j;		/* indices for partition finding loops */
747 	cpumask_var_t *doms;	/* resulting partition; i.e. sched domains */
748 	struct sched_domain_attr *dattr;  /* attributes for custom domains */
749 	int ndoms = 0;		/* number of sched domains in result */
750 	int nslot;		/* next empty doms[] struct cpumask slot */
751 	struct cgroup_subsys_state *pos_css;
752 	bool root_load_balance = is_sched_load_balance(&top_cpuset);
753 	bool cgrpv2 = cpuset_v2();
754 	int nslot_update;
755 
756 	doms = NULL;
757 	dattr = NULL;
758 	csa = NULL;
759 
760 	/* Special case for the 99% of systems with one, full, sched domain */
761 	if (root_load_balance && cpumask_empty(subpartitions_cpus)) {
762 single_root_domain:
763 		ndoms = 1;
764 		doms = alloc_sched_domains(ndoms);
765 		if (!doms)
766 			goto done;
767 
768 		dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL);
769 		if (dattr) {
770 			*dattr = SD_ATTR_INIT;
771 			update_domain_attr_tree(dattr, &top_cpuset);
772 		}
773 		cpumask_and(doms[0], top_cpuset.effective_cpus,
774 			    housekeeping_cpumask(HK_TYPE_DOMAIN));
775 
776 		goto done;
777 	}
778 
779 	csa = kmalloc_array(nr_cpusets(), sizeof(cp), GFP_KERNEL);
780 	if (!csa)
781 		goto done;
782 	csn = 0;
783 
784 	rcu_read_lock();
785 	if (root_load_balance)
786 		csa[csn++] = &top_cpuset;
787 	cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) {
788 		if (cp == &top_cpuset)
789 			continue;
790 
791 		if (cgrpv2)
792 			goto v2;
793 
794 		/*
795 		 * v1:
796 		 * Continue traversing beyond @cp iff @cp has some CPUs and
797 		 * isn't load balancing.  The former is obvious.  The
798 		 * latter: All child cpusets contain a subset of the
799 		 * parent's cpus, so just skip them, and then we call
800 		 * update_domain_attr_tree() to calc relax_domain_level of
801 		 * the corresponding sched domain.
802 		 */
803 		if (!cpumask_empty(cp->cpus_allowed) &&
804 		    !(is_sched_load_balance(cp) &&
805 		      cpumask_intersects(cp->cpus_allowed,
806 					 housekeeping_cpumask(HK_TYPE_DOMAIN))))
807 			continue;
808 
809 		if (is_sched_load_balance(cp) &&
810 		    !cpumask_empty(cp->effective_cpus))
811 			csa[csn++] = cp;
812 
813 		/* skip @cp's subtree */
814 		pos_css = css_rightmost_descendant(pos_css);
815 		continue;
816 
817 v2:
818 		/*
819 		 * Only valid partition roots that are not isolated and with
820 		 * non-empty effective_cpus will be saved into csn[].
821 		 */
822 		if ((cp->partition_root_state == PRS_ROOT) &&
823 		    !cpumask_empty(cp->effective_cpus))
824 			csa[csn++] = cp;
825 
826 		/*
827 		 * Skip @cp's subtree if not a partition root and has no
828 		 * exclusive CPUs to be granted to child cpusets.
829 		 */
830 		if (!is_partition_valid(cp) && cpumask_empty(cp->exclusive_cpus))
831 			pos_css = css_rightmost_descendant(pos_css);
832 	}
833 	rcu_read_unlock();
834 
835 	/*
836 	 * If there are only isolated partitions underneath the cgroup root,
837 	 * we can optimize out unneeded sched domains scanning.
838 	 */
839 	if (root_load_balance && (csn == 1))
840 		goto single_root_domain;
841 
842 	for (i = 0; i < csn; i++)
843 		uf_node_init(&csa[i]->node);
844 
845 	/* Merge overlapping cpusets */
846 	for (i = 0; i < csn; i++) {
847 		for (j = i + 1; j < csn; j++) {
848 			if (cpusets_overlap(csa[i], csa[j])) {
849 				/*
850 				 * Cgroup v2 shouldn't pass down overlapping
851 				 * partition root cpusets.
852 				 */
853 				WARN_ON_ONCE(cgrpv2);
854 				uf_union(&csa[i]->node, &csa[j]->node);
855 			}
856 		}
857 	}
858 
859 	/* Count the total number of domains */
860 	for (i = 0; i < csn; i++) {
861 		if (uf_find(&csa[i]->node) == &csa[i]->node)
862 			ndoms++;
863 	}
864 
865 	/*
866 	 * Now we know how many domains to create.
867 	 * Convert <csn, csa> to <ndoms, doms> and populate cpu masks.
868 	 */
869 	doms = alloc_sched_domains(ndoms);
870 	if (!doms)
871 		goto done;
872 
873 	/*
874 	 * The rest of the code, including the scheduler, can deal with
875 	 * dattr==NULL case. No need to abort if alloc fails.
876 	 */
877 	dattr = kmalloc_array(ndoms, sizeof(struct sched_domain_attr),
878 			      GFP_KERNEL);
879 
880 	/*
881 	 * Cgroup v2 doesn't support domain attributes, just set all of them
882 	 * to SD_ATTR_INIT. Also non-isolating partition root CPUs are a
883 	 * subset of HK_TYPE_DOMAIN housekeeping CPUs.
884 	 */
885 	if (cgrpv2) {
886 		for (i = 0; i < ndoms; i++) {
887 			/*
888 			 * The top cpuset may contain some boot time isolated
889 			 * CPUs that need to be excluded from the sched domain.
890 			 */
891 			if (csa[i] == &top_cpuset)
892 				cpumask_and(doms[i], csa[i]->effective_cpus,
893 					    housekeeping_cpumask(HK_TYPE_DOMAIN));
894 			else
895 				cpumask_copy(doms[i], csa[i]->effective_cpus);
896 			if (dattr)
897 				dattr[i] = SD_ATTR_INIT;
898 		}
899 		goto done;
900 	}
901 
902 	for (nslot = 0, i = 0; i < csn; i++) {
903 		nslot_update = 0;
904 		for (j = i; j < csn; j++) {
905 			if (uf_find(&csa[j]->node) == &csa[i]->node) {
906 				struct cpumask *dp = doms[nslot];
907 
908 				if (i == j) {
909 					nslot_update = 1;
910 					cpumask_clear(dp);
911 					if (dattr)
912 						*(dattr + nslot) = SD_ATTR_INIT;
913 				}
914 				cpumask_or(dp, dp, csa[j]->effective_cpus);
915 				cpumask_and(dp, dp, housekeeping_cpumask(HK_TYPE_DOMAIN));
916 				if (dattr)
917 					update_domain_attr_tree(dattr + nslot, csa[j]);
918 			}
919 		}
920 		if (nslot_update)
921 			nslot++;
922 	}
923 	BUG_ON(nslot != ndoms);
924 
925 done:
926 	kfree(csa);
927 
928 	/*
929 	 * Fallback to the default domain if kmalloc() failed.
930 	 * See comments in partition_sched_domains().
931 	 */
932 	if (doms == NULL)
933 		ndoms = 1;
934 
935 	*domains    = doms;
936 	*attributes = dattr;
937 	return ndoms;
938 }
939 
940 static void dl_update_tasks_root_domain(struct cpuset *cs)
941 {
942 	struct css_task_iter it;
943 	struct task_struct *task;
944 
945 	if (cs->nr_deadline_tasks == 0)
946 		return;
947 
948 	css_task_iter_start(&cs->css, 0, &it);
949 
950 	while ((task = css_task_iter_next(&it)))
951 		dl_add_task_root_domain(task);
952 
953 	css_task_iter_end(&it);
954 }
955 
956 void dl_rebuild_rd_accounting(void)
957 {
958 	struct cpuset *cs = NULL;
959 	struct cgroup_subsys_state *pos_css;
960 	int cpu;
961 	u64 cookie = ++dl_cookie;
962 
963 	lockdep_assert_held(&cpuset_mutex);
964 	lockdep_assert_cpus_held();
965 	lockdep_assert_held(&sched_domains_mutex);
966 
967 	rcu_read_lock();
968 
969 	for_each_possible_cpu(cpu) {
970 		if (dl_bw_visited(cpu, cookie))
971 			continue;
972 
973 		dl_clear_root_domain_cpu(cpu);
974 	}
975 
976 	cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
977 
978 		if (cpumask_empty(cs->effective_cpus)) {
979 			pos_css = css_rightmost_descendant(pos_css);
980 			continue;
981 		}
982 
983 		css_get(&cs->css);
984 
985 		rcu_read_unlock();
986 
987 		dl_update_tasks_root_domain(cs);
988 
989 		rcu_read_lock();
990 		css_put(&cs->css);
991 	}
992 	rcu_read_unlock();
993 }
994 
995 /*
996  * Rebuild scheduler domains.
997  *
998  * If the flag 'sched_load_balance' of any cpuset with non-empty
999  * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset
1000  * which has that flag enabled, or if any cpuset with a non-empty
1001  * 'cpus' is removed, then call this routine to rebuild the
1002  * scheduler's dynamic sched domains.
1003  *
1004  * Call with cpuset_mutex held.  Takes cpus_read_lock().
1005  */
1006 void rebuild_sched_domains_locked(void)
1007 {
1008 	struct cgroup_subsys_state *pos_css;
1009 	struct sched_domain_attr *attr;
1010 	cpumask_var_t *doms;
1011 	struct cpuset *cs;
1012 	int ndoms;
1013 
1014 	lockdep_assert_cpus_held();
1015 	lockdep_assert_held(&cpuset_mutex);
1016 	force_sd_rebuild = false;
1017 
1018 	/*
1019 	 * If we have raced with CPU hotplug, return early to avoid
1020 	 * passing doms with offlined cpu to partition_sched_domains().
1021 	 * Anyways, cpuset_handle_hotplug() will rebuild sched domains.
1022 	 *
1023 	 * With no CPUs in any subpartitions, top_cpuset's effective CPUs
1024 	 * should be the same as the active CPUs, so checking only top_cpuset
1025 	 * is enough to detect racing CPU offlines.
1026 	 */
1027 	if (cpumask_empty(subpartitions_cpus) &&
1028 	    !cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
1029 		return;
1030 
1031 	/*
1032 	 * With subpartition CPUs, however, the effective CPUs of a partition
1033 	 * root should be only a subset of the active CPUs.  Since a CPU in any
1034 	 * partition root could be offlined, all must be checked.
1035 	 */
1036 	if (!cpumask_empty(subpartitions_cpus)) {
1037 		rcu_read_lock();
1038 		cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
1039 			if (!is_partition_valid(cs)) {
1040 				pos_css = css_rightmost_descendant(pos_css);
1041 				continue;
1042 			}
1043 			if (!cpumask_subset(cs->effective_cpus,
1044 					    cpu_active_mask)) {
1045 				rcu_read_unlock();
1046 				return;
1047 			}
1048 		}
1049 		rcu_read_unlock();
1050 	}
1051 
1052 	/* Generate domain masks and attrs */
1053 	ndoms = generate_sched_domains(&doms, &attr);
1054 
1055 	/* Have scheduler rebuild the domains */
1056 	partition_sched_domains(ndoms, doms, attr);
1057 }
1058 #else /* !CONFIG_SMP */
1059 void rebuild_sched_domains_locked(void)
1060 {
1061 }
1062 #endif /* CONFIG_SMP */
1063 
1064 static void rebuild_sched_domains_cpuslocked(void)
1065 {
1066 	mutex_lock(&cpuset_mutex);
1067 	rebuild_sched_domains_locked();
1068 	mutex_unlock(&cpuset_mutex);
1069 }
1070 
1071 void rebuild_sched_domains(void)
1072 {
1073 	cpus_read_lock();
1074 	rebuild_sched_domains_cpuslocked();
1075 	cpus_read_unlock();
1076 }
1077 
1078 void cpuset_reset_sched_domains(void)
1079 {
1080 	mutex_lock(&cpuset_mutex);
1081 	partition_sched_domains(1, NULL, NULL);
1082 	mutex_unlock(&cpuset_mutex);
1083 }
1084 
1085 /**
1086  * cpuset_update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
1087  * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
1088  * @new_cpus: the temp variable for the new effective_cpus mask
1089  *
1090  * Iterate through each task of @cs updating its cpus_allowed to the
1091  * effective cpuset's.  As this function is called with cpuset_mutex held,
1092  * cpuset membership stays stable. For top_cpuset, task_cpu_possible_mask()
1093  * is used instead of effective_cpus to make sure all offline CPUs are also
1094  * included as hotplug code won't update cpumasks for tasks in top_cpuset.
1095  */
1096 void cpuset_update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus)
1097 {
1098 	struct css_task_iter it;
1099 	struct task_struct *task;
1100 	bool top_cs = cs == &top_cpuset;
1101 
1102 	css_task_iter_start(&cs->css, 0, &it);
1103 	while ((task = css_task_iter_next(&it))) {
1104 		const struct cpumask *possible_mask = task_cpu_possible_mask(task);
1105 
1106 		if (top_cs) {
1107 			/*
1108 			 * Percpu kthreads in top_cpuset are ignored
1109 			 */
1110 			if (kthread_is_per_cpu(task))
1111 				continue;
1112 			cpumask_andnot(new_cpus, possible_mask, subpartitions_cpus);
1113 		} else {
1114 			cpumask_and(new_cpus, possible_mask, cs->effective_cpus);
1115 		}
1116 		set_cpus_allowed_ptr(task, new_cpus);
1117 	}
1118 	css_task_iter_end(&it);
1119 }
1120 
1121 /**
1122  * compute_effective_cpumask - Compute the effective cpumask of the cpuset
1123  * @new_cpus: the temp variable for the new effective_cpus mask
1124  * @cs: the cpuset the need to recompute the new effective_cpus mask
1125  * @parent: the parent cpuset
1126  *
1127  * The result is valid only if the given cpuset isn't a partition root.
1128  */
1129 static void compute_effective_cpumask(struct cpumask *new_cpus,
1130 				      struct cpuset *cs, struct cpuset *parent)
1131 {
1132 	cpumask_and(new_cpus, cs->cpus_allowed, parent->effective_cpus);
1133 }
1134 
1135 /*
1136  * Commands for update_parent_effective_cpumask
1137  */
1138 enum partition_cmd {
1139 	partcmd_enable,		/* Enable partition root	  */
1140 	partcmd_enablei,	/* Enable isolated partition root */
1141 	partcmd_disable,	/* Disable partition root	  */
1142 	partcmd_update,		/* Update parent's effective_cpus */
1143 	partcmd_invalidate,	/* Make partition invalid	  */
1144 };
1145 
1146 static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
1147 				    struct tmpmasks *tmp);
1148 
1149 /*
1150  * Update partition exclusive flag
1151  *
1152  * Return: 0 if successful, an error code otherwise
1153  */
1154 static int update_partition_exclusive(struct cpuset *cs, int new_prs)
1155 {
1156 	bool exclusive = (new_prs > PRS_MEMBER);
1157 
1158 	if (exclusive && !is_cpu_exclusive(cs)) {
1159 		if (cpuset_update_flag(CS_CPU_EXCLUSIVE, cs, 1))
1160 			return PERR_NOTEXCL;
1161 	} else if (!exclusive && is_cpu_exclusive(cs)) {
1162 		/* Turning off CS_CPU_EXCLUSIVE will not return error */
1163 		cpuset_update_flag(CS_CPU_EXCLUSIVE, cs, 0);
1164 	}
1165 	return 0;
1166 }
1167 
1168 /*
1169  * Update partition load balance flag and/or rebuild sched domain
1170  *
1171  * Changing load balance flag will automatically call
1172  * rebuild_sched_domains_locked().
1173  * This function is for cgroup v2 only.
1174  */
1175 static void update_partition_sd_lb(struct cpuset *cs, int old_prs)
1176 {
1177 	int new_prs = cs->partition_root_state;
1178 	bool rebuild_domains = (new_prs > 0) || (old_prs > 0);
1179 	bool new_lb;
1180 
1181 	/*
1182 	 * If cs is not a valid partition root, the load balance state
1183 	 * will follow its parent.
1184 	 */
1185 	if (new_prs > 0) {
1186 		new_lb = (new_prs != PRS_ISOLATED);
1187 	} else {
1188 		new_lb = is_sched_load_balance(parent_cs(cs));
1189 	}
1190 	if (new_lb != !!is_sched_load_balance(cs)) {
1191 		rebuild_domains = true;
1192 		if (new_lb)
1193 			set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
1194 		else
1195 			clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
1196 	}
1197 
1198 	if (rebuild_domains)
1199 		cpuset_force_rebuild();
1200 }
1201 
1202 /*
1203  * tasks_nocpu_error - Return true if tasks will have no effective_cpus
1204  */
1205 static bool tasks_nocpu_error(struct cpuset *parent, struct cpuset *cs,
1206 			      struct cpumask *xcpus)
1207 {
1208 	/*
1209 	 * A populated partition (cs or parent) can't have empty effective_cpus
1210 	 */
1211 	return (cpumask_subset(parent->effective_cpus, xcpus) &&
1212 		partition_is_populated(parent, cs)) ||
1213 	       (!cpumask_intersects(xcpus, cpu_active_mask) &&
1214 		partition_is_populated(cs, NULL));
1215 }
1216 
1217 static void reset_partition_data(struct cpuset *cs)
1218 {
1219 	struct cpuset *parent = parent_cs(cs);
1220 
1221 	if (!cpuset_v2())
1222 		return;
1223 
1224 	lockdep_assert_held(&callback_lock);
1225 
1226 	cs->nr_subparts = 0;
1227 	if (cpumask_empty(cs->exclusive_cpus)) {
1228 		cpumask_clear(cs->effective_xcpus);
1229 		if (is_cpu_exclusive(cs))
1230 			clear_bit(CS_CPU_EXCLUSIVE, &cs->flags);
1231 	}
1232 	if (!cpumask_and(cs->effective_cpus, parent->effective_cpus, cs->cpus_allowed))
1233 		cpumask_copy(cs->effective_cpus, parent->effective_cpus);
1234 }
1235 
1236 /*
1237  * partition_xcpus_newstate - Exclusive CPUs state change
1238  * @old_prs: old partition_root_state
1239  * @new_prs: new partition_root_state
1240  * @xcpus: exclusive CPUs with state change
1241  */
1242 static void partition_xcpus_newstate(int old_prs, int new_prs, struct cpumask *xcpus)
1243 {
1244 	WARN_ON_ONCE(old_prs == new_prs);
1245 	if (new_prs == PRS_ISOLATED)
1246 		cpumask_or(isolated_cpus, isolated_cpus, xcpus);
1247 	else
1248 		cpumask_andnot(isolated_cpus, isolated_cpus, xcpus);
1249 }
1250 
1251 /*
1252  * partition_xcpus_add - Add new exclusive CPUs to partition
1253  * @new_prs: new partition_root_state
1254  * @parent: parent cpuset
1255  * @xcpus: exclusive CPUs to be added
1256  * Return: true if isolated_cpus modified, false otherwise
1257  *
1258  * Remote partition if parent == NULL
1259  */
1260 static bool partition_xcpus_add(int new_prs, struct cpuset *parent,
1261 				struct cpumask *xcpus)
1262 {
1263 	bool isolcpus_updated;
1264 
1265 	WARN_ON_ONCE(new_prs < 0);
1266 	lockdep_assert_held(&callback_lock);
1267 	if (!parent)
1268 		parent = &top_cpuset;
1269 
1270 
1271 	if (parent == &top_cpuset)
1272 		cpumask_or(subpartitions_cpus, subpartitions_cpus, xcpus);
1273 
1274 	isolcpus_updated = (new_prs != parent->partition_root_state);
1275 	if (isolcpus_updated)
1276 		partition_xcpus_newstate(parent->partition_root_state, new_prs,
1277 					 xcpus);
1278 
1279 	cpumask_andnot(parent->effective_cpus, parent->effective_cpus, xcpus);
1280 	return isolcpus_updated;
1281 }
1282 
1283 /*
1284  * partition_xcpus_del - Remove exclusive CPUs from partition
1285  * @old_prs: old partition_root_state
1286  * @parent: parent cpuset
1287  * @xcpus: exclusive CPUs to be removed
1288  * Return: true if isolated_cpus modified, false otherwise
1289  *
1290  * Remote partition if parent == NULL
1291  */
1292 static bool partition_xcpus_del(int old_prs, struct cpuset *parent,
1293 				struct cpumask *xcpus)
1294 {
1295 	bool isolcpus_updated;
1296 
1297 	WARN_ON_ONCE(old_prs < 0);
1298 	lockdep_assert_held(&callback_lock);
1299 	if (!parent)
1300 		parent = &top_cpuset;
1301 
1302 	if (parent == &top_cpuset)
1303 		cpumask_andnot(subpartitions_cpus, subpartitions_cpus, xcpus);
1304 
1305 	isolcpus_updated = (old_prs != parent->partition_root_state);
1306 	if (isolcpus_updated)
1307 		partition_xcpus_newstate(old_prs, parent->partition_root_state,
1308 					 xcpus);
1309 
1310 	cpumask_and(xcpus, xcpus, cpu_active_mask);
1311 	cpumask_or(parent->effective_cpus, parent->effective_cpus, xcpus);
1312 	return isolcpus_updated;
1313 }
1314 
1315 static void update_unbound_workqueue_cpumask(bool isolcpus_updated)
1316 {
1317 	int ret;
1318 
1319 	lockdep_assert_cpus_held();
1320 
1321 	if (!isolcpus_updated)
1322 		return;
1323 
1324 	ret = workqueue_unbound_exclude_cpumask(isolated_cpus);
1325 	WARN_ON_ONCE(ret < 0);
1326 }
1327 
1328 /**
1329  * cpuset_cpu_is_isolated - Check if the given CPU is isolated
1330  * @cpu: the CPU number to be checked
1331  * Return: true if CPU is used in an isolated partition, false otherwise
1332  */
1333 bool cpuset_cpu_is_isolated(int cpu)
1334 {
1335 	return cpumask_test_cpu(cpu, isolated_cpus);
1336 }
1337 EXPORT_SYMBOL_GPL(cpuset_cpu_is_isolated);
1338 
1339 /*
1340  * compute_effective_exclusive_cpumask - compute effective exclusive CPUs
1341  * @cs: cpuset
1342  * @xcpus: effective exclusive CPUs value to be set
1343  * Return: true if xcpus is not empty, false otherwise.
1344  *
1345  * Starting with exclusive_cpus (cpus_allowed if exclusive_cpus is not set),
1346  * it must be a subset of parent's effective_xcpus.
1347  */
1348 static bool compute_effective_exclusive_cpumask(struct cpuset *cs,
1349 						struct cpumask *xcpus)
1350 {
1351 	struct cpuset *parent = parent_cs(cs);
1352 
1353 	if (!xcpus)
1354 		xcpus = cs->effective_xcpus;
1355 
1356 	return cpumask_and(xcpus, user_xcpus(cs), parent->effective_xcpus);
1357 }
1358 
1359 static inline bool is_remote_partition(struct cpuset *cs)
1360 {
1361 	return !list_empty(&cs->remote_sibling);
1362 }
1363 
1364 static inline bool is_local_partition(struct cpuset *cs)
1365 {
1366 	return is_partition_valid(cs) && !is_remote_partition(cs);
1367 }
1368 
1369 /*
1370  * remote_partition_enable - Enable current cpuset as a remote partition root
1371  * @cs: the cpuset to update
1372  * @new_prs: new partition_root_state
1373  * @tmp: temporary masks
1374  * Return: 0 if successful, errcode if error
1375  *
1376  * Enable the current cpuset to become a remote partition root taking CPUs
1377  * directly from the top cpuset. cpuset_mutex must be held by the caller.
1378  */
1379 static int remote_partition_enable(struct cpuset *cs, int new_prs,
1380 				   struct tmpmasks *tmp)
1381 {
1382 	bool isolcpus_updated;
1383 
1384 	/*
1385 	 * The user must have sysadmin privilege.
1386 	 */
1387 	if (!capable(CAP_SYS_ADMIN))
1388 		return PERR_ACCESS;
1389 
1390 	/*
1391 	 * The requested exclusive_cpus must not be allocated to other
1392 	 * partitions and it can't use up all the root's effective_cpus.
1393 	 *
1394 	 * Note that if there is any local partition root above it or
1395 	 * remote partition root underneath it, its exclusive_cpus must
1396 	 * have overlapped with subpartitions_cpus.
1397 	 */
1398 	compute_effective_exclusive_cpumask(cs, tmp->new_cpus);
1399 	if (cpumask_empty(tmp->new_cpus) ||
1400 	    cpumask_intersects(tmp->new_cpus, subpartitions_cpus) ||
1401 	    cpumask_subset(top_cpuset.effective_cpus, tmp->new_cpus))
1402 		return PERR_INVCPUS;
1403 
1404 	spin_lock_irq(&callback_lock);
1405 	isolcpus_updated = partition_xcpus_add(new_prs, NULL, tmp->new_cpus);
1406 	list_add(&cs->remote_sibling, &remote_children);
1407 	spin_unlock_irq(&callback_lock);
1408 	update_unbound_workqueue_cpumask(isolcpus_updated);
1409 
1410 	/*
1411 	 * Propagate changes in top_cpuset's effective_cpus down the hierarchy.
1412 	 */
1413 	cpuset_update_tasks_cpumask(&top_cpuset, tmp->new_cpus);
1414 	update_sibling_cpumasks(&top_cpuset, NULL, tmp);
1415 	return 0;
1416 }
1417 
1418 /*
1419  * remote_partition_disable - Remove current cpuset from remote partition list
1420  * @cs: the cpuset to update
1421  * @tmp: temporary masks
1422  *
1423  * The effective_cpus is also updated.
1424  *
1425  * cpuset_mutex must be held by the caller.
1426  */
1427 static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp)
1428 {
1429 	bool isolcpus_updated;
1430 
1431 	compute_effective_exclusive_cpumask(cs, tmp->new_cpus);
1432 	WARN_ON_ONCE(!is_remote_partition(cs));
1433 	WARN_ON_ONCE(!cpumask_subset(tmp->new_cpus, subpartitions_cpus));
1434 
1435 	spin_lock_irq(&callback_lock);
1436 	list_del_init(&cs->remote_sibling);
1437 	isolcpus_updated = partition_xcpus_del(cs->partition_root_state,
1438 					       NULL, tmp->new_cpus);
1439 	cs->partition_root_state = -cs->partition_root_state;
1440 	if (!cs->prs_err)
1441 		cs->prs_err = PERR_INVCPUS;
1442 	reset_partition_data(cs);
1443 	spin_unlock_irq(&callback_lock);
1444 	update_unbound_workqueue_cpumask(isolcpus_updated);
1445 
1446 	/*
1447 	 * Propagate changes in top_cpuset's effective_cpus down the hierarchy.
1448 	 */
1449 	cpuset_update_tasks_cpumask(&top_cpuset, tmp->new_cpus);
1450 	update_sibling_cpumasks(&top_cpuset, NULL, tmp);
1451 }
1452 
1453 /*
1454  * remote_cpus_update - cpus_exclusive change of remote partition
1455  * @cs: the cpuset to be updated
1456  * @newmask: the new effective_xcpus mask
1457  * @tmp: temporary masks
1458  *
1459  * top_cpuset and subpartitions_cpus will be updated or partition can be
1460  * invalidated.
1461  */
1462 static void remote_cpus_update(struct cpuset *cs, struct cpumask *newmask,
1463 			       struct tmpmasks *tmp)
1464 {
1465 	bool adding, deleting;
1466 	int prs = cs->partition_root_state;
1467 	int isolcpus_updated = 0;
1468 
1469 	if (WARN_ON_ONCE(!is_remote_partition(cs)))
1470 		return;
1471 
1472 	WARN_ON_ONCE(!cpumask_subset(cs->effective_xcpus, subpartitions_cpus));
1473 
1474 	if (cpumask_empty(newmask))
1475 		goto invalidate;
1476 
1477 	adding   = cpumask_andnot(tmp->addmask, newmask, cs->effective_xcpus);
1478 	deleting = cpumask_andnot(tmp->delmask, cs->effective_xcpus, newmask);
1479 
1480 	/*
1481 	 * Additions of remote CPUs is only allowed if those CPUs are
1482 	 * not allocated to other partitions and there are effective_cpus
1483 	 * left in the top cpuset.
1484 	 */
1485 	if (adding && (!capable(CAP_SYS_ADMIN) ||
1486 		       cpumask_intersects(tmp->addmask, subpartitions_cpus) ||
1487 		       cpumask_subset(top_cpuset.effective_cpus, tmp->addmask)))
1488 		goto invalidate;
1489 
1490 	spin_lock_irq(&callback_lock);
1491 	if (adding)
1492 		isolcpus_updated += partition_xcpus_add(prs, NULL, tmp->addmask);
1493 	if (deleting)
1494 		isolcpus_updated += partition_xcpus_del(prs, NULL, tmp->delmask);
1495 	spin_unlock_irq(&callback_lock);
1496 	update_unbound_workqueue_cpumask(isolcpus_updated);
1497 
1498 	/*
1499 	 * Propagate changes in top_cpuset's effective_cpus down the hierarchy.
1500 	 */
1501 	cpuset_update_tasks_cpumask(&top_cpuset, tmp->new_cpus);
1502 	update_sibling_cpumasks(&top_cpuset, NULL, tmp);
1503 	return;
1504 
1505 invalidate:
1506 	remote_partition_disable(cs, tmp);
1507 }
1508 
1509 /*
1510  * remote_partition_check - check if a child remote partition needs update
1511  * @cs: the cpuset to be updated
1512  * @newmask: the new effective_xcpus mask
1513  * @delmask: temporary mask for deletion (not in tmp)
1514  * @tmp: temporary masks
1515  *
1516  * This should be called before the given cs has updated its cpus_allowed
1517  * and/or effective_xcpus.
1518  */
1519 static void remote_partition_check(struct cpuset *cs, struct cpumask *newmask,
1520 				   struct cpumask *delmask, struct tmpmasks *tmp)
1521 {
1522 	struct cpuset *child, *next;
1523 	int disable_cnt = 0;
1524 
1525 	/*
1526 	 * Compute the effective exclusive CPUs that will be deleted.
1527 	 */
1528 	if (!cpumask_andnot(delmask, cs->effective_xcpus, newmask) ||
1529 	    !cpumask_intersects(delmask, subpartitions_cpus))
1530 		return;	/* No deletion of exclusive CPUs in partitions */
1531 
1532 	/*
1533 	 * Searching the remote children list to look for those that will
1534 	 * be impacted by the deletion of exclusive CPUs.
1535 	 *
1536 	 * Since a cpuset must be removed from the remote children list
1537 	 * before it can go offline and holding cpuset_mutex will prevent
1538 	 * any change in cpuset status. RCU read lock isn't needed.
1539 	 */
1540 	lockdep_assert_held(&cpuset_mutex);
1541 	list_for_each_entry_safe(child, next, &remote_children, remote_sibling)
1542 		if (cpumask_intersects(child->effective_cpus, delmask)) {
1543 			remote_partition_disable(child, tmp);
1544 			disable_cnt++;
1545 		}
1546 	if (disable_cnt)
1547 		cpuset_force_rebuild();
1548 }
1549 
1550 /*
1551  * prstate_housekeeping_conflict - check for partition & housekeeping conflicts
1552  * @prstate: partition root state to be checked
1553  * @new_cpus: cpu mask
1554  * Return: true if there is conflict, false otherwise
1555  *
1556  * CPUs outside of boot_hk_cpus, if defined, can only be used in an
1557  * isolated partition.
1558  */
1559 static bool prstate_housekeeping_conflict(int prstate, struct cpumask *new_cpus)
1560 {
1561 	if (!have_boot_isolcpus)
1562 		return false;
1563 
1564 	if ((prstate != PRS_ISOLATED) && !cpumask_subset(new_cpus, boot_hk_cpus))
1565 		return true;
1566 
1567 	return false;
1568 }
1569 
1570 /**
1571  * update_parent_effective_cpumask - update effective_cpus mask of parent cpuset
1572  * @cs:      The cpuset that requests change in partition root state
1573  * @cmd:     Partition root state change command
1574  * @newmask: Optional new cpumask for partcmd_update
1575  * @tmp:     Temporary addmask and delmask
1576  * Return:   0 or a partition root state error code
1577  *
1578  * For partcmd_enable*, the cpuset is being transformed from a non-partition
1579  * root to a partition root. The effective_xcpus (cpus_allowed if
1580  * effective_xcpus not set) mask of the given cpuset will be taken away from
1581  * parent's effective_cpus. The function will return 0 if all the CPUs listed
1582  * in effective_xcpus can be granted or an error code will be returned.
1583  *
1584  * For partcmd_disable, the cpuset is being transformed from a partition
1585  * root back to a non-partition root. Any CPUs in effective_xcpus will be
1586  * given back to parent's effective_cpus. 0 will always be returned.
1587  *
1588  * For partcmd_update, if the optional newmask is specified, the cpu list is
1589  * to be changed from effective_xcpus to newmask. Otherwise, effective_xcpus is
1590  * assumed to remain the same. The cpuset should either be a valid or invalid
1591  * partition root. The partition root state may change from valid to invalid
1592  * or vice versa. An error code will be returned if transitioning from
1593  * invalid to valid violates the exclusivity rule.
1594  *
1595  * For partcmd_invalidate, the current partition will be made invalid.
1596  *
1597  * The partcmd_enable* and partcmd_disable commands are used by
1598  * update_prstate(). An error code may be returned and the caller will check
1599  * for error.
1600  *
1601  * The partcmd_update command is used by update_cpumasks_hier() with newmask
1602  * NULL and update_cpumask() with newmask set. The partcmd_invalidate is used
1603  * by update_cpumask() with NULL newmask. In both cases, the callers won't
1604  * check for error and so partition_root_state and prs_error will be updated
1605  * directly.
1606  */
1607 static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
1608 					   struct cpumask *newmask,
1609 					   struct tmpmasks *tmp)
1610 {
1611 	struct cpuset *parent = parent_cs(cs);
1612 	int adding;	/* Adding cpus to parent's effective_cpus	*/
1613 	int deleting;	/* Deleting cpus from parent's effective_cpus	*/
1614 	int old_prs, new_prs;
1615 	int part_error = PERR_NONE;	/* Partition error? */
1616 	int subparts_delta = 0;
1617 	struct cpumask *xcpus;		/* cs effective_xcpus */
1618 	int isolcpus_updated = 0;
1619 	bool nocpu;
1620 
1621 	lockdep_assert_held(&cpuset_mutex);
1622 
1623 	/*
1624 	 * new_prs will only be changed for the partcmd_update and
1625 	 * partcmd_invalidate commands.
1626 	 */
1627 	adding = deleting = false;
1628 	old_prs = new_prs = cs->partition_root_state;
1629 	xcpus = user_xcpus(cs);
1630 
1631 	if (cmd == partcmd_invalidate) {
1632 		if (is_prs_invalid(old_prs))
1633 			return 0;
1634 
1635 		/*
1636 		 * Make the current partition invalid.
1637 		 */
1638 		if (is_partition_valid(parent))
1639 			adding = cpumask_and(tmp->addmask,
1640 					     xcpus, parent->effective_xcpus);
1641 		if (old_prs > 0) {
1642 			new_prs = -old_prs;
1643 			subparts_delta--;
1644 		}
1645 		goto write_error;
1646 	}
1647 
1648 	/*
1649 	 * The parent must be a partition root.
1650 	 * The new cpumask, if present, or the current cpus_allowed must
1651 	 * not be empty.
1652 	 */
1653 	if (!is_partition_valid(parent)) {
1654 		return is_partition_invalid(parent)
1655 		       ? PERR_INVPARENT : PERR_NOTPART;
1656 	}
1657 	if (!newmask && xcpus_empty(cs))
1658 		return PERR_CPUSEMPTY;
1659 
1660 	nocpu = tasks_nocpu_error(parent, cs, xcpus);
1661 
1662 	if ((cmd == partcmd_enable) || (cmd == partcmd_enablei)) {
1663 		/*
1664 		 * Enabling partition root is not allowed if its
1665 		 * effective_xcpus is empty or doesn't overlap with
1666 		 * parent's effective_xcpus.
1667 		 */
1668 		if (cpumask_empty(xcpus) ||
1669 		    !cpumask_intersects(xcpus, parent->effective_xcpus))
1670 			return PERR_INVCPUS;
1671 
1672 		if (prstate_housekeeping_conflict(new_prs, xcpus))
1673 			return PERR_HKEEPING;
1674 
1675 		/*
1676 		 * A parent can be left with no CPU as long as there is no
1677 		 * task directly associated with the parent partition.
1678 		 */
1679 		if (nocpu)
1680 			return PERR_NOCPUS;
1681 
1682 		cpumask_copy(tmp->delmask, xcpus);
1683 		deleting = true;
1684 		subparts_delta++;
1685 		new_prs = (cmd == partcmd_enable) ? PRS_ROOT : PRS_ISOLATED;
1686 	} else if (cmd == partcmd_disable) {
1687 		/*
1688 		 * May need to add cpus to parent's effective_cpus for
1689 		 * valid partition root.
1690 		 */
1691 		adding = !is_prs_invalid(old_prs) &&
1692 			  cpumask_and(tmp->addmask, xcpus, parent->effective_xcpus);
1693 		if (adding)
1694 			subparts_delta--;
1695 		new_prs = PRS_MEMBER;
1696 	} else if (newmask) {
1697 		/*
1698 		 * Empty cpumask is not allowed
1699 		 */
1700 		if (cpumask_empty(newmask)) {
1701 			part_error = PERR_CPUSEMPTY;
1702 			goto write_error;
1703 		}
1704 		/* Check newmask again, whether cpus are available for parent/cs */
1705 		nocpu |= tasks_nocpu_error(parent, cs, newmask);
1706 
1707 		/*
1708 		 * partcmd_update with newmask:
1709 		 *
1710 		 * Compute add/delete mask to/from effective_cpus
1711 		 *
1712 		 * For valid partition:
1713 		 *   addmask = exclusive_cpus & ~newmask
1714 		 *			      & parent->effective_xcpus
1715 		 *   delmask = newmask & ~exclusive_cpus
1716 		 *		       & parent->effective_xcpus
1717 		 *
1718 		 * For invalid partition:
1719 		 *   delmask = newmask & parent->effective_xcpus
1720 		 */
1721 		if (is_prs_invalid(old_prs)) {
1722 			adding = false;
1723 			deleting = cpumask_and(tmp->delmask,
1724 					newmask, parent->effective_xcpus);
1725 		} else {
1726 			cpumask_andnot(tmp->addmask, xcpus, newmask);
1727 			adding = cpumask_and(tmp->addmask, tmp->addmask,
1728 					     parent->effective_xcpus);
1729 
1730 			cpumask_andnot(tmp->delmask, newmask, xcpus);
1731 			deleting = cpumask_and(tmp->delmask, tmp->delmask,
1732 					       parent->effective_xcpus);
1733 		}
1734 		/*
1735 		 * Make partition invalid if parent's effective_cpus could
1736 		 * become empty and there are tasks in the parent.
1737 		 */
1738 		if (nocpu && (!adding ||
1739 		    !cpumask_intersects(tmp->addmask, cpu_active_mask))) {
1740 			part_error = PERR_NOCPUS;
1741 			deleting = false;
1742 			adding = cpumask_and(tmp->addmask,
1743 					     xcpus, parent->effective_xcpus);
1744 		}
1745 	} else {
1746 		/*
1747 		 * partcmd_update w/o newmask
1748 		 *
1749 		 * delmask = effective_xcpus & parent->effective_cpus
1750 		 *
1751 		 * This can be called from:
1752 		 * 1) update_cpumasks_hier()
1753 		 * 2) cpuset_hotplug_update_tasks()
1754 		 *
1755 		 * Check to see if it can be transitioned from valid to
1756 		 * invalid partition or vice versa.
1757 		 *
1758 		 * A partition error happens when parent has tasks and all
1759 		 * its effective CPUs will have to be distributed out.
1760 		 */
1761 		WARN_ON_ONCE(!is_partition_valid(parent));
1762 		if (nocpu) {
1763 			part_error = PERR_NOCPUS;
1764 			if (is_partition_valid(cs))
1765 				adding = cpumask_and(tmp->addmask,
1766 						xcpus, parent->effective_xcpus);
1767 		} else if (is_partition_invalid(cs) &&
1768 			   cpumask_subset(xcpus, parent->effective_xcpus)) {
1769 			struct cgroup_subsys_state *css;
1770 			struct cpuset *child;
1771 			bool exclusive = true;
1772 
1773 			/*
1774 			 * Convert invalid partition to valid has to
1775 			 * pass the cpu exclusivity test.
1776 			 */
1777 			rcu_read_lock();
1778 			cpuset_for_each_child(child, css, parent) {
1779 				if (child == cs)
1780 					continue;
1781 				if (!cpusets_are_exclusive(cs, child)) {
1782 					exclusive = false;
1783 					break;
1784 				}
1785 			}
1786 			rcu_read_unlock();
1787 			if (exclusive)
1788 				deleting = cpumask_and(tmp->delmask,
1789 						xcpus, parent->effective_cpus);
1790 			else
1791 				part_error = PERR_NOTEXCL;
1792 		}
1793 	}
1794 
1795 write_error:
1796 	if (part_error)
1797 		WRITE_ONCE(cs->prs_err, part_error);
1798 
1799 	if (cmd == partcmd_update) {
1800 		/*
1801 		 * Check for possible transition between valid and invalid
1802 		 * partition root.
1803 		 */
1804 		switch (cs->partition_root_state) {
1805 		case PRS_ROOT:
1806 		case PRS_ISOLATED:
1807 			if (part_error) {
1808 				new_prs = -old_prs;
1809 				subparts_delta--;
1810 			}
1811 			break;
1812 		case PRS_INVALID_ROOT:
1813 		case PRS_INVALID_ISOLATED:
1814 			if (!part_error) {
1815 				new_prs = -old_prs;
1816 				subparts_delta++;
1817 			}
1818 			break;
1819 		}
1820 	}
1821 
1822 	if (!adding && !deleting && (new_prs == old_prs))
1823 		return 0;
1824 
1825 	/*
1826 	 * Transitioning between invalid to valid or vice versa may require
1827 	 * changing CS_CPU_EXCLUSIVE. In the case of partcmd_update,
1828 	 * validate_change() has already been successfully called and
1829 	 * CPU lists in cs haven't been updated yet. So defer it to later.
1830 	 */
1831 	if ((old_prs != new_prs) && (cmd != partcmd_update))  {
1832 		int err = update_partition_exclusive(cs, new_prs);
1833 
1834 		if (err)
1835 			return err;
1836 	}
1837 
1838 	/*
1839 	 * Change the parent's effective_cpus & effective_xcpus (top cpuset
1840 	 * only).
1841 	 *
1842 	 * Newly added CPUs will be removed from effective_cpus and
1843 	 * newly deleted ones will be added back to effective_cpus.
1844 	 */
1845 	spin_lock_irq(&callback_lock);
1846 	if (old_prs != new_prs) {
1847 		cs->partition_root_state = new_prs;
1848 		if (new_prs <= 0)
1849 			cs->nr_subparts = 0;
1850 	}
1851 	/*
1852 	 * Adding to parent's effective_cpus means deletion CPUs from cs
1853 	 * and vice versa.
1854 	 */
1855 	if (adding)
1856 		isolcpus_updated += partition_xcpus_del(old_prs, parent,
1857 							tmp->addmask);
1858 	if (deleting)
1859 		isolcpus_updated += partition_xcpus_add(new_prs, parent,
1860 							tmp->delmask);
1861 
1862 	if (is_partition_valid(parent)) {
1863 		parent->nr_subparts += subparts_delta;
1864 		WARN_ON_ONCE(parent->nr_subparts < 0);
1865 	}
1866 	spin_unlock_irq(&callback_lock);
1867 	update_unbound_workqueue_cpumask(isolcpus_updated);
1868 
1869 	if ((old_prs != new_prs) && (cmd == partcmd_update))
1870 		update_partition_exclusive(cs, new_prs);
1871 
1872 	if (adding || deleting) {
1873 		cpuset_update_tasks_cpumask(parent, tmp->addmask);
1874 		update_sibling_cpumasks(parent, cs, tmp);
1875 	}
1876 
1877 	/*
1878 	 * For partcmd_update without newmask, it is being called from
1879 	 * cpuset_handle_hotplug(). Update the load balance flag and
1880 	 * scheduling domain accordingly.
1881 	 */
1882 	if ((cmd == partcmd_update) && !newmask)
1883 		update_partition_sd_lb(cs, old_prs);
1884 
1885 	notify_partition_change(cs, old_prs);
1886 	return 0;
1887 }
1888 
1889 /**
1890  * compute_partition_effective_cpumask - compute effective_cpus for partition
1891  * @cs: partition root cpuset
1892  * @new_ecpus: previously computed effective_cpus to be updated
1893  *
1894  * Compute the effective_cpus of a partition root by scanning effective_xcpus
1895  * of child partition roots and excluding their effective_xcpus.
1896  *
1897  * This has the side effect of invalidating valid child partition roots,
1898  * if necessary. Since it is called from either cpuset_hotplug_update_tasks()
1899  * or update_cpumasks_hier() where parent and children are modified
1900  * successively, we don't need to call update_parent_effective_cpumask()
1901  * and the child's effective_cpus will be updated in later iterations.
1902  *
1903  * Note that rcu_read_lock() is assumed to be held.
1904  */
1905 static void compute_partition_effective_cpumask(struct cpuset *cs,
1906 						struct cpumask *new_ecpus)
1907 {
1908 	struct cgroup_subsys_state *css;
1909 	struct cpuset *child;
1910 	bool populated = partition_is_populated(cs, NULL);
1911 
1912 	/*
1913 	 * Check child partition roots to see if they should be
1914 	 * invalidated when
1915 	 *  1) child effective_xcpus not a subset of new
1916 	 *     excluisve_cpus
1917 	 *  2) All the effective_cpus will be used up and cp
1918 	 *     has tasks
1919 	 */
1920 	compute_effective_exclusive_cpumask(cs, new_ecpus);
1921 	cpumask_and(new_ecpus, new_ecpus, cpu_active_mask);
1922 
1923 	rcu_read_lock();
1924 	cpuset_for_each_child(child, css, cs) {
1925 		if (!is_partition_valid(child))
1926 			continue;
1927 
1928 		child->prs_err = 0;
1929 		if (!cpumask_subset(child->effective_xcpus,
1930 				    cs->effective_xcpus))
1931 			child->prs_err = PERR_INVCPUS;
1932 		else if (populated &&
1933 			 cpumask_subset(new_ecpus, child->effective_xcpus))
1934 			child->prs_err = PERR_NOCPUS;
1935 
1936 		if (child->prs_err) {
1937 			int old_prs = child->partition_root_state;
1938 
1939 			/*
1940 			 * Invalidate child partition
1941 			 */
1942 			spin_lock_irq(&callback_lock);
1943 			make_partition_invalid(child);
1944 			cs->nr_subparts--;
1945 			child->nr_subparts = 0;
1946 			spin_unlock_irq(&callback_lock);
1947 			notify_partition_change(child, old_prs);
1948 			continue;
1949 		}
1950 		cpumask_andnot(new_ecpus, new_ecpus,
1951 			       child->effective_xcpus);
1952 	}
1953 	rcu_read_unlock();
1954 }
1955 
1956 /*
1957  * update_cpumasks_hier - Update effective cpumasks and tasks in the subtree
1958  * @cs:  the cpuset to consider
1959  * @tmp: temp variables for calculating effective_cpus & partition setup
1960  * @force: don't skip any descendant cpusets if set
1961  *
1962  * When configured cpumask is changed, the effective cpumasks of this cpuset
1963  * and all its descendants need to be updated.
1964  *
1965  * On legacy hierarchy, effective_cpus will be the same with cpu_allowed.
1966  *
1967  * Called with cpuset_mutex held
1968  */
1969 static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp,
1970 				 bool force)
1971 {
1972 	struct cpuset *cp;
1973 	struct cgroup_subsys_state *pos_css;
1974 	bool need_rebuild_sched_domains = false;
1975 	int old_prs, new_prs;
1976 
1977 	rcu_read_lock();
1978 	cpuset_for_each_descendant_pre(cp, pos_css, cs) {
1979 		struct cpuset *parent = parent_cs(cp);
1980 		bool remote = is_remote_partition(cp);
1981 		bool update_parent = false;
1982 
1983 		/*
1984 		 * Skip descendent remote partition that acquires CPUs
1985 		 * directly from top cpuset unless it is cs.
1986 		 */
1987 		if (remote && (cp != cs)) {
1988 			pos_css = css_rightmost_descendant(pos_css);
1989 			continue;
1990 		}
1991 
1992 		/*
1993 		 * Update effective_xcpus if exclusive_cpus set.
1994 		 * The case when exclusive_cpus isn't set is handled later.
1995 		 */
1996 		if (!cpumask_empty(cp->exclusive_cpus) && (cp != cs)) {
1997 			spin_lock_irq(&callback_lock);
1998 			compute_effective_exclusive_cpumask(cp, NULL);
1999 			spin_unlock_irq(&callback_lock);
2000 		}
2001 
2002 		old_prs = new_prs = cp->partition_root_state;
2003 		if (remote || (is_partition_valid(parent) &&
2004 			       is_partition_valid(cp)))
2005 			compute_partition_effective_cpumask(cp, tmp->new_cpus);
2006 		else
2007 			compute_effective_cpumask(tmp->new_cpus, cp, parent);
2008 
2009 		/*
2010 		 * A partition with no effective_cpus is allowed as long as
2011 		 * there is no task associated with it. Call
2012 		 * update_parent_effective_cpumask() to check it.
2013 		 */
2014 		if (is_partition_valid(cp) && cpumask_empty(tmp->new_cpus)) {
2015 			update_parent = true;
2016 			goto update_parent_effective;
2017 		}
2018 
2019 		/*
2020 		 * If it becomes empty, inherit the effective mask of the
2021 		 * parent, which is guaranteed to have some CPUs unless
2022 		 * it is a partition root that has explicitly distributed
2023 		 * out all its CPUs.
2024 		 */
2025 		if (is_in_v2_mode() && !remote && cpumask_empty(tmp->new_cpus))
2026 			cpumask_copy(tmp->new_cpus, parent->effective_cpus);
2027 
2028 		if (remote)
2029 			goto get_css;
2030 
2031 		/*
2032 		 * Skip the whole subtree if
2033 		 * 1) the cpumask remains the same,
2034 		 * 2) has no partition root state,
2035 		 * 3) force flag not set, and
2036 		 * 4) for v2 load balance state same as its parent.
2037 		 */
2038 		if (!cp->partition_root_state && !force &&
2039 		    cpumask_equal(tmp->new_cpus, cp->effective_cpus) &&
2040 		    (!cpuset_v2() ||
2041 		    (is_sched_load_balance(parent) == is_sched_load_balance(cp)))) {
2042 			pos_css = css_rightmost_descendant(pos_css);
2043 			continue;
2044 		}
2045 
2046 update_parent_effective:
2047 		/*
2048 		 * update_parent_effective_cpumask() should have been called
2049 		 * for cs already in update_cpumask(). We should also call
2050 		 * cpuset_update_tasks_cpumask() again for tasks in the parent
2051 		 * cpuset if the parent's effective_cpus changes.
2052 		 */
2053 		if ((cp != cs) && old_prs) {
2054 			switch (parent->partition_root_state) {
2055 			case PRS_ROOT:
2056 			case PRS_ISOLATED:
2057 				update_parent = true;
2058 				break;
2059 
2060 			default:
2061 				/*
2062 				 * When parent is not a partition root or is
2063 				 * invalid, child partition roots become
2064 				 * invalid too.
2065 				 */
2066 				if (is_partition_valid(cp))
2067 					new_prs = -cp->partition_root_state;
2068 				WRITE_ONCE(cp->prs_err,
2069 					   is_partition_invalid(parent)
2070 					   ? PERR_INVPARENT : PERR_NOTPART);
2071 				break;
2072 			}
2073 		}
2074 get_css:
2075 		if (!css_tryget_online(&cp->css))
2076 			continue;
2077 		rcu_read_unlock();
2078 
2079 		if (update_parent) {
2080 			update_parent_effective_cpumask(cp, partcmd_update, NULL, tmp);
2081 			/*
2082 			 * The cpuset partition_root_state may become
2083 			 * invalid. Capture it.
2084 			 */
2085 			new_prs = cp->partition_root_state;
2086 		}
2087 
2088 		spin_lock_irq(&callback_lock);
2089 		cpumask_copy(cp->effective_cpus, tmp->new_cpus);
2090 		cp->partition_root_state = new_prs;
2091 		/*
2092 		 * Make sure effective_xcpus is properly set for a valid
2093 		 * partition root.
2094 		 */
2095 		if ((new_prs > 0) && cpumask_empty(cp->exclusive_cpus))
2096 			cpumask_and(cp->effective_xcpus,
2097 				    cp->cpus_allowed, parent->effective_xcpus);
2098 		else if (new_prs < 0)
2099 			reset_partition_data(cp);
2100 		spin_unlock_irq(&callback_lock);
2101 
2102 		notify_partition_change(cp, old_prs);
2103 
2104 		WARN_ON(!is_in_v2_mode() &&
2105 			!cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
2106 
2107 		cpuset_update_tasks_cpumask(cp, cp->effective_cpus);
2108 
2109 		/*
2110 		 * On default hierarchy, inherit the CS_SCHED_LOAD_BALANCE
2111 		 * from parent if current cpuset isn't a valid partition root
2112 		 * and their load balance states differ.
2113 		 */
2114 		if (cpuset_v2() && !is_partition_valid(cp) &&
2115 		    (is_sched_load_balance(parent) != is_sched_load_balance(cp))) {
2116 			if (is_sched_load_balance(parent))
2117 				set_bit(CS_SCHED_LOAD_BALANCE, &cp->flags);
2118 			else
2119 				clear_bit(CS_SCHED_LOAD_BALANCE, &cp->flags);
2120 		}
2121 
2122 		/*
2123 		 * On legacy hierarchy, if the effective cpumask of any non-
2124 		 * empty cpuset is changed, we need to rebuild sched domains.
2125 		 * On default hierarchy, the cpuset needs to be a partition
2126 		 * root as well.
2127 		 */
2128 		if (!cpumask_empty(cp->cpus_allowed) &&
2129 		    is_sched_load_balance(cp) &&
2130 		   (!cpuset_v2() || is_partition_valid(cp)))
2131 			need_rebuild_sched_domains = true;
2132 
2133 		rcu_read_lock();
2134 		css_put(&cp->css);
2135 	}
2136 	rcu_read_unlock();
2137 
2138 	if (need_rebuild_sched_domains)
2139 		cpuset_force_rebuild();
2140 }
2141 
2142 /**
2143  * update_sibling_cpumasks - Update siblings cpumasks
2144  * @parent:  Parent cpuset
2145  * @cs:      Current cpuset
2146  * @tmp:     Temp variables
2147  */
2148 static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
2149 				    struct tmpmasks *tmp)
2150 {
2151 	struct cpuset *sibling;
2152 	struct cgroup_subsys_state *pos_css;
2153 
2154 	lockdep_assert_held(&cpuset_mutex);
2155 
2156 	/*
2157 	 * Check all its siblings and call update_cpumasks_hier()
2158 	 * if their effective_cpus will need to be changed.
2159 	 *
2160 	 * It is possible a change in parent's effective_cpus
2161 	 * due to a change in a child partition's effective_xcpus will impact
2162 	 * its siblings even if they do not inherit parent's effective_cpus
2163 	 * directly.
2164 	 *
2165 	 * The update_cpumasks_hier() function may sleep. So we have to
2166 	 * release the RCU read lock before calling it.
2167 	 */
2168 	rcu_read_lock();
2169 	cpuset_for_each_child(sibling, pos_css, parent) {
2170 		if (sibling == cs)
2171 			continue;
2172 		if (!is_partition_valid(sibling)) {
2173 			compute_effective_cpumask(tmp->new_cpus, sibling,
2174 						  parent);
2175 			if (cpumask_equal(tmp->new_cpus, sibling->effective_cpus))
2176 				continue;
2177 		}
2178 		if (!css_tryget_online(&sibling->css))
2179 			continue;
2180 
2181 		rcu_read_unlock();
2182 		update_cpumasks_hier(sibling, tmp, false);
2183 		rcu_read_lock();
2184 		css_put(&sibling->css);
2185 	}
2186 	rcu_read_unlock();
2187 }
2188 
2189 /**
2190  * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it
2191  * @cs: the cpuset to consider
2192  * @trialcs: trial cpuset
2193  * @buf: buffer of cpu numbers written to this cpuset
2194  */
2195 static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
2196 			  const char *buf)
2197 {
2198 	int retval;
2199 	struct tmpmasks tmp;
2200 	struct cpuset *parent = parent_cs(cs);
2201 	bool invalidate = false;
2202 	bool force = false;
2203 	int old_prs = cs->partition_root_state;
2204 
2205 	/* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */
2206 	if (cs == &top_cpuset)
2207 		return -EACCES;
2208 
2209 	/*
2210 	 * An empty cpus_allowed is ok only if the cpuset has no tasks.
2211 	 * Since cpulist_parse() fails on an empty mask, we special case
2212 	 * that parsing.  The validate_change() call ensures that cpusets
2213 	 * with tasks have cpus.
2214 	 */
2215 	if (!*buf) {
2216 		cpumask_clear(trialcs->cpus_allowed);
2217 		if (cpumask_empty(trialcs->exclusive_cpus))
2218 			cpumask_clear(trialcs->effective_xcpus);
2219 	} else {
2220 		retval = cpulist_parse(buf, trialcs->cpus_allowed);
2221 		if (retval < 0)
2222 			return retval;
2223 
2224 		if (!cpumask_subset(trialcs->cpus_allowed,
2225 				    top_cpuset.cpus_allowed))
2226 			return -EINVAL;
2227 
2228 		/*
2229 		 * When exclusive_cpus isn't explicitly set, it is constrained
2230 		 * by cpus_allowed and parent's effective_xcpus. Otherwise,
2231 		 * trialcs->effective_xcpus is used as a temporary cpumask
2232 		 * for checking validity of the partition root.
2233 		 */
2234 		if (!cpumask_empty(trialcs->exclusive_cpus) || is_partition_valid(cs))
2235 			compute_effective_exclusive_cpumask(trialcs, NULL);
2236 	}
2237 
2238 	/* Nothing to do if the cpus didn't change */
2239 	if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))
2240 		return 0;
2241 
2242 	if (alloc_cpumasks(NULL, &tmp))
2243 		return -ENOMEM;
2244 
2245 	if (old_prs) {
2246 		if (is_partition_valid(cs) &&
2247 		    cpumask_empty(trialcs->effective_xcpus)) {
2248 			invalidate = true;
2249 			cs->prs_err = PERR_INVCPUS;
2250 		} else if (prstate_housekeeping_conflict(old_prs, trialcs->effective_xcpus)) {
2251 			invalidate = true;
2252 			cs->prs_err = PERR_HKEEPING;
2253 		} else if (tasks_nocpu_error(parent, cs, trialcs->effective_xcpus)) {
2254 			invalidate = true;
2255 			cs->prs_err = PERR_NOCPUS;
2256 		}
2257 	}
2258 
2259 	/*
2260 	 * Check all the descendants in update_cpumasks_hier() if
2261 	 * effective_xcpus is to be changed.
2262 	 */
2263 	force = !cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus);
2264 
2265 	retval = validate_change(cs, trialcs);
2266 
2267 	if ((retval == -EINVAL) && cpuset_v2()) {
2268 		struct cgroup_subsys_state *css;
2269 		struct cpuset *cp;
2270 
2271 		/*
2272 		 * The -EINVAL error code indicates that partition sibling
2273 		 * CPU exclusivity rule has been violated. We still allow
2274 		 * the cpumask change to proceed while invalidating the
2275 		 * partition. However, any conflicting sibling partitions
2276 		 * have to be marked as invalid too.
2277 		 */
2278 		invalidate = true;
2279 		rcu_read_lock();
2280 		cpuset_for_each_child(cp, css, parent) {
2281 			struct cpumask *xcpus = user_xcpus(trialcs);
2282 
2283 			if (is_partition_valid(cp) &&
2284 			    cpumask_intersects(xcpus, cp->effective_xcpus)) {
2285 				rcu_read_unlock();
2286 				update_parent_effective_cpumask(cp, partcmd_invalidate, NULL, &tmp);
2287 				rcu_read_lock();
2288 			}
2289 		}
2290 		rcu_read_unlock();
2291 		retval = 0;
2292 	}
2293 
2294 	if (retval < 0)
2295 		goto out_free;
2296 
2297 	if (is_partition_valid(cs) ||
2298 	   (is_partition_invalid(cs) && !invalidate)) {
2299 		struct cpumask *xcpus = trialcs->effective_xcpus;
2300 
2301 		if (cpumask_empty(xcpus) && is_partition_invalid(cs))
2302 			xcpus = trialcs->cpus_allowed;
2303 
2304 		/*
2305 		 * Call remote_cpus_update() to handle valid remote partition
2306 		 */
2307 		if (is_remote_partition(cs))
2308 			remote_cpus_update(cs, xcpus, &tmp);
2309 		else if (invalidate)
2310 			update_parent_effective_cpumask(cs, partcmd_invalidate,
2311 							NULL, &tmp);
2312 		else
2313 			update_parent_effective_cpumask(cs, partcmd_update,
2314 							xcpus, &tmp);
2315 	} else if (!cpumask_empty(cs->exclusive_cpus)) {
2316 		/*
2317 		 * Use trialcs->effective_cpus as a temp cpumask
2318 		 */
2319 		remote_partition_check(cs, trialcs->effective_xcpus,
2320 				       trialcs->effective_cpus, &tmp);
2321 	}
2322 
2323 	spin_lock_irq(&callback_lock);
2324 	cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
2325 	cpumask_copy(cs->effective_xcpus, trialcs->effective_xcpus);
2326 	if ((old_prs > 0) && !is_partition_valid(cs))
2327 		reset_partition_data(cs);
2328 	spin_unlock_irq(&callback_lock);
2329 
2330 	/* effective_cpus/effective_xcpus will be updated here */
2331 	update_cpumasks_hier(cs, &tmp, force);
2332 
2333 	/* Update CS_SCHED_LOAD_BALANCE and/or sched_domains, if necessary */
2334 	if (cs->partition_root_state)
2335 		update_partition_sd_lb(cs, old_prs);
2336 out_free:
2337 	free_cpumasks(NULL, &tmp);
2338 	return retval;
2339 }
2340 
2341 /**
2342  * update_exclusive_cpumask - update the exclusive_cpus mask of a cpuset
2343  * @cs: the cpuset to consider
2344  * @trialcs: trial cpuset
2345  * @buf: buffer of cpu numbers written to this cpuset
2346  *
2347  * The tasks' cpumask will be updated if cs is a valid partition root.
2348  */
2349 static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs,
2350 				    const char *buf)
2351 {
2352 	int retval;
2353 	struct tmpmasks tmp;
2354 	struct cpuset *parent = parent_cs(cs);
2355 	bool invalidate = false;
2356 	bool force = false;
2357 	int old_prs = cs->partition_root_state;
2358 
2359 	if (!*buf) {
2360 		cpumask_clear(trialcs->exclusive_cpus);
2361 		cpumask_clear(trialcs->effective_xcpus);
2362 	} else {
2363 		retval = cpulist_parse(buf, trialcs->exclusive_cpus);
2364 		if (retval < 0)
2365 			return retval;
2366 	}
2367 
2368 	/* Nothing to do if the CPUs didn't change */
2369 	if (cpumask_equal(cs->exclusive_cpus, trialcs->exclusive_cpus))
2370 		return 0;
2371 
2372 	if (*buf)
2373 		compute_effective_exclusive_cpumask(trialcs, NULL);
2374 
2375 	/*
2376 	 * Check all the descendants in update_cpumasks_hier() if
2377 	 * effective_xcpus is to be changed.
2378 	 */
2379 	force = !cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus);
2380 
2381 	retval = validate_change(cs, trialcs);
2382 	if (retval)
2383 		return retval;
2384 
2385 	if (alloc_cpumasks(NULL, &tmp))
2386 		return -ENOMEM;
2387 
2388 	if (old_prs) {
2389 		if (cpumask_empty(trialcs->effective_xcpus)) {
2390 			invalidate = true;
2391 			cs->prs_err = PERR_INVCPUS;
2392 		} else if (prstate_housekeeping_conflict(old_prs, trialcs->effective_xcpus)) {
2393 			invalidate = true;
2394 			cs->prs_err = PERR_HKEEPING;
2395 		} else if (tasks_nocpu_error(parent, cs, trialcs->effective_xcpus)) {
2396 			invalidate = true;
2397 			cs->prs_err = PERR_NOCPUS;
2398 		}
2399 
2400 		if (is_remote_partition(cs)) {
2401 			if (invalidate)
2402 				remote_partition_disable(cs, &tmp);
2403 			else
2404 				remote_cpus_update(cs, trialcs->effective_xcpus,
2405 						   &tmp);
2406 		} else if (invalidate) {
2407 			update_parent_effective_cpumask(cs, partcmd_invalidate,
2408 							NULL, &tmp);
2409 		} else {
2410 			update_parent_effective_cpumask(cs, partcmd_update,
2411 						trialcs->effective_xcpus, &tmp);
2412 		}
2413 	} else if (!cpumask_empty(trialcs->exclusive_cpus)) {
2414 		/*
2415 		 * Use trialcs->effective_cpus as a temp cpumask
2416 		 */
2417 		remote_partition_check(cs, trialcs->effective_xcpus,
2418 				       trialcs->effective_cpus, &tmp);
2419 	}
2420 	spin_lock_irq(&callback_lock);
2421 	cpumask_copy(cs->exclusive_cpus, trialcs->exclusive_cpus);
2422 	cpumask_copy(cs->effective_xcpus, trialcs->effective_xcpus);
2423 	if ((old_prs > 0) && !is_partition_valid(cs))
2424 		reset_partition_data(cs);
2425 	spin_unlock_irq(&callback_lock);
2426 
2427 	/*
2428 	 * Call update_cpumasks_hier() to update effective_cpus/effective_xcpus
2429 	 * of the subtree when it is a valid partition root or effective_xcpus
2430 	 * is updated.
2431 	 */
2432 	if (is_partition_valid(cs) || force)
2433 		update_cpumasks_hier(cs, &tmp, force);
2434 
2435 	/* Update CS_SCHED_LOAD_BALANCE and/or sched_domains, if necessary */
2436 	if (cs->partition_root_state)
2437 		update_partition_sd_lb(cs, old_prs);
2438 
2439 	free_cpumasks(NULL, &tmp);
2440 	return 0;
2441 }
2442 
2443 /*
2444  * Migrate memory region from one set of nodes to another.  This is
2445  * performed asynchronously as it can be called from process migration path
2446  * holding locks involved in process management.  All mm migrations are
2447  * performed in the queued order and can be waited for by flushing
2448  * cpuset_migrate_mm_wq.
2449  */
2450 
2451 struct cpuset_migrate_mm_work {
2452 	struct work_struct	work;
2453 	struct mm_struct	*mm;
2454 	nodemask_t		from;
2455 	nodemask_t		to;
2456 };
2457 
2458 static void cpuset_migrate_mm_workfn(struct work_struct *work)
2459 {
2460 	struct cpuset_migrate_mm_work *mwork =
2461 		container_of(work, struct cpuset_migrate_mm_work, work);
2462 
2463 	/* on a wq worker, no need to worry about %current's mems_allowed */
2464 	do_migrate_pages(mwork->mm, &mwork->from, &mwork->to, MPOL_MF_MOVE_ALL);
2465 	mmput(mwork->mm);
2466 	kfree(mwork);
2467 }
2468 
2469 static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
2470 							const nodemask_t *to)
2471 {
2472 	struct cpuset_migrate_mm_work *mwork;
2473 
2474 	if (nodes_equal(*from, *to)) {
2475 		mmput(mm);
2476 		return;
2477 	}
2478 
2479 	mwork = kzalloc(sizeof(*mwork), GFP_KERNEL);
2480 	if (mwork) {
2481 		mwork->mm = mm;
2482 		mwork->from = *from;
2483 		mwork->to = *to;
2484 		INIT_WORK(&mwork->work, cpuset_migrate_mm_workfn);
2485 		queue_work(cpuset_migrate_mm_wq, &mwork->work);
2486 	} else {
2487 		mmput(mm);
2488 	}
2489 }
2490 
2491 static void cpuset_post_attach(void)
2492 {
2493 	flush_workqueue(cpuset_migrate_mm_wq);
2494 }
2495 
2496 /*
2497  * cpuset_change_task_nodemask - change task's mems_allowed and mempolicy
2498  * @tsk: the task to change
2499  * @newmems: new nodes that the task will be set
2500  *
2501  * We use the mems_allowed_seq seqlock to safely update both tsk->mems_allowed
2502  * and rebind an eventual tasks' mempolicy. If the task is allocating in
2503  * parallel, it might temporarily see an empty intersection, which results in
2504  * a seqlock check and retry before OOM or allocation failure.
2505  */
2506 static void cpuset_change_task_nodemask(struct task_struct *tsk,
2507 					nodemask_t *newmems)
2508 {
2509 	task_lock(tsk);
2510 
2511 	local_irq_disable();
2512 	write_seqcount_begin(&tsk->mems_allowed_seq);
2513 
2514 	nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
2515 	mpol_rebind_task(tsk, newmems);
2516 	tsk->mems_allowed = *newmems;
2517 
2518 	write_seqcount_end(&tsk->mems_allowed_seq);
2519 	local_irq_enable();
2520 
2521 	task_unlock(tsk);
2522 }
2523 
2524 static void *cpuset_being_rebound;
2525 
2526 /**
2527  * cpuset_update_tasks_nodemask - Update the nodemasks of tasks in the cpuset.
2528  * @cs: the cpuset in which each task's mems_allowed mask needs to be changed
2529  *
2530  * Iterate through each task of @cs updating its mems_allowed to the
2531  * effective cpuset's.  As this function is called with cpuset_mutex held,
2532  * cpuset membership stays stable.
2533  */
2534 void cpuset_update_tasks_nodemask(struct cpuset *cs)
2535 {
2536 	static nodemask_t newmems;	/* protected by cpuset_mutex */
2537 	struct css_task_iter it;
2538 	struct task_struct *task;
2539 
2540 	cpuset_being_rebound = cs;		/* causes mpol_dup() rebind */
2541 
2542 	guarantee_online_mems(cs, &newmems);
2543 
2544 	/*
2545 	 * The mpol_rebind_mm() call takes mmap_lock, which we couldn't
2546 	 * take while holding tasklist_lock.  Forks can happen - the
2547 	 * mpol_dup() cpuset_being_rebound check will catch such forks,
2548 	 * and rebind their vma mempolicies too.  Because we still hold
2549 	 * the global cpuset_mutex, we know that no other rebind effort
2550 	 * will be contending for the global variable cpuset_being_rebound.
2551 	 * It's ok if we rebind the same mm twice; mpol_rebind_mm()
2552 	 * is idempotent.  Also migrate pages in each mm to new nodes.
2553 	 */
2554 	css_task_iter_start(&cs->css, 0, &it);
2555 	while ((task = css_task_iter_next(&it))) {
2556 		struct mm_struct *mm;
2557 		bool migrate;
2558 
2559 		cpuset_change_task_nodemask(task, &newmems);
2560 
2561 		mm = get_task_mm(task);
2562 		if (!mm)
2563 			continue;
2564 
2565 		migrate = is_memory_migrate(cs);
2566 
2567 		mpol_rebind_mm(mm, &cs->mems_allowed);
2568 		if (migrate)
2569 			cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems);
2570 		else
2571 			mmput(mm);
2572 	}
2573 	css_task_iter_end(&it);
2574 
2575 	/*
2576 	 * All the tasks' nodemasks have been updated, update
2577 	 * cs->old_mems_allowed.
2578 	 */
2579 	cs->old_mems_allowed = newmems;
2580 
2581 	/* We're done rebinding vmas to this cpuset's new mems_allowed. */
2582 	cpuset_being_rebound = NULL;
2583 }
2584 
2585 /*
2586  * update_nodemasks_hier - Update effective nodemasks and tasks in the subtree
2587  * @cs: the cpuset to consider
2588  * @new_mems: a temp variable for calculating new effective_mems
2589  *
2590  * When configured nodemask is changed, the effective nodemasks of this cpuset
2591  * and all its descendants need to be updated.
2592  *
2593  * On legacy hierarchy, effective_mems will be the same with mems_allowed.
2594  *
2595  * Called with cpuset_mutex held
2596  */
2597 static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
2598 {
2599 	struct cpuset *cp;
2600 	struct cgroup_subsys_state *pos_css;
2601 
2602 	rcu_read_lock();
2603 	cpuset_for_each_descendant_pre(cp, pos_css, cs) {
2604 		struct cpuset *parent = parent_cs(cp);
2605 
2606 		nodes_and(*new_mems, cp->mems_allowed, parent->effective_mems);
2607 
2608 		/*
2609 		 * If it becomes empty, inherit the effective mask of the
2610 		 * parent, which is guaranteed to have some MEMs.
2611 		 */
2612 		if (is_in_v2_mode() && nodes_empty(*new_mems))
2613 			*new_mems = parent->effective_mems;
2614 
2615 		/* Skip the whole subtree if the nodemask remains the same. */
2616 		if (nodes_equal(*new_mems, cp->effective_mems)) {
2617 			pos_css = css_rightmost_descendant(pos_css);
2618 			continue;
2619 		}
2620 
2621 		if (!css_tryget_online(&cp->css))
2622 			continue;
2623 		rcu_read_unlock();
2624 
2625 		spin_lock_irq(&callback_lock);
2626 		cp->effective_mems = *new_mems;
2627 		spin_unlock_irq(&callback_lock);
2628 
2629 		WARN_ON(!is_in_v2_mode() &&
2630 			!nodes_equal(cp->mems_allowed, cp->effective_mems));
2631 
2632 		cpuset_update_tasks_nodemask(cp);
2633 
2634 		rcu_read_lock();
2635 		css_put(&cp->css);
2636 	}
2637 	rcu_read_unlock();
2638 }
2639 
2640 /*
2641  * Handle user request to change the 'mems' memory placement
2642  * of a cpuset.  Needs to validate the request, update the
2643  * cpusets mems_allowed, and for each task in the cpuset,
2644  * update mems_allowed and rebind task's mempolicy and any vma
2645  * mempolicies and if the cpuset is marked 'memory_migrate',
2646  * migrate the tasks pages to the new memory.
2647  *
2648  * Call with cpuset_mutex held. May take callback_lock during call.
2649  * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
2650  * lock each such tasks mm->mmap_lock, scan its vma's and rebind
2651  * their mempolicies to the cpusets new mems_allowed.
2652  */
2653 static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
2654 			   const char *buf)
2655 {
2656 	int retval;
2657 
2658 	/*
2659 	 * top_cpuset.mems_allowed tracks node_stats[N_MEMORY];
2660 	 * it's read-only
2661 	 */
2662 	if (cs == &top_cpuset) {
2663 		retval = -EACCES;
2664 		goto done;
2665 	}
2666 
2667 	/*
2668 	 * An empty mems_allowed is ok iff there are no tasks in the cpuset.
2669 	 * Since nodelist_parse() fails on an empty mask, we special case
2670 	 * that parsing.  The validate_change() call ensures that cpusets
2671 	 * with tasks have memory.
2672 	 */
2673 	if (!*buf) {
2674 		nodes_clear(trialcs->mems_allowed);
2675 	} else {
2676 		retval = nodelist_parse(buf, trialcs->mems_allowed);
2677 		if (retval < 0)
2678 			goto done;
2679 
2680 		if (!nodes_subset(trialcs->mems_allowed,
2681 				  top_cpuset.mems_allowed)) {
2682 			retval = -EINVAL;
2683 			goto done;
2684 		}
2685 	}
2686 
2687 	if (nodes_equal(cs->mems_allowed, trialcs->mems_allowed)) {
2688 		retval = 0;		/* Too easy - nothing to do */
2689 		goto done;
2690 	}
2691 	retval = validate_change(cs, trialcs);
2692 	if (retval < 0)
2693 		goto done;
2694 
2695 	check_insane_mems_config(&trialcs->mems_allowed);
2696 
2697 	spin_lock_irq(&callback_lock);
2698 	cs->mems_allowed = trialcs->mems_allowed;
2699 	spin_unlock_irq(&callback_lock);
2700 
2701 	/* use trialcs->mems_allowed as a temp variable */
2702 	update_nodemasks_hier(cs, &trialcs->mems_allowed);
2703 done:
2704 	return retval;
2705 }
2706 
2707 bool current_cpuset_is_being_rebound(void)
2708 {
2709 	bool ret;
2710 
2711 	rcu_read_lock();
2712 	ret = task_cs(current) == cpuset_being_rebound;
2713 	rcu_read_unlock();
2714 
2715 	return ret;
2716 }
2717 
2718 /*
2719  * cpuset_update_flag - read a 0 or a 1 in a file and update associated flag
2720  * bit:		the bit to update (see cpuset_flagbits_t)
2721  * cs:		the cpuset to update
2722  * turning_on: 	whether the flag is being set or cleared
2723  *
2724  * Call with cpuset_mutex held.
2725  */
2726 
2727 int cpuset_update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
2728 		       int turning_on)
2729 {
2730 	struct cpuset *trialcs;
2731 	int balance_flag_changed;
2732 	int spread_flag_changed;
2733 	int err;
2734 
2735 	trialcs = alloc_trial_cpuset(cs);
2736 	if (!trialcs)
2737 		return -ENOMEM;
2738 
2739 	if (turning_on)
2740 		set_bit(bit, &trialcs->flags);
2741 	else
2742 		clear_bit(bit, &trialcs->flags);
2743 
2744 	err = validate_change(cs, trialcs);
2745 	if (err < 0)
2746 		goto out;
2747 
2748 	balance_flag_changed = (is_sched_load_balance(cs) !=
2749 				is_sched_load_balance(trialcs));
2750 
2751 	spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs))
2752 			|| (is_spread_page(cs) != is_spread_page(trialcs)));
2753 
2754 	spin_lock_irq(&callback_lock);
2755 	cs->flags = trialcs->flags;
2756 	spin_unlock_irq(&callback_lock);
2757 
2758 	if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) {
2759 		if (cpuset_v2())
2760 			cpuset_force_rebuild();
2761 		else
2762 			rebuild_sched_domains_locked();
2763 	}
2764 
2765 	if (spread_flag_changed)
2766 		cpuset1_update_tasks_flags(cs);
2767 out:
2768 	free_cpuset(trialcs);
2769 	return err;
2770 }
2771 
2772 /**
2773  * update_prstate - update partition_root_state
2774  * @cs: the cpuset to update
2775  * @new_prs: new partition root state
2776  * Return: 0 if successful, != 0 if error
2777  *
2778  * Call with cpuset_mutex held.
2779  */
2780 static int update_prstate(struct cpuset *cs, int new_prs)
2781 {
2782 	int err = PERR_NONE, old_prs = cs->partition_root_state;
2783 	struct cpuset *parent = parent_cs(cs);
2784 	struct tmpmasks tmpmask;
2785 	bool new_xcpus_state = false;
2786 
2787 	if (old_prs == new_prs)
2788 		return 0;
2789 
2790 	/*
2791 	 * Treat a previously invalid partition root as if it is a "member".
2792 	 */
2793 	if (new_prs && is_prs_invalid(old_prs))
2794 		old_prs = PRS_MEMBER;
2795 
2796 	if (alloc_cpumasks(NULL, &tmpmask))
2797 		return -ENOMEM;
2798 
2799 	/*
2800 	 * Setup effective_xcpus if not properly set yet, it will be cleared
2801 	 * later if partition becomes invalid.
2802 	 */
2803 	if ((new_prs > 0) && cpumask_empty(cs->exclusive_cpus)) {
2804 		spin_lock_irq(&callback_lock);
2805 		cpumask_and(cs->effective_xcpus,
2806 			    cs->cpus_allowed, parent->effective_xcpus);
2807 		spin_unlock_irq(&callback_lock);
2808 	}
2809 
2810 	err = update_partition_exclusive(cs, new_prs);
2811 	if (err)
2812 		goto out;
2813 
2814 	if (!old_prs) {
2815 		/*
2816 		 * cpus_allowed and exclusive_cpus cannot be both empty.
2817 		 */
2818 		if (xcpus_empty(cs)) {
2819 			err = PERR_CPUSEMPTY;
2820 			goto out;
2821 		}
2822 
2823 		/*
2824 		 * If parent is valid partition, enable local partiion.
2825 		 * Otherwise, enable a remote partition.
2826 		 */
2827 		if (is_partition_valid(parent)) {
2828 			enum partition_cmd cmd = (new_prs == PRS_ROOT)
2829 					       ? partcmd_enable : partcmd_enablei;
2830 
2831 			err = update_parent_effective_cpumask(cs, cmd, NULL, &tmpmask);
2832 		} else {
2833 			err = remote_partition_enable(cs, new_prs, &tmpmask);
2834 		}
2835 	} else if (old_prs && new_prs) {
2836 		/*
2837 		 * A change in load balance state only, no change in cpumasks.
2838 		 */
2839 		new_xcpus_state = true;
2840 	} else {
2841 		/*
2842 		 * Switching back to member is always allowed even if it
2843 		 * disables child partitions.
2844 		 */
2845 		if (is_remote_partition(cs))
2846 			remote_partition_disable(cs, &tmpmask);
2847 		else
2848 			update_parent_effective_cpumask(cs, partcmd_disable,
2849 							NULL, &tmpmask);
2850 
2851 		/*
2852 		 * Invalidation of child partitions will be done in
2853 		 * update_cpumasks_hier().
2854 		 */
2855 	}
2856 out:
2857 	/*
2858 	 * Make partition invalid & disable CS_CPU_EXCLUSIVE if an error
2859 	 * happens.
2860 	 */
2861 	if (err) {
2862 		new_prs = -new_prs;
2863 		update_partition_exclusive(cs, new_prs);
2864 	}
2865 
2866 	spin_lock_irq(&callback_lock);
2867 	cs->partition_root_state = new_prs;
2868 	WRITE_ONCE(cs->prs_err, err);
2869 	if (!is_partition_valid(cs))
2870 		reset_partition_data(cs);
2871 	else if (new_xcpus_state)
2872 		partition_xcpus_newstate(old_prs, new_prs, cs->effective_xcpus);
2873 	spin_unlock_irq(&callback_lock);
2874 	update_unbound_workqueue_cpumask(new_xcpus_state);
2875 
2876 	/* Force update if switching back to member */
2877 	update_cpumasks_hier(cs, &tmpmask, !new_prs);
2878 
2879 	/* Update sched domains and load balance flag */
2880 	update_partition_sd_lb(cs, old_prs);
2881 
2882 	notify_partition_change(cs, old_prs);
2883 	if (force_sd_rebuild)
2884 		rebuild_sched_domains_locked();
2885 	free_cpumasks(NULL, &tmpmask);
2886 	return 0;
2887 }
2888 
2889 static struct cpuset *cpuset_attach_old_cs;
2890 
2891 /*
2892  * Check to see if a cpuset can accept a new task
2893  * For v1, cpus_allowed and mems_allowed can't be empty.
2894  * For v2, effective_cpus can't be empty.
2895  * Note that in v1, effective_cpus = cpus_allowed.
2896  */
2897 static int cpuset_can_attach_check(struct cpuset *cs)
2898 {
2899 	if (cpumask_empty(cs->effective_cpus) ||
2900 	   (!is_in_v2_mode() && nodes_empty(cs->mems_allowed)))
2901 		return -ENOSPC;
2902 	return 0;
2903 }
2904 
2905 static void reset_migrate_dl_data(struct cpuset *cs)
2906 {
2907 	cs->nr_migrate_dl_tasks = 0;
2908 	cs->sum_migrate_dl_bw = 0;
2909 }
2910 
2911 /* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */
2912 static int cpuset_can_attach(struct cgroup_taskset *tset)
2913 {
2914 	struct cgroup_subsys_state *css;
2915 	struct cpuset *cs, *oldcs;
2916 	struct task_struct *task;
2917 	bool cpus_updated, mems_updated;
2918 	int ret;
2919 
2920 	/* used later by cpuset_attach() */
2921 	cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css));
2922 	oldcs = cpuset_attach_old_cs;
2923 	cs = css_cs(css);
2924 
2925 	mutex_lock(&cpuset_mutex);
2926 
2927 	/* Check to see if task is allowed in the cpuset */
2928 	ret = cpuset_can_attach_check(cs);
2929 	if (ret)
2930 		goto out_unlock;
2931 
2932 	cpus_updated = !cpumask_equal(cs->effective_cpus, oldcs->effective_cpus);
2933 	mems_updated = !nodes_equal(cs->effective_mems, oldcs->effective_mems);
2934 
2935 	cgroup_taskset_for_each(task, css, tset) {
2936 		ret = task_can_attach(task);
2937 		if (ret)
2938 			goto out_unlock;
2939 
2940 		/*
2941 		 * Skip rights over task check in v2 when nothing changes,
2942 		 * migration permission derives from hierarchy ownership in
2943 		 * cgroup_procs_write_permission()).
2944 		 */
2945 		if (!cpuset_v2() || (cpus_updated || mems_updated)) {
2946 			ret = security_task_setscheduler(task);
2947 			if (ret)
2948 				goto out_unlock;
2949 		}
2950 
2951 		if (dl_task(task)) {
2952 			cs->nr_migrate_dl_tasks++;
2953 			cs->sum_migrate_dl_bw += task->dl.dl_bw;
2954 		}
2955 	}
2956 
2957 	if (!cs->nr_migrate_dl_tasks)
2958 		goto out_success;
2959 
2960 	if (!cpumask_intersects(oldcs->effective_cpus, cs->effective_cpus)) {
2961 		int cpu = cpumask_any_and(cpu_active_mask, cs->effective_cpus);
2962 
2963 		if (unlikely(cpu >= nr_cpu_ids)) {
2964 			reset_migrate_dl_data(cs);
2965 			ret = -EINVAL;
2966 			goto out_unlock;
2967 		}
2968 
2969 		ret = dl_bw_alloc(cpu, cs->sum_migrate_dl_bw);
2970 		if (ret) {
2971 			reset_migrate_dl_data(cs);
2972 			goto out_unlock;
2973 		}
2974 	}
2975 
2976 out_success:
2977 	/*
2978 	 * Mark attach is in progress.  This makes validate_change() fail
2979 	 * changes which zero cpus/mems_allowed.
2980 	 */
2981 	cs->attach_in_progress++;
2982 out_unlock:
2983 	mutex_unlock(&cpuset_mutex);
2984 	return ret;
2985 }
2986 
2987 static void cpuset_cancel_attach(struct cgroup_taskset *tset)
2988 {
2989 	struct cgroup_subsys_state *css;
2990 	struct cpuset *cs;
2991 
2992 	cgroup_taskset_first(tset, &css);
2993 	cs = css_cs(css);
2994 
2995 	mutex_lock(&cpuset_mutex);
2996 	dec_attach_in_progress_locked(cs);
2997 
2998 	if (cs->nr_migrate_dl_tasks) {
2999 		int cpu = cpumask_any(cs->effective_cpus);
3000 
3001 		dl_bw_free(cpu, cs->sum_migrate_dl_bw);
3002 		reset_migrate_dl_data(cs);
3003 	}
3004 
3005 	mutex_unlock(&cpuset_mutex);
3006 }
3007 
3008 /*
3009  * Protected by cpuset_mutex. cpus_attach is used only by cpuset_attach_task()
3010  * but we can't allocate it dynamically there.  Define it global and
3011  * allocate from cpuset_init().
3012  */
3013 static cpumask_var_t cpus_attach;
3014 static nodemask_t cpuset_attach_nodemask_to;
3015 
3016 static void cpuset_attach_task(struct cpuset *cs, struct task_struct *task)
3017 {
3018 	lockdep_assert_held(&cpuset_mutex);
3019 
3020 	if (cs != &top_cpuset)
3021 		guarantee_online_cpus(task, cpus_attach);
3022 	else
3023 		cpumask_andnot(cpus_attach, task_cpu_possible_mask(task),
3024 			       subpartitions_cpus);
3025 	/*
3026 	 * can_attach beforehand should guarantee that this doesn't
3027 	 * fail.  TODO: have a better way to handle failure here
3028 	 */
3029 	WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach));
3030 
3031 	cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to);
3032 	cpuset1_update_task_spread_flags(cs, task);
3033 }
3034 
3035 static void cpuset_attach(struct cgroup_taskset *tset)
3036 {
3037 	struct task_struct *task;
3038 	struct task_struct *leader;
3039 	struct cgroup_subsys_state *css;
3040 	struct cpuset *cs;
3041 	struct cpuset *oldcs = cpuset_attach_old_cs;
3042 	bool cpus_updated, mems_updated;
3043 
3044 	cgroup_taskset_first(tset, &css);
3045 	cs = css_cs(css);
3046 
3047 	lockdep_assert_cpus_held();	/* see cgroup_attach_lock() */
3048 	mutex_lock(&cpuset_mutex);
3049 	cpus_updated = !cpumask_equal(cs->effective_cpus,
3050 				      oldcs->effective_cpus);
3051 	mems_updated = !nodes_equal(cs->effective_mems, oldcs->effective_mems);
3052 
3053 	/*
3054 	 * In the default hierarchy, enabling cpuset in the child cgroups
3055 	 * will trigger a number of cpuset_attach() calls with no change
3056 	 * in effective cpus and mems. In that case, we can optimize out
3057 	 * by skipping the task iteration and update.
3058 	 */
3059 	if (cpuset_v2() && !cpus_updated && !mems_updated) {
3060 		cpuset_attach_nodemask_to = cs->effective_mems;
3061 		goto out;
3062 	}
3063 
3064 	guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
3065 
3066 	cgroup_taskset_for_each(task, css, tset)
3067 		cpuset_attach_task(cs, task);
3068 
3069 	/*
3070 	 * Change mm for all threadgroup leaders. This is expensive and may
3071 	 * sleep and should be moved outside migration path proper. Skip it
3072 	 * if there is no change in effective_mems and CS_MEMORY_MIGRATE is
3073 	 * not set.
3074 	 */
3075 	cpuset_attach_nodemask_to = cs->effective_mems;
3076 	if (!is_memory_migrate(cs) && !mems_updated)
3077 		goto out;
3078 
3079 	cgroup_taskset_for_each_leader(leader, css, tset) {
3080 		struct mm_struct *mm = get_task_mm(leader);
3081 
3082 		if (mm) {
3083 			mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
3084 
3085 			/*
3086 			 * old_mems_allowed is the same with mems_allowed
3087 			 * here, except if this task is being moved
3088 			 * automatically due to hotplug.  In that case
3089 			 * @mems_allowed has been updated and is empty, so
3090 			 * @old_mems_allowed is the right nodesets that we
3091 			 * migrate mm from.
3092 			 */
3093 			if (is_memory_migrate(cs))
3094 				cpuset_migrate_mm(mm, &oldcs->old_mems_allowed,
3095 						  &cpuset_attach_nodemask_to);
3096 			else
3097 				mmput(mm);
3098 		}
3099 	}
3100 
3101 out:
3102 	cs->old_mems_allowed = cpuset_attach_nodemask_to;
3103 
3104 	if (cs->nr_migrate_dl_tasks) {
3105 		cs->nr_deadline_tasks += cs->nr_migrate_dl_tasks;
3106 		oldcs->nr_deadline_tasks -= cs->nr_migrate_dl_tasks;
3107 		reset_migrate_dl_data(cs);
3108 	}
3109 
3110 	dec_attach_in_progress_locked(cs);
3111 
3112 	mutex_unlock(&cpuset_mutex);
3113 }
3114 
3115 /*
3116  * Common handling for a write to a "cpus" or "mems" file.
3117  */
3118 ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
3119 				    char *buf, size_t nbytes, loff_t off)
3120 {
3121 	struct cpuset *cs = css_cs(of_css(of));
3122 	struct cpuset *trialcs;
3123 	int retval = -ENODEV;
3124 
3125 	buf = strstrip(buf);
3126 	cpus_read_lock();
3127 	mutex_lock(&cpuset_mutex);
3128 	if (!is_cpuset_online(cs))
3129 		goto out_unlock;
3130 
3131 	trialcs = alloc_trial_cpuset(cs);
3132 	if (!trialcs) {
3133 		retval = -ENOMEM;
3134 		goto out_unlock;
3135 	}
3136 
3137 	switch (of_cft(of)->private) {
3138 	case FILE_CPULIST:
3139 		retval = update_cpumask(cs, trialcs, buf);
3140 		break;
3141 	case FILE_EXCLUSIVE_CPULIST:
3142 		retval = update_exclusive_cpumask(cs, trialcs, buf);
3143 		break;
3144 	case FILE_MEMLIST:
3145 		retval = update_nodemask(cs, trialcs, buf);
3146 		break;
3147 	default:
3148 		retval = -EINVAL;
3149 		break;
3150 	}
3151 
3152 	free_cpuset(trialcs);
3153 	if (force_sd_rebuild)
3154 		rebuild_sched_domains_locked();
3155 out_unlock:
3156 	mutex_unlock(&cpuset_mutex);
3157 	cpus_read_unlock();
3158 	flush_workqueue(cpuset_migrate_mm_wq);
3159 	return retval ?: nbytes;
3160 }
3161 
3162 /*
3163  * These ascii lists should be read in a single call, by using a user
3164  * buffer large enough to hold the entire map.  If read in smaller
3165  * chunks, there is no guarantee of atomicity.  Since the display format
3166  * used, list of ranges of sequential numbers, is variable length,
3167  * and since these maps can change value dynamically, one could read
3168  * gibberish by doing partial reads while a list was changing.
3169  */
3170 int cpuset_common_seq_show(struct seq_file *sf, void *v)
3171 {
3172 	struct cpuset *cs = css_cs(seq_css(sf));
3173 	cpuset_filetype_t type = seq_cft(sf)->private;
3174 	int ret = 0;
3175 
3176 	spin_lock_irq(&callback_lock);
3177 
3178 	switch (type) {
3179 	case FILE_CPULIST:
3180 		seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->cpus_allowed));
3181 		break;
3182 	case FILE_MEMLIST:
3183 		seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->mems_allowed));
3184 		break;
3185 	case FILE_EFFECTIVE_CPULIST:
3186 		seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->effective_cpus));
3187 		break;
3188 	case FILE_EFFECTIVE_MEMLIST:
3189 		seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->effective_mems));
3190 		break;
3191 	case FILE_EXCLUSIVE_CPULIST:
3192 		seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->exclusive_cpus));
3193 		break;
3194 	case FILE_EFFECTIVE_XCPULIST:
3195 		seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->effective_xcpus));
3196 		break;
3197 	case FILE_SUBPARTS_CPULIST:
3198 		seq_printf(sf, "%*pbl\n", cpumask_pr_args(subpartitions_cpus));
3199 		break;
3200 	case FILE_ISOLATED_CPULIST:
3201 		seq_printf(sf, "%*pbl\n", cpumask_pr_args(isolated_cpus));
3202 		break;
3203 	default:
3204 		ret = -EINVAL;
3205 	}
3206 
3207 	spin_unlock_irq(&callback_lock);
3208 	return ret;
3209 }
3210 
3211 static int sched_partition_show(struct seq_file *seq, void *v)
3212 {
3213 	struct cpuset *cs = css_cs(seq_css(seq));
3214 	const char *err, *type = NULL;
3215 
3216 	switch (cs->partition_root_state) {
3217 	case PRS_ROOT:
3218 		seq_puts(seq, "root\n");
3219 		break;
3220 	case PRS_ISOLATED:
3221 		seq_puts(seq, "isolated\n");
3222 		break;
3223 	case PRS_MEMBER:
3224 		seq_puts(seq, "member\n");
3225 		break;
3226 	case PRS_INVALID_ROOT:
3227 		type = "root";
3228 		fallthrough;
3229 	case PRS_INVALID_ISOLATED:
3230 		if (!type)
3231 			type = "isolated";
3232 		err = perr_strings[READ_ONCE(cs->prs_err)];
3233 		if (err)
3234 			seq_printf(seq, "%s invalid (%s)\n", type, err);
3235 		else
3236 			seq_printf(seq, "%s invalid\n", type);
3237 		break;
3238 	}
3239 	return 0;
3240 }
3241 
3242 static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf,
3243 				     size_t nbytes, loff_t off)
3244 {
3245 	struct cpuset *cs = css_cs(of_css(of));
3246 	int val;
3247 	int retval = -ENODEV;
3248 
3249 	buf = strstrip(buf);
3250 
3251 	if (!strcmp(buf, "root"))
3252 		val = PRS_ROOT;
3253 	else if (!strcmp(buf, "member"))
3254 		val = PRS_MEMBER;
3255 	else if (!strcmp(buf, "isolated"))
3256 		val = PRS_ISOLATED;
3257 	else
3258 		return -EINVAL;
3259 
3260 	css_get(&cs->css);
3261 	cpus_read_lock();
3262 	mutex_lock(&cpuset_mutex);
3263 	if (!is_cpuset_online(cs))
3264 		goto out_unlock;
3265 
3266 	retval = update_prstate(cs, val);
3267 out_unlock:
3268 	mutex_unlock(&cpuset_mutex);
3269 	cpus_read_unlock();
3270 	css_put(&cs->css);
3271 	return retval ?: nbytes;
3272 }
3273 
3274 /*
3275  * This is currently a minimal set for the default hierarchy. It can be
3276  * expanded later on by migrating more features and control files from v1.
3277  */
3278 static struct cftype dfl_files[] = {
3279 	{
3280 		.name = "cpus",
3281 		.seq_show = cpuset_common_seq_show,
3282 		.write = cpuset_write_resmask,
3283 		.max_write_len = (100U + 6 * NR_CPUS),
3284 		.private = FILE_CPULIST,
3285 		.flags = CFTYPE_NOT_ON_ROOT,
3286 	},
3287 
3288 	{
3289 		.name = "mems",
3290 		.seq_show = cpuset_common_seq_show,
3291 		.write = cpuset_write_resmask,
3292 		.max_write_len = (100U + 6 * MAX_NUMNODES),
3293 		.private = FILE_MEMLIST,
3294 		.flags = CFTYPE_NOT_ON_ROOT,
3295 	},
3296 
3297 	{
3298 		.name = "cpus.effective",
3299 		.seq_show = cpuset_common_seq_show,
3300 		.private = FILE_EFFECTIVE_CPULIST,
3301 	},
3302 
3303 	{
3304 		.name = "mems.effective",
3305 		.seq_show = cpuset_common_seq_show,
3306 		.private = FILE_EFFECTIVE_MEMLIST,
3307 	},
3308 
3309 	{
3310 		.name = "cpus.partition",
3311 		.seq_show = sched_partition_show,
3312 		.write = sched_partition_write,
3313 		.private = FILE_PARTITION_ROOT,
3314 		.flags = CFTYPE_NOT_ON_ROOT,
3315 		.file_offset = offsetof(struct cpuset, partition_file),
3316 	},
3317 
3318 	{
3319 		.name = "cpus.exclusive",
3320 		.seq_show = cpuset_common_seq_show,
3321 		.write = cpuset_write_resmask,
3322 		.max_write_len = (100U + 6 * NR_CPUS),
3323 		.private = FILE_EXCLUSIVE_CPULIST,
3324 		.flags = CFTYPE_NOT_ON_ROOT,
3325 	},
3326 
3327 	{
3328 		.name = "cpus.exclusive.effective",
3329 		.seq_show = cpuset_common_seq_show,
3330 		.private = FILE_EFFECTIVE_XCPULIST,
3331 		.flags = CFTYPE_NOT_ON_ROOT,
3332 	},
3333 
3334 	{
3335 		.name = "cpus.subpartitions",
3336 		.seq_show = cpuset_common_seq_show,
3337 		.private = FILE_SUBPARTS_CPULIST,
3338 		.flags = CFTYPE_ONLY_ON_ROOT | CFTYPE_DEBUG,
3339 	},
3340 
3341 	{
3342 		.name = "cpus.isolated",
3343 		.seq_show = cpuset_common_seq_show,
3344 		.private = FILE_ISOLATED_CPULIST,
3345 		.flags = CFTYPE_ONLY_ON_ROOT,
3346 	},
3347 
3348 	{ }	/* terminate */
3349 };
3350 
3351 
3352 /**
3353  * cpuset_css_alloc - Allocate a cpuset css
3354  * @parent_css: Parent css of the control group that the new cpuset will be
3355  *              part of
3356  * Return: cpuset css on success, -ENOMEM on failure.
3357  *
3358  * Allocate and initialize a new cpuset css, for non-NULL @parent_css, return
3359  * top cpuset css otherwise.
3360  */
3361 static struct cgroup_subsys_state *
3362 cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
3363 {
3364 	struct cpuset *cs;
3365 
3366 	if (!parent_css)
3367 		return &top_cpuset.css;
3368 
3369 	cs = kzalloc(sizeof(*cs), GFP_KERNEL);
3370 	if (!cs)
3371 		return ERR_PTR(-ENOMEM);
3372 
3373 	if (alloc_cpumasks(cs, NULL)) {
3374 		kfree(cs);
3375 		return ERR_PTR(-ENOMEM);
3376 	}
3377 
3378 	__set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
3379 	fmeter_init(&cs->fmeter);
3380 	cs->relax_domain_level = -1;
3381 	INIT_LIST_HEAD(&cs->remote_sibling);
3382 
3383 	/* Set CS_MEMORY_MIGRATE for default hierarchy */
3384 	if (cpuset_v2())
3385 		__set_bit(CS_MEMORY_MIGRATE, &cs->flags);
3386 
3387 	return &cs->css;
3388 }
3389 
3390 static int cpuset_css_online(struct cgroup_subsys_state *css)
3391 {
3392 	struct cpuset *cs = css_cs(css);
3393 	struct cpuset *parent = parent_cs(cs);
3394 	struct cpuset *tmp_cs;
3395 	struct cgroup_subsys_state *pos_css;
3396 
3397 	if (!parent)
3398 		return 0;
3399 
3400 	cpus_read_lock();
3401 	mutex_lock(&cpuset_mutex);
3402 
3403 	set_bit(CS_ONLINE, &cs->flags);
3404 	if (is_spread_page(parent))
3405 		set_bit(CS_SPREAD_PAGE, &cs->flags);
3406 	if (is_spread_slab(parent))
3407 		set_bit(CS_SPREAD_SLAB, &cs->flags);
3408 	/*
3409 	 * For v2, clear CS_SCHED_LOAD_BALANCE if parent is isolated
3410 	 */
3411 	if (cpuset_v2() && !is_sched_load_balance(parent))
3412 		clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
3413 
3414 	cpuset_inc();
3415 
3416 	spin_lock_irq(&callback_lock);
3417 	if (is_in_v2_mode()) {
3418 		cpumask_copy(cs->effective_cpus, parent->effective_cpus);
3419 		cs->effective_mems = parent->effective_mems;
3420 	}
3421 	spin_unlock_irq(&callback_lock);
3422 
3423 	if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
3424 		goto out_unlock;
3425 
3426 	/*
3427 	 * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is
3428 	 * set.  This flag handling is implemented in cgroup core for
3429 	 * historical reasons - the flag may be specified during mount.
3430 	 *
3431 	 * Currently, if any sibling cpusets have exclusive cpus or mem, we
3432 	 * refuse to clone the configuration - thereby refusing the task to
3433 	 * be entered, and as a result refusing the sys_unshare() or
3434 	 * clone() which initiated it.  If this becomes a problem for some
3435 	 * users who wish to allow that scenario, then this could be
3436 	 * changed to grant parent->cpus_allowed-sibling_cpus_exclusive
3437 	 * (and likewise for mems) to the new cgroup.
3438 	 */
3439 	rcu_read_lock();
3440 	cpuset_for_each_child(tmp_cs, pos_css, parent) {
3441 		if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) {
3442 			rcu_read_unlock();
3443 			goto out_unlock;
3444 		}
3445 	}
3446 	rcu_read_unlock();
3447 
3448 	spin_lock_irq(&callback_lock);
3449 	cs->mems_allowed = parent->mems_allowed;
3450 	cs->effective_mems = parent->mems_allowed;
3451 	cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
3452 	cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
3453 	spin_unlock_irq(&callback_lock);
3454 out_unlock:
3455 	mutex_unlock(&cpuset_mutex);
3456 	cpus_read_unlock();
3457 	return 0;
3458 }
3459 
3460 /*
3461  * If the cpuset being removed has its flag 'sched_load_balance'
3462  * enabled, then simulate turning sched_load_balance off, which
3463  * will call rebuild_sched_domains_locked(). That is not needed
3464  * in the default hierarchy where only changes in partition
3465  * will cause repartitioning.
3466  *
3467  * If the cpuset has the 'sched.partition' flag enabled, simulate
3468  * turning 'sched.partition" off.
3469  */
3470 
3471 static void cpuset_css_offline(struct cgroup_subsys_state *css)
3472 {
3473 	struct cpuset *cs = css_cs(css);
3474 
3475 	cpus_read_lock();
3476 	mutex_lock(&cpuset_mutex);
3477 
3478 	if (is_partition_valid(cs))
3479 		update_prstate(cs, 0);
3480 
3481 	if (!cpuset_v2() && is_sched_load_balance(cs))
3482 		cpuset_update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
3483 
3484 	cpuset_dec();
3485 	clear_bit(CS_ONLINE, &cs->flags);
3486 
3487 	mutex_unlock(&cpuset_mutex);
3488 	cpus_read_unlock();
3489 }
3490 
3491 static void cpuset_css_free(struct cgroup_subsys_state *css)
3492 {
3493 	struct cpuset *cs = css_cs(css);
3494 
3495 	free_cpuset(cs);
3496 }
3497 
3498 static void cpuset_bind(struct cgroup_subsys_state *root_css)
3499 {
3500 	mutex_lock(&cpuset_mutex);
3501 	spin_lock_irq(&callback_lock);
3502 
3503 	if (is_in_v2_mode()) {
3504 		cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
3505 		cpumask_copy(top_cpuset.effective_xcpus, cpu_possible_mask);
3506 		top_cpuset.mems_allowed = node_possible_map;
3507 	} else {
3508 		cpumask_copy(top_cpuset.cpus_allowed,
3509 			     top_cpuset.effective_cpus);
3510 		top_cpuset.mems_allowed = top_cpuset.effective_mems;
3511 	}
3512 
3513 	spin_unlock_irq(&callback_lock);
3514 	mutex_unlock(&cpuset_mutex);
3515 }
3516 
3517 /*
3518  * In case the child is cloned into a cpuset different from its parent,
3519  * additional checks are done to see if the move is allowed.
3520  */
3521 static int cpuset_can_fork(struct task_struct *task, struct css_set *cset)
3522 {
3523 	struct cpuset *cs = css_cs(cset->subsys[cpuset_cgrp_id]);
3524 	bool same_cs;
3525 	int ret;
3526 
3527 	rcu_read_lock();
3528 	same_cs = (cs == task_cs(current));
3529 	rcu_read_unlock();
3530 
3531 	if (same_cs)
3532 		return 0;
3533 
3534 	lockdep_assert_held(&cgroup_mutex);
3535 	mutex_lock(&cpuset_mutex);
3536 
3537 	/* Check to see if task is allowed in the cpuset */
3538 	ret = cpuset_can_attach_check(cs);
3539 	if (ret)
3540 		goto out_unlock;
3541 
3542 	ret = task_can_attach(task);
3543 	if (ret)
3544 		goto out_unlock;
3545 
3546 	ret = security_task_setscheduler(task);
3547 	if (ret)
3548 		goto out_unlock;
3549 
3550 	/*
3551 	 * Mark attach is in progress.  This makes validate_change() fail
3552 	 * changes which zero cpus/mems_allowed.
3553 	 */
3554 	cs->attach_in_progress++;
3555 out_unlock:
3556 	mutex_unlock(&cpuset_mutex);
3557 	return ret;
3558 }
3559 
3560 static void cpuset_cancel_fork(struct task_struct *task, struct css_set *cset)
3561 {
3562 	struct cpuset *cs = css_cs(cset->subsys[cpuset_cgrp_id]);
3563 	bool same_cs;
3564 
3565 	rcu_read_lock();
3566 	same_cs = (cs == task_cs(current));
3567 	rcu_read_unlock();
3568 
3569 	if (same_cs)
3570 		return;
3571 
3572 	dec_attach_in_progress(cs);
3573 }
3574 
3575 /*
3576  * Make sure the new task conform to the current state of its parent,
3577  * which could have been changed by cpuset just after it inherits the
3578  * state from the parent and before it sits on the cgroup's task list.
3579  */
3580 static void cpuset_fork(struct task_struct *task)
3581 {
3582 	struct cpuset *cs;
3583 	bool same_cs;
3584 
3585 	rcu_read_lock();
3586 	cs = task_cs(task);
3587 	same_cs = (cs == task_cs(current));
3588 	rcu_read_unlock();
3589 
3590 	if (same_cs) {
3591 		if (cs == &top_cpuset)
3592 			return;
3593 
3594 		set_cpus_allowed_ptr(task, current->cpus_ptr);
3595 		task->mems_allowed = current->mems_allowed;
3596 		return;
3597 	}
3598 
3599 	/* CLONE_INTO_CGROUP */
3600 	mutex_lock(&cpuset_mutex);
3601 	guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
3602 	cpuset_attach_task(cs, task);
3603 
3604 	dec_attach_in_progress_locked(cs);
3605 	mutex_unlock(&cpuset_mutex);
3606 }
3607 
3608 struct cgroup_subsys cpuset_cgrp_subsys = {
3609 	.css_alloc	= cpuset_css_alloc,
3610 	.css_online	= cpuset_css_online,
3611 	.css_offline	= cpuset_css_offline,
3612 	.css_free	= cpuset_css_free,
3613 	.can_attach	= cpuset_can_attach,
3614 	.cancel_attach	= cpuset_cancel_attach,
3615 	.attach		= cpuset_attach,
3616 	.post_attach	= cpuset_post_attach,
3617 	.bind		= cpuset_bind,
3618 	.can_fork	= cpuset_can_fork,
3619 	.cancel_fork	= cpuset_cancel_fork,
3620 	.fork		= cpuset_fork,
3621 #ifdef CONFIG_CPUSETS_V1
3622 	.legacy_cftypes	= cpuset1_files,
3623 #endif
3624 	.dfl_cftypes	= dfl_files,
3625 	.early_init	= true,
3626 	.threaded	= true,
3627 };
3628 
3629 /**
3630  * cpuset_init - initialize cpusets at system boot
3631  *
3632  * Description: Initialize top_cpuset
3633  **/
3634 
3635 int __init cpuset_init(void)
3636 {
3637 	BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL));
3638 	BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL));
3639 	BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_xcpus, GFP_KERNEL));
3640 	BUG_ON(!alloc_cpumask_var(&top_cpuset.exclusive_cpus, GFP_KERNEL));
3641 	BUG_ON(!zalloc_cpumask_var(&subpartitions_cpus, GFP_KERNEL));
3642 	BUG_ON(!zalloc_cpumask_var(&isolated_cpus, GFP_KERNEL));
3643 
3644 	cpumask_setall(top_cpuset.cpus_allowed);
3645 	nodes_setall(top_cpuset.mems_allowed);
3646 	cpumask_setall(top_cpuset.effective_cpus);
3647 	cpumask_setall(top_cpuset.effective_xcpus);
3648 	cpumask_setall(top_cpuset.exclusive_cpus);
3649 	nodes_setall(top_cpuset.effective_mems);
3650 
3651 	fmeter_init(&top_cpuset.fmeter);
3652 	INIT_LIST_HEAD(&remote_children);
3653 
3654 	BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL));
3655 
3656 	have_boot_isolcpus = housekeeping_enabled(HK_TYPE_DOMAIN);
3657 	if (have_boot_isolcpus) {
3658 		BUG_ON(!alloc_cpumask_var(&boot_hk_cpus, GFP_KERNEL));
3659 		cpumask_copy(boot_hk_cpus, housekeeping_cpumask(HK_TYPE_DOMAIN));
3660 		cpumask_andnot(isolated_cpus, cpu_possible_mask, boot_hk_cpus);
3661 	}
3662 
3663 	return 0;
3664 }
3665 
3666 static void
3667 hotplug_update_tasks(struct cpuset *cs,
3668 		     struct cpumask *new_cpus, nodemask_t *new_mems,
3669 		     bool cpus_updated, bool mems_updated)
3670 {
3671 	/* A partition root is allowed to have empty effective cpus */
3672 	if (cpumask_empty(new_cpus) && !is_partition_valid(cs))
3673 		cpumask_copy(new_cpus, parent_cs(cs)->effective_cpus);
3674 	if (nodes_empty(*new_mems))
3675 		*new_mems = parent_cs(cs)->effective_mems;
3676 
3677 	spin_lock_irq(&callback_lock);
3678 	cpumask_copy(cs->effective_cpus, new_cpus);
3679 	cs->effective_mems = *new_mems;
3680 	spin_unlock_irq(&callback_lock);
3681 
3682 	if (cpus_updated)
3683 		cpuset_update_tasks_cpumask(cs, new_cpus);
3684 	if (mems_updated)
3685 		cpuset_update_tasks_nodemask(cs);
3686 }
3687 
3688 void cpuset_force_rebuild(void)
3689 {
3690 	force_sd_rebuild = true;
3691 }
3692 
3693 /**
3694  * cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug
3695  * @cs: cpuset in interest
3696  * @tmp: the tmpmasks structure pointer
3697  *
3698  * Compare @cs's cpu and mem masks against top_cpuset and if some have gone
3699  * offline, update @cs accordingly.  If @cs ends up with no CPU or memory,
3700  * all its tasks are moved to the nearest ancestor with both resources.
3701  */
3702 static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp)
3703 {
3704 	static cpumask_t new_cpus;
3705 	static nodemask_t new_mems;
3706 	bool cpus_updated;
3707 	bool mems_updated;
3708 	bool remote;
3709 	int partcmd = -1;
3710 	struct cpuset *parent;
3711 retry:
3712 	wait_event(cpuset_attach_wq, cs->attach_in_progress == 0);
3713 
3714 	mutex_lock(&cpuset_mutex);
3715 
3716 	/*
3717 	 * We have raced with task attaching. We wait until attaching
3718 	 * is finished, so we won't attach a task to an empty cpuset.
3719 	 */
3720 	if (cs->attach_in_progress) {
3721 		mutex_unlock(&cpuset_mutex);
3722 		goto retry;
3723 	}
3724 
3725 	parent = parent_cs(cs);
3726 	compute_effective_cpumask(&new_cpus, cs, parent);
3727 	nodes_and(new_mems, cs->mems_allowed, parent->effective_mems);
3728 
3729 	if (!tmp || !cs->partition_root_state)
3730 		goto update_tasks;
3731 
3732 	/*
3733 	 * Compute effective_cpus for valid partition root, may invalidate
3734 	 * child partition roots if necessary.
3735 	 */
3736 	remote = is_remote_partition(cs);
3737 	if (remote || (is_partition_valid(cs) && is_partition_valid(parent)))
3738 		compute_partition_effective_cpumask(cs, &new_cpus);
3739 
3740 	if (remote && cpumask_empty(&new_cpus) &&
3741 	    partition_is_populated(cs, NULL)) {
3742 		remote_partition_disable(cs, tmp);
3743 		compute_effective_cpumask(&new_cpus, cs, parent);
3744 		remote = false;
3745 		cpuset_force_rebuild();
3746 	}
3747 
3748 	/*
3749 	 * Force the partition to become invalid if either one of
3750 	 * the following conditions hold:
3751 	 * 1) empty effective cpus but not valid empty partition.
3752 	 * 2) parent is invalid or doesn't grant any cpus to child
3753 	 *    partitions.
3754 	 */
3755 	if (is_local_partition(cs) && (!is_partition_valid(parent) ||
3756 				tasks_nocpu_error(parent, cs, &new_cpus)))
3757 		partcmd = partcmd_invalidate;
3758 	/*
3759 	 * On the other hand, an invalid partition root may be transitioned
3760 	 * back to a regular one.
3761 	 */
3762 	else if (is_partition_valid(parent) && is_partition_invalid(cs))
3763 		partcmd = partcmd_update;
3764 
3765 	if (partcmd >= 0) {
3766 		update_parent_effective_cpumask(cs, partcmd, NULL, tmp);
3767 		if ((partcmd == partcmd_invalidate) || is_partition_valid(cs)) {
3768 			compute_partition_effective_cpumask(cs, &new_cpus);
3769 			cpuset_force_rebuild();
3770 		}
3771 	}
3772 
3773 update_tasks:
3774 	cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);
3775 	mems_updated = !nodes_equal(new_mems, cs->effective_mems);
3776 	if (!cpus_updated && !mems_updated)
3777 		goto unlock;	/* Hotplug doesn't affect this cpuset */
3778 
3779 	if (mems_updated)
3780 		check_insane_mems_config(&new_mems);
3781 
3782 	if (is_in_v2_mode())
3783 		hotplug_update_tasks(cs, &new_cpus, &new_mems,
3784 				     cpus_updated, mems_updated);
3785 	else
3786 		cpuset1_hotplug_update_tasks(cs, &new_cpus, &new_mems,
3787 					    cpus_updated, mems_updated);
3788 
3789 unlock:
3790 	mutex_unlock(&cpuset_mutex);
3791 }
3792 
3793 /**
3794  * cpuset_handle_hotplug - handle CPU/memory hot{,un}plug for a cpuset
3795  *
3796  * This function is called after either CPU or memory configuration has
3797  * changed and updates cpuset accordingly.  The top_cpuset is always
3798  * synchronized to cpu_active_mask and N_MEMORY, which is necessary in
3799  * order to make cpusets transparent (of no affect) on systems that are
3800  * actively using CPU hotplug but making no active use of cpusets.
3801  *
3802  * Non-root cpusets are only affected by offlining.  If any CPUs or memory
3803  * nodes have been taken down, cpuset_hotplug_update_tasks() is invoked on
3804  * all descendants.
3805  *
3806  * Note that CPU offlining during suspend is ignored.  We don't modify
3807  * cpusets across suspend/resume cycles at all.
3808  *
3809  * CPU / memory hotplug is handled synchronously.
3810  */
3811 static void cpuset_handle_hotplug(void)
3812 {
3813 	static cpumask_t new_cpus;
3814 	static nodemask_t new_mems;
3815 	bool cpus_updated, mems_updated;
3816 	bool on_dfl = is_in_v2_mode();
3817 	struct tmpmasks tmp, *ptmp = NULL;
3818 
3819 	if (on_dfl && !alloc_cpumasks(NULL, &tmp))
3820 		ptmp = &tmp;
3821 
3822 	lockdep_assert_cpus_held();
3823 	mutex_lock(&cpuset_mutex);
3824 
3825 	/* fetch the available cpus/mems and find out which changed how */
3826 	cpumask_copy(&new_cpus, cpu_active_mask);
3827 	new_mems = node_states[N_MEMORY];
3828 
3829 	/*
3830 	 * If subpartitions_cpus is populated, it is likely that the check
3831 	 * below will produce a false positive on cpus_updated when the cpu
3832 	 * list isn't changed. It is extra work, but it is better to be safe.
3833 	 */
3834 	cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus) ||
3835 		       !cpumask_empty(subpartitions_cpus);
3836 	mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems);
3837 
3838 	/* For v1, synchronize cpus_allowed to cpu_active_mask */
3839 	if (cpus_updated) {
3840 		cpuset_force_rebuild();
3841 		spin_lock_irq(&callback_lock);
3842 		if (!on_dfl)
3843 			cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
3844 		/*
3845 		 * Make sure that CPUs allocated to child partitions
3846 		 * do not show up in effective_cpus. If no CPU is left,
3847 		 * we clear the subpartitions_cpus & let the child partitions
3848 		 * fight for the CPUs again.
3849 		 */
3850 		if (!cpumask_empty(subpartitions_cpus)) {
3851 			if (cpumask_subset(&new_cpus, subpartitions_cpus)) {
3852 				top_cpuset.nr_subparts = 0;
3853 				cpumask_clear(subpartitions_cpus);
3854 			} else {
3855 				cpumask_andnot(&new_cpus, &new_cpus,
3856 					       subpartitions_cpus);
3857 			}
3858 		}
3859 		cpumask_copy(top_cpuset.effective_cpus, &new_cpus);
3860 		spin_unlock_irq(&callback_lock);
3861 		/* we don't mess with cpumasks of tasks in top_cpuset */
3862 	}
3863 
3864 	/* synchronize mems_allowed to N_MEMORY */
3865 	if (mems_updated) {
3866 		spin_lock_irq(&callback_lock);
3867 		if (!on_dfl)
3868 			top_cpuset.mems_allowed = new_mems;
3869 		top_cpuset.effective_mems = new_mems;
3870 		spin_unlock_irq(&callback_lock);
3871 		cpuset_update_tasks_nodemask(&top_cpuset);
3872 	}
3873 
3874 	mutex_unlock(&cpuset_mutex);
3875 
3876 	/* if cpus or mems changed, we need to propagate to descendants */
3877 	if (cpus_updated || mems_updated) {
3878 		struct cpuset *cs;
3879 		struct cgroup_subsys_state *pos_css;
3880 
3881 		rcu_read_lock();
3882 		cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
3883 			if (cs == &top_cpuset || !css_tryget_online(&cs->css))
3884 				continue;
3885 			rcu_read_unlock();
3886 
3887 			cpuset_hotplug_update_tasks(cs, ptmp);
3888 
3889 			rcu_read_lock();
3890 			css_put(&cs->css);
3891 		}
3892 		rcu_read_unlock();
3893 	}
3894 
3895 	/* rebuild sched domains if necessary */
3896 	if (force_sd_rebuild)
3897 		rebuild_sched_domains_cpuslocked();
3898 
3899 	free_cpumasks(NULL, ptmp);
3900 }
3901 
3902 void cpuset_update_active_cpus(void)
3903 {
3904 	/*
3905 	 * We're inside cpu hotplug critical region which usually nests
3906 	 * inside cgroup synchronization.  Bounce actual hotplug processing
3907 	 * to a work item to avoid reverse locking order.
3908 	 */
3909 	cpuset_handle_hotplug();
3910 }
3911 
3912 /*
3913  * Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY].
3914  * Call this routine anytime after node_states[N_MEMORY] changes.
3915  * See cpuset_update_active_cpus() for CPU hotplug handling.
3916  */
3917 static int cpuset_track_online_nodes(struct notifier_block *self,
3918 				unsigned long action, void *arg)
3919 {
3920 	cpuset_handle_hotplug();
3921 	return NOTIFY_OK;
3922 }
3923 
3924 /**
3925  * cpuset_init_smp - initialize cpus_allowed
3926  *
3927  * Description: Finish top cpuset after cpu, node maps are initialized
3928  */
3929 void __init cpuset_init_smp(void)
3930 {
3931 	/*
3932 	 * cpus_allowd/mems_allowed set to v2 values in the initial
3933 	 * cpuset_bind() call will be reset to v1 values in another
3934 	 * cpuset_bind() call when v1 cpuset is mounted.
3935 	 */
3936 	top_cpuset.old_mems_allowed = top_cpuset.mems_allowed;
3937 
3938 	cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask);
3939 	top_cpuset.effective_mems = node_states[N_MEMORY];
3940 
3941 	hotplug_memory_notifier(cpuset_track_online_nodes, CPUSET_CALLBACK_PRI);
3942 
3943 	cpuset_migrate_mm_wq = alloc_ordered_workqueue("cpuset_migrate_mm", 0);
3944 	BUG_ON(!cpuset_migrate_mm_wq);
3945 }
3946 
3947 /**
3948  * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset.
3949  * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed.
3950  * @pmask: pointer to struct cpumask variable to receive cpus_allowed set.
3951  *
3952  * Description: Returns the cpumask_var_t cpus_allowed of the cpuset
3953  * attached to the specified @tsk.  Guaranteed to return some non-empty
3954  * subset of cpu_online_mask, even if this means going outside the
3955  * tasks cpuset, except when the task is in the top cpuset.
3956  **/
3957 
3958 void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
3959 {
3960 	unsigned long flags;
3961 	struct cpuset *cs;
3962 
3963 	spin_lock_irqsave(&callback_lock, flags);
3964 	rcu_read_lock();
3965 
3966 	cs = task_cs(tsk);
3967 	if (cs != &top_cpuset)
3968 		guarantee_online_cpus(tsk, pmask);
3969 	/*
3970 	 * Tasks in the top cpuset won't get update to their cpumasks
3971 	 * when a hotplug online/offline event happens. So we include all
3972 	 * offline cpus in the allowed cpu list.
3973 	 */
3974 	if ((cs == &top_cpuset) || cpumask_empty(pmask)) {
3975 		const struct cpumask *possible_mask = task_cpu_possible_mask(tsk);
3976 
3977 		/*
3978 		 * We first exclude cpus allocated to partitions. If there is no
3979 		 * allowable online cpu left, we fall back to all possible cpus.
3980 		 */
3981 		cpumask_andnot(pmask, possible_mask, subpartitions_cpus);
3982 		if (!cpumask_intersects(pmask, cpu_online_mask))
3983 			cpumask_copy(pmask, possible_mask);
3984 	}
3985 
3986 	rcu_read_unlock();
3987 	spin_unlock_irqrestore(&callback_lock, flags);
3988 }
3989 
3990 /**
3991  * cpuset_cpus_allowed_fallback - final fallback before complete catastrophe.
3992  * @tsk: pointer to task_struct with which the scheduler is struggling
3993  *
3994  * Description: In the case that the scheduler cannot find an allowed cpu in
3995  * tsk->cpus_allowed, we fall back to task_cs(tsk)->cpus_allowed. In legacy
3996  * mode however, this value is the same as task_cs(tsk)->effective_cpus,
3997  * which will not contain a sane cpumask during cases such as cpu hotplugging.
3998  * This is the absolute last resort for the scheduler and it is only used if
3999  * _every_ other avenue has been traveled.
4000  *
4001  * Returns true if the affinity of @tsk was changed, false otherwise.
4002  **/
4003 
4004 bool cpuset_cpus_allowed_fallback(struct task_struct *tsk)
4005 {
4006 	const struct cpumask *possible_mask = task_cpu_possible_mask(tsk);
4007 	const struct cpumask *cs_mask;
4008 	bool changed = false;
4009 
4010 	rcu_read_lock();
4011 	cs_mask = task_cs(tsk)->cpus_allowed;
4012 	if (is_in_v2_mode() && cpumask_subset(cs_mask, possible_mask)) {
4013 		do_set_cpus_allowed(tsk, cs_mask);
4014 		changed = true;
4015 	}
4016 	rcu_read_unlock();
4017 
4018 	/*
4019 	 * We own tsk->cpus_allowed, nobody can change it under us.
4020 	 *
4021 	 * But we used cs && cs->cpus_allowed lockless and thus can
4022 	 * race with cgroup_attach_task() or update_cpumask() and get
4023 	 * the wrong tsk->cpus_allowed. However, both cases imply the
4024 	 * subsequent cpuset_change_cpumask()->set_cpus_allowed_ptr()
4025 	 * which takes task_rq_lock().
4026 	 *
4027 	 * If we are called after it dropped the lock we must see all
4028 	 * changes in tsk_cs()->cpus_allowed. Otherwise we can temporary
4029 	 * set any mask even if it is not right from task_cs() pov,
4030 	 * the pending set_cpus_allowed_ptr() will fix things.
4031 	 *
4032 	 * select_fallback_rq() will fix things ups and set cpu_possible_mask
4033 	 * if required.
4034 	 */
4035 	return changed;
4036 }
4037 
4038 void __init cpuset_init_current_mems_allowed(void)
4039 {
4040 	nodes_setall(current->mems_allowed);
4041 }
4042 
4043 /**
4044  * cpuset_mems_allowed - return mems_allowed mask from a tasks cpuset.
4045  * @tsk: pointer to task_struct from which to obtain cpuset->mems_allowed.
4046  *
4047  * Description: Returns the nodemask_t mems_allowed of the cpuset
4048  * attached to the specified @tsk.  Guaranteed to return some non-empty
4049  * subset of node_states[N_MEMORY], even if this means going outside the
4050  * tasks cpuset.
4051  **/
4052 
4053 nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
4054 {
4055 	nodemask_t mask;
4056 	unsigned long flags;
4057 
4058 	spin_lock_irqsave(&callback_lock, flags);
4059 	rcu_read_lock();
4060 	guarantee_online_mems(task_cs(tsk), &mask);
4061 	rcu_read_unlock();
4062 	spin_unlock_irqrestore(&callback_lock, flags);
4063 
4064 	return mask;
4065 }
4066 
4067 /**
4068  * cpuset_nodemask_valid_mems_allowed - check nodemask vs. current mems_allowed
4069  * @nodemask: the nodemask to be checked
4070  *
4071  * Are any of the nodes in the nodemask allowed in current->mems_allowed?
4072  */
4073 int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
4074 {
4075 	return nodes_intersects(*nodemask, current->mems_allowed);
4076 }
4077 
4078 /*
4079  * nearest_hardwall_ancestor() - Returns the nearest mem_exclusive or
4080  * mem_hardwall ancestor to the specified cpuset.  Call holding
4081  * callback_lock.  If no ancestor is mem_exclusive or mem_hardwall
4082  * (an unusual configuration), then returns the root cpuset.
4083  */
4084 static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
4085 {
4086 	while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && parent_cs(cs))
4087 		cs = parent_cs(cs);
4088 	return cs;
4089 }
4090 
4091 /*
4092  * cpuset_node_allowed - Can we allocate on a memory node?
4093  * @node: is this an allowed node?
4094  * @gfp_mask: memory allocation flags
4095  *
4096  * If we're in interrupt, yes, we can always allocate.  If @node is set in
4097  * current's mems_allowed, yes.  If it's not a __GFP_HARDWALL request and this
4098  * node is set in the nearest hardwalled cpuset ancestor to current's cpuset,
4099  * yes.  If current has access to memory reserves as an oom victim, yes.
4100  * Otherwise, no.
4101  *
4102  * GFP_USER allocations are marked with the __GFP_HARDWALL bit,
4103  * and do not allow allocations outside the current tasks cpuset
4104  * unless the task has been OOM killed.
4105  * GFP_KERNEL allocations are not so marked, so can escape to the
4106  * nearest enclosing hardwalled ancestor cpuset.
4107  *
4108  * Scanning up parent cpusets requires callback_lock.  The
4109  * __alloc_pages() routine only calls here with __GFP_HARDWALL bit
4110  * _not_ set if it's a GFP_KERNEL allocation, and all nodes in the
4111  * current tasks mems_allowed came up empty on the first pass over
4112  * the zonelist.  So only GFP_KERNEL allocations, if all nodes in the
4113  * cpuset are short of memory, might require taking the callback_lock.
4114  *
4115  * The first call here from mm/page_alloc:get_page_from_freelist()
4116  * has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets,
4117  * so no allocation on a node outside the cpuset is allowed (unless
4118  * in interrupt, of course).
4119  *
4120  * The second pass through get_page_from_freelist() doesn't even call
4121  * here for GFP_ATOMIC calls.  For those calls, the __alloc_pages()
4122  * variable 'wait' is not set, and the bit ALLOC_CPUSET is not set
4123  * in alloc_flags.  That logic and the checks below have the combined
4124  * affect that:
4125  *	in_interrupt - any node ok (current task context irrelevant)
4126  *	GFP_ATOMIC   - any node ok
4127  *	tsk_is_oom_victim   - any node ok
4128  *	GFP_KERNEL   - any node in enclosing hardwalled cpuset ok
4129  *	GFP_USER     - only nodes in current tasks mems allowed ok.
4130  */
4131 bool cpuset_node_allowed(int node, gfp_t gfp_mask)
4132 {
4133 	struct cpuset *cs;		/* current cpuset ancestors */
4134 	bool allowed;			/* is allocation in zone z allowed? */
4135 	unsigned long flags;
4136 
4137 	if (in_interrupt())
4138 		return true;
4139 	if (node_isset(node, current->mems_allowed))
4140 		return true;
4141 	/*
4142 	 * Allow tasks that have access to memory reserves because they have
4143 	 * been OOM killed to get memory anywhere.
4144 	 */
4145 	if (unlikely(tsk_is_oom_victim(current)))
4146 		return true;
4147 	if (gfp_mask & __GFP_HARDWALL)	/* If hardwall request, stop here */
4148 		return false;
4149 
4150 	if (current->flags & PF_EXITING) /* Let dying task have memory */
4151 		return true;
4152 
4153 	/* Not hardwall and node outside mems_allowed: scan up cpusets */
4154 	spin_lock_irqsave(&callback_lock, flags);
4155 
4156 	rcu_read_lock();
4157 	cs = nearest_hardwall_ancestor(task_cs(current));
4158 	allowed = node_isset(node, cs->mems_allowed);
4159 	rcu_read_unlock();
4160 
4161 	spin_unlock_irqrestore(&callback_lock, flags);
4162 	return allowed;
4163 }
4164 
4165 /**
4166  * cpuset_spread_node() - On which node to begin search for a page
4167  * @rotor: round robin rotor
4168  *
4169  * If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for
4170  * tasks in a cpuset with is_spread_page or is_spread_slab set),
4171  * and if the memory allocation used cpuset_mem_spread_node()
4172  * to determine on which node to start looking, as it will for
4173  * certain page cache or slab cache pages such as used for file
4174  * system buffers and inode caches, then instead of starting on the
4175  * local node to look for a free page, rather spread the starting
4176  * node around the tasks mems_allowed nodes.
4177  *
4178  * We don't have to worry about the returned node being offline
4179  * because "it can't happen", and even if it did, it would be ok.
4180  *
4181  * The routines calling guarantee_online_mems() are careful to
4182  * only set nodes in task->mems_allowed that are online.  So it
4183  * should not be possible for the following code to return an
4184  * offline node.  But if it did, that would be ok, as this routine
4185  * is not returning the node where the allocation must be, only
4186  * the node where the search should start.  The zonelist passed to
4187  * __alloc_pages() will include all nodes.  If the slab allocator
4188  * is passed an offline node, it will fall back to the local node.
4189  * See kmem_cache_alloc_node().
4190  */
4191 static int cpuset_spread_node(int *rotor)
4192 {
4193 	return *rotor = next_node_in(*rotor, current->mems_allowed);
4194 }
4195 
4196 /**
4197  * cpuset_mem_spread_node() - On which node to begin search for a file page
4198  */
4199 int cpuset_mem_spread_node(void)
4200 {
4201 	if (current->cpuset_mem_spread_rotor == NUMA_NO_NODE)
4202 		current->cpuset_mem_spread_rotor =
4203 			node_random(&current->mems_allowed);
4204 
4205 	return cpuset_spread_node(&current->cpuset_mem_spread_rotor);
4206 }
4207 
4208 /**
4209  * cpuset_mems_allowed_intersects - Does @tsk1's mems_allowed intersect @tsk2's?
4210  * @tsk1: pointer to task_struct of some task.
4211  * @tsk2: pointer to task_struct of some other task.
4212  *
4213  * Description: Return true if @tsk1's mems_allowed intersects the
4214  * mems_allowed of @tsk2.  Used by the OOM killer to determine if
4215  * one of the task's memory usage might impact the memory available
4216  * to the other.
4217  **/
4218 
4219 int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
4220 				   const struct task_struct *tsk2)
4221 {
4222 	return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed);
4223 }
4224 
4225 /**
4226  * cpuset_print_current_mems_allowed - prints current's cpuset and mems_allowed
4227  *
4228  * Description: Prints current's name, cpuset name, and cached copy of its
4229  * mems_allowed to the kernel log.
4230  */
4231 void cpuset_print_current_mems_allowed(void)
4232 {
4233 	struct cgroup *cgrp;
4234 
4235 	rcu_read_lock();
4236 
4237 	cgrp = task_cs(current)->css.cgroup;
4238 	pr_cont(",cpuset=");
4239 	pr_cont_cgroup_name(cgrp);
4240 	pr_cont(",mems_allowed=%*pbl",
4241 		nodemask_pr_args(&current->mems_allowed));
4242 
4243 	rcu_read_unlock();
4244 }
4245 
4246 /* Display task mems_allowed in /proc/<pid>/status file. */
4247 void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task)
4248 {
4249 	seq_printf(m, "Mems_allowed:\t%*pb\n",
4250 		   nodemask_pr_args(&task->mems_allowed));
4251 	seq_printf(m, "Mems_allowed_list:\t%*pbl\n",
4252 		   nodemask_pr_args(&task->mems_allowed));
4253 }
4254