xref: /linux/kernel/cgroup/cpuset-v1.c (revision 272bd8183376a9e20fe08bacbaa44003d7c8acaa)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 
3 #include "cgroup-internal.h"
4 #include "cpuset-internal.h"
5 
6 /*
7  * Legacy hierarchy call to cgroup_transfer_tasks() is handled asynchrously
8  */
9 struct cpuset_remove_tasks_struct {
10 	struct work_struct work;
11 	struct cpuset *cs;
12 };
13 
14 /*
15  * Frequency meter - How fast is some event occurring?
16  *
17  * These routines manage a digitally filtered, constant time based,
18  * event frequency meter.  There are four routines:
19  *   fmeter_init() - initialize a frequency meter.
20  *   fmeter_markevent() - called each time the event happens.
21  *   fmeter_getrate() - returns the recent rate of such events.
22  *   fmeter_update() - internal routine used to update fmeter.
23  *
24  * A common data structure is passed to each of these routines,
25  * which is used to keep track of the state required to manage the
26  * frequency meter and its digital filter.
27  *
28  * The filter works on the number of events marked per unit time.
29  * The filter is single-pole low-pass recursive (IIR).  The time unit
30  * is 1 second.  Arithmetic is done using 32-bit integers scaled to
31  * simulate 3 decimal digits of precision (multiplied by 1000).
32  *
33  * With an FM_COEF of 933, and a time base of 1 second, the filter
34  * has a half-life of 10 seconds, meaning that if the events quit
35  * happening, then the rate returned from the fmeter_getrate()
36  * will be cut in half each 10 seconds, until it converges to zero.
37  *
38  * It is not worth doing a real infinitely recursive filter.  If more
39  * than FM_MAXTICKS ticks have elapsed since the last filter event,
40  * just compute FM_MAXTICKS ticks worth, by which point the level
41  * will be stable.
42  *
43  * Limit the count of unprocessed events to FM_MAXCNT, so as to avoid
44  * arithmetic overflow in the fmeter_update() routine.
45  *
46  * Given the simple 32 bit integer arithmetic used, this meter works
47  * best for reporting rates between one per millisecond (msec) and
48  * one per 32 (approx) seconds.  At constant rates faster than one
49  * per msec it maxes out at values just under 1,000,000.  At constant
50  * rates between one per msec, and one per second it will stabilize
51  * to a value N*1000, where N is the rate of events per second.
52  * At constant rates between one per second and one per 32 seconds,
53  * it will be choppy, moving up on the seconds that have an event,
54  * and then decaying until the next event.  At rates slower than
55  * about one in 32 seconds, it decays all the way back to zero between
56  * each event.
57  */
58 
59 #define FM_COEF 933		/* coefficient for half-life of 10 secs */
60 #define FM_MAXTICKS ((u32)99)   /* useless computing more ticks than this */
61 #define FM_MAXCNT 1000000	/* limit cnt to avoid overflow */
62 #define FM_SCALE 1000		/* faux fixed point scale */
63 
64 /* Initialize a frequency meter */
65 static void fmeter_init(struct fmeter *fmp)
66 {
67 	fmp->cnt = 0;
68 	fmp->val = 0;
69 	fmp->time = 0;
70 	spin_lock_init(&fmp->lock);
71 }
72 
73 /* Internal meter update - process cnt events and update value */
74 static void fmeter_update(struct fmeter *fmp)
75 {
76 	time64_t now;
77 	u32 ticks;
78 
79 	now = ktime_get_seconds();
80 	ticks = now - fmp->time;
81 
82 	if (ticks == 0)
83 		return;
84 
85 	ticks = min(FM_MAXTICKS, ticks);
86 	while (ticks-- > 0)
87 		fmp->val = (FM_COEF * fmp->val) / FM_SCALE;
88 	fmp->time = now;
89 
90 	fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE;
91 	fmp->cnt = 0;
92 }
93 
94 /* Process any previous ticks, then bump cnt by one (times scale). */
95 static void fmeter_markevent(struct fmeter *fmp)
96 {
97 	spin_lock(&fmp->lock);
98 	fmeter_update(fmp);
99 	fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE);
100 	spin_unlock(&fmp->lock);
101 }
102 
103 /* Process any previous ticks, then return current value. */
104 static int fmeter_getrate(struct fmeter *fmp)
105 {
106 	int val;
107 
108 	spin_lock(&fmp->lock);
109 	fmeter_update(fmp);
110 	val = fmp->val;
111 	spin_unlock(&fmp->lock);
112 	return val;
113 }
114 
115 /*
116  * Collection of memory_pressure is suppressed unless
117  * this flag is enabled by writing "1" to the special
118  * cpuset file 'memory_pressure_enabled' in the root cpuset.
119  */
120 
121 int cpuset_memory_pressure_enabled __read_mostly;
122 
123 /*
124  * __cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims.
125  *
126  * Keep a running average of the rate of synchronous (direct)
127  * page reclaim efforts initiated by tasks in each cpuset.
128  *
129  * This represents the rate at which some task in the cpuset
130  * ran low on memory on all nodes it was allowed to use, and
131  * had to enter the kernels page reclaim code in an effort to
132  * create more free memory by tossing clean pages or swapping
133  * or writing dirty pages.
134  *
135  * Display to user space in the per-cpuset read-only file
136  * "memory_pressure".  Value displayed is an integer
137  * representing the recent rate of entry into the synchronous
138  * (direct) page reclaim by any task attached to the cpuset.
139  */
140 
141 void __cpuset_memory_pressure_bump(void)
142 {
143 	rcu_read_lock();
144 	fmeter_markevent(&task_cs(current)->fmeter);
145 	rcu_read_unlock();
146 }
147 
148 static int update_relax_domain_level(struct cpuset *cs, s64 val)
149 {
150 #ifdef CONFIG_SMP
151 	if (val < -1 || val > sched_domain_level_max + 1)
152 		return -EINVAL;
153 #endif
154 
155 	if (val != cs->relax_domain_level) {
156 		cs->relax_domain_level = val;
157 		if (!cpumask_empty(cs->cpus_allowed) &&
158 		    is_sched_load_balance(cs))
159 			rebuild_sched_domains_locked();
160 	}
161 
162 	return 0;
163 }
164 
165 static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
166 			    s64 val)
167 {
168 	struct cpuset *cs = css_cs(css);
169 	cpuset_filetype_t type = cft->private;
170 	int retval = -ENODEV;
171 
172 	cpuset_full_lock();
173 	if (!is_cpuset_online(cs))
174 		goto out_unlock;
175 
176 	switch (type) {
177 	case FILE_SCHED_RELAX_DOMAIN_LEVEL:
178 		pr_info_once("cpuset.%s is deprecated\n", cft->name);
179 		retval = update_relax_domain_level(cs, val);
180 		break;
181 	default:
182 		retval = -EINVAL;
183 		break;
184 	}
185 out_unlock:
186 	cpuset_full_unlock();
187 	return retval;
188 }
189 
190 static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft)
191 {
192 	struct cpuset *cs = css_cs(css);
193 	cpuset_filetype_t type = cft->private;
194 
195 	switch (type) {
196 	case FILE_SCHED_RELAX_DOMAIN_LEVEL:
197 		return cs->relax_domain_level;
198 	default:
199 		BUG();
200 	}
201 
202 	/* Unreachable but makes gcc happy */
203 	return 0;
204 }
205 
206 /*
207  * update task's spread flag if cpuset's page/slab spread flag is set
208  *
209  * Call with callback_lock or cpuset_mutex held. The check can be skipped
210  * if on default hierarchy.
211  */
212 void cpuset1_update_task_spread_flags(struct cpuset *cs,
213 					struct task_struct *tsk)
214 {
215 	if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys))
216 		return;
217 
218 	if (is_spread_page(cs))
219 		task_set_spread_page(tsk);
220 	else
221 		task_clear_spread_page(tsk);
222 
223 	if (is_spread_slab(cs))
224 		task_set_spread_slab(tsk);
225 	else
226 		task_clear_spread_slab(tsk);
227 }
228 
229 /**
230  * cpuset1_update_tasks_flags - update the spread flags of tasks in the cpuset.
231  * @cs: the cpuset in which each task's spread flags needs to be changed
232  *
233  * Iterate through each task of @cs updating its spread flags.  As this
234  * function is called with cpuset_mutex held, cpuset membership stays
235  * stable.
236  */
237 void cpuset1_update_tasks_flags(struct cpuset *cs)
238 {
239 	struct css_task_iter it;
240 	struct task_struct *task;
241 
242 	css_task_iter_start(&cs->css, 0, &it);
243 	while ((task = css_task_iter_next(&it)))
244 		cpuset1_update_task_spread_flags(cs, task);
245 	css_task_iter_end(&it);
246 }
247 
248 /*
249  * If CPU and/or memory hotplug handlers, below, unplug any CPUs
250  * or memory nodes, we need to walk over the cpuset hierarchy,
251  * removing that CPU or node from all cpusets.  If this removes the
252  * last CPU or node from a cpuset, then move the tasks in the empty
253  * cpuset to its next-highest non-empty parent.
254  */
255 static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
256 {
257 	struct cpuset *parent;
258 
259 	/*
260 	 * Find its next-highest non-empty parent, (top cpuset
261 	 * has online cpus, so can't be empty).
262 	 */
263 	parent = parent_cs(cs);
264 	while (cpumask_empty(parent->cpus_allowed) ||
265 			nodes_empty(parent->mems_allowed))
266 		parent = parent_cs(parent);
267 
268 	if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) {
269 		pr_err("cpuset: failed to transfer tasks out of empty cpuset ");
270 		pr_cont_cgroup_name(cs->css.cgroup);
271 		pr_cont("\n");
272 	}
273 }
274 
275 static void cpuset_migrate_tasks_workfn(struct work_struct *work)
276 {
277 	struct cpuset_remove_tasks_struct *s;
278 
279 	s = container_of(work, struct cpuset_remove_tasks_struct, work);
280 	remove_tasks_in_empty_cpuset(s->cs);
281 	css_put(&s->cs->css);
282 	kfree(s);
283 }
284 
285 void cpuset1_hotplug_update_tasks(struct cpuset *cs,
286 			    struct cpumask *new_cpus, nodemask_t *new_mems,
287 			    bool cpus_updated, bool mems_updated)
288 {
289 	bool is_empty;
290 
291 	cpuset_callback_lock_irq();
292 	cpumask_copy(cs->cpus_allowed, new_cpus);
293 	cpumask_copy(cs->effective_cpus, new_cpus);
294 	cs->mems_allowed = *new_mems;
295 	cs->effective_mems = *new_mems;
296 	cpuset_callback_unlock_irq();
297 
298 	/*
299 	 * Don't call cpuset_update_tasks_cpumask() if the cpuset becomes empty,
300 	 * as the tasks will be migrated to an ancestor.
301 	 */
302 	if (cpus_updated && !cpumask_empty(cs->cpus_allowed))
303 		cpuset_update_tasks_cpumask(cs, new_cpus);
304 	if (mems_updated && !nodes_empty(cs->mems_allowed))
305 		cpuset_update_tasks_nodemask(cs);
306 
307 	is_empty = cpumask_empty(cs->cpus_allowed) ||
308 		   nodes_empty(cs->mems_allowed);
309 
310 	/*
311 	 * Move tasks to the nearest ancestor with execution resources,
312 	 * This is full cgroup operation which will also call back into
313 	 * cpuset. Execute it asynchronously using workqueue.
314 	 */
315 	if (is_empty && cs->css.cgroup->nr_populated_csets &&
316 	    css_tryget_online(&cs->css)) {
317 		struct cpuset_remove_tasks_struct *s;
318 
319 		s = kzalloc(sizeof(*s), GFP_KERNEL);
320 		if (WARN_ON_ONCE(!s)) {
321 			css_put(&cs->css);
322 			return;
323 		}
324 
325 		s->cs = cs;
326 		INIT_WORK(&s->work, cpuset_migrate_tasks_workfn);
327 		schedule_work(&s->work);
328 	}
329 }
330 
331 /*
332  * is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q?
333  *
334  * One cpuset is a subset of another if all its allowed CPUs and
335  * Memory Nodes are a subset of the other, and its exclusive flags
336  * are only set if the other's are set.  Call holding cpuset_mutex.
337  */
338 
339 static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
340 {
341 	return	cpumask_subset(p->cpus_allowed, q->cpus_allowed) &&
342 		nodes_subset(p->mems_allowed, q->mems_allowed) &&
343 		is_cpu_exclusive(p) <= is_cpu_exclusive(q) &&
344 		is_mem_exclusive(p) <= is_mem_exclusive(q);
345 }
346 
347 /*
348  * cpuset1_validate_change() - Validate conditions specific to legacy (v1)
349  *                            behavior.
350  */
351 int cpuset1_validate_change(struct cpuset *cur, struct cpuset *trial)
352 {
353 	struct cgroup_subsys_state *css;
354 	struct cpuset *c, *par;
355 	int ret;
356 
357 	WARN_ON_ONCE(!rcu_read_lock_held());
358 
359 	/* Each of our child cpusets must be a subset of us */
360 	ret = -EBUSY;
361 	cpuset_for_each_child(c, css, cur)
362 		if (!is_cpuset_subset(c, trial))
363 			goto out;
364 
365 	/* On legacy hierarchy, we must be a subset of our parent cpuset. */
366 	ret = -EACCES;
367 	par = parent_cs(cur);
368 	if (par && !is_cpuset_subset(trial, par))
369 		goto out;
370 
371 	/*
372 	 * Cpusets with tasks - existing or newly being attached - can't
373 	 * be changed to have empty cpus_allowed or mems_allowed.
374 	 */
375 	ret = -ENOSPC;
376 	if (cpuset_is_populated(cur)) {
377 		if (!cpumask_empty(cur->cpus_allowed) &&
378 		    cpumask_empty(trial->cpus_allowed))
379 			goto out;
380 		if (!nodes_empty(cur->mems_allowed) &&
381 		    nodes_empty(trial->mems_allowed))
382 			goto out;
383 	}
384 
385 	ret = 0;
386 out:
387 	return ret;
388 }
389 
390 /*
391  * cpuset1_cpus_excl_conflict() - Check if two cpusets have exclusive CPU conflicts
392  *                                to legacy (v1)
393  * @cs1: first cpuset to check
394  * @cs2: second cpuset to check
395  *
396  * Returns: true if CPU exclusivity conflict exists, false otherwise
397  *
398  * If either cpuset is CPU exclusive, their allowed CPUs cannot intersect.
399  */
400 bool cpuset1_cpus_excl_conflict(struct cpuset *cs1, struct cpuset *cs2)
401 {
402 	if (is_cpu_exclusive(cs1) || is_cpu_exclusive(cs2))
403 		return cpumask_intersects(cs1->cpus_allowed,
404 					  cs2->cpus_allowed);
405 
406 	return false;
407 }
408 
409 #ifdef CONFIG_PROC_PID_CPUSET
410 /*
411  * proc_cpuset_show()
412  *  - Print tasks cpuset path into seq_file.
413  *  - Used for /proc/<pid>/cpuset.
414  */
415 int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
416 		     struct pid *pid, struct task_struct *tsk)
417 {
418 	char *buf;
419 	struct cgroup_subsys_state *css;
420 	int retval;
421 
422 	retval = -ENOMEM;
423 	buf = kmalloc(PATH_MAX, GFP_KERNEL);
424 	if (!buf)
425 		goto out;
426 
427 	rcu_read_lock();
428 	spin_lock_irq(&css_set_lock);
429 	css = task_css(tsk, cpuset_cgrp_id);
430 	retval = cgroup_path_ns_locked(css->cgroup, buf, PATH_MAX,
431 				       current->nsproxy->cgroup_ns);
432 	spin_unlock_irq(&css_set_lock);
433 	rcu_read_unlock();
434 
435 	if (retval == -E2BIG)
436 		retval = -ENAMETOOLONG;
437 	if (retval < 0)
438 		goto out_free;
439 	seq_puts(m, buf);
440 	seq_putc(m, '\n');
441 	retval = 0;
442 out_free:
443 	kfree(buf);
444 out:
445 	return retval;
446 }
447 #endif /* CONFIG_PROC_PID_CPUSET */
448 
449 static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft)
450 {
451 	struct cpuset *cs = css_cs(css);
452 	cpuset_filetype_t type = cft->private;
453 
454 	switch (type) {
455 	case FILE_CPU_EXCLUSIVE:
456 		return is_cpu_exclusive(cs);
457 	case FILE_MEM_EXCLUSIVE:
458 		return is_mem_exclusive(cs);
459 	case FILE_MEM_HARDWALL:
460 		return is_mem_hardwall(cs);
461 	case FILE_SCHED_LOAD_BALANCE:
462 		return is_sched_load_balance(cs);
463 	case FILE_MEMORY_MIGRATE:
464 		return is_memory_migrate(cs);
465 	case FILE_MEMORY_PRESSURE_ENABLED:
466 		return cpuset_memory_pressure_enabled;
467 	case FILE_MEMORY_PRESSURE:
468 		return fmeter_getrate(&cs->fmeter);
469 	case FILE_SPREAD_PAGE:
470 		return is_spread_page(cs);
471 	case FILE_SPREAD_SLAB:
472 		return is_spread_slab(cs);
473 	default:
474 		BUG();
475 	}
476 
477 	/* Unreachable but makes gcc happy */
478 	return 0;
479 }
480 
481 static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
482 			    u64 val)
483 {
484 	struct cpuset *cs = css_cs(css);
485 	cpuset_filetype_t type = cft->private;
486 	int retval = 0;
487 
488 	cpuset_full_lock();
489 	if (!is_cpuset_online(cs)) {
490 		retval = -ENODEV;
491 		goto out_unlock;
492 	}
493 
494 	switch (type) {
495 	case FILE_CPU_EXCLUSIVE:
496 		retval = cpuset_update_flag(CS_CPU_EXCLUSIVE, cs, val);
497 		break;
498 	case FILE_MEM_EXCLUSIVE:
499 		pr_info_once("cpuset.%s is deprecated\n", cft->name);
500 		retval = cpuset_update_flag(CS_MEM_EXCLUSIVE, cs, val);
501 		break;
502 	case FILE_MEM_HARDWALL:
503 		pr_info_once("cpuset.%s is deprecated\n", cft->name);
504 		retval = cpuset_update_flag(CS_MEM_HARDWALL, cs, val);
505 		break;
506 	case FILE_SCHED_LOAD_BALANCE:
507 		pr_info_once("cpuset.%s is deprecated, use cpuset.cpus.partition instead\n", cft->name);
508 		retval = cpuset_update_flag(CS_SCHED_LOAD_BALANCE, cs, val);
509 		break;
510 	case FILE_MEMORY_MIGRATE:
511 		pr_info_once("cpuset.%s is deprecated\n", cft->name);
512 		retval = cpuset_update_flag(CS_MEMORY_MIGRATE, cs, val);
513 		break;
514 	case FILE_MEMORY_PRESSURE_ENABLED:
515 		pr_info_once("cpuset.%s is deprecated, use memory.pressure with CONFIG_PSI instead\n", cft->name);
516 		cpuset_memory_pressure_enabled = !!val;
517 		break;
518 	case FILE_SPREAD_PAGE:
519 		pr_info_once("cpuset.%s is deprecated\n", cft->name);
520 		retval = cpuset_update_flag(CS_SPREAD_PAGE, cs, val);
521 		break;
522 	case FILE_SPREAD_SLAB:
523 		pr_warn_once("cpuset.%s is deprecated\n", cft->name);
524 		retval = cpuset_update_flag(CS_SPREAD_SLAB, cs, val);
525 		break;
526 	default:
527 		retval = -EINVAL;
528 		break;
529 	}
530 out_unlock:
531 	cpuset_full_unlock();
532 	return retval;
533 }
534 
535 void cpuset1_init(struct cpuset *cs)
536 {
537 	fmeter_init(&cs->fmeter);
538 	cs->relax_domain_level = -1;
539 }
540 
541 void cpuset1_online_css(struct cgroup_subsys_state *css)
542 {
543 	struct cpuset *tmp_cs;
544 	struct cgroup_subsys_state *pos_css;
545 	struct cpuset *cs = css_cs(css);
546 	struct cpuset *parent = parent_cs(cs);
547 
548 	lockdep_assert_cpus_held();
549 	lockdep_assert_cpuset_lock_held();
550 
551 	if (is_spread_page(parent))
552 		set_bit(CS_SPREAD_PAGE, &cs->flags);
553 	if (is_spread_slab(parent))
554 		set_bit(CS_SPREAD_SLAB, &cs->flags);
555 
556 	if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
557 		return;
558 
559 	/*
560 	 * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is
561 	 * set.  This flag handling is implemented in cgroup core for
562 	 * historical reasons - the flag may be specified during mount.
563 	 *
564 	 * Currently, if any sibling cpusets have exclusive cpus or mem, we
565 	 * refuse to clone the configuration - thereby refusing the task to
566 	 * be entered, and as a result refusing the sys_unshare() or
567 	 * clone() which initiated it.  If this becomes a problem for some
568 	 * users who wish to allow that scenario, then this could be
569 	 * changed to grant parent->cpus_allowed-sibling_cpus_exclusive
570 	 * (and likewise for mems) to the new cgroup.
571 	 */
572 	rcu_read_lock();
573 	cpuset_for_each_child(tmp_cs, pos_css, parent) {
574 		if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) {
575 			rcu_read_unlock();
576 			return;
577 		}
578 	}
579 	rcu_read_unlock();
580 
581 	cpuset_callback_lock_irq();
582 	cs->mems_allowed = parent->mems_allowed;
583 	cs->effective_mems = parent->mems_allowed;
584 	cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
585 	cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
586 	cpuset_callback_unlock_irq();
587 }
588 
589 static void
590 update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
591 {
592 	if (dattr->relax_domain_level < c->relax_domain_level)
593 		dattr->relax_domain_level = c->relax_domain_level;
594 }
595 
596 static void update_domain_attr_tree(struct sched_domain_attr *dattr,
597 				    struct cpuset *root_cs)
598 {
599 	struct cpuset *cp;
600 	struct cgroup_subsys_state *pos_css;
601 
602 	rcu_read_lock();
603 	cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
604 		/* skip the whole subtree if @cp doesn't have any CPU */
605 		if (cpumask_empty(cp->cpus_allowed)) {
606 			pos_css = css_rightmost_descendant(pos_css);
607 			continue;
608 		}
609 
610 		if (is_sched_load_balance(cp))
611 			update_domain_attr(dattr, cp);
612 	}
613 	rcu_read_unlock();
614 }
615 
616 /*
617  * cpuset1_generate_sched_domains()
618  *
619  * Finding the best partition (set of domains):
620  *	The double nested loops below over i, j scan over the load
621  *	balanced cpusets (using the array of cpuset pointers in csa[])
622  *	looking for pairs of cpusets that have overlapping cpus_allowed
623  *	and merging them using a union-find algorithm.
624  *
625  *	The union of the cpus_allowed masks from the set of all cpusets
626  *	having the same root then form the one element of the partition
627  *	(one sched domain) to be passed to partition_sched_domains().
628  */
629 int cpuset1_generate_sched_domains(cpumask_var_t **domains,
630 			struct sched_domain_attr **attributes)
631 {
632 	struct cpuset *cp;	/* top-down scan of cpusets */
633 	struct cpuset **csa;	/* array of all cpuset ptrs */
634 	int csn;		/* how many cpuset ptrs in csa so far */
635 	int i, j;		/* indices for partition finding loops */
636 	cpumask_var_t *doms;	/* resulting partition; i.e. sched domains */
637 	struct sched_domain_attr *dattr;  /* attributes for custom domains */
638 	int ndoms = 0;		/* number of sched domains in result */
639 	int nslot;		/* next empty doms[] struct cpumask slot */
640 	struct cgroup_subsys_state *pos_css;
641 	int nslot_update;
642 
643 	lockdep_assert_cpuset_lock_held();
644 
645 	doms = NULL;
646 	dattr = NULL;
647 	csa = NULL;
648 
649 	/* Special case for the 99% of systems with one, full, sched domain */
650 	if (is_sched_load_balance(&top_cpuset)) {
651 		ndoms = 1;
652 		doms = alloc_sched_domains(ndoms);
653 		if (!doms)
654 			goto done;
655 
656 		dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL);
657 		if (dattr) {
658 			*dattr = SD_ATTR_INIT;
659 			update_domain_attr_tree(dattr, &top_cpuset);
660 		}
661 		cpumask_and(doms[0], top_cpuset.effective_cpus,
662 			    housekeeping_cpumask(HK_TYPE_DOMAIN));
663 
664 		goto done;
665 	}
666 
667 	csa = kmalloc_array(nr_cpusets(), sizeof(cp), GFP_KERNEL);
668 	if (!csa)
669 		goto done;
670 	csn = 0;
671 
672 	rcu_read_lock();
673 	cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) {
674 		if (cp == &top_cpuset)
675 			continue;
676 
677 		/*
678 		 * Continue traversing beyond @cp iff @cp has some CPUs and
679 		 * isn't load balancing.  The former is obvious.  The
680 		 * latter: All child cpusets contain a subset of the
681 		 * parent's cpus, so just skip them, and then we call
682 		 * update_domain_attr_tree() to calc relax_domain_level of
683 		 * the corresponding sched domain.
684 		 */
685 		if (!cpumask_empty(cp->cpus_allowed) &&
686 		    !(is_sched_load_balance(cp) &&
687 		      cpumask_intersects(cp->cpus_allowed,
688 					 housekeeping_cpumask(HK_TYPE_DOMAIN))))
689 			continue;
690 
691 		if (is_sched_load_balance(cp) &&
692 		    !cpumask_empty(cp->effective_cpus))
693 			csa[csn++] = cp;
694 
695 		/* skip @cp's subtree */
696 		pos_css = css_rightmost_descendant(pos_css);
697 		continue;
698 	}
699 	rcu_read_unlock();
700 
701 	for (i = 0; i < csn; i++)
702 		uf_node_init(&csa[i]->node);
703 
704 	/* Merge overlapping cpusets */
705 	for (i = 0; i < csn; i++) {
706 		for (j = i + 1; j < csn; j++) {
707 			if (cpusets_overlap(csa[i], csa[j]))
708 				uf_union(&csa[i]->node, &csa[j]->node);
709 		}
710 	}
711 
712 	/* Count the total number of domains */
713 	for (i = 0; i < csn; i++) {
714 		if (uf_find(&csa[i]->node) == &csa[i]->node)
715 			ndoms++;
716 	}
717 
718 	/*
719 	 * Now we know how many domains to create.
720 	 * Convert <csn, csa> to <ndoms, doms> and populate cpu masks.
721 	 */
722 	doms = alloc_sched_domains(ndoms);
723 	if (!doms)
724 		goto done;
725 
726 	/*
727 	 * The rest of the code, including the scheduler, can deal with
728 	 * dattr==NULL case. No need to abort if alloc fails.
729 	 */
730 	dattr = kmalloc_array(ndoms, sizeof(struct sched_domain_attr),
731 			      GFP_KERNEL);
732 
733 	for (nslot = 0, i = 0; i < csn; i++) {
734 		nslot_update = 0;
735 		for (j = i; j < csn; j++) {
736 			if (uf_find(&csa[j]->node) == &csa[i]->node) {
737 				struct cpumask *dp = doms[nslot];
738 
739 				if (i == j) {
740 					nslot_update = 1;
741 					cpumask_clear(dp);
742 					if (dattr)
743 						*(dattr + nslot) = SD_ATTR_INIT;
744 				}
745 				cpumask_or(dp, dp, csa[j]->effective_cpus);
746 				cpumask_and(dp, dp, housekeeping_cpumask(HK_TYPE_DOMAIN));
747 				if (dattr)
748 					update_domain_attr_tree(dattr + nslot, csa[j]);
749 			}
750 		}
751 		if (nslot_update)
752 			nslot++;
753 	}
754 	BUG_ON(nslot != ndoms);
755 
756 done:
757 	kfree(csa);
758 
759 	/*
760 	 * Fallback to the default domain if kmalloc() failed.
761 	 * See comments in partition_sched_domains().
762 	 */
763 	if (doms == NULL)
764 		ndoms = 1;
765 
766 	*domains    = doms;
767 	*attributes = dattr;
768 	return ndoms;
769 }
770 
771 /*
772  * for the common functions, 'private' gives the type of file
773  */
774 
775 struct cftype cpuset1_files[] = {
776 	{
777 		.name = "cpus",
778 		.seq_show = cpuset_common_seq_show,
779 		.write = cpuset_write_resmask,
780 		.max_write_len = (100U + 6 * NR_CPUS),
781 		.private = FILE_CPULIST,
782 	},
783 
784 	{
785 		.name = "mems",
786 		.seq_show = cpuset_common_seq_show,
787 		.write = cpuset_write_resmask,
788 		.max_write_len = (100U + 6 * MAX_NUMNODES),
789 		.private = FILE_MEMLIST,
790 	},
791 
792 	{
793 		.name = "effective_cpus",
794 		.seq_show = cpuset_common_seq_show,
795 		.private = FILE_EFFECTIVE_CPULIST,
796 	},
797 
798 	{
799 		.name = "effective_mems",
800 		.seq_show = cpuset_common_seq_show,
801 		.private = FILE_EFFECTIVE_MEMLIST,
802 	},
803 
804 	{
805 		.name = "cpu_exclusive",
806 		.read_u64 = cpuset_read_u64,
807 		.write_u64 = cpuset_write_u64,
808 		.private = FILE_CPU_EXCLUSIVE,
809 	},
810 
811 	{
812 		.name = "mem_exclusive",
813 		.read_u64 = cpuset_read_u64,
814 		.write_u64 = cpuset_write_u64,
815 		.private = FILE_MEM_EXCLUSIVE,
816 	},
817 
818 	{
819 		.name = "mem_hardwall",
820 		.read_u64 = cpuset_read_u64,
821 		.write_u64 = cpuset_write_u64,
822 		.private = FILE_MEM_HARDWALL,
823 	},
824 
825 	{
826 		.name = "sched_load_balance",
827 		.read_u64 = cpuset_read_u64,
828 		.write_u64 = cpuset_write_u64,
829 		.private = FILE_SCHED_LOAD_BALANCE,
830 	},
831 
832 	{
833 		.name = "sched_relax_domain_level",
834 		.read_s64 = cpuset_read_s64,
835 		.write_s64 = cpuset_write_s64,
836 		.private = FILE_SCHED_RELAX_DOMAIN_LEVEL,
837 	},
838 
839 	{
840 		.name = "memory_migrate",
841 		.read_u64 = cpuset_read_u64,
842 		.write_u64 = cpuset_write_u64,
843 		.private = FILE_MEMORY_MIGRATE,
844 	},
845 
846 	{
847 		.name = "memory_pressure",
848 		.read_u64 = cpuset_read_u64,
849 		.private = FILE_MEMORY_PRESSURE,
850 	},
851 
852 	{
853 		.name = "memory_spread_page",
854 		.read_u64 = cpuset_read_u64,
855 		.write_u64 = cpuset_write_u64,
856 		.private = FILE_SPREAD_PAGE,
857 	},
858 
859 	{
860 		/* obsolete, may be removed in the future */
861 		.name = "memory_spread_slab",
862 		.read_u64 = cpuset_read_u64,
863 		.write_u64 = cpuset_write_u64,
864 		.private = FILE_SPREAD_SLAB,
865 	},
866 
867 	{
868 		.name = "memory_pressure_enabled",
869 		.flags = CFTYPE_ONLY_ON_ROOT,
870 		.read_u64 = cpuset_read_u64,
871 		.write_u64 = cpuset_write_u64,
872 		.private = FILE_MEMORY_PRESSURE_ENABLED,
873 	},
874 
875 	{ }	/* terminate */
876 };
877