1 // SPDX-License-Identifier: GPL-2.0-or-later
2
3 #include "cgroup-internal.h"
4 #include "cpuset-internal.h"
5
6 /*
7 * Legacy hierarchy call to cgroup_transfer_tasks() is handled asynchrously
8 */
9 struct cpuset_remove_tasks_struct {
10 struct work_struct work;
11 struct cpuset *cs;
12 };
13
14 /*
15 * Frequency meter - How fast is some event occurring?
16 *
17 * These routines manage a digitally filtered, constant time based,
18 * event frequency meter. There are four routines:
19 * fmeter_init() - initialize a frequency meter.
20 * fmeter_markevent() - called each time the event happens.
21 * fmeter_getrate() - returns the recent rate of such events.
22 * fmeter_update() - internal routine used to update fmeter.
23 *
24 * A common data structure is passed to each of these routines,
25 * which is used to keep track of the state required to manage the
26 * frequency meter and its digital filter.
27 *
28 * The filter works on the number of events marked per unit time.
29 * The filter is single-pole low-pass recursive (IIR). The time unit
30 * is 1 second. Arithmetic is done using 32-bit integers scaled to
31 * simulate 3 decimal digits of precision (multiplied by 1000).
32 *
33 * With an FM_COEF of 933, and a time base of 1 second, the filter
34 * has a half-life of 10 seconds, meaning that if the events quit
35 * happening, then the rate returned from the fmeter_getrate()
36 * will be cut in half each 10 seconds, until it converges to zero.
37 *
38 * It is not worth doing a real infinitely recursive filter. If more
39 * than FM_MAXTICKS ticks have elapsed since the last filter event,
40 * just compute FM_MAXTICKS ticks worth, by which point the level
41 * will be stable.
42 *
43 * Limit the count of unprocessed events to FM_MAXCNT, so as to avoid
44 * arithmetic overflow in the fmeter_update() routine.
45 *
46 * Given the simple 32 bit integer arithmetic used, this meter works
47 * best for reporting rates between one per millisecond (msec) and
48 * one per 32 (approx) seconds. At constant rates faster than one
49 * per msec it maxes out at values just under 1,000,000. At constant
50 * rates between one per msec, and one per second it will stabilize
51 * to a value N*1000, where N is the rate of events per second.
52 * At constant rates between one per second and one per 32 seconds,
53 * it will be choppy, moving up on the seconds that have an event,
54 * and then decaying until the next event. At rates slower than
55 * about one in 32 seconds, it decays all the way back to zero between
56 * each event.
57 */
58
59 #define FM_COEF 933 /* coefficient for half-life of 10 secs */
60 #define FM_MAXTICKS ((u32)99) /* useless computing more ticks than this */
61 #define FM_MAXCNT 1000000 /* limit cnt to avoid overflow */
62 #define FM_SCALE 1000 /* faux fixed point scale */
63
64 /* Initialize a frequency meter */
fmeter_init(struct fmeter * fmp)65 static void fmeter_init(struct fmeter *fmp)
66 {
67 fmp->cnt = 0;
68 fmp->val = 0;
69 fmp->time = 0;
70 spin_lock_init(&fmp->lock);
71 }
72
73 /* Internal meter update - process cnt events and update value */
fmeter_update(struct fmeter * fmp)74 static void fmeter_update(struct fmeter *fmp)
75 {
76 time64_t now;
77 u32 ticks;
78
79 now = ktime_get_seconds();
80 ticks = now - fmp->time;
81
82 if (ticks == 0)
83 return;
84
85 ticks = min(FM_MAXTICKS, ticks);
86 while (ticks-- > 0)
87 fmp->val = (FM_COEF * fmp->val) / FM_SCALE;
88 fmp->time = now;
89
90 fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE;
91 fmp->cnt = 0;
92 }
93
94 /* Process any previous ticks, then bump cnt by one (times scale). */
fmeter_markevent(struct fmeter * fmp)95 static void fmeter_markevent(struct fmeter *fmp)
96 {
97 spin_lock(&fmp->lock);
98 fmeter_update(fmp);
99 fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE);
100 spin_unlock(&fmp->lock);
101 }
102
103 /* Process any previous ticks, then return current value. */
fmeter_getrate(struct fmeter * fmp)104 static int fmeter_getrate(struct fmeter *fmp)
105 {
106 int val;
107
108 spin_lock(&fmp->lock);
109 fmeter_update(fmp);
110 val = fmp->val;
111 spin_unlock(&fmp->lock);
112 return val;
113 }
114
115 /*
116 * Collection of memory_pressure is suppressed unless
117 * this flag is enabled by writing "1" to the special
118 * cpuset file 'memory_pressure_enabled' in the root cpuset.
119 */
120
121 int cpuset_memory_pressure_enabled __read_mostly;
122
123 /*
124 * __cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims.
125 *
126 * Keep a running average of the rate of synchronous (direct)
127 * page reclaim efforts initiated by tasks in each cpuset.
128 *
129 * This represents the rate at which some task in the cpuset
130 * ran low on memory on all nodes it was allowed to use, and
131 * had to enter the kernels page reclaim code in an effort to
132 * create more free memory by tossing clean pages or swapping
133 * or writing dirty pages.
134 *
135 * Display to user space in the per-cpuset read-only file
136 * "memory_pressure". Value displayed is an integer
137 * representing the recent rate of entry into the synchronous
138 * (direct) page reclaim by any task attached to the cpuset.
139 */
140
__cpuset_memory_pressure_bump(void)141 void __cpuset_memory_pressure_bump(void)
142 {
143 rcu_read_lock();
144 fmeter_markevent(&task_cs(current)->fmeter);
145 rcu_read_unlock();
146 }
147
update_relax_domain_level(struct cpuset * cs,s64 val)148 static int update_relax_domain_level(struct cpuset *cs, s64 val)
149 {
150 #ifdef CONFIG_SMP
151 if (val < -1 || val > sched_domain_level_max + 1)
152 return -EINVAL;
153 #endif
154
155 if (val != cs->relax_domain_level) {
156 cs->relax_domain_level = val;
157 if (!cpumask_empty(cs->cpus_allowed) &&
158 is_sched_load_balance(cs))
159 rebuild_sched_domains_locked();
160 }
161
162 return 0;
163 }
164
cpuset_write_s64(struct cgroup_subsys_state * css,struct cftype * cft,s64 val)165 static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
166 s64 val)
167 {
168 struct cpuset *cs = css_cs(css);
169 cpuset_filetype_t type = cft->private;
170 int retval = -ENODEV;
171
172 cpuset_full_lock();
173 if (!is_cpuset_online(cs))
174 goto out_unlock;
175
176 switch (type) {
177 case FILE_SCHED_RELAX_DOMAIN_LEVEL:
178 pr_info_once("cpuset.%s is deprecated\n", cft->name);
179 retval = update_relax_domain_level(cs, val);
180 break;
181 default:
182 retval = -EINVAL;
183 break;
184 }
185 out_unlock:
186 cpuset_full_unlock();
187 return retval;
188 }
189
cpuset_read_s64(struct cgroup_subsys_state * css,struct cftype * cft)190 static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft)
191 {
192 struct cpuset *cs = css_cs(css);
193 cpuset_filetype_t type = cft->private;
194
195 switch (type) {
196 case FILE_SCHED_RELAX_DOMAIN_LEVEL:
197 return cs->relax_domain_level;
198 default:
199 BUG();
200 }
201
202 /* Unreachable but makes gcc happy */
203 return 0;
204 }
205
206 /*
207 * update task's spread flag if cpuset's page/slab spread flag is set
208 *
209 * Call with callback_lock or cpuset_mutex held. The check can be skipped
210 * if on default hierarchy.
211 */
cpuset1_update_task_spread_flags(struct cpuset * cs,struct task_struct * tsk)212 void cpuset1_update_task_spread_flags(struct cpuset *cs,
213 struct task_struct *tsk)
214 {
215 if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys))
216 return;
217
218 if (is_spread_page(cs))
219 task_set_spread_page(tsk);
220 else
221 task_clear_spread_page(tsk);
222
223 if (is_spread_slab(cs))
224 task_set_spread_slab(tsk);
225 else
226 task_clear_spread_slab(tsk);
227 }
228
229 /**
230 * cpuset1_update_tasks_flags - update the spread flags of tasks in the cpuset.
231 * @cs: the cpuset in which each task's spread flags needs to be changed
232 *
233 * Iterate through each task of @cs updating its spread flags. As this
234 * function is called with cpuset_mutex held, cpuset membership stays
235 * stable.
236 */
cpuset1_update_tasks_flags(struct cpuset * cs)237 void cpuset1_update_tasks_flags(struct cpuset *cs)
238 {
239 struct css_task_iter it;
240 struct task_struct *task;
241
242 css_task_iter_start(&cs->css, 0, &it);
243 while ((task = css_task_iter_next(&it)))
244 cpuset1_update_task_spread_flags(cs, task);
245 css_task_iter_end(&it);
246 }
247
248 /*
249 * If CPU and/or memory hotplug handlers, below, unplug any CPUs
250 * or memory nodes, we need to walk over the cpuset hierarchy,
251 * removing that CPU or node from all cpusets. If this removes the
252 * last CPU or node from a cpuset, then move the tasks in the empty
253 * cpuset to its next-highest non-empty parent.
254 */
remove_tasks_in_empty_cpuset(struct cpuset * cs)255 static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
256 {
257 struct cpuset *parent;
258
259 /*
260 * Find its next-highest non-empty parent, (top cpuset
261 * has online cpus, so can't be empty).
262 */
263 parent = parent_cs(cs);
264 while (cpumask_empty(parent->cpus_allowed) ||
265 nodes_empty(parent->mems_allowed))
266 parent = parent_cs(parent);
267
268 if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) {
269 pr_err("cpuset: failed to transfer tasks out of empty cpuset ");
270 pr_cont_cgroup_name(cs->css.cgroup);
271 pr_cont("\n");
272 }
273 }
274
cpuset_migrate_tasks_workfn(struct work_struct * work)275 static void cpuset_migrate_tasks_workfn(struct work_struct *work)
276 {
277 struct cpuset_remove_tasks_struct *s;
278
279 s = container_of(work, struct cpuset_remove_tasks_struct, work);
280 remove_tasks_in_empty_cpuset(s->cs);
281 css_put(&s->cs->css);
282 kfree(s);
283 }
284
cpuset1_hotplug_update_tasks(struct cpuset * cs,struct cpumask * new_cpus,nodemask_t * new_mems,bool cpus_updated,bool mems_updated)285 void cpuset1_hotplug_update_tasks(struct cpuset *cs,
286 struct cpumask *new_cpus, nodemask_t *new_mems,
287 bool cpus_updated, bool mems_updated)
288 {
289 bool is_empty;
290
291 cpuset_callback_lock_irq();
292 cpumask_copy(cs->cpus_allowed, new_cpus);
293 cpumask_copy(cs->effective_cpus, new_cpus);
294 cs->mems_allowed = *new_mems;
295 cs->effective_mems = *new_mems;
296 cpuset_callback_unlock_irq();
297
298 /*
299 * Don't call cpuset_update_tasks_cpumask() if the cpuset becomes empty,
300 * as the tasks will be migrated to an ancestor.
301 */
302 if (cpus_updated && !cpumask_empty(cs->cpus_allowed))
303 cpuset_update_tasks_cpumask(cs, new_cpus);
304 if (mems_updated && !nodes_empty(cs->mems_allowed))
305 cpuset_update_tasks_nodemask(cs);
306
307 is_empty = cpumask_empty(cs->cpus_allowed) ||
308 nodes_empty(cs->mems_allowed);
309
310 /*
311 * Move tasks to the nearest ancestor with execution resources,
312 * This is full cgroup operation which will also call back into
313 * cpuset. Execute it asynchronously using workqueue.
314 */
315 if (is_empty && cs->css.cgroup->nr_populated_csets &&
316 css_tryget_online(&cs->css)) {
317 struct cpuset_remove_tasks_struct *s;
318
319 s = kzalloc_obj(*s);
320 if (WARN_ON_ONCE(!s)) {
321 css_put(&cs->css);
322 return;
323 }
324
325 s->cs = cs;
326 INIT_WORK(&s->work, cpuset_migrate_tasks_workfn);
327 schedule_work(&s->work);
328 }
329 }
330
331 /*
332 * is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q?
333 *
334 * One cpuset is a subset of another if all its allowed CPUs and
335 * Memory Nodes are a subset of the other, and its exclusive flags
336 * are only set if the other's are set. Call holding cpuset_mutex.
337 */
338
is_cpuset_subset(const struct cpuset * p,const struct cpuset * q)339 static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
340 {
341 return cpumask_subset(p->cpus_allowed, q->cpus_allowed) &&
342 nodes_subset(p->mems_allowed, q->mems_allowed) &&
343 is_cpu_exclusive(p) <= is_cpu_exclusive(q) &&
344 is_mem_exclusive(p) <= is_mem_exclusive(q);
345 }
346
347 /*
348 * cpuset1_validate_change() - Validate conditions specific to legacy (v1)
349 * behavior.
350 */
cpuset1_validate_change(struct cpuset * cur,struct cpuset * trial)351 int cpuset1_validate_change(struct cpuset *cur, struct cpuset *trial)
352 {
353 struct cgroup_subsys_state *css;
354 struct cpuset *c, *par;
355 int ret;
356
357 WARN_ON_ONCE(!rcu_read_lock_held());
358
359 /* Each of our child cpusets must be a subset of us */
360 ret = -EBUSY;
361 cpuset_for_each_child(c, css, cur)
362 if (!is_cpuset_subset(c, trial))
363 goto out;
364
365 /* On legacy hierarchy, we must be a subset of our parent cpuset. */
366 ret = -EACCES;
367 par = parent_cs(cur);
368 if (par && !is_cpuset_subset(trial, par))
369 goto out;
370
371 /*
372 * Cpusets with tasks - existing or newly being attached - can't
373 * be changed to have empty cpus_allowed or mems_allowed.
374 */
375 ret = -ENOSPC;
376 if (cpuset_is_populated(cur)) {
377 if (!cpumask_empty(cur->cpus_allowed) &&
378 cpumask_empty(trial->cpus_allowed))
379 goto out;
380 if (!nodes_empty(cur->mems_allowed) &&
381 nodes_empty(trial->mems_allowed))
382 goto out;
383 }
384
385 ret = 0;
386 out:
387 return ret;
388 }
389
390 /*
391 * cpuset1_cpus_excl_conflict() - Check if two cpusets have exclusive CPU conflicts
392 * to legacy (v1)
393 * @cs1: first cpuset to check
394 * @cs2: second cpuset to check
395 *
396 * Returns: true if CPU exclusivity conflict exists, false otherwise
397 *
398 * If either cpuset is CPU exclusive, their allowed CPUs cannot intersect.
399 */
cpuset1_cpus_excl_conflict(struct cpuset * cs1,struct cpuset * cs2)400 bool cpuset1_cpus_excl_conflict(struct cpuset *cs1, struct cpuset *cs2)
401 {
402 if (is_cpu_exclusive(cs1) || is_cpu_exclusive(cs2))
403 return cpumask_intersects(cs1->cpus_allowed,
404 cs2->cpus_allowed);
405
406 return false;
407 }
408
409 #ifdef CONFIG_PROC_PID_CPUSET
410 /*
411 * proc_cpuset_show()
412 * - Print tasks cpuset path into seq_file.
413 * - Used for /proc/<pid>/cpuset.
414 */
proc_cpuset_show(struct seq_file * m,struct pid_namespace * ns,struct pid * pid,struct task_struct * tsk)415 int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
416 struct pid *pid, struct task_struct *tsk)
417 {
418 char *buf;
419 struct cgroup_subsys_state *css;
420 int retval;
421
422 retval = -ENOMEM;
423 buf = kmalloc(PATH_MAX, GFP_KERNEL);
424 if (!buf)
425 goto out;
426
427 rcu_read_lock();
428 spin_lock_irq(&css_set_lock);
429 css = task_css(tsk, cpuset_cgrp_id);
430 retval = cgroup_path_ns_locked(css->cgroup, buf, PATH_MAX,
431 current->nsproxy->cgroup_ns);
432 spin_unlock_irq(&css_set_lock);
433 rcu_read_unlock();
434
435 if (retval == -E2BIG)
436 retval = -ENAMETOOLONG;
437 if (retval < 0)
438 goto out_free;
439 seq_puts(m, buf);
440 seq_putc(m, '\n');
441 retval = 0;
442 out_free:
443 kfree(buf);
444 out:
445 return retval;
446 }
447 #endif /* CONFIG_PROC_PID_CPUSET */
448
cpuset_read_u64(struct cgroup_subsys_state * css,struct cftype * cft)449 static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft)
450 {
451 struct cpuset *cs = css_cs(css);
452 cpuset_filetype_t type = cft->private;
453
454 switch (type) {
455 case FILE_CPU_EXCLUSIVE:
456 return is_cpu_exclusive(cs);
457 case FILE_MEM_EXCLUSIVE:
458 return is_mem_exclusive(cs);
459 case FILE_MEM_HARDWALL:
460 return is_mem_hardwall(cs);
461 case FILE_SCHED_LOAD_BALANCE:
462 return is_sched_load_balance(cs);
463 case FILE_MEMORY_MIGRATE:
464 return is_memory_migrate(cs);
465 case FILE_MEMORY_PRESSURE_ENABLED:
466 return cpuset_memory_pressure_enabled;
467 case FILE_MEMORY_PRESSURE:
468 return fmeter_getrate(&cs->fmeter);
469 case FILE_SPREAD_PAGE:
470 return is_spread_page(cs);
471 case FILE_SPREAD_SLAB:
472 return is_spread_slab(cs);
473 default:
474 BUG();
475 }
476
477 /* Unreachable but makes gcc happy */
478 return 0;
479 }
480
cpuset_write_u64(struct cgroup_subsys_state * css,struct cftype * cft,u64 val)481 static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
482 u64 val)
483 {
484 struct cpuset *cs = css_cs(css);
485 cpuset_filetype_t type = cft->private;
486 int retval = 0;
487
488 cpuset_full_lock();
489 if (!is_cpuset_online(cs)) {
490 retval = -ENODEV;
491 goto out_unlock;
492 }
493
494 switch (type) {
495 case FILE_CPU_EXCLUSIVE:
496 retval = cpuset_update_flag(CS_CPU_EXCLUSIVE, cs, val);
497 break;
498 case FILE_MEM_EXCLUSIVE:
499 pr_info_once("cpuset.%s is deprecated\n", cft->name);
500 retval = cpuset_update_flag(CS_MEM_EXCLUSIVE, cs, val);
501 break;
502 case FILE_MEM_HARDWALL:
503 pr_info_once("cpuset.%s is deprecated\n", cft->name);
504 retval = cpuset_update_flag(CS_MEM_HARDWALL, cs, val);
505 break;
506 case FILE_SCHED_LOAD_BALANCE:
507 pr_info_once("cpuset.%s is deprecated, use cpuset.cpus.partition instead\n", cft->name);
508 retval = cpuset_update_flag(CS_SCHED_LOAD_BALANCE, cs, val);
509 break;
510 case FILE_MEMORY_MIGRATE:
511 pr_info_once("cpuset.%s is deprecated\n", cft->name);
512 retval = cpuset_update_flag(CS_MEMORY_MIGRATE, cs, val);
513 break;
514 case FILE_MEMORY_PRESSURE_ENABLED:
515 pr_info_once("cpuset.%s is deprecated, use memory.pressure with CONFIG_PSI instead\n", cft->name);
516 cpuset_memory_pressure_enabled = !!val;
517 break;
518 case FILE_SPREAD_PAGE:
519 pr_info_once("cpuset.%s is deprecated\n", cft->name);
520 retval = cpuset_update_flag(CS_SPREAD_PAGE, cs, val);
521 break;
522 case FILE_SPREAD_SLAB:
523 pr_warn_once("cpuset.%s is deprecated\n", cft->name);
524 retval = cpuset_update_flag(CS_SPREAD_SLAB, cs, val);
525 break;
526 default:
527 retval = -EINVAL;
528 break;
529 }
530 out_unlock:
531 cpuset_full_unlock();
532 return retval;
533 }
534
cpuset1_init(struct cpuset * cs)535 void cpuset1_init(struct cpuset *cs)
536 {
537 fmeter_init(&cs->fmeter);
538 cs->relax_domain_level = -1;
539 }
540
cpuset1_online_css(struct cgroup_subsys_state * css)541 void cpuset1_online_css(struct cgroup_subsys_state *css)
542 {
543 struct cpuset *tmp_cs;
544 struct cgroup_subsys_state *pos_css;
545 struct cpuset *cs = css_cs(css);
546 struct cpuset *parent = parent_cs(cs);
547
548 lockdep_assert_cpus_held();
549 lockdep_assert_cpuset_lock_held();
550
551 if (is_spread_page(parent))
552 set_bit(CS_SPREAD_PAGE, &cs->flags);
553 if (is_spread_slab(parent))
554 set_bit(CS_SPREAD_SLAB, &cs->flags);
555
556 if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
557 return;
558
559 /*
560 * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is
561 * set. This flag handling is implemented in cgroup core for
562 * historical reasons - the flag may be specified during mount.
563 *
564 * Currently, if any sibling cpusets have exclusive cpus or mem, we
565 * refuse to clone the configuration - thereby refusing the task to
566 * be entered, and as a result refusing the sys_unshare() or
567 * clone() which initiated it. If this becomes a problem for some
568 * users who wish to allow that scenario, then this could be
569 * changed to grant parent->cpus_allowed-sibling_cpus_exclusive
570 * (and likewise for mems) to the new cgroup.
571 */
572 rcu_read_lock();
573 cpuset_for_each_child(tmp_cs, pos_css, parent) {
574 if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) {
575 rcu_read_unlock();
576 return;
577 }
578 }
579 rcu_read_unlock();
580
581 cpuset_callback_lock_irq();
582 cs->mems_allowed = parent->mems_allowed;
583 cs->effective_mems = parent->mems_allowed;
584 cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
585 cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
586 cpuset_callback_unlock_irq();
587 }
588
589 static void
update_domain_attr(struct sched_domain_attr * dattr,struct cpuset * c)590 update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
591 {
592 if (dattr->relax_domain_level < c->relax_domain_level)
593 dattr->relax_domain_level = c->relax_domain_level;
594 }
595
update_domain_attr_tree(struct sched_domain_attr * dattr,struct cpuset * root_cs)596 static void update_domain_attr_tree(struct sched_domain_attr *dattr,
597 struct cpuset *root_cs)
598 {
599 struct cpuset *cp;
600 struct cgroup_subsys_state *pos_css;
601
602 rcu_read_lock();
603 cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
604 /* skip the whole subtree if @cp doesn't have any CPU */
605 if (cpumask_empty(cp->cpus_allowed)) {
606 pos_css = css_rightmost_descendant(pos_css);
607 continue;
608 }
609
610 if (is_sched_load_balance(cp))
611 update_domain_attr(dattr, cp);
612 }
613 rcu_read_unlock();
614 }
615
616 /*
617 * cpuset1_generate_sched_domains()
618 *
619 * Finding the best partition (set of domains):
620 * The double nested loops below over i, j scan over the load
621 * balanced cpusets (using the array of cpuset pointers in csa[])
622 * looking for pairs of cpusets that have overlapping cpus_allowed
623 * and merging them using a union-find algorithm.
624 *
625 * The union of the cpus_allowed masks from the set of all cpusets
626 * having the same root then form the one element of the partition
627 * (one sched domain) to be passed to partition_sched_domains().
628 */
cpuset1_generate_sched_domains(cpumask_var_t ** domains,struct sched_domain_attr ** attributes)629 int cpuset1_generate_sched_domains(cpumask_var_t **domains,
630 struct sched_domain_attr **attributes)
631 {
632 struct cpuset *cp; /* top-down scan of cpusets */
633 struct cpuset **csa; /* array of all cpuset ptrs */
634 int csn; /* how many cpuset ptrs in csa so far */
635 int i, j; /* indices for partition finding loops */
636 cpumask_var_t *doms; /* resulting partition; i.e. sched domains */
637 struct sched_domain_attr *dattr; /* attributes for custom domains */
638 int ndoms = 0; /* number of sched domains in result */
639 int nslot; /* next empty doms[] struct cpumask slot */
640 struct cgroup_subsys_state *pos_css;
641 int nslot_update;
642
643 lockdep_assert_cpuset_lock_held();
644
645 doms = NULL;
646 dattr = NULL;
647 csa = NULL;
648
649 /* Special case for the 99% of systems with one, full, sched domain */
650 if (is_sched_load_balance(&top_cpuset)) {
651 ndoms = 1;
652 doms = alloc_sched_domains(ndoms);
653 if (!doms)
654 goto done;
655
656 dattr = kmalloc_obj(struct sched_domain_attr);
657 if (dattr) {
658 *dattr = SD_ATTR_INIT;
659 update_domain_attr_tree(dattr, &top_cpuset);
660 }
661 cpumask_and(doms[0], top_cpuset.effective_cpus,
662 housekeeping_cpumask(HK_TYPE_DOMAIN));
663
664 goto done;
665 }
666
667 csa = kmalloc_objs(cp, nr_cpusets());
668 if (!csa)
669 goto done;
670 csn = 0;
671
672 rcu_read_lock();
673 cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) {
674 if (cp == &top_cpuset)
675 continue;
676
677 /*
678 * Continue traversing beyond @cp iff @cp has some CPUs and
679 * isn't load balancing. The former is obvious. The
680 * latter: All child cpusets contain a subset of the
681 * parent's cpus, so just skip them, and then we call
682 * update_domain_attr_tree() to calc relax_domain_level of
683 * the corresponding sched domain.
684 */
685 if (!cpumask_empty(cp->cpus_allowed) &&
686 !(is_sched_load_balance(cp) &&
687 cpumask_intersects(cp->cpus_allowed,
688 housekeeping_cpumask(HK_TYPE_DOMAIN))))
689 continue;
690
691 if (is_sched_load_balance(cp) &&
692 !cpumask_empty(cp->effective_cpus))
693 csa[csn++] = cp;
694
695 /* skip @cp's subtree */
696 pos_css = css_rightmost_descendant(pos_css);
697 continue;
698 }
699 rcu_read_unlock();
700
701 for (i = 0; i < csn; i++)
702 uf_node_init(&csa[i]->node);
703
704 /* Merge overlapping cpusets */
705 for (i = 0; i < csn; i++) {
706 for (j = i + 1; j < csn; j++) {
707 if (cpusets_overlap(csa[i], csa[j]))
708 uf_union(&csa[i]->node, &csa[j]->node);
709 }
710 }
711
712 /* Count the total number of domains */
713 for (i = 0; i < csn; i++) {
714 if (uf_find(&csa[i]->node) == &csa[i]->node)
715 ndoms++;
716 }
717
718 /*
719 * Now we know how many domains to create.
720 * Convert <csn, csa> to <ndoms, doms> and populate cpu masks.
721 */
722 doms = alloc_sched_domains(ndoms);
723 if (!doms)
724 goto done;
725
726 /*
727 * The rest of the code, including the scheduler, can deal with
728 * dattr==NULL case. No need to abort if alloc fails.
729 */
730 dattr = kmalloc_objs(struct sched_domain_attr, ndoms);
731
732 for (nslot = 0, i = 0; i < csn; i++) {
733 nslot_update = 0;
734 for (j = i; j < csn; j++) {
735 if (uf_find(&csa[j]->node) == &csa[i]->node) {
736 struct cpumask *dp = doms[nslot];
737
738 if (i == j) {
739 nslot_update = 1;
740 cpumask_clear(dp);
741 if (dattr)
742 *(dattr + nslot) = SD_ATTR_INIT;
743 }
744 cpumask_or(dp, dp, csa[j]->effective_cpus);
745 cpumask_and(dp, dp, housekeeping_cpumask(HK_TYPE_DOMAIN));
746 if (dattr)
747 update_domain_attr_tree(dattr + nslot, csa[j]);
748 }
749 }
750 if (nslot_update)
751 nslot++;
752 }
753 BUG_ON(nslot != ndoms);
754
755 done:
756 kfree(csa);
757
758 /*
759 * Fallback to the default domain if kmalloc() failed.
760 * See comments in partition_sched_domains().
761 */
762 if (doms == NULL)
763 ndoms = 1;
764
765 *domains = doms;
766 *attributes = dattr;
767 return ndoms;
768 }
769
770 /*
771 * for the common functions, 'private' gives the type of file
772 */
773
774 struct cftype cpuset1_files[] = {
775 {
776 .name = "cpus",
777 .seq_show = cpuset_common_seq_show,
778 .write = cpuset_write_resmask,
779 .max_write_len = (100U + 6 * NR_CPUS),
780 .private = FILE_CPULIST,
781 },
782
783 {
784 .name = "mems",
785 .seq_show = cpuset_common_seq_show,
786 .write = cpuset_write_resmask,
787 .max_write_len = (100U + 6 * MAX_NUMNODES),
788 .private = FILE_MEMLIST,
789 },
790
791 {
792 .name = "effective_cpus",
793 .seq_show = cpuset_common_seq_show,
794 .private = FILE_EFFECTIVE_CPULIST,
795 },
796
797 {
798 .name = "effective_mems",
799 .seq_show = cpuset_common_seq_show,
800 .private = FILE_EFFECTIVE_MEMLIST,
801 },
802
803 {
804 .name = "cpu_exclusive",
805 .read_u64 = cpuset_read_u64,
806 .write_u64 = cpuset_write_u64,
807 .private = FILE_CPU_EXCLUSIVE,
808 },
809
810 {
811 .name = "mem_exclusive",
812 .read_u64 = cpuset_read_u64,
813 .write_u64 = cpuset_write_u64,
814 .private = FILE_MEM_EXCLUSIVE,
815 },
816
817 {
818 .name = "mem_hardwall",
819 .read_u64 = cpuset_read_u64,
820 .write_u64 = cpuset_write_u64,
821 .private = FILE_MEM_HARDWALL,
822 },
823
824 {
825 .name = "sched_load_balance",
826 .read_u64 = cpuset_read_u64,
827 .write_u64 = cpuset_write_u64,
828 .private = FILE_SCHED_LOAD_BALANCE,
829 },
830
831 {
832 .name = "sched_relax_domain_level",
833 .read_s64 = cpuset_read_s64,
834 .write_s64 = cpuset_write_s64,
835 .private = FILE_SCHED_RELAX_DOMAIN_LEVEL,
836 },
837
838 {
839 .name = "memory_migrate",
840 .read_u64 = cpuset_read_u64,
841 .write_u64 = cpuset_write_u64,
842 .private = FILE_MEMORY_MIGRATE,
843 },
844
845 {
846 .name = "memory_pressure",
847 .read_u64 = cpuset_read_u64,
848 .private = FILE_MEMORY_PRESSURE,
849 },
850
851 {
852 .name = "memory_spread_page",
853 .read_u64 = cpuset_read_u64,
854 .write_u64 = cpuset_write_u64,
855 .private = FILE_SPREAD_PAGE,
856 },
857
858 {
859 /* obsolete, may be removed in the future */
860 .name = "memory_spread_slab",
861 .read_u64 = cpuset_read_u64,
862 .write_u64 = cpuset_write_u64,
863 .private = FILE_SPREAD_SLAB,
864 },
865
866 {
867 .name = "memory_pressure_enabled",
868 .flags = CFTYPE_ONLY_ON_ROOT,
869 .read_u64 = cpuset_read_u64,
870 .write_u64 = cpuset_write_u64,
871 .private = FILE_MEMORY_PRESSURE_ENABLED,
872 },
873
874 { } /* terminate */
875 };
876