1 // SPDX-License-Identifier: GPL-2.0-or-later 2 3 #include "cgroup-internal.h" 4 #include "cpuset-internal.h" 5 6 /* 7 * Legacy hierarchy call to cgroup_transfer_tasks() is handled asynchrously 8 */ 9 struct cpuset_remove_tasks_struct { 10 struct work_struct work; 11 struct cpuset *cs; 12 }; 13 14 /* 15 * Frequency meter - How fast is some event occurring? 16 * 17 * These routines manage a digitally filtered, constant time based, 18 * event frequency meter. There are four routines: 19 * fmeter_init() - initialize a frequency meter. 20 * fmeter_markevent() - called each time the event happens. 21 * fmeter_getrate() - returns the recent rate of such events. 22 * fmeter_update() - internal routine used to update fmeter. 23 * 24 * A common data structure is passed to each of these routines, 25 * which is used to keep track of the state required to manage the 26 * frequency meter and its digital filter. 27 * 28 * The filter works on the number of events marked per unit time. 29 * The filter is single-pole low-pass recursive (IIR). The time unit 30 * is 1 second. Arithmetic is done using 32-bit integers scaled to 31 * simulate 3 decimal digits of precision (multiplied by 1000). 32 * 33 * With an FM_COEF of 933, and a time base of 1 second, the filter 34 * has a half-life of 10 seconds, meaning that if the events quit 35 * happening, then the rate returned from the fmeter_getrate() 36 * will be cut in half each 10 seconds, until it converges to zero. 37 * 38 * It is not worth doing a real infinitely recursive filter. If more 39 * than FM_MAXTICKS ticks have elapsed since the last filter event, 40 * just compute FM_MAXTICKS ticks worth, by which point the level 41 * will be stable. 42 * 43 * Limit the count of unprocessed events to FM_MAXCNT, so as to avoid 44 * arithmetic overflow in the fmeter_update() routine. 45 * 46 * Given the simple 32 bit integer arithmetic used, this meter works 47 * best for reporting rates between one per millisecond (msec) and 48 * one per 32 (approx) seconds. At constant rates faster than one 49 * per msec it maxes out at values just under 1,000,000. At constant 50 * rates between one per msec, and one per second it will stabilize 51 * to a value N*1000, where N is the rate of events per second. 52 * At constant rates between one per second and one per 32 seconds, 53 * it will be choppy, moving up on the seconds that have an event, 54 * and then decaying until the next event. At rates slower than 55 * about one in 32 seconds, it decays all the way back to zero between 56 * each event. 57 */ 58 59 #define FM_COEF 933 /* coefficient for half-life of 10 secs */ 60 #define FM_MAXTICKS ((u32)99) /* useless computing more ticks than this */ 61 #define FM_MAXCNT 1000000 /* limit cnt to avoid overflow */ 62 #define FM_SCALE 1000 /* faux fixed point scale */ 63 64 /* Initialize a frequency meter */ 65 static void fmeter_init(struct fmeter *fmp) 66 { 67 fmp->cnt = 0; 68 fmp->val = 0; 69 fmp->time = 0; 70 spin_lock_init(&fmp->lock); 71 } 72 73 /* Internal meter update - process cnt events and update value */ 74 static void fmeter_update(struct fmeter *fmp) 75 { 76 time64_t now; 77 u32 ticks; 78 79 now = ktime_get_seconds(); 80 ticks = now - fmp->time; 81 82 if (ticks == 0) 83 return; 84 85 ticks = min(FM_MAXTICKS, ticks); 86 while (ticks-- > 0) 87 fmp->val = (FM_COEF * fmp->val) / FM_SCALE; 88 fmp->time = now; 89 90 fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE; 91 fmp->cnt = 0; 92 } 93 94 /* Process any previous ticks, then bump cnt by one (times scale). */ 95 static void fmeter_markevent(struct fmeter *fmp) 96 { 97 spin_lock(&fmp->lock); 98 fmeter_update(fmp); 99 fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE); 100 spin_unlock(&fmp->lock); 101 } 102 103 /* Process any previous ticks, then return current value. */ 104 static int fmeter_getrate(struct fmeter *fmp) 105 { 106 int val; 107 108 spin_lock(&fmp->lock); 109 fmeter_update(fmp); 110 val = fmp->val; 111 spin_unlock(&fmp->lock); 112 return val; 113 } 114 115 /* 116 * Collection of memory_pressure is suppressed unless 117 * this flag is enabled by writing "1" to the special 118 * cpuset file 'memory_pressure_enabled' in the root cpuset. 119 */ 120 121 int cpuset_memory_pressure_enabled __read_mostly; 122 123 /* 124 * __cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims. 125 * 126 * Keep a running average of the rate of synchronous (direct) 127 * page reclaim efforts initiated by tasks in each cpuset. 128 * 129 * This represents the rate at which some task in the cpuset 130 * ran low on memory on all nodes it was allowed to use, and 131 * had to enter the kernels page reclaim code in an effort to 132 * create more free memory by tossing clean pages or swapping 133 * or writing dirty pages. 134 * 135 * Display to user space in the per-cpuset read-only file 136 * "memory_pressure". Value displayed is an integer 137 * representing the recent rate of entry into the synchronous 138 * (direct) page reclaim by any task attached to the cpuset. 139 */ 140 141 void __cpuset_memory_pressure_bump(void) 142 { 143 rcu_read_lock(); 144 fmeter_markevent(&task_cs(current)->fmeter); 145 rcu_read_unlock(); 146 } 147 148 static int update_relax_domain_level(struct cpuset *cs, s64 val) 149 { 150 #ifdef CONFIG_SMP 151 if (val < -1 || val > sched_domain_level_max + 1) 152 return -EINVAL; 153 #endif 154 155 if (val != cs->relax_domain_level) { 156 cs->relax_domain_level = val; 157 if (!cpumask_empty(cs->cpus_allowed) && 158 is_sched_load_balance(cs)) 159 rebuild_sched_domains_locked(); 160 } 161 162 return 0; 163 } 164 165 static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, 166 s64 val) 167 { 168 struct cpuset *cs = css_cs(css); 169 cpuset_filetype_t type = cft->private; 170 int retval = -ENODEV; 171 172 cpuset_full_lock(); 173 if (!is_cpuset_online(cs)) 174 goto out_unlock; 175 176 switch (type) { 177 case FILE_SCHED_RELAX_DOMAIN_LEVEL: 178 pr_info_once("cpuset.%s is deprecated\n", cft->name); 179 retval = update_relax_domain_level(cs, val); 180 break; 181 default: 182 retval = -EINVAL; 183 break; 184 } 185 out_unlock: 186 cpuset_full_unlock(); 187 return retval; 188 } 189 190 static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft) 191 { 192 struct cpuset *cs = css_cs(css); 193 cpuset_filetype_t type = cft->private; 194 195 switch (type) { 196 case FILE_SCHED_RELAX_DOMAIN_LEVEL: 197 return cs->relax_domain_level; 198 default: 199 BUG(); 200 } 201 202 /* Unreachable but makes gcc happy */ 203 return 0; 204 } 205 206 /* 207 * update task's spread flag if cpuset's page/slab spread flag is set 208 * 209 * Call with callback_lock or cpuset_mutex held. The check can be skipped 210 * if on default hierarchy. 211 */ 212 void cpuset1_update_task_spread_flags(struct cpuset *cs, 213 struct task_struct *tsk) 214 { 215 if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) 216 return; 217 218 if (is_spread_page(cs)) 219 task_set_spread_page(tsk); 220 else 221 task_clear_spread_page(tsk); 222 223 if (is_spread_slab(cs)) 224 task_set_spread_slab(tsk); 225 else 226 task_clear_spread_slab(tsk); 227 } 228 229 /** 230 * cpuset1_update_tasks_flags - update the spread flags of tasks in the cpuset. 231 * @cs: the cpuset in which each task's spread flags needs to be changed 232 * 233 * Iterate through each task of @cs updating its spread flags. As this 234 * function is called with cpuset_mutex held, cpuset membership stays 235 * stable. 236 */ 237 void cpuset1_update_tasks_flags(struct cpuset *cs) 238 { 239 struct css_task_iter it; 240 struct task_struct *task; 241 242 css_task_iter_start(&cs->css, 0, &it); 243 while ((task = css_task_iter_next(&it))) 244 cpuset1_update_task_spread_flags(cs, task); 245 css_task_iter_end(&it); 246 } 247 248 /* 249 * If CPU and/or memory hotplug handlers, below, unplug any CPUs 250 * or memory nodes, we need to walk over the cpuset hierarchy, 251 * removing that CPU or node from all cpusets. If this removes the 252 * last CPU or node from a cpuset, then move the tasks in the empty 253 * cpuset to its next-highest non-empty parent. 254 */ 255 static void remove_tasks_in_empty_cpuset(struct cpuset *cs) 256 { 257 struct cpuset *parent; 258 259 /* 260 * Find its next-highest non-empty parent, (top cpuset 261 * has online cpus, so can't be empty). 262 */ 263 parent = parent_cs(cs); 264 while (cpumask_empty(parent->cpus_allowed) || 265 nodes_empty(parent->mems_allowed)) 266 parent = parent_cs(parent); 267 268 if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) { 269 pr_err("cpuset: failed to transfer tasks out of empty cpuset "); 270 pr_cont_cgroup_name(cs->css.cgroup); 271 pr_cont("\n"); 272 } 273 } 274 275 static void cpuset_migrate_tasks_workfn(struct work_struct *work) 276 { 277 struct cpuset_remove_tasks_struct *s; 278 279 s = container_of(work, struct cpuset_remove_tasks_struct, work); 280 remove_tasks_in_empty_cpuset(s->cs); 281 css_put(&s->cs->css); 282 kfree(s); 283 } 284 285 void cpuset1_hotplug_update_tasks(struct cpuset *cs, 286 struct cpumask *new_cpus, nodemask_t *new_mems, 287 bool cpus_updated, bool mems_updated) 288 { 289 bool is_empty; 290 291 cpuset_callback_lock_irq(); 292 cpumask_copy(cs->cpus_allowed, new_cpus); 293 cpumask_copy(cs->effective_cpus, new_cpus); 294 cs->mems_allowed = *new_mems; 295 cs->effective_mems = *new_mems; 296 cpuset_callback_unlock_irq(); 297 298 /* 299 * Don't call cpuset_update_tasks_cpumask() if the cpuset becomes empty, 300 * as the tasks will be migrated to an ancestor. 301 */ 302 if (cpus_updated && !cpumask_empty(cs->cpus_allowed)) 303 cpuset_update_tasks_cpumask(cs, new_cpus); 304 if (mems_updated && !nodes_empty(cs->mems_allowed)) 305 cpuset_update_tasks_nodemask(cs); 306 307 is_empty = cpumask_empty(cs->cpus_allowed) || 308 nodes_empty(cs->mems_allowed); 309 310 /* 311 * Move tasks to the nearest ancestor with execution resources, 312 * This is full cgroup operation which will also call back into 313 * cpuset. Execute it asynchronously using workqueue. 314 */ 315 if (is_empty && cs->css.cgroup->nr_populated_csets && 316 css_tryget_online(&cs->css)) { 317 struct cpuset_remove_tasks_struct *s; 318 319 s = kzalloc(sizeof(*s), GFP_KERNEL); 320 if (WARN_ON_ONCE(!s)) { 321 css_put(&cs->css); 322 return; 323 } 324 325 s->cs = cs; 326 INIT_WORK(&s->work, cpuset_migrate_tasks_workfn); 327 schedule_work(&s->work); 328 } 329 } 330 331 /* 332 * is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q? 333 * 334 * One cpuset is a subset of another if all its allowed CPUs and 335 * Memory Nodes are a subset of the other, and its exclusive flags 336 * are only set if the other's are set. Call holding cpuset_mutex. 337 */ 338 339 static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) 340 { 341 return cpumask_subset(p->cpus_allowed, q->cpus_allowed) && 342 nodes_subset(p->mems_allowed, q->mems_allowed) && 343 is_cpu_exclusive(p) <= is_cpu_exclusive(q) && 344 is_mem_exclusive(p) <= is_mem_exclusive(q); 345 } 346 347 /* 348 * cpuset1_validate_change() - Validate conditions specific to legacy (v1) 349 * behavior. 350 */ 351 int cpuset1_validate_change(struct cpuset *cur, struct cpuset *trial) 352 { 353 struct cgroup_subsys_state *css; 354 struct cpuset *c, *par; 355 int ret; 356 357 WARN_ON_ONCE(!rcu_read_lock_held()); 358 359 /* Each of our child cpusets must be a subset of us */ 360 ret = -EBUSY; 361 cpuset_for_each_child(c, css, cur) 362 if (!is_cpuset_subset(c, trial)) 363 goto out; 364 365 /* On legacy hierarchy, we must be a subset of our parent cpuset. */ 366 ret = -EACCES; 367 par = parent_cs(cur); 368 if (par && !is_cpuset_subset(trial, par)) 369 goto out; 370 371 /* 372 * Cpusets with tasks - existing or newly being attached - can't 373 * be changed to have empty cpus_allowed or mems_allowed. 374 */ 375 ret = -ENOSPC; 376 if (cpuset_is_populated(cur)) { 377 if (!cpumask_empty(cur->cpus_allowed) && 378 cpumask_empty(trial->cpus_allowed)) 379 goto out; 380 if (!nodes_empty(cur->mems_allowed) && 381 nodes_empty(trial->mems_allowed)) 382 goto out; 383 } 384 385 ret = 0; 386 out: 387 return ret; 388 } 389 390 /* 391 * cpuset1_cpus_excl_conflict() - Check if two cpusets have exclusive CPU conflicts 392 * to legacy (v1) 393 * @cs1: first cpuset to check 394 * @cs2: second cpuset to check 395 * 396 * Returns: true if CPU exclusivity conflict exists, false otherwise 397 * 398 * If either cpuset is CPU exclusive, their allowed CPUs cannot intersect. 399 */ 400 bool cpuset1_cpus_excl_conflict(struct cpuset *cs1, struct cpuset *cs2) 401 { 402 if (is_cpu_exclusive(cs1) || is_cpu_exclusive(cs2)) 403 return cpumask_intersects(cs1->cpus_allowed, 404 cs2->cpus_allowed); 405 406 return false; 407 } 408 409 #ifdef CONFIG_PROC_PID_CPUSET 410 /* 411 * proc_cpuset_show() 412 * - Print tasks cpuset path into seq_file. 413 * - Used for /proc/<pid>/cpuset. 414 */ 415 int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns, 416 struct pid *pid, struct task_struct *tsk) 417 { 418 char *buf; 419 struct cgroup_subsys_state *css; 420 int retval; 421 422 retval = -ENOMEM; 423 buf = kmalloc(PATH_MAX, GFP_KERNEL); 424 if (!buf) 425 goto out; 426 427 rcu_read_lock(); 428 spin_lock_irq(&css_set_lock); 429 css = task_css(tsk, cpuset_cgrp_id); 430 retval = cgroup_path_ns_locked(css->cgroup, buf, PATH_MAX, 431 current->nsproxy->cgroup_ns); 432 spin_unlock_irq(&css_set_lock); 433 rcu_read_unlock(); 434 435 if (retval == -E2BIG) 436 retval = -ENAMETOOLONG; 437 if (retval < 0) 438 goto out_free; 439 seq_puts(m, buf); 440 seq_putc(m, '\n'); 441 retval = 0; 442 out_free: 443 kfree(buf); 444 out: 445 return retval; 446 } 447 #endif /* CONFIG_PROC_PID_CPUSET */ 448 449 static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft) 450 { 451 struct cpuset *cs = css_cs(css); 452 cpuset_filetype_t type = cft->private; 453 454 switch (type) { 455 case FILE_CPU_EXCLUSIVE: 456 return is_cpu_exclusive(cs); 457 case FILE_MEM_EXCLUSIVE: 458 return is_mem_exclusive(cs); 459 case FILE_MEM_HARDWALL: 460 return is_mem_hardwall(cs); 461 case FILE_SCHED_LOAD_BALANCE: 462 return is_sched_load_balance(cs); 463 case FILE_MEMORY_MIGRATE: 464 return is_memory_migrate(cs); 465 case FILE_MEMORY_PRESSURE_ENABLED: 466 return cpuset_memory_pressure_enabled; 467 case FILE_MEMORY_PRESSURE: 468 return fmeter_getrate(&cs->fmeter); 469 case FILE_SPREAD_PAGE: 470 return is_spread_page(cs); 471 case FILE_SPREAD_SLAB: 472 return is_spread_slab(cs); 473 default: 474 BUG(); 475 } 476 477 /* Unreachable but makes gcc happy */ 478 return 0; 479 } 480 481 static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, 482 u64 val) 483 { 484 struct cpuset *cs = css_cs(css); 485 cpuset_filetype_t type = cft->private; 486 int retval = 0; 487 488 cpuset_full_lock(); 489 if (!is_cpuset_online(cs)) { 490 retval = -ENODEV; 491 goto out_unlock; 492 } 493 494 switch (type) { 495 case FILE_CPU_EXCLUSIVE: 496 retval = cpuset_update_flag(CS_CPU_EXCLUSIVE, cs, val); 497 break; 498 case FILE_MEM_EXCLUSIVE: 499 pr_info_once("cpuset.%s is deprecated\n", cft->name); 500 retval = cpuset_update_flag(CS_MEM_EXCLUSIVE, cs, val); 501 break; 502 case FILE_MEM_HARDWALL: 503 pr_info_once("cpuset.%s is deprecated\n", cft->name); 504 retval = cpuset_update_flag(CS_MEM_HARDWALL, cs, val); 505 break; 506 case FILE_SCHED_LOAD_BALANCE: 507 pr_info_once("cpuset.%s is deprecated, use cpuset.cpus.partition instead\n", cft->name); 508 retval = cpuset_update_flag(CS_SCHED_LOAD_BALANCE, cs, val); 509 break; 510 case FILE_MEMORY_MIGRATE: 511 pr_info_once("cpuset.%s is deprecated\n", cft->name); 512 retval = cpuset_update_flag(CS_MEMORY_MIGRATE, cs, val); 513 break; 514 case FILE_MEMORY_PRESSURE_ENABLED: 515 pr_info_once("cpuset.%s is deprecated, use memory.pressure with CONFIG_PSI instead\n", cft->name); 516 cpuset_memory_pressure_enabled = !!val; 517 break; 518 case FILE_SPREAD_PAGE: 519 pr_info_once("cpuset.%s is deprecated\n", cft->name); 520 retval = cpuset_update_flag(CS_SPREAD_PAGE, cs, val); 521 break; 522 case FILE_SPREAD_SLAB: 523 pr_warn_once("cpuset.%s is deprecated\n", cft->name); 524 retval = cpuset_update_flag(CS_SPREAD_SLAB, cs, val); 525 break; 526 default: 527 retval = -EINVAL; 528 break; 529 } 530 out_unlock: 531 cpuset_full_unlock(); 532 return retval; 533 } 534 535 void cpuset1_init(struct cpuset *cs) 536 { 537 fmeter_init(&cs->fmeter); 538 cs->relax_domain_level = -1; 539 } 540 541 void cpuset1_online_css(struct cgroup_subsys_state *css) 542 { 543 struct cpuset *tmp_cs; 544 struct cgroup_subsys_state *pos_css; 545 struct cpuset *cs = css_cs(css); 546 struct cpuset *parent = parent_cs(cs); 547 548 lockdep_assert_cpus_held(); 549 lockdep_assert_cpuset_lock_held(); 550 551 if (is_spread_page(parent)) 552 set_bit(CS_SPREAD_PAGE, &cs->flags); 553 if (is_spread_slab(parent)) 554 set_bit(CS_SPREAD_SLAB, &cs->flags); 555 556 if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags)) 557 return; 558 559 /* 560 * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is 561 * set. This flag handling is implemented in cgroup core for 562 * historical reasons - the flag may be specified during mount. 563 * 564 * Currently, if any sibling cpusets have exclusive cpus or mem, we 565 * refuse to clone the configuration - thereby refusing the task to 566 * be entered, and as a result refusing the sys_unshare() or 567 * clone() which initiated it. If this becomes a problem for some 568 * users who wish to allow that scenario, then this could be 569 * changed to grant parent->cpus_allowed-sibling_cpus_exclusive 570 * (and likewise for mems) to the new cgroup. 571 */ 572 rcu_read_lock(); 573 cpuset_for_each_child(tmp_cs, pos_css, parent) { 574 if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) { 575 rcu_read_unlock(); 576 return; 577 } 578 } 579 rcu_read_unlock(); 580 581 cpuset_callback_lock_irq(); 582 cs->mems_allowed = parent->mems_allowed; 583 cs->effective_mems = parent->mems_allowed; 584 cpumask_copy(cs->cpus_allowed, parent->cpus_allowed); 585 cpumask_copy(cs->effective_cpus, parent->cpus_allowed); 586 cpuset_callback_unlock_irq(); 587 } 588 589 static void 590 update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c) 591 { 592 if (dattr->relax_domain_level < c->relax_domain_level) 593 dattr->relax_domain_level = c->relax_domain_level; 594 } 595 596 static void update_domain_attr_tree(struct sched_domain_attr *dattr, 597 struct cpuset *root_cs) 598 { 599 struct cpuset *cp; 600 struct cgroup_subsys_state *pos_css; 601 602 rcu_read_lock(); 603 cpuset_for_each_descendant_pre(cp, pos_css, root_cs) { 604 /* skip the whole subtree if @cp doesn't have any CPU */ 605 if (cpumask_empty(cp->cpus_allowed)) { 606 pos_css = css_rightmost_descendant(pos_css); 607 continue; 608 } 609 610 if (is_sched_load_balance(cp)) 611 update_domain_attr(dattr, cp); 612 } 613 rcu_read_unlock(); 614 } 615 616 /* 617 * cpuset1_generate_sched_domains() 618 * 619 * Finding the best partition (set of domains): 620 * The double nested loops below over i, j scan over the load 621 * balanced cpusets (using the array of cpuset pointers in csa[]) 622 * looking for pairs of cpusets that have overlapping cpus_allowed 623 * and merging them using a union-find algorithm. 624 * 625 * The union of the cpus_allowed masks from the set of all cpusets 626 * having the same root then form the one element of the partition 627 * (one sched domain) to be passed to partition_sched_domains(). 628 */ 629 int cpuset1_generate_sched_domains(cpumask_var_t **domains, 630 struct sched_domain_attr **attributes) 631 { 632 struct cpuset *cp; /* top-down scan of cpusets */ 633 struct cpuset **csa; /* array of all cpuset ptrs */ 634 int csn; /* how many cpuset ptrs in csa so far */ 635 int i, j; /* indices for partition finding loops */ 636 cpumask_var_t *doms; /* resulting partition; i.e. sched domains */ 637 struct sched_domain_attr *dattr; /* attributes for custom domains */ 638 int ndoms = 0; /* number of sched domains in result */ 639 int nslot; /* next empty doms[] struct cpumask slot */ 640 struct cgroup_subsys_state *pos_css; 641 int nslot_update; 642 643 lockdep_assert_cpuset_lock_held(); 644 645 doms = NULL; 646 dattr = NULL; 647 csa = NULL; 648 649 /* Special case for the 99% of systems with one, full, sched domain */ 650 if (is_sched_load_balance(&top_cpuset)) { 651 ndoms = 1; 652 doms = alloc_sched_domains(ndoms); 653 if (!doms) 654 goto done; 655 656 dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL); 657 if (dattr) { 658 *dattr = SD_ATTR_INIT; 659 update_domain_attr_tree(dattr, &top_cpuset); 660 } 661 cpumask_and(doms[0], top_cpuset.effective_cpus, 662 housekeeping_cpumask(HK_TYPE_DOMAIN)); 663 664 goto done; 665 } 666 667 csa = kmalloc_array(nr_cpusets(), sizeof(cp), GFP_KERNEL); 668 if (!csa) 669 goto done; 670 csn = 0; 671 672 rcu_read_lock(); 673 cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) { 674 if (cp == &top_cpuset) 675 continue; 676 677 /* 678 * Continue traversing beyond @cp iff @cp has some CPUs and 679 * isn't load balancing. The former is obvious. The 680 * latter: All child cpusets contain a subset of the 681 * parent's cpus, so just skip them, and then we call 682 * update_domain_attr_tree() to calc relax_domain_level of 683 * the corresponding sched domain. 684 */ 685 if (!cpumask_empty(cp->cpus_allowed) && 686 !(is_sched_load_balance(cp) && 687 cpumask_intersects(cp->cpus_allowed, 688 housekeeping_cpumask(HK_TYPE_DOMAIN)))) 689 continue; 690 691 if (is_sched_load_balance(cp) && 692 !cpumask_empty(cp->effective_cpus)) 693 csa[csn++] = cp; 694 695 /* skip @cp's subtree */ 696 pos_css = css_rightmost_descendant(pos_css); 697 continue; 698 } 699 rcu_read_unlock(); 700 701 for (i = 0; i < csn; i++) 702 uf_node_init(&csa[i]->node); 703 704 /* Merge overlapping cpusets */ 705 for (i = 0; i < csn; i++) { 706 for (j = i + 1; j < csn; j++) { 707 if (cpusets_overlap(csa[i], csa[j])) 708 uf_union(&csa[i]->node, &csa[j]->node); 709 } 710 } 711 712 /* Count the total number of domains */ 713 for (i = 0; i < csn; i++) { 714 if (uf_find(&csa[i]->node) == &csa[i]->node) 715 ndoms++; 716 } 717 718 /* 719 * Now we know how many domains to create. 720 * Convert <csn, csa> to <ndoms, doms> and populate cpu masks. 721 */ 722 doms = alloc_sched_domains(ndoms); 723 if (!doms) 724 goto done; 725 726 /* 727 * The rest of the code, including the scheduler, can deal with 728 * dattr==NULL case. No need to abort if alloc fails. 729 */ 730 dattr = kmalloc_array(ndoms, sizeof(struct sched_domain_attr), 731 GFP_KERNEL); 732 733 for (nslot = 0, i = 0; i < csn; i++) { 734 nslot_update = 0; 735 for (j = i; j < csn; j++) { 736 if (uf_find(&csa[j]->node) == &csa[i]->node) { 737 struct cpumask *dp = doms[nslot]; 738 739 if (i == j) { 740 nslot_update = 1; 741 cpumask_clear(dp); 742 if (dattr) 743 *(dattr + nslot) = SD_ATTR_INIT; 744 } 745 cpumask_or(dp, dp, csa[j]->effective_cpus); 746 cpumask_and(dp, dp, housekeeping_cpumask(HK_TYPE_DOMAIN)); 747 if (dattr) 748 update_domain_attr_tree(dattr + nslot, csa[j]); 749 } 750 } 751 if (nslot_update) 752 nslot++; 753 } 754 BUG_ON(nslot != ndoms); 755 756 done: 757 kfree(csa); 758 759 /* 760 * Fallback to the default domain if kmalloc() failed. 761 * See comments in partition_sched_domains(). 762 */ 763 if (doms == NULL) 764 ndoms = 1; 765 766 *domains = doms; 767 *attributes = dattr; 768 return ndoms; 769 } 770 771 /* 772 * for the common functions, 'private' gives the type of file 773 */ 774 775 struct cftype cpuset1_files[] = { 776 { 777 .name = "cpus", 778 .seq_show = cpuset_common_seq_show, 779 .write = cpuset_write_resmask, 780 .max_write_len = (100U + 6 * NR_CPUS), 781 .private = FILE_CPULIST, 782 }, 783 784 { 785 .name = "mems", 786 .seq_show = cpuset_common_seq_show, 787 .write = cpuset_write_resmask, 788 .max_write_len = (100U + 6 * MAX_NUMNODES), 789 .private = FILE_MEMLIST, 790 }, 791 792 { 793 .name = "effective_cpus", 794 .seq_show = cpuset_common_seq_show, 795 .private = FILE_EFFECTIVE_CPULIST, 796 }, 797 798 { 799 .name = "effective_mems", 800 .seq_show = cpuset_common_seq_show, 801 .private = FILE_EFFECTIVE_MEMLIST, 802 }, 803 804 { 805 .name = "cpu_exclusive", 806 .read_u64 = cpuset_read_u64, 807 .write_u64 = cpuset_write_u64, 808 .private = FILE_CPU_EXCLUSIVE, 809 }, 810 811 { 812 .name = "mem_exclusive", 813 .read_u64 = cpuset_read_u64, 814 .write_u64 = cpuset_write_u64, 815 .private = FILE_MEM_EXCLUSIVE, 816 }, 817 818 { 819 .name = "mem_hardwall", 820 .read_u64 = cpuset_read_u64, 821 .write_u64 = cpuset_write_u64, 822 .private = FILE_MEM_HARDWALL, 823 }, 824 825 { 826 .name = "sched_load_balance", 827 .read_u64 = cpuset_read_u64, 828 .write_u64 = cpuset_write_u64, 829 .private = FILE_SCHED_LOAD_BALANCE, 830 }, 831 832 { 833 .name = "sched_relax_domain_level", 834 .read_s64 = cpuset_read_s64, 835 .write_s64 = cpuset_write_s64, 836 .private = FILE_SCHED_RELAX_DOMAIN_LEVEL, 837 }, 838 839 { 840 .name = "memory_migrate", 841 .read_u64 = cpuset_read_u64, 842 .write_u64 = cpuset_write_u64, 843 .private = FILE_MEMORY_MIGRATE, 844 }, 845 846 { 847 .name = "memory_pressure", 848 .read_u64 = cpuset_read_u64, 849 .private = FILE_MEMORY_PRESSURE, 850 }, 851 852 { 853 .name = "memory_spread_page", 854 .read_u64 = cpuset_read_u64, 855 .write_u64 = cpuset_write_u64, 856 .private = FILE_SPREAD_PAGE, 857 }, 858 859 { 860 /* obsolete, may be removed in the future */ 861 .name = "memory_spread_slab", 862 .read_u64 = cpuset_read_u64, 863 .write_u64 = cpuset_write_u64, 864 .private = FILE_SPREAD_SLAB, 865 }, 866 867 { 868 .name = "memory_pressure_enabled", 869 .flags = CFTYPE_ONLY_ON_ROOT, 870 .read_u64 = cpuset_read_u64, 871 .write_u64 = cpuset_write_u64, 872 .private = FILE_MEMORY_PRESSURE_ENABLED, 873 }, 874 875 { } /* terminate */ 876 }; 877