1 /* 2 * kernel/cpuset.c 3 * 4 * Processor and Memory placement constraints for sets of tasks. 5 * 6 * Copyright (C) 2003 BULL SA. 7 * Copyright (C) 2004-2007 Silicon Graphics, Inc. 8 * Copyright (C) 2006 Google, Inc 9 * 10 * Portions derived from Patrick Mochel's sysfs code. 11 * sysfs is Copyright (c) 2001-3 Patrick Mochel 12 * 13 * 2003-10-10 Written by Simon Derr. 14 * 2003-10-22 Updates by Stephen Hemminger. 15 * 2004 May-July Rework by Paul Jackson. 16 * 2006 Rework by Paul Menage to use generic cgroups 17 * 2008 Rework of the scheduler domains and CPU hotplug handling 18 * by Max Krasnyansky 19 * 20 * This file is subject to the terms and conditions of the GNU General Public 21 * License. See the file COPYING in the main directory of the Linux 22 * distribution for more details. 23 */ 24 #include "cgroup-internal.h" 25 #include "cpuset-internal.h" 26 27 #include <linux/init.h> 28 #include <linux/interrupt.h> 29 #include <linux/kernel.h> 30 #include <linux/mempolicy.h> 31 #include <linux/mm.h> 32 #include <linux/memory.h> 33 #include <linux/export.h> 34 #include <linux/rcupdate.h> 35 #include <linux/sched.h> 36 #include <linux/sched/deadline.h> 37 #include <linux/sched/mm.h> 38 #include <linux/sched/task.h> 39 #include <linux/security.h> 40 #include <linux/oom.h> 41 #include <linux/sched/isolation.h> 42 #include <linux/wait.h> 43 #include <linux/workqueue.h> 44 45 DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key); 46 DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key); 47 48 /* 49 * There could be abnormal cpuset configurations for cpu or memory 50 * node binding, add this key to provide a quick low-cost judgment 51 * of the situation. 52 */ 53 DEFINE_STATIC_KEY_FALSE(cpusets_insane_config_key); 54 55 static const char * const perr_strings[] = { 56 [PERR_INVCPUS] = "Invalid cpu list in cpuset.cpus.exclusive", 57 [PERR_INVPARENT] = "Parent is an invalid partition root", 58 [PERR_NOTPART] = "Parent is not a partition root", 59 [PERR_NOTEXCL] = "Cpu list in cpuset.cpus not exclusive", 60 [PERR_NOCPUS] = "Parent unable to distribute cpu downstream", 61 [PERR_HOTPLUG] = "No cpu available due to hotplug", 62 [PERR_CPUSEMPTY] = "cpuset.cpus and cpuset.cpus.exclusive are empty", 63 [PERR_HKEEPING] = "partition config conflicts with housekeeping setup", 64 [PERR_ACCESS] = "Enable partition not permitted", 65 }; 66 67 /* 68 * Exclusive CPUs distributed out to sub-partitions of top_cpuset 69 */ 70 static cpumask_var_t subpartitions_cpus; 71 72 /* 73 * Exclusive CPUs in isolated partitions 74 */ 75 static cpumask_var_t isolated_cpus; 76 77 /* 78 * Housekeeping (HK_TYPE_DOMAIN) CPUs at boot 79 */ 80 static cpumask_var_t boot_hk_cpus; 81 static bool have_boot_isolcpus; 82 83 /* List of remote partition root children */ 84 static struct list_head remote_children; 85 86 /* 87 * A flag to force sched domain rebuild at the end of an operation. 88 * It can be set in 89 * - update_partition_sd_lb() 90 * - remote_partition_check() 91 * - update_cpumasks_hier() 92 * - cpuset_update_flag() 93 * - cpuset_hotplug_update_tasks() 94 * - cpuset_handle_hotplug() 95 * 96 * Protected by cpuset_mutex (with cpus_read_lock held) or cpus_write_lock. 97 * 98 * Note that update_relax_domain_level() in cpuset-v1.c can still call 99 * rebuild_sched_domains_locked() directly without using this flag. 100 */ 101 static bool force_sd_rebuild; 102 103 /* 104 * Partition root states: 105 * 106 * 0 - member (not a partition root) 107 * 1 - partition root 108 * 2 - partition root without load balancing (isolated) 109 * -1 - invalid partition root 110 * -2 - invalid isolated partition root 111 * 112 * There are 2 types of partitions - local or remote. Local partitions are 113 * those whose parents are partition root themselves. Setting of 114 * cpuset.cpus.exclusive are optional in setting up local partitions. 115 * Remote partitions are those whose parents are not partition roots. Passing 116 * down exclusive CPUs by setting cpuset.cpus.exclusive along its ancestor 117 * nodes are mandatory in creating a remote partition. 118 * 119 * For simplicity, a local partition can be created under a local or remote 120 * partition but a remote partition cannot have any partition root in its 121 * ancestor chain except the cgroup root. 122 */ 123 #define PRS_MEMBER 0 124 #define PRS_ROOT 1 125 #define PRS_ISOLATED 2 126 #define PRS_INVALID_ROOT -1 127 #define PRS_INVALID_ISOLATED -2 128 129 static inline bool is_prs_invalid(int prs_state) 130 { 131 return prs_state < 0; 132 } 133 134 /* 135 * Temporary cpumasks for working with partitions that are passed among 136 * functions to avoid memory allocation in inner functions. 137 */ 138 struct tmpmasks { 139 cpumask_var_t addmask, delmask; /* For partition root */ 140 cpumask_var_t new_cpus; /* For update_cpumasks_hier() */ 141 }; 142 143 void inc_dl_tasks_cs(struct task_struct *p) 144 { 145 struct cpuset *cs = task_cs(p); 146 147 cs->nr_deadline_tasks++; 148 } 149 150 void dec_dl_tasks_cs(struct task_struct *p) 151 { 152 struct cpuset *cs = task_cs(p); 153 154 cs->nr_deadline_tasks--; 155 } 156 157 static inline int is_partition_valid(const struct cpuset *cs) 158 { 159 return cs->partition_root_state > 0; 160 } 161 162 static inline int is_partition_invalid(const struct cpuset *cs) 163 { 164 return cs->partition_root_state < 0; 165 } 166 167 /* 168 * Callers should hold callback_lock to modify partition_root_state. 169 */ 170 static inline void make_partition_invalid(struct cpuset *cs) 171 { 172 if (cs->partition_root_state > 0) 173 cs->partition_root_state = -cs->partition_root_state; 174 } 175 176 /* 177 * Send notification event of whenever partition_root_state changes. 178 */ 179 static inline void notify_partition_change(struct cpuset *cs, int old_prs) 180 { 181 if (old_prs == cs->partition_root_state) 182 return; 183 cgroup_file_notify(&cs->partition_file); 184 185 /* Reset prs_err if not invalid */ 186 if (is_partition_valid(cs)) 187 WRITE_ONCE(cs->prs_err, PERR_NONE); 188 } 189 190 static struct cpuset top_cpuset = { 191 .flags = BIT(CS_ONLINE) | BIT(CS_CPU_EXCLUSIVE) | 192 BIT(CS_MEM_EXCLUSIVE) | BIT(CS_SCHED_LOAD_BALANCE), 193 .partition_root_state = PRS_ROOT, 194 .relax_domain_level = -1, 195 .remote_sibling = LIST_HEAD_INIT(top_cpuset.remote_sibling), 196 }; 197 198 /* 199 * There are two global locks guarding cpuset structures - cpuset_mutex and 200 * callback_lock. The cpuset code uses only cpuset_mutex. Other kernel 201 * subsystems can use cpuset_lock()/cpuset_unlock() to prevent change to cpuset 202 * structures. Note that cpuset_mutex needs to be a mutex as it is used in 203 * paths that rely on priority inheritance (e.g. scheduler - on RT) for 204 * correctness. 205 * 206 * A task must hold both locks to modify cpusets. If a task holds 207 * cpuset_mutex, it blocks others, ensuring that it is the only task able to 208 * also acquire callback_lock and be able to modify cpusets. It can perform 209 * various checks on the cpuset structure first, knowing nothing will change. 210 * It can also allocate memory while just holding cpuset_mutex. While it is 211 * performing these checks, various callback routines can briefly acquire 212 * callback_lock to query cpusets. Once it is ready to make the changes, it 213 * takes callback_lock, blocking everyone else. 214 * 215 * Calls to the kernel memory allocator can not be made while holding 216 * callback_lock, as that would risk double tripping on callback_lock 217 * from one of the callbacks into the cpuset code from within 218 * __alloc_pages(). 219 * 220 * If a task is only holding callback_lock, then it has read-only 221 * access to cpusets. 222 * 223 * Now, the task_struct fields mems_allowed and mempolicy may be changed 224 * by other task, we use alloc_lock in the task_struct fields to protect 225 * them. 226 * 227 * The cpuset_common_seq_show() handlers only hold callback_lock across 228 * small pieces of code, such as when reading out possibly multi-word 229 * cpumasks and nodemasks. 230 */ 231 232 static DEFINE_MUTEX(cpuset_mutex); 233 234 void cpuset_lock(void) 235 { 236 mutex_lock(&cpuset_mutex); 237 } 238 239 void cpuset_unlock(void) 240 { 241 mutex_unlock(&cpuset_mutex); 242 } 243 244 static DEFINE_SPINLOCK(callback_lock); 245 246 void cpuset_callback_lock_irq(void) 247 { 248 spin_lock_irq(&callback_lock); 249 } 250 251 void cpuset_callback_unlock_irq(void) 252 { 253 spin_unlock_irq(&callback_lock); 254 } 255 256 static struct workqueue_struct *cpuset_migrate_mm_wq; 257 258 static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq); 259 260 static inline void check_insane_mems_config(nodemask_t *nodes) 261 { 262 if (!cpusets_insane_config() && 263 movable_only_nodes(nodes)) { 264 static_branch_enable(&cpusets_insane_config_key); 265 pr_info("Unsupported (movable nodes only) cpuset configuration detected (nmask=%*pbl)!\n" 266 "Cpuset allocations might fail even with a lot of memory available.\n", 267 nodemask_pr_args(nodes)); 268 } 269 } 270 271 /* 272 * decrease cs->attach_in_progress. 273 * wake_up cpuset_attach_wq if cs->attach_in_progress==0. 274 */ 275 static inline void dec_attach_in_progress_locked(struct cpuset *cs) 276 { 277 lockdep_assert_held(&cpuset_mutex); 278 279 cs->attach_in_progress--; 280 if (!cs->attach_in_progress) 281 wake_up(&cpuset_attach_wq); 282 } 283 284 static inline void dec_attach_in_progress(struct cpuset *cs) 285 { 286 mutex_lock(&cpuset_mutex); 287 dec_attach_in_progress_locked(cs); 288 mutex_unlock(&cpuset_mutex); 289 } 290 291 static inline bool cpuset_v2(void) 292 { 293 return !IS_ENABLED(CONFIG_CPUSETS_V1) || 294 cgroup_subsys_on_dfl(cpuset_cgrp_subsys); 295 } 296 297 /* 298 * Cgroup v2 behavior is used on the "cpus" and "mems" control files when 299 * on default hierarchy or when the cpuset_v2_mode flag is set by mounting 300 * the v1 cpuset cgroup filesystem with the "cpuset_v2_mode" mount option. 301 * With v2 behavior, "cpus" and "mems" are always what the users have 302 * requested and won't be changed by hotplug events. Only the effective 303 * cpus or mems will be affected. 304 */ 305 static inline bool is_in_v2_mode(void) 306 { 307 return cpuset_v2() || 308 (cpuset_cgrp_subsys.root->flags & CGRP_ROOT_CPUSET_V2_MODE); 309 } 310 311 /** 312 * partition_is_populated - check if partition has tasks 313 * @cs: partition root to be checked 314 * @excluded_child: a child cpuset to be excluded in task checking 315 * Return: true if there are tasks, false otherwise 316 * 317 * It is assumed that @cs is a valid partition root. @excluded_child should 318 * be non-NULL when this cpuset is going to become a partition itself. 319 */ 320 static inline bool partition_is_populated(struct cpuset *cs, 321 struct cpuset *excluded_child) 322 { 323 struct cgroup_subsys_state *css; 324 struct cpuset *child; 325 326 if (cs->css.cgroup->nr_populated_csets) 327 return true; 328 if (!excluded_child && !cs->nr_subparts) 329 return cgroup_is_populated(cs->css.cgroup); 330 331 rcu_read_lock(); 332 cpuset_for_each_child(child, css, cs) { 333 if (child == excluded_child) 334 continue; 335 if (is_partition_valid(child)) 336 continue; 337 if (cgroup_is_populated(child->css.cgroup)) { 338 rcu_read_unlock(); 339 return true; 340 } 341 } 342 rcu_read_unlock(); 343 return false; 344 } 345 346 /* 347 * Return in pmask the portion of a task's cpusets's cpus_allowed that 348 * are online and are capable of running the task. If none are found, 349 * walk up the cpuset hierarchy until we find one that does have some 350 * appropriate cpus. 351 * 352 * One way or another, we guarantee to return some non-empty subset 353 * of cpu_online_mask. 354 * 355 * Call with callback_lock or cpuset_mutex held. 356 */ 357 static void guarantee_online_cpus(struct task_struct *tsk, 358 struct cpumask *pmask) 359 { 360 const struct cpumask *possible_mask = task_cpu_possible_mask(tsk); 361 struct cpuset *cs; 362 363 if (WARN_ON(!cpumask_and(pmask, possible_mask, cpu_online_mask))) 364 cpumask_copy(pmask, cpu_online_mask); 365 366 rcu_read_lock(); 367 cs = task_cs(tsk); 368 369 while (!cpumask_intersects(cs->effective_cpus, pmask)) 370 cs = parent_cs(cs); 371 372 cpumask_and(pmask, pmask, cs->effective_cpus); 373 rcu_read_unlock(); 374 } 375 376 /* 377 * Return in *pmask the portion of a cpusets's mems_allowed that 378 * are online, with memory. If none are online with memory, walk 379 * up the cpuset hierarchy until we find one that does have some 380 * online mems. The top cpuset always has some mems online. 381 * 382 * One way or another, we guarantee to return some non-empty subset 383 * of node_states[N_MEMORY]. 384 * 385 * Call with callback_lock or cpuset_mutex held. 386 */ 387 static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask) 388 { 389 while (!nodes_intersects(cs->effective_mems, node_states[N_MEMORY])) 390 cs = parent_cs(cs); 391 nodes_and(*pmask, cs->effective_mems, node_states[N_MEMORY]); 392 } 393 394 /** 395 * alloc_cpumasks - allocate three cpumasks for cpuset 396 * @cs: the cpuset that have cpumasks to be allocated. 397 * @tmp: the tmpmasks structure pointer 398 * Return: 0 if successful, -ENOMEM otherwise. 399 * 400 * Only one of the two input arguments should be non-NULL. 401 */ 402 static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp) 403 { 404 cpumask_var_t *pmask1, *pmask2, *pmask3, *pmask4; 405 406 if (cs) { 407 pmask1 = &cs->cpus_allowed; 408 pmask2 = &cs->effective_cpus; 409 pmask3 = &cs->effective_xcpus; 410 pmask4 = &cs->exclusive_cpus; 411 } else { 412 pmask1 = &tmp->new_cpus; 413 pmask2 = &tmp->addmask; 414 pmask3 = &tmp->delmask; 415 pmask4 = NULL; 416 } 417 418 if (!zalloc_cpumask_var(pmask1, GFP_KERNEL)) 419 return -ENOMEM; 420 421 if (!zalloc_cpumask_var(pmask2, GFP_KERNEL)) 422 goto free_one; 423 424 if (!zalloc_cpumask_var(pmask3, GFP_KERNEL)) 425 goto free_two; 426 427 if (pmask4 && !zalloc_cpumask_var(pmask4, GFP_KERNEL)) 428 goto free_three; 429 430 431 return 0; 432 433 free_three: 434 free_cpumask_var(*pmask3); 435 free_two: 436 free_cpumask_var(*pmask2); 437 free_one: 438 free_cpumask_var(*pmask1); 439 return -ENOMEM; 440 } 441 442 /** 443 * free_cpumasks - free cpumasks in a tmpmasks structure 444 * @cs: the cpuset that have cpumasks to be free. 445 * @tmp: the tmpmasks structure pointer 446 */ 447 static inline void free_cpumasks(struct cpuset *cs, struct tmpmasks *tmp) 448 { 449 if (cs) { 450 free_cpumask_var(cs->cpus_allowed); 451 free_cpumask_var(cs->effective_cpus); 452 free_cpumask_var(cs->effective_xcpus); 453 free_cpumask_var(cs->exclusive_cpus); 454 } 455 if (tmp) { 456 free_cpumask_var(tmp->new_cpus); 457 free_cpumask_var(tmp->addmask); 458 free_cpumask_var(tmp->delmask); 459 } 460 } 461 462 /** 463 * alloc_trial_cpuset - allocate a trial cpuset 464 * @cs: the cpuset that the trial cpuset duplicates 465 */ 466 static struct cpuset *alloc_trial_cpuset(struct cpuset *cs) 467 { 468 struct cpuset *trial; 469 470 trial = kmemdup(cs, sizeof(*cs), GFP_KERNEL); 471 if (!trial) 472 return NULL; 473 474 if (alloc_cpumasks(trial, NULL)) { 475 kfree(trial); 476 return NULL; 477 } 478 479 cpumask_copy(trial->cpus_allowed, cs->cpus_allowed); 480 cpumask_copy(trial->effective_cpus, cs->effective_cpus); 481 cpumask_copy(trial->effective_xcpus, cs->effective_xcpus); 482 cpumask_copy(trial->exclusive_cpus, cs->exclusive_cpus); 483 return trial; 484 } 485 486 /** 487 * free_cpuset - free the cpuset 488 * @cs: the cpuset to be freed 489 */ 490 static inline void free_cpuset(struct cpuset *cs) 491 { 492 free_cpumasks(cs, NULL); 493 kfree(cs); 494 } 495 496 /* Return user specified exclusive CPUs */ 497 static inline struct cpumask *user_xcpus(struct cpuset *cs) 498 { 499 return cpumask_empty(cs->exclusive_cpus) ? cs->cpus_allowed 500 : cs->exclusive_cpus; 501 } 502 503 static inline bool xcpus_empty(struct cpuset *cs) 504 { 505 return cpumask_empty(cs->cpus_allowed) && 506 cpumask_empty(cs->exclusive_cpus); 507 } 508 509 /* 510 * cpusets_are_exclusive() - check if two cpusets are exclusive 511 * 512 * Return true if exclusive, false if not 513 */ 514 static inline bool cpusets_are_exclusive(struct cpuset *cs1, struct cpuset *cs2) 515 { 516 struct cpumask *xcpus1 = user_xcpus(cs1); 517 struct cpumask *xcpus2 = user_xcpus(cs2); 518 519 if (cpumask_intersects(xcpus1, xcpus2)) 520 return false; 521 return true; 522 } 523 524 /* 525 * validate_change() - Used to validate that any proposed cpuset change 526 * follows the structural rules for cpusets. 527 * 528 * If we replaced the flag and mask values of the current cpuset 529 * (cur) with those values in the trial cpuset (trial), would 530 * our various subset and exclusive rules still be valid? Presumes 531 * cpuset_mutex held. 532 * 533 * 'cur' is the address of an actual, in-use cpuset. Operations 534 * such as list traversal that depend on the actual address of the 535 * cpuset in the list must use cur below, not trial. 536 * 537 * 'trial' is the address of bulk structure copy of cur, with 538 * perhaps one or more of the fields cpus_allowed, mems_allowed, 539 * or flags changed to new, trial values. 540 * 541 * Return 0 if valid, -errno if not. 542 */ 543 544 static int validate_change(struct cpuset *cur, struct cpuset *trial) 545 { 546 struct cgroup_subsys_state *css; 547 struct cpuset *c, *par; 548 int ret = 0; 549 550 rcu_read_lock(); 551 552 if (!is_in_v2_mode()) 553 ret = cpuset1_validate_change(cur, trial); 554 if (ret) 555 goto out; 556 557 /* Remaining checks don't apply to root cpuset */ 558 if (cur == &top_cpuset) 559 goto out; 560 561 par = parent_cs(cur); 562 563 /* 564 * Cpusets with tasks - existing or newly being attached - can't 565 * be changed to have empty cpus_allowed or mems_allowed. 566 */ 567 ret = -ENOSPC; 568 if ((cgroup_is_populated(cur->css.cgroup) || cur->attach_in_progress)) { 569 if (!cpumask_empty(cur->cpus_allowed) && 570 cpumask_empty(trial->cpus_allowed)) 571 goto out; 572 if (!nodes_empty(cur->mems_allowed) && 573 nodes_empty(trial->mems_allowed)) 574 goto out; 575 } 576 577 /* 578 * We can't shrink if we won't have enough room for SCHED_DEADLINE 579 * tasks. This check is not done when scheduling is disabled as the 580 * users should know what they are doing. 581 * 582 * For v1, effective_cpus == cpus_allowed & user_xcpus() returns 583 * cpus_allowed. 584 * 585 * For v2, is_cpu_exclusive() & is_sched_load_balance() are true only 586 * for non-isolated partition root. At this point, the target 587 * effective_cpus isn't computed yet. user_xcpus() is the best 588 * approximation. 589 * 590 * TBD: May need to precompute the real effective_cpus here in case 591 * incorrect scheduling of SCHED_DEADLINE tasks in a partition 592 * becomes an issue. 593 */ 594 ret = -EBUSY; 595 if (is_cpu_exclusive(cur) && is_sched_load_balance(cur) && 596 !cpuset_cpumask_can_shrink(cur->effective_cpus, user_xcpus(trial))) 597 goto out; 598 599 /* 600 * If either I or some sibling (!= me) is exclusive, we can't 601 * overlap. exclusive_cpus cannot overlap with each other if set. 602 */ 603 ret = -EINVAL; 604 cpuset_for_each_child(c, css, par) { 605 bool txset, cxset; /* Are exclusive_cpus set? */ 606 607 if (c == cur) 608 continue; 609 610 txset = !cpumask_empty(trial->exclusive_cpus); 611 cxset = !cpumask_empty(c->exclusive_cpus); 612 if (is_cpu_exclusive(trial) || is_cpu_exclusive(c) || 613 (txset && cxset)) { 614 if (!cpusets_are_exclusive(trial, c)) 615 goto out; 616 } else if (txset || cxset) { 617 struct cpumask *xcpus, *acpus; 618 619 /* 620 * When just one of the exclusive_cpus's is set, 621 * cpus_allowed of the other cpuset, if set, cannot be 622 * a subset of it or none of those CPUs will be 623 * available if these exclusive CPUs are activated. 624 */ 625 if (txset) { 626 xcpus = trial->exclusive_cpus; 627 acpus = c->cpus_allowed; 628 } else { 629 xcpus = c->exclusive_cpus; 630 acpus = trial->cpus_allowed; 631 } 632 if (!cpumask_empty(acpus) && cpumask_subset(acpus, xcpus)) 633 goto out; 634 } 635 if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) && 636 nodes_intersects(trial->mems_allowed, c->mems_allowed)) 637 goto out; 638 } 639 640 ret = 0; 641 out: 642 rcu_read_unlock(); 643 return ret; 644 } 645 646 #ifdef CONFIG_SMP 647 /* 648 * Helper routine for generate_sched_domains(). 649 * Do cpusets a, b have overlapping effective cpus_allowed masks? 650 */ 651 static int cpusets_overlap(struct cpuset *a, struct cpuset *b) 652 { 653 return cpumask_intersects(a->effective_cpus, b->effective_cpus); 654 } 655 656 static void 657 update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c) 658 { 659 if (dattr->relax_domain_level < c->relax_domain_level) 660 dattr->relax_domain_level = c->relax_domain_level; 661 return; 662 } 663 664 static void update_domain_attr_tree(struct sched_domain_attr *dattr, 665 struct cpuset *root_cs) 666 { 667 struct cpuset *cp; 668 struct cgroup_subsys_state *pos_css; 669 670 rcu_read_lock(); 671 cpuset_for_each_descendant_pre(cp, pos_css, root_cs) { 672 /* skip the whole subtree if @cp doesn't have any CPU */ 673 if (cpumask_empty(cp->cpus_allowed)) { 674 pos_css = css_rightmost_descendant(pos_css); 675 continue; 676 } 677 678 if (is_sched_load_balance(cp)) 679 update_domain_attr(dattr, cp); 680 } 681 rcu_read_unlock(); 682 } 683 684 /* Must be called with cpuset_mutex held. */ 685 static inline int nr_cpusets(void) 686 { 687 /* jump label reference count + the top-level cpuset */ 688 return static_key_count(&cpusets_enabled_key.key) + 1; 689 } 690 691 /* 692 * generate_sched_domains() 693 * 694 * This function builds a partial partition of the systems CPUs 695 * A 'partial partition' is a set of non-overlapping subsets whose 696 * union is a subset of that set. 697 * The output of this function needs to be passed to kernel/sched/core.c 698 * partition_sched_domains() routine, which will rebuild the scheduler's 699 * load balancing domains (sched domains) as specified by that partial 700 * partition. 701 * 702 * See "What is sched_load_balance" in Documentation/admin-guide/cgroup-v1/cpusets.rst 703 * for a background explanation of this. 704 * 705 * Does not return errors, on the theory that the callers of this 706 * routine would rather not worry about failures to rebuild sched 707 * domains when operating in the severe memory shortage situations 708 * that could cause allocation failures below. 709 * 710 * Must be called with cpuset_mutex held. 711 * 712 * The three key local variables below are: 713 * cp - cpuset pointer, used (together with pos_css) to perform a 714 * top-down scan of all cpusets. For our purposes, rebuilding 715 * the schedulers sched domains, we can ignore !is_sched_load_ 716 * balance cpusets. 717 * csa - (for CpuSet Array) Array of pointers to all the cpusets 718 * that need to be load balanced, for convenient iterative 719 * access by the subsequent code that finds the best partition, 720 * i.e the set of domains (subsets) of CPUs such that the 721 * cpus_allowed of every cpuset marked is_sched_load_balance 722 * is a subset of one of these domains, while there are as 723 * many such domains as possible, each as small as possible. 724 * doms - Conversion of 'csa' to an array of cpumasks, for passing to 725 * the kernel/sched/core.c routine partition_sched_domains() in a 726 * convenient format, that can be easily compared to the prior 727 * value to determine what partition elements (sched domains) 728 * were changed (added or removed.) 729 * 730 * Finding the best partition (set of domains): 731 * The double nested loops below over i, j scan over the load 732 * balanced cpusets (using the array of cpuset pointers in csa[]) 733 * looking for pairs of cpusets that have overlapping cpus_allowed 734 * and merging them using a union-find algorithm. 735 * 736 * The union of the cpus_allowed masks from the set of all cpusets 737 * having the same root then form the one element of the partition 738 * (one sched domain) to be passed to partition_sched_domains(). 739 * 740 */ 741 static int generate_sched_domains(cpumask_var_t **domains, 742 struct sched_domain_attr **attributes) 743 { 744 struct cpuset *cp; /* top-down scan of cpusets */ 745 struct cpuset **csa; /* array of all cpuset ptrs */ 746 int csn; /* how many cpuset ptrs in csa so far */ 747 int i, j; /* indices for partition finding loops */ 748 cpumask_var_t *doms; /* resulting partition; i.e. sched domains */ 749 struct sched_domain_attr *dattr; /* attributes for custom domains */ 750 int ndoms = 0; /* number of sched domains in result */ 751 int nslot; /* next empty doms[] struct cpumask slot */ 752 struct cgroup_subsys_state *pos_css; 753 bool root_load_balance = is_sched_load_balance(&top_cpuset); 754 bool cgrpv2 = cpuset_v2(); 755 int nslot_update; 756 757 doms = NULL; 758 dattr = NULL; 759 csa = NULL; 760 761 /* Special case for the 99% of systems with one, full, sched domain */ 762 if (root_load_balance && cpumask_empty(subpartitions_cpus)) { 763 single_root_domain: 764 ndoms = 1; 765 doms = alloc_sched_domains(ndoms); 766 if (!doms) 767 goto done; 768 769 dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL); 770 if (dattr) { 771 *dattr = SD_ATTR_INIT; 772 update_domain_attr_tree(dattr, &top_cpuset); 773 } 774 cpumask_and(doms[0], top_cpuset.effective_cpus, 775 housekeeping_cpumask(HK_TYPE_DOMAIN)); 776 777 goto done; 778 } 779 780 csa = kmalloc_array(nr_cpusets(), sizeof(cp), GFP_KERNEL); 781 if (!csa) 782 goto done; 783 csn = 0; 784 785 rcu_read_lock(); 786 if (root_load_balance) 787 csa[csn++] = &top_cpuset; 788 cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) { 789 if (cp == &top_cpuset) 790 continue; 791 792 if (cgrpv2) 793 goto v2; 794 795 /* 796 * v1: 797 * Continue traversing beyond @cp iff @cp has some CPUs and 798 * isn't load balancing. The former is obvious. The 799 * latter: All child cpusets contain a subset of the 800 * parent's cpus, so just skip them, and then we call 801 * update_domain_attr_tree() to calc relax_domain_level of 802 * the corresponding sched domain. 803 */ 804 if (!cpumask_empty(cp->cpus_allowed) && 805 !(is_sched_load_balance(cp) && 806 cpumask_intersects(cp->cpus_allowed, 807 housekeeping_cpumask(HK_TYPE_DOMAIN)))) 808 continue; 809 810 if (is_sched_load_balance(cp) && 811 !cpumask_empty(cp->effective_cpus)) 812 csa[csn++] = cp; 813 814 /* skip @cp's subtree */ 815 pos_css = css_rightmost_descendant(pos_css); 816 continue; 817 818 v2: 819 /* 820 * Only valid partition roots that are not isolated and with 821 * non-empty effective_cpus will be saved into csn[]. 822 */ 823 if ((cp->partition_root_state == PRS_ROOT) && 824 !cpumask_empty(cp->effective_cpus)) 825 csa[csn++] = cp; 826 827 /* 828 * Skip @cp's subtree if not a partition root and has no 829 * exclusive CPUs to be granted to child cpusets. 830 */ 831 if (!is_partition_valid(cp) && cpumask_empty(cp->exclusive_cpus)) 832 pos_css = css_rightmost_descendant(pos_css); 833 } 834 rcu_read_unlock(); 835 836 /* 837 * If there are only isolated partitions underneath the cgroup root, 838 * we can optimize out unneeded sched domains scanning. 839 */ 840 if (root_load_balance && (csn == 1)) 841 goto single_root_domain; 842 843 for (i = 0; i < csn; i++) 844 uf_node_init(&csa[i]->node); 845 846 /* Merge overlapping cpusets */ 847 for (i = 0; i < csn; i++) { 848 for (j = i + 1; j < csn; j++) { 849 if (cpusets_overlap(csa[i], csa[j])) { 850 /* 851 * Cgroup v2 shouldn't pass down overlapping 852 * partition root cpusets. 853 */ 854 WARN_ON_ONCE(cgrpv2); 855 uf_union(&csa[i]->node, &csa[j]->node); 856 } 857 } 858 } 859 860 /* Count the total number of domains */ 861 for (i = 0; i < csn; i++) { 862 if (uf_find(&csa[i]->node) == &csa[i]->node) 863 ndoms++; 864 } 865 866 /* 867 * Now we know how many domains to create. 868 * Convert <csn, csa> to <ndoms, doms> and populate cpu masks. 869 */ 870 doms = alloc_sched_domains(ndoms); 871 if (!doms) 872 goto done; 873 874 /* 875 * The rest of the code, including the scheduler, can deal with 876 * dattr==NULL case. No need to abort if alloc fails. 877 */ 878 dattr = kmalloc_array(ndoms, sizeof(struct sched_domain_attr), 879 GFP_KERNEL); 880 881 /* 882 * Cgroup v2 doesn't support domain attributes, just set all of them 883 * to SD_ATTR_INIT. Also non-isolating partition root CPUs are a 884 * subset of HK_TYPE_DOMAIN housekeeping CPUs. 885 */ 886 if (cgrpv2) { 887 for (i = 0; i < ndoms; i++) { 888 /* 889 * The top cpuset may contain some boot time isolated 890 * CPUs that need to be excluded from the sched domain. 891 */ 892 if (csa[i] == &top_cpuset) 893 cpumask_and(doms[i], csa[i]->effective_cpus, 894 housekeeping_cpumask(HK_TYPE_DOMAIN)); 895 else 896 cpumask_copy(doms[i], csa[i]->effective_cpus); 897 if (dattr) 898 dattr[i] = SD_ATTR_INIT; 899 } 900 goto done; 901 } 902 903 for (nslot = 0, i = 0; i < csn; i++) { 904 nslot_update = 0; 905 for (j = i; j < csn; j++) { 906 if (uf_find(&csa[j]->node) == &csa[i]->node) { 907 struct cpumask *dp = doms[nslot]; 908 909 if (i == j) { 910 nslot_update = 1; 911 cpumask_clear(dp); 912 if (dattr) 913 *(dattr + nslot) = SD_ATTR_INIT; 914 } 915 cpumask_or(dp, dp, csa[j]->effective_cpus); 916 cpumask_and(dp, dp, housekeeping_cpumask(HK_TYPE_DOMAIN)); 917 if (dattr) 918 update_domain_attr_tree(dattr + nslot, csa[j]); 919 } 920 } 921 if (nslot_update) 922 nslot++; 923 } 924 BUG_ON(nslot != ndoms); 925 926 done: 927 kfree(csa); 928 929 /* 930 * Fallback to the default domain if kmalloc() failed. 931 * See comments in partition_sched_domains(). 932 */ 933 if (doms == NULL) 934 ndoms = 1; 935 936 *domains = doms; 937 *attributes = dattr; 938 return ndoms; 939 } 940 941 static void dl_update_tasks_root_domain(struct cpuset *cs) 942 { 943 struct css_task_iter it; 944 struct task_struct *task; 945 946 if (cs->nr_deadline_tasks == 0) 947 return; 948 949 css_task_iter_start(&cs->css, 0, &it); 950 951 while ((task = css_task_iter_next(&it))) 952 dl_add_task_root_domain(task); 953 954 css_task_iter_end(&it); 955 } 956 957 static void dl_rebuild_rd_accounting(void) 958 { 959 struct cpuset *cs = NULL; 960 struct cgroup_subsys_state *pos_css; 961 962 lockdep_assert_held(&cpuset_mutex); 963 lockdep_assert_cpus_held(); 964 lockdep_assert_held(&sched_domains_mutex); 965 966 rcu_read_lock(); 967 968 /* 969 * Clear default root domain DL accounting, it will be computed again 970 * if a task belongs to it. 971 */ 972 dl_clear_root_domain(&def_root_domain); 973 974 cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) { 975 976 if (cpumask_empty(cs->effective_cpus)) { 977 pos_css = css_rightmost_descendant(pos_css); 978 continue; 979 } 980 981 css_get(&cs->css); 982 983 rcu_read_unlock(); 984 985 dl_update_tasks_root_domain(cs); 986 987 rcu_read_lock(); 988 css_put(&cs->css); 989 } 990 rcu_read_unlock(); 991 } 992 993 static void 994 partition_and_rebuild_sched_domains(int ndoms_new, cpumask_var_t doms_new[], 995 struct sched_domain_attr *dattr_new) 996 { 997 mutex_lock(&sched_domains_mutex); 998 partition_sched_domains_locked(ndoms_new, doms_new, dattr_new); 999 dl_rebuild_rd_accounting(); 1000 mutex_unlock(&sched_domains_mutex); 1001 } 1002 1003 /* 1004 * Rebuild scheduler domains. 1005 * 1006 * If the flag 'sched_load_balance' of any cpuset with non-empty 1007 * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset 1008 * which has that flag enabled, or if any cpuset with a non-empty 1009 * 'cpus' is removed, then call this routine to rebuild the 1010 * scheduler's dynamic sched domains. 1011 * 1012 * Call with cpuset_mutex held. Takes cpus_read_lock(). 1013 */ 1014 void rebuild_sched_domains_locked(void) 1015 { 1016 struct cgroup_subsys_state *pos_css; 1017 struct sched_domain_attr *attr; 1018 cpumask_var_t *doms; 1019 struct cpuset *cs; 1020 int ndoms; 1021 1022 lockdep_assert_cpus_held(); 1023 lockdep_assert_held(&cpuset_mutex); 1024 force_sd_rebuild = false; 1025 1026 /* 1027 * If we have raced with CPU hotplug, return early to avoid 1028 * passing doms with offlined cpu to partition_sched_domains(). 1029 * Anyways, cpuset_handle_hotplug() will rebuild sched domains. 1030 * 1031 * With no CPUs in any subpartitions, top_cpuset's effective CPUs 1032 * should be the same as the active CPUs, so checking only top_cpuset 1033 * is enough to detect racing CPU offlines. 1034 */ 1035 if (cpumask_empty(subpartitions_cpus) && 1036 !cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask)) 1037 return; 1038 1039 /* 1040 * With subpartition CPUs, however, the effective CPUs of a partition 1041 * root should be only a subset of the active CPUs. Since a CPU in any 1042 * partition root could be offlined, all must be checked. 1043 */ 1044 if (!cpumask_empty(subpartitions_cpus)) { 1045 rcu_read_lock(); 1046 cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) { 1047 if (!is_partition_valid(cs)) { 1048 pos_css = css_rightmost_descendant(pos_css); 1049 continue; 1050 } 1051 if (!cpumask_subset(cs->effective_cpus, 1052 cpu_active_mask)) { 1053 rcu_read_unlock(); 1054 return; 1055 } 1056 } 1057 rcu_read_unlock(); 1058 } 1059 1060 /* Generate domain masks and attrs */ 1061 ndoms = generate_sched_domains(&doms, &attr); 1062 1063 /* Have scheduler rebuild the domains */ 1064 partition_and_rebuild_sched_domains(ndoms, doms, attr); 1065 } 1066 #else /* !CONFIG_SMP */ 1067 void rebuild_sched_domains_locked(void) 1068 { 1069 } 1070 #endif /* CONFIG_SMP */ 1071 1072 static void rebuild_sched_domains_cpuslocked(void) 1073 { 1074 mutex_lock(&cpuset_mutex); 1075 rebuild_sched_domains_locked(); 1076 mutex_unlock(&cpuset_mutex); 1077 } 1078 1079 void rebuild_sched_domains(void) 1080 { 1081 cpus_read_lock(); 1082 rebuild_sched_domains_cpuslocked(); 1083 cpus_read_unlock(); 1084 } 1085 1086 /** 1087 * cpuset_update_tasks_cpumask - Update the cpumasks of tasks in the cpuset. 1088 * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed 1089 * @new_cpus: the temp variable for the new effective_cpus mask 1090 * 1091 * Iterate through each task of @cs updating its cpus_allowed to the 1092 * effective cpuset's. As this function is called with cpuset_mutex held, 1093 * cpuset membership stays stable. For top_cpuset, task_cpu_possible_mask() 1094 * is used instead of effective_cpus to make sure all offline CPUs are also 1095 * included as hotplug code won't update cpumasks for tasks in top_cpuset. 1096 */ 1097 void cpuset_update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus) 1098 { 1099 struct css_task_iter it; 1100 struct task_struct *task; 1101 bool top_cs = cs == &top_cpuset; 1102 1103 css_task_iter_start(&cs->css, 0, &it); 1104 while ((task = css_task_iter_next(&it))) { 1105 const struct cpumask *possible_mask = task_cpu_possible_mask(task); 1106 1107 if (top_cs) { 1108 /* 1109 * Percpu kthreads in top_cpuset are ignored 1110 */ 1111 if (kthread_is_per_cpu(task)) 1112 continue; 1113 cpumask_andnot(new_cpus, possible_mask, subpartitions_cpus); 1114 } else { 1115 cpumask_and(new_cpus, possible_mask, cs->effective_cpus); 1116 } 1117 set_cpus_allowed_ptr(task, new_cpus); 1118 } 1119 css_task_iter_end(&it); 1120 } 1121 1122 /** 1123 * compute_effective_cpumask - Compute the effective cpumask of the cpuset 1124 * @new_cpus: the temp variable for the new effective_cpus mask 1125 * @cs: the cpuset the need to recompute the new effective_cpus mask 1126 * @parent: the parent cpuset 1127 * 1128 * The result is valid only if the given cpuset isn't a partition root. 1129 */ 1130 static void compute_effective_cpumask(struct cpumask *new_cpus, 1131 struct cpuset *cs, struct cpuset *parent) 1132 { 1133 cpumask_and(new_cpus, cs->cpus_allowed, parent->effective_cpus); 1134 } 1135 1136 /* 1137 * Commands for update_parent_effective_cpumask 1138 */ 1139 enum partition_cmd { 1140 partcmd_enable, /* Enable partition root */ 1141 partcmd_enablei, /* Enable isolated partition root */ 1142 partcmd_disable, /* Disable partition root */ 1143 partcmd_update, /* Update parent's effective_cpus */ 1144 partcmd_invalidate, /* Make partition invalid */ 1145 }; 1146 1147 static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, 1148 struct tmpmasks *tmp); 1149 1150 /* 1151 * Update partition exclusive flag 1152 * 1153 * Return: 0 if successful, an error code otherwise 1154 */ 1155 static int update_partition_exclusive(struct cpuset *cs, int new_prs) 1156 { 1157 bool exclusive = (new_prs > PRS_MEMBER); 1158 1159 if (exclusive && !is_cpu_exclusive(cs)) { 1160 if (cpuset_update_flag(CS_CPU_EXCLUSIVE, cs, 1)) 1161 return PERR_NOTEXCL; 1162 } else if (!exclusive && is_cpu_exclusive(cs)) { 1163 /* Turning off CS_CPU_EXCLUSIVE will not return error */ 1164 cpuset_update_flag(CS_CPU_EXCLUSIVE, cs, 0); 1165 } 1166 return 0; 1167 } 1168 1169 /* 1170 * Update partition load balance flag and/or rebuild sched domain 1171 * 1172 * Changing load balance flag will automatically call 1173 * rebuild_sched_domains_locked(). 1174 * This function is for cgroup v2 only. 1175 */ 1176 static void update_partition_sd_lb(struct cpuset *cs, int old_prs) 1177 { 1178 int new_prs = cs->partition_root_state; 1179 bool rebuild_domains = (new_prs > 0) || (old_prs > 0); 1180 bool new_lb; 1181 1182 /* 1183 * If cs is not a valid partition root, the load balance state 1184 * will follow its parent. 1185 */ 1186 if (new_prs > 0) { 1187 new_lb = (new_prs != PRS_ISOLATED); 1188 } else { 1189 new_lb = is_sched_load_balance(parent_cs(cs)); 1190 } 1191 if (new_lb != !!is_sched_load_balance(cs)) { 1192 rebuild_domains = true; 1193 if (new_lb) 1194 set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); 1195 else 1196 clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); 1197 } 1198 1199 if (rebuild_domains) 1200 cpuset_force_rebuild(); 1201 } 1202 1203 /* 1204 * tasks_nocpu_error - Return true if tasks will have no effective_cpus 1205 */ 1206 static bool tasks_nocpu_error(struct cpuset *parent, struct cpuset *cs, 1207 struct cpumask *xcpus) 1208 { 1209 /* 1210 * A populated partition (cs or parent) can't have empty effective_cpus 1211 */ 1212 return (cpumask_subset(parent->effective_cpus, xcpus) && 1213 partition_is_populated(parent, cs)) || 1214 (!cpumask_intersects(xcpus, cpu_active_mask) && 1215 partition_is_populated(cs, NULL)); 1216 } 1217 1218 static void reset_partition_data(struct cpuset *cs) 1219 { 1220 struct cpuset *parent = parent_cs(cs); 1221 1222 if (!cpuset_v2()) 1223 return; 1224 1225 lockdep_assert_held(&callback_lock); 1226 1227 cs->nr_subparts = 0; 1228 if (cpumask_empty(cs->exclusive_cpus)) { 1229 cpumask_clear(cs->effective_xcpus); 1230 if (is_cpu_exclusive(cs)) 1231 clear_bit(CS_CPU_EXCLUSIVE, &cs->flags); 1232 } 1233 if (!cpumask_and(cs->effective_cpus, parent->effective_cpus, cs->cpus_allowed)) 1234 cpumask_copy(cs->effective_cpus, parent->effective_cpus); 1235 } 1236 1237 /* 1238 * partition_xcpus_newstate - Exclusive CPUs state change 1239 * @old_prs: old partition_root_state 1240 * @new_prs: new partition_root_state 1241 * @xcpus: exclusive CPUs with state change 1242 */ 1243 static void partition_xcpus_newstate(int old_prs, int new_prs, struct cpumask *xcpus) 1244 { 1245 WARN_ON_ONCE(old_prs == new_prs); 1246 if (new_prs == PRS_ISOLATED) 1247 cpumask_or(isolated_cpus, isolated_cpus, xcpus); 1248 else 1249 cpumask_andnot(isolated_cpus, isolated_cpus, xcpus); 1250 } 1251 1252 /* 1253 * partition_xcpus_add - Add new exclusive CPUs to partition 1254 * @new_prs: new partition_root_state 1255 * @parent: parent cpuset 1256 * @xcpus: exclusive CPUs to be added 1257 * Return: true if isolated_cpus modified, false otherwise 1258 * 1259 * Remote partition if parent == NULL 1260 */ 1261 static bool partition_xcpus_add(int new_prs, struct cpuset *parent, 1262 struct cpumask *xcpus) 1263 { 1264 bool isolcpus_updated; 1265 1266 WARN_ON_ONCE(new_prs < 0); 1267 lockdep_assert_held(&callback_lock); 1268 if (!parent) 1269 parent = &top_cpuset; 1270 1271 1272 if (parent == &top_cpuset) 1273 cpumask_or(subpartitions_cpus, subpartitions_cpus, xcpus); 1274 1275 isolcpus_updated = (new_prs != parent->partition_root_state); 1276 if (isolcpus_updated) 1277 partition_xcpus_newstate(parent->partition_root_state, new_prs, 1278 xcpus); 1279 1280 cpumask_andnot(parent->effective_cpus, parent->effective_cpus, xcpus); 1281 return isolcpus_updated; 1282 } 1283 1284 /* 1285 * partition_xcpus_del - Remove exclusive CPUs from partition 1286 * @old_prs: old partition_root_state 1287 * @parent: parent cpuset 1288 * @xcpus: exclusive CPUs to be removed 1289 * Return: true if isolated_cpus modified, false otherwise 1290 * 1291 * Remote partition if parent == NULL 1292 */ 1293 static bool partition_xcpus_del(int old_prs, struct cpuset *parent, 1294 struct cpumask *xcpus) 1295 { 1296 bool isolcpus_updated; 1297 1298 WARN_ON_ONCE(old_prs < 0); 1299 lockdep_assert_held(&callback_lock); 1300 if (!parent) 1301 parent = &top_cpuset; 1302 1303 if (parent == &top_cpuset) 1304 cpumask_andnot(subpartitions_cpus, subpartitions_cpus, xcpus); 1305 1306 isolcpus_updated = (old_prs != parent->partition_root_state); 1307 if (isolcpus_updated) 1308 partition_xcpus_newstate(old_prs, parent->partition_root_state, 1309 xcpus); 1310 1311 cpumask_and(xcpus, xcpus, cpu_active_mask); 1312 cpumask_or(parent->effective_cpus, parent->effective_cpus, xcpus); 1313 return isolcpus_updated; 1314 } 1315 1316 static void update_unbound_workqueue_cpumask(bool isolcpus_updated) 1317 { 1318 int ret; 1319 1320 lockdep_assert_cpus_held(); 1321 1322 if (!isolcpus_updated) 1323 return; 1324 1325 ret = workqueue_unbound_exclude_cpumask(isolated_cpus); 1326 WARN_ON_ONCE(ret < 0); 1327 } 1328 1329 /** 1330 * cpuset_cpu_is_isolated - Check if the given CPU is isolated 1331 * @cpu: the CPU number to be checked 1332 * Return: true if CPU is used in an isolated partition, false otherwise 1333 */ 1334 bool cpuset_cpu_is_isolated(int cpu) 1335 { 1336 return cpumask_test_cpu(cpu, isolated_cpus); 1337 } 1338 EXPORT_SYMBOL_GPL(cpuset_cpu_is_isolated); 1339 1340 /* 1341 * compute_effective_exclusive_cpumask - compute effective exclusive CPUs 1342 * @cs: cpuset 1343 * @xcpus: effective exclusive CPUs value to be set 1344 * Return: true if xcpus is not empty, false otherwise. 1345 * 1346 * Starting with exclusive_cpus (cpus_allowed if exclusive_cpus is not set), 1347 * it must be a subset of parent's effective_xcpus. 1348 */ 1349 static bool compute_effective_exclusive_cpumask(struct cpuset *cs, 1350 struct cpumask *xcpus) 1351 { 1352 struct cpuset *parent = parent_cs(cs); 1353 1354 if (!xcpus) 1355 xcpus = cs->effective_xcpus; 1356 1357 return cpumask_and(xcpus, user_xcpus(cs), parent->effective_xcpus); 1358 } 1359 1360 static inline bool is_remote_partition(struct cpuset *cs) 1361 { 1362 return !list_empty(&cs->remote_sibling); 1363 } 1364 1365 static inline bool is_local_partition(struct cpuset *cs) 1366 { 1367 return is_partition_valid(cs) && !is_remote_partition(cs); 1368 } 1369 1370 /* 1371 * remote_partition_enable - Enable current cpuset as a remote partition root 1372 * @cs: the cpuset to update 1373 * @new_prs: new partition_root_state 1374 * @tmp: temporary masks 1375 * Return: 0 if successful, errcode if error 1376 * 1377 * Enable the current cpuset to become a remote partition root taking CPUs 1378 * directly from the top cpuset. cpuset_mutex must be held by the caller. 1379 */ 1380 static int remote_partition_enable(struct cpuset *cs, int new_prs, 1381 struct tmpmasks *tmp) 1382 { 1383 bool isolcpus_updated; 1384 1385 /* 1386 * The user must have sysadmin privilege. 1387 */ 1388 if (!capable(CAP_SYS_ADMIN)) 1389 return PERR_ACCESS; 1390 1391 /* 1392 * The requested exclusive_cpus must not be allocated to other 1393 * partitions and it can't use up all the root's effective_cpus. 1394 * 1395 * Note that if there is any local partition root above it or 1396 * remote partition root underneath it, its exclusive_cpus must 1397 * have overlapped with subpartitions_cpus. 1398 */ 1399 compute_effective_exclusive_cpumask(cs, tmp->new_cpus); 1400 if (cpumask_empty(tmp->new_cpus) || 1401 cpumask_intersects(tmp->new_cpus, subpartitions_cpus) || 1402 cpumask_subset(top_cpuset.effective_cpus, tmp->new_cpus)) 1403 return PERR_INVCPUS; 1404 1405 spin_lock_irq(&callback_lock); 1406 isolcpus_updated = partition_xcpus_add(new_prs, NULL, tmp->new_cpus); 1407 list_add(&cs->remote_sibling, &remote_children); 1408 spin_unlock_irq(&callback_lock); 1409 update_unbound_workqueue_cpumask(isolcpus_updated); 1410 1411 /* 1412 * Propagate changes in top_cpuset's effective_cpus down the hierarchy. 1413 */ 1414 cpuset_update_tasks_cpumask(&top_cpuset, tmp->new_cpus); 1415 update_sibling_cpumasks(&top_cpuset, NULL, tmp); 1416 return 0; 1417 } 1418 1419 /* 1420 * remote_partition_disable - Remove current cpuset from remote partition list 1421 * @cs: the cpuset to update 1422 * @tmp: temporary masks 1423 * 1424 * The effective_cpus is also updated. 1425 * 1426 * cpuset_mutex must be held by the caller. 1427 */ 1428 static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp) 1429 { 1430 bool isolcpus_updated; 1431 1432 compute_effective_exclusive_cpumask(cs, tmp->new_cpus); 1433 WARN_ON_ONCE(!is_remote_partition(cs)); 1434 WARN_ON_ONCE(!cpumask_subset(tmp->new_cpus, subpartitions_cpus)); 1435 1436 spin_lock_irq(&callback_lock); 1437 list_del_init(&cs->remote_sibling); 1438 isolcpus_updated = partition_xcpus_del(cs->partition_root_state, 1439 NULL, tmp->new_cpus); 1440 cs->partition_root_state = -cs->partition_root_state; 1441 if (!cs->prs_err) 1442 cs->prs_err = PERR_INVCPUS; 1443 reset_partition_data(cs); 1444 spin_unlock_irq(&callback_lock); 1445 update_unbound_workqueue_cpumask(isolcpus_updated); 1446 1447 /* 1448 * Propagate changes in top_cpuset's effective_cpus down the hierarchy. 1449 */ 1450 cpuset_update_tasks_cpumask(&top_cpuset, tmp->new_cpus); 1451 update_sibling_cpumasks(&top_cpuset, NULL, tmp); 1452 } 1453 1454 /* 1455 * remote_cpus_update - cpus_exclusive change of remote partition 1456 * @cs: the cpuset to be updated 1457 * @newmask: the new effective_xcpus mask 1458 * @tmp: temporary masks 1459 * 1460 * top_cpuset and subpartitions_cpus will be updated or partition can be 1461 * invalidated. 1462 */ 1463 static void remote_cpus_update(struct cpuset *cs, struct cpumask *newmask, 1464 struct tmpmasks *tmp) 1465 { 1466 bool adding, deleting; 1467 int prs = cs->partition_root_state; 1468 int isolcpus_updated = 0; 1469 1470 if (WARN_ON_ONCE(!is_remote_partition(cs))) 1471 return; 1472 1473 WARN_ON_ONCE(!cpumask_subset(cs->effective_xcpus, subpartitions_cpus)); 1474 1475 if (cpumask_empty(newmask)) 1476 goto invalidate; 1477 1478 adding = cpumask_andnot(tmp->addmask, newmask, cs->effective_xcpus); 1479 deleting = cpumask_andnot(tmp->delmask, cs->effective_xcpus, newmask); 1480 1481 /* 1482 * Additions of remote CPUs is only allowed if those CPUs are 1483 * not allocated to other partitions and there are effective_cpus 1484 * left in the top cpuset. 1485 */ 1486 if (adding && (!capable(CAP_SYS_ADMIN) || 1487 cpumask_intersects(tmp->addmask, subpartitions_cpus) || 1488 cpumask_subset(top_cpuset.effective_cpus, tmp->addmask))) 1489 goto invalidate; 1490 1491 spin_lock_irq(&callback_lock); 1492 if (adding) 1493 isolcpus_updated += partition_xcpus_add(prs, NULL, tmp->addmask); 1494 if (deleting) 1495 isolcpus_updated += partition_xcpus_del(prs, NULL, tmp->delmask); 1496 spin_unlock_irq(&callback_lock); 1497 update_unbound_workqueue_cpumask(isolcpus_updated); 1498 1499 /* 1500 * Propagate changes in top_cpuset's effective_cpus down the hierarchy. 1501 */ 1502 cpuset_update_tasks_cpumask(&top_cpuset, tmp->new_cpus); 1503 update_sibling_cpumasks(&top_cpuset, NULL, tmp); 1504 return; 1505 1506 invalidate: 1507 remote_partition_disable(cs, tmp); 1508 } 1509 1510 /* 1511 * remote_partition_check - check if a child remote partition needs update 1512 * @cs: the cpuset to be updated 1513 * @newmask: the new effective_xcpus mask 1514 * @delmask: temporary mask for deletion (not in tmp) 1515 * @tmp: temporary masks 1516 * 1517 * This should be called before the given cs has updated its cpus_allowed 1518 * and/or effective_xcpus. 1519 */ 1520 static void remote_partition_check(struct cpuset *cs, struct cpumask *newmask, 1521 struct cpumask *delmask, struct tmpmasks *tmp) 1522 { 1523 struct cpuset *child, *next; 1524 int disable_cnt = 0; 1525 1526 /* 1527 * Compute the effective exclusive CPUs that will be deleted. 1528 */ 1529 if (!cpumask_andnot(delmask, cs->effective_xcpus, newmask) || 1530 !cpumask_intersects(delmask, subpartitions_cpus)) 1531 return; /* No deletion of exclusive CPUs in partitions */ 1532 1533 /* 1534 * Searching the remote children list to look for those that will 1535 * be impacted by the deletion of exclusive CPUs. 1536 * 1537 * Since a cpuset must be removed from the remote children list 1538 * before it can go offline and holding cpuset_mutex will prevent 1539 * any change in cpuset status. RCU read lock isn't needed. 1540 */ 1541 lockdep_assert_held(&cpuset_mutex); 1542 list_for_each_entry_safe(child, next, &remote_children, remote_sibling) 1543 if (cpumask_intersects(child->effective_cpus, delmask)) { 1544 remote_partition_disable(child, tmp); 1545 disable_cnt++; 1546 } 1547 if (disable_cnt) 1548 cpuset_force_rebuild(); 1549 } 1550 1551 /* 1552 * prstate_housekeeping_conflict - check for partition & housekeeping conflicts 1553 * @prstate: partition root state to be checked 1554 * @new_cpus: cpu mask 1555 * Return: true if there is conflict, false otherwise 1556 * 1557 * CPUs outside of boot_hk_cpus, if defined, can only be used in an 1558 * isolated partition. 1559 */ 1560 static bool prstate_housekeeping_conflict(int prstate, struct cpumask *new_cpus) 1561 { 1562 if (!have_boot_isolcpus) 1563 return false; 1564 1565 if ((prstate != PRS_ISOLATED) && !cpumask_subset(new_cpus, boot_hk_cpus)) 1566 return true; 1567 1568 return false; 1569 } 1570 1571 /** 1572 * update_parent_effective_cpumask - update effective_cpus mask of parent cpuset 1573 * @cs: The cpuset that requests change in partition root state 1574 * @cmd: Partition root state change command 1575 * @newmask: Optional new cpumask for partcmd_update 1576 * @tmp: Temporary addmask and delmask 1577 * Return: 0 or a partition root state error code 1578 * 1579 * For partcmd_enable*, the cpuset is being transformed from a non-partition 1580 * root to a partition root. The effective_xcpus (cpus_allowed if 1581 * effective_xcpus not set) mask of the given cpuset will be taken away from 1582 * parent's effective_cpus. The function will return 0 if all the CPUs listed 1583 * in effective_xcpus can be granted or an error code will be returned. 1584 * 1585 * For partcmd_disable, the cpuset is being transformed from a partition 1586 * root back to a non-partition root. Any CPUs in effective_xcpus will be 1587 * given back to parent's effective_cpus. 0 will always be returned. 1588 * 1589 * For partcmd_update, if the optional newmask is specified, the cpu list is 1590 * to be changed from effective_xcpus to newmask. Otherwise, effective_xcpus is 1591 * assumed to remain the same. The cpuset should either be a valid or invalid 1592 * partition root. The partition root state may change from valid to invalid 1593 * or vice versa. An error code will be returned if transitioning from 1594 * invalid to valid violates the exclusivity rule. 1595 * 1596 * For partcmd_invalidate, the current partition will be made invalid. 1597 * 1598 * The partcmd_enable* and partcmd_disable commands are used by 1599 * update_prstate(). An error code may be returned and the caller will check 1600 * for error. 1601 * 1602 * The partcmd_update command is used by update_cpumasks_hier() with newmask 1603 * NULL and update_cpumask() with newmask set. The partcmd_invalidate is used 1604 * by update_cpumask() with NULL newmask. In both cases, the callers won't 1605 * check for error and so partition_root_state and prs_error will be updated 1606 * directly. 1607 */ 1608 static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, 1609 struct cpumask *newmask, 1610 struct tmpmasks *tmp) 1611 { 1612 struct cpuset *parent = parent_cs(cs); 1613 int adding; /* Adding cpus to parent's effective_cpus */ 1614 int deleting; /* Deleting cpus from parent's effective_cpus */ 1615 int old_prs, new_prs; 1616 int part_error = PERR_NONE; /* Partition error? */ 1617 int subparts_delta = 0; 1618 struct cpumask *xcpus; /* cs effective_xcpus */ 1619 int isolcpus_updated = 0; 1620 bool nocpu; 1621 1622 lockdep_assert_held(&cpuset_mutex); 1623 1624 /* 1625 * new_prs will only be changed for the partcmd_update and 1626 * partcmd_invalidate commands. 1627 */ 1628 adding = deleting = false; 1629 old_prs = new_prs = cs->partition_root_state; 1630 xcpus = user_xcpus(cs); 1631 1632 if (cmd == partcmd_invalidate) { 1633 if (is_prs_invalid(old_prs)) 1634 return 0; 1635 1636 /* 1637 * Make the current partition invalid. 1638 */ 1639 if (is_partition_valid(parent)) 1640 adding = cpumask_and(tmp->addmask, 1641 xcpus, parent->effective_xcpus); 1642 if (old_prs > 0) { 1643 new_prs = -old_prs; 1644 subparts_delta--; 1645 } 1646 goto write_error; 1647 } 1648 1649 /* 1650 * The parent must be a partition root. 1651 * The new cpumask, if present, or the current cpus_allowed must 1652 * not be empty. 1653 */ 1654 if (!is_partition_valid(parent)) { 1655 return is_partition_invalid(parent) 1656 ? PERR_INVPARENT : PERR_NOTPART; 1657 } 1658 if (!newmask && xcpus_empty(cs)) 1659 return PERR_CPUSEMPTY; 1660 1661 nocpu = tasks_nocpu_error(parent, cs, xcpus); 1662 1663 if ((cmd == partcmd_enable) || (cmd == partcmd_enablei)) { 1664 /* 1665 * Enabling partition root is not allowed if its 1666 * effective_xcpus is empty or doesn't overlap with 1667 * parent's effective_xcpus. 1668 */ 1669 if (cpumask_empty(xcpus) || 1670 !cpumask_intersects(xcpus, parent->effective_xcpus)) 1671 return PERR_INVCPUS; 1672 1673 if (prstate_housekeeping_conflict(new_prs, xcpus)) 1674 return PERR_HKEEPING; 1675 1676 /* 1677 * A parent can be left with no CPU as long as there is no 1678 * task directly associated with the parent partition. 1679 */ 1680 if (nocpu) 1681 return PERR_NOCPUS; 1682 1683 cpumask_copy(tmp->delmask, xcpus); 1684 deleting = true; 1685 subparts_delta++; 1686 new_prs = (cmd == partcmd_enable) ? PRS_ROOT : PRS_ISOLATED; 1687 } else if (cmd == partcmd_disable) { 1688 /* 1689 * May need to add cpus to parent's effective_cpus for 1690 * valid partition root. 1691 */ 1692 adding = !is_prs_invalid(old_prs) && 1693 cpumask_and(tmp->addmask, xcpus, parent->effective_xcpus); 1694 if (adding) 1695 subparts_delta--; 1696 new_prs = PRS_MEMBER; 1697 } else if (newmask) { 1698 /* 1699 * Empty cpumask is not allowed 1700 */ 1701 if (cpumask_empty(newmask)) { 1702 part_error = PERR_CPUSEMPTY; 1703 goto write_error; 1704 } 1705 /* Check newmask again, whether cpus are available for parent/cs */ 1706 nocpu |= tasks_nocpu_error(parent, cs, newmask); 1707 1708 /* 1709 * partcmd_update with newmask: 1710 * 1711 * Compute add/delete mask to/from effective_cpus 1712 * 1713 * For valid partition: 1714 * addmask = exclusive_cpus & ~newmask 1715 * & parent->effective_xcpus 1716 * delmask = newmask & ~exclusive_cpus 1717 * & parent->effective_xcpus 1718 * 1719 * For invalid partition: 1720 * delmask = newmask & parent->effective_xcpus 1721 */ 1722 if (is_prs_invalid(old_prs)) { 1723 adding = false; 1724 deleting = cpumask_and(tmp->delmask, 1725 newmask, parent->effective_xcpus); 1726 } else { 1727 cpumask_andnot(tmp->addmask, xcpus, newmask); 1728 adding = cpumask_and(tmp->addmask, tmp->addmask, 1729 parent->effective_xcpus); 1730 1731 cpumask_andnot(tmp->delmask, newmask, xcpus); 1732 deleting = cpumask_and(tmp->delmask, tmp->delmask, 1733 parent->effective_xcpus); 1734 } 1735 /* 1736 * Make partition invalid if parent's effective_cpus could 1737 * become empty and there are tasks in the parent. 1738 */ 1739 if (nocpu && (!adding || 1740 !cpumask_intersects(tmp->addmask, cpu_active_mask))) { 1741 part_error = PERR_NOCPUS; 1742 deleting = false; 1743 adding = cpumask_and(tmp->addmask, 1744 xcpus, parent->effective_xcpus); 1745 } 1746 } else { 1747 /* 1748 * partcmd_update w/o newmask 1749 * 1750 * delmask = effective_xcpus & parent->effective_cpus 1751 * 1752 * This can be called from: 1753 * 1) update_cpumasks_hier() 1754 * 2) cpuset_hotplug_update_tasks() 1755 * 1756 * Check to see if it can be transitioned from valid to 1757 * invalid partition or vice versa. 1758 * 1759 * A partition error happens when parent has tasks and all 1760 * its effective CPUs will have to be distributed out. 1761 */ 1762 WARN_ON_ONCE(!is_partition_valid(parent)); 1763 if (nocpu) { 1764 part_error = PERR_NOCPUS; 1765 if (is_partition_valid(cs)) 1766 adding = cpumask_and(tmp->addmask, 1767 xcpus, parent->effective_xcpus); 1768 } else if (is_partition_invalid(cs) && 1769 cpumask_subset(xcpus, parent->effective_xcpus)) { 1770 struct cgroup_subsys_state *css; 1771 struct cpuset *child; 1772 bool exclusive = true; 1773 1774 /* 1775 * Convert invalid partition to valid has to 1776 * pass the cpu exclusivity test. 1777 */ 1778 rcu_read_lock(); 1779 cpuset_for_each_child(child, css, parent) { 1780 if (child == cs) 1781 continue; 1782 if (!cpusets_are_exclusive(cs, child)) { 1783 exclusive = false; 1784 break; 1785 } 1786 } 1787 rcu_read_unlock(); 1788 if (exclusive) 1789 deleting = cpumask_and(tmp->delmask, 1790 xcpus, parent->effective_cpus); 1791 else 1792 part_error = PERR_NOTEXCL; 1793 } 1794 } 1795 1796 write_error: 1797 if (part_error) 1798 WRITE_ONCE(cs->prs_err, part_error); 1799 1800 if (cmd == partcmd_update) { 1801 /* 1802 * Check for possible transition between valid and invalid 1803 * partition root. 1804 */ 1805 switch (cs->partition_root_state) { 1806 case PRS_ROOT: 1807 case PRS_ISOLATED: 1808 if (part_error) { 1809 new_prs = -old_prs; 1810 subparts_delta--; 1811 } 1812 break; 1813 case PRS_INVALID_ROOT: 1814 case PRS_INVALID_ISOLATED: 1815 if (!part_error) { 1816 new_prs = -old_prs; 1817 subparts_delta++; 1818 } 1819 break; 1820 } 1821 } 1822 1823 if (!adding && !deleting && (new_prs == old_prs)) 1824 return 0; 1825 1826 /* 1827 * Transitioning between invalid to valid or vice versa may require 1828 * changing CS_CPU_EXCLUSIVE. In the case of partcmd_update, 1829 * validate_change() has already been successfully called and 1830 * CPU lists in cs haven't been updated yet. So defer it to later. 1831 */ 1832 if ((old_prs != new_prs) && (cmd != partcmd_update)) { 1833 int err = update_partition_exclusive(cs, new_prs); 1834 1835 if (err) 1836 return err; 1837 } 1838 1839 /* 1840 * Change the parent's effective_cpus & effective_xcpus (top cpuset 1841 * only). 1842 * 1843 * Newly added CPUs will be removed from effective_cpus and 1844 * newly deleted ones will be added back to effective_cpus. 1845 */ 1846 spin_lock_irq(&callback_lock); 1847 if (old_prs != new_prs) { 1848 cs->partition_root_state = new_prs; 1849 if (new_prs <= 0) 1850 cs->nr_subparts = 0; 1851 } 1852 /* 1853 * Adding to parent's effective_cpus means deletion CPUs from cs 1854 * and vice versa. 1855 */ 1856 if (adding) 1857 isolcpus_updated += partition_xcpus_del(old_prs, parent, 1858 tmp->addmask); 1859 if (deleting) 1860 isolcpus_updated += partition_xcpus_add(new_prs, parent, 1861 tmp->delmask); 1862 1863 if (is_partition_valid(parent)) { 1864 parent->nr_subparts += subparts_delta; 1865 WARN_ON_ONCE(parent->nr_subparts < 0); 1866 } 1867 spin_unlock_irq(&callback_lock); 1868 update_unbound_workqueue_cpumask(isolcpus_updated); 1869 1870 if ((old_prs != new_prs) && (cmd == partcmd_update)) 1871 update_partition_exclusive(cs, new_prs); 1872 1873 if (adding || deleting) { 1874 cpuset_update_tasks_cpumask(parent, tmp->addmask); 1875 update_sibling_cpumasks(parent, cs, tmp); 1876 } 1877 1878 /* 1879 * For partcmd_update without newmask, it is being called from 1880 * cpuset_handle_hotplug(). Update the load balance flag and 1881 * scheduling domain accordingly. 1882 */ 1883 if ((cmd == partcmd_update) && !newmask) 1884 update_partition_sd_lb(cs, old_prs); 1885 1886 notify_partition_change(cs, old_prs); 1887 return 0; 1888 } 1889 1890 /** 1891 * compute_partition_effective_cpumask - compute effective_cpus for partition 1892 * @cs: partition root cpuset 1893 * @new_ecpus: previously computed effective_cpus to be updated 1894 * 1895 * Compute the effective_cpus of a partition root by scanning effective_xcpus 1896 * of child partition roots and excluding their effective_xcpus. 1897 * 1898 * This has the side effect of invalidating valid child partition roots, 1899 * if necessary. Since it is called from either cpuset_hotplug_update_tasks() 1900 * or update_cpumasks_hier() where parent and children are modified 1901 * successively, we don't need to call update_parent_effective_cpumask() 1902 * and the child's effective_cpus will be updated in later iterations. 1903 * 1904 * Note that rcu_read_lock() is assumed to be held. 1905 */ 1906 static void compute_partition_effective_cpumask(struct cpuset *cs, 1907 struct cpumask *new_ecpus) 1908 { 1909 struct cgroup_subsys_state *css; 1910 struct cpuset *child; 1911 bool populated = partition_is_populated(cs, NULL); 1912 1913 /* 1914 * Check child partition roots to see if they should be 1915 * invalidated when 1916 * 1) child effective_xcpus not a subset of new 1917 * excluisve_cpus 1918 * 2) All the effective_cpus will be used up and cp 1919 * has tasks 1920 */ 1921 compute_effective_exclusive_cpumask(cs, new_ecpus); 1922 cpumask_and(new_ecpus, new_ecpus, cpu_active_mask); 1923 1924 rcu_read_lock(); 1925 cpuset_for_each_child(child, css, cs) { 1926 if (!is_partition_valid(child)) 1927 continue; 1928 1929 child->prs_err = 0; 1930 if (!cpumask_subset(child->effective_xcpus, 1931 cs->effective_xcpus)) 1932 child->prs_err = PERR_INVCPUS; 1933 else if (populated && 1934 cpumask_subset(new_ecpus, child->effective_xcpus)) 1935 child->prs_err = PERR_NOCPUS; 1936 1937 if (child->prs_err) { 1938 int old_prs = child->partition_root_state; 1939 1940 /* 1941 * Invalidate child partition 1942 */ 1943 spin_lock_irq(&callback_lock); 1944 make_partition_invalid(child); 1945 cs->nr_subparts--; 1946 child->nr_subparts = 0; 1947 spin_unlock_irq(&callback_lock); 1948 notify_partition_change(child, old_prs); 1949 continue; 1950 } 1951 cpumask_andnot(new_ecpus, new_ecpus, 1952 child->effective_xcpus); 1953 } 1954 rcu_read_unlock(); 1955 } 1956 1957 /* 1958 * update_cpumasks_hier - Update effective cpumasks and tasks in the subtree 1959 * @cs: the cpuset to consider 1960 * @tmp: temp variables for calculating effective_cpus & partition setup 1961 * @force: don't skip any descendant cpusets if set 1962 * 1963 * When configured cpumask is changed, the effective cpumasks of this cpuset 1964 * and all its descendants need to be updated. 1965 * 1966 * On legacy hierarchy, effective_cpus will be the same with cpu_allowed. 1967 * 1968 * Called with cpuset_mutex held 1969 */ 1970 static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp, 1971 bool force) 1972 { 1973 struct cpuset *cp; 1974 struct cgroup_subsys_state *pos_css; 1975 bool need_rebuild_sched_domains = false; 1976 int old_prs, new_prs; 1977 1978 rcu_read_lock(); 1979 cpuset_for_each_descendant_pre(cp, pos_css, cs) { 1980 struct cpuset *parent = parent_cs(cp); 1981 bool remote = is_remote_partition(cp); 1982 bool update_parent = false; 1983 1984 /* 1985 * Skip descendent remote partition that acquires CPUs 1986 * directly from top cpuset unless it is cs. 1987 */ 1988 if (remote && (cp != cs)) { 1989 pos_css = css_rightmost_descendant(pos_css); 1990 continue; 1991 } 1992 1993 /* 1994 * Update effective_xcpus if exclusive_cpus set. 1995 * The case when exclusive_cpus isn't set is handled later. 1996 */ 1997 if (!cpumask_empty(cp->exclusive_cpus) && (cp != cs)) { 1998 spin_lock_irq(&callback_lock); 1999 compute_effective_exclusive_cpumask(cp, NULL); 2000 spin_unlock_irq(&callback_lock); 2001 } 2002 2003 old_prs = new_prs = cp->partition_root_state; 2004 if (remote || (is_partition_valid(parent) && 2005 is_partition_valid(cp))) 2006 compute_partition_effective_cpumask(cp, tmp->new_cpus); 2007 else 2008 compute_effective_cpumask(tmp->new_cpus, cp, parent); 2009 2010 /* 2011 * A partition with no effective_cpus is allowed as long as 2012 * there is no task associated with it. Call 2013 * update_parent_effective_cpumask() to check it. 2014 */ 2015 if (is_partition_valid(cp) && cpumask_empty(tmp->new_cpus)) { 2016 update_parent = true; 2017 goto update_parent_effective; 2018 } 2019 2020 /* 2021 * If it becomes empty, inherit the effective mask of the 2022 * parent, which is guaranteed to have some CPUs unless 2023 * it is a partition root that has explicitly distributed 2024 * out all its CPUs. 2025 */ 2026 if (is_in_v2_mode() && !remote && cpumask_empty(tmp->new_cpus)) 2027 cpumask_copy(tmp->new_cpus, parent->effective_cpus); 2028 2029 if (remote) 2030 goto get_css; 2031 2032 /* 2033 * Skip the whole subtree if 2034 * 1) the cpumask remains the same, 2035 * 2) has no partition root state, 2036 * 3) force flag not set, and 2037 * 4) for v2 load balance state same as its parent. 2038 */ 2039 if (!cp->partition_root_state && !force && 2040 cpumask_equal(tmp->new_cpus, cp->effective_cpus) && 2041 (!cpuset_v2() || 2042 (is_sched_load_balance(parent) == is_sched_load_balance(cp)))) { 2043 pos_css = css_rightmost_descendant(pos_css); 2044 continue; 2045 } 2046 2047 update_parent_effective: 2048 /* 2049 * update_parent_effective_cpumask() should have been called 2050 * for cs already in update_cpumask(). We should also call 2051 * cpuset_update_tasks_cpumask() again for tasks in the parent 2052 * cpuset if the parent's effective_cpus changes. 2053 */ 2054 if ((cp != cs) && old_prs) { 2055 switch (parent->partition_root_state) { 2056 case PRS_ROOT: 2057 case PRS_ISOLATED: 2058 update_parent = true; 2059 break; 2060 2061 default: 2062 /* 2063 * When parent is not a partition root or is 2064 * invalid, child partition roots become 2065 * invalid too. 2066 */ 2067 if (is_partition_valid(cp)) 2068 new_prs = -cp->partition_root_state; 2069 WRITE_ONCE(cp->prs_err, 2070 is_partition_invalid(parent) 2071 ? PERR_INVPARENT : PERR_NOTPART); 2072 break; 2073 } 2074 } 2075 get_css: 2076 if (!css_tryget_online(&cp->css)) 2077 continue; 2078 rcu_read_unlock(); 2079 2080 if (update_parent) { 2081 update_parent_effective_cpumask(cp, partcmd_update, NULL, tmp); 2082 /* 2083 * The cpuset partition_root_state may become 2084 * invalid. Capture it. 2085 */ 2086 new_prs = cp->partition_root_state; 2087 } 2088 2089 spin_lock_irq(&callback_lock); 2090 cpumask_copy(cp->effective_cpus, tmp->new_cpus); 2091 cp->partition_root_state = new_prs; 2092 /* 2093 * Make sure effective_xcpus is properly set for a valid 2094 * partition root. 2095 */ 2096 if ((new_prs > 0) && cpumask_empty(cp->exclusive_cpus)) 2097 cpumask_and(cp->effective_xcpus, 2098 cp->cpus_allowed, parent->effective_xcpus); 2099 else if (new_prs < 0) 2100 reset_partition_data(cp); 2101 spin_unlock_irq(&callback_lock); 2102 2103 notify_partition_change(cp, old_prs); 2104 2105 WARN_ON(!is_in_v2_mode() && 2106 !cpumask_equal(cp->cpus_allowed, cp->effective_cpus)); 2107 2108 cpuset_update_tasks_cpumask(cp, cp->effective_cpus); 2109 2110 /* 2111 * On default hierarchy, inherit the CS_SCHED_LOAD_BALANCE 2112 * from parent if current cpuset isn't a valid partition root 2113 * and their load balance states differ. 2114 */ 2115 if (cpuset_v2() && !is_partition_valid(cp) && 2116 (is_sched_load_balance(parent) != is_sched_load_balance(cp))) { 2117 if (is_sched_load_balance(parent)) 2118 set_bit(CS_SCHED_LOAD_BALANCE, &cp->flags); 2119 else 2120 clear_bit(CS_SCHED_LOAD_BALANCE, &cp->flags); 2121 } 2122 2123 /* 2124 * On legacy hierarchy, if the effective cpumask of any non- 2125 * empty cpuset is changed, we need to rebuild sched domains. 2126 * On default hierarchy, the cpuset needs to be a partition 2127 * root as well. 2128 */ 2129 if (!cpumask_empty(cp->cpus_allowed) && 2130 is_sched_load_balance(cp) && 2131 (!cpuset_v2() || is_partition_valid(cp))) 2132 need_rebuild_sched_domains = true; 2133 2134 rcu_read_lock(); 2135 css_put(&cp->css); 2136 } 2137 rcu_read_unlock(); 2138 2139 if (need_rebuild_sched_domains) 2140 cpuset_force_rebuild(); 2141 } 2142 2143 /** 2144 * update_sibling_cpumasks - Update siblings cpumasks 2145 * @parent: Parent cpuset 2146 * @cs: Current cpuset 2147 * @tmp: Temp variables 2148 */ 2149 static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, 2150 struct tmpmasks *tmp) 2151 { 2152 struct cpuset *sibling; 2153 struct cgroup_subsys_state *pos_css; 2154 2155 lockdep_assert_held(&cpuset_mutex); 2156 2157 /* 2158 * Check all its siblings and call update_cpumasks_hier() 2159 * if their effective_cpus will need to be changed. 2160 * 2161 * It is possible a change in parent's effective_cpus 2162 * due to a change in a child partition's effective_xcpus will impact 2163 * its siblings even if they do not inherit parent's effective_cpus 2164 * directly. 2165 * 2166 * The update_cpumasks_hier() function may sleep. So we have to 2167 * release the RCU read lock before calling it. 2168 */ 2169 rcu_read_lock(); 2170 cpuset_for_each_child(sibling, pos_css, parent) { 2171 if (sibling == cs) 2172 continue; 2173 if (!is_partition_valid(sibling)) { 2174 compute_effective_cpumask(tmp->new_cpus, sibling, 2175 parent); 2176 if (cpumask_equal(tmp->new_cpus, sibling->effective_cpus)) 2177 continue; 2178 } 2179 if (!css_tryget_online(&sibling->css)) 2180 continue; 2181 2182 rcu_read_unlock(); 2183 update_cpumasks_hier(sibling, tmp, false); 2184 rcu_read_lock(); 2185 css_put(&sibling->css); 2186 } 2187 rcu_read_unlock(); 2188 } 2189 2190 /** 2191 * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it 2192 * @cs: the cpuset to consider 2193 * @trialcs: trial cpuset 2194 * @buf: buffer of cpu numbers written to this cpuset 2195 */ 2196 static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, 2197 const char *buf) 2198 { 2199 int retval; 2200 struct tmpmasks tmp; 2201 struct cpuset *parent = parent_cs(cs); 2202 bool invalidate = false; 2203 bool force = false; 2204 int old_prs = cs->partition_root_state; 2205 2206 /* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */ 2207 if (cs == &top_cpuset) 2208 return -EACCES; 2209 2210 /* 2211 * An empty cpus_allowed is ok only if the cpuset has no tasks. 2212 * Since cpulist_parse() fails on an empty mask, we special case 2213 * that parsing. The validate_change() call ensures that cpusets 2214 * with tasks have cpus. 2215 */ 2216 if (!*buf) { 2217 cpumask_clear(trialcs->cpus_allowed); 2218 if (cpumask_empty(trialcs->exclusive_cpus)) 2219 cpumask_clear(trialcs->effective_xcpus); 2220 } else { 2221 retval = cpulist_parse(buf, trialcs->cpus_allowed); 2222 if (retval < 0) 2223 return retval; 2224 2225 if (!cpumask_subset(trialcs->cpus_allowed, 2226 top_cpuset.cpus_allowed)) 2227 return -EINVAL; 2228 2229 /* 2230 * When exclusive_cpus isn't explicitly set, it is constrained 2231 * by cpus_allowed and parent's effective_xcpus. Otherwise, 2232 * trialcs->effective_xcpus is used as a temporary cpumask 2233 * for checking validity of the partition root. 2234 */ 2235 if (!cpumask_empty(trialcs->exclusive_cpus) || is_partition_valid(cs)) 2236 compute_effective_exclusive_cpumask(trialcs, NULL); 2237 } 2238 2239 /* Nothing to do if the cpus didn't change */ 2240 if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed)) 2241 return 0; 2242 2243 if (alloc_cpumasks(NULL, &tmp)) 2244 return -ENOMEM; 2245 2246 if (old_prs) { 2247 if (is_partition_valid(cs) && 2248 cpumask_empty(trialcs->effective_xcpus)) { 2249 invalidate = true; 2250 cs->prs_err = PERR_INVCPUS; 2251 } else if (prstate_housekeeping_conflict(old_prs, trialcs->effective_xcpus)) { 2252 invalidate = true; 2253 cs->prs_err = PERR_HKEEPING; 2254 } else if (tasks_nocpu_error(parent, cs, trialcs->effective_xcpus)) { 2255 invalidate = true; 2256 cs->prs_err = PERR_NOCPUS; 2257 } 2258 } 2259 2260 /* 2261 * Check all the descendants in update_cpumasks_hier() if 2262 * effective_xcpus is to be changed. 2263 */ 2264 force = !cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus); 2265 2266 retval = validate_change(cs, trialcs); 2267 2268 if ((retval == -EINVAL) && cpuset_v2()) { 2269 struct cgroup_subsys_state *css; 2270 struct cpuset *cp; 2271 2272 /* 2273 * The -EINVAL error code indicates that partition sibling 2274 * CPU exclusivity rule has been violated. We still allow 2275 * the cpumask change to proceed while invalidating the 2276 * partition. However, any conflicting sibling partitions 2277 * have to be marked as invalid too. 2278 */ 2279 invalidate = true; 2280 rcu_read_lock(); 2281 cpuset_for_each_child(cp, css, parent) { 2282 struct cpumask *xcpus = user_xcpus(trialcs); 2283 2284 if (is_partition_valid(cp) && 2285 cpumask_intersects(xcpus, cp->effective_xcpus)) { 2286 rcu_read_unlock(); 2287 update_parent_effective_cpumask(cp, partcmd_invalidate, NULL, &tmp); 2288 rcu_read_lock(); 2289 } 2290 } 2291 rcu_read_unlock(); 2292 retval = 0; 2293 } 2294 2295 if (retval < 0) 2296 goto out_free; 2297 2298 if (is_partition_valid(cs) || 2299 (is_partition_invalid(cs) && !invalidate)) { 2300 struct cpumask *xcpus = trialcs->effective_xcpus; 2301 2302 if (cpumask_empty(xcpus) && is_partition_invalid(cs)) 2303 xcpus = trialcs->cpus_allowed; 2304 2305 /* 2306 * Call remote_cpus_update() to handle valid remote partition 2307 */ 2308 if (is_remote_partition(cs)) 2309 remote_cpus_update(cs, xcpus, &tmp); 2310 else if (invalidate) 2311 update_parent_effective_cpumask(cs, partcmd_invalidate, 2312 NULL, &tmp); 2313 else 2314 update_parent_effective_cpumask(cs, partcmd_update, 2315 xcpus, &tmp); 2316 } else if (!cpumask_empty(cs->exclusive_cpus)) { 2317 /* 2318 * Use trialcs->effective_cpus as a temp cpumask 2319 */ 2320 remote_partition_check(cs, trialcs->effective_xcpus, 2321 trialcs->effective_cpus, &tmp); 2322 } 2323 2324 spin_lock_irq(&callback_lock); 2325 cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed); 2326 cpumask_copy(cs->effective_xcpus, trialcs->effective_xcpus); 2327 if ((old_prs > 0) && !is_partition_valid(cs)) 2328 reset_partition_data(cs); 2329 spin_unlock_irq(&callback_lock); 2330 2331 /* effective_cpus/effective_xcpus will be updated here */ 2332 update_cpumasks_hier(cs, &tmp, force); 2333 2334 /* Update CS_SCHED_LOAD_BALANCE and/or sched_domains, if necessary */ 2335 if (cs->partition_root_state) 2336 update_partition_sd_lb(cs, old_prs); 2337 out_free: 2338 free_cpumasks(NULL, &tmp); 2339 return retval; 2340 } 2341 2342 /** 2343 * update_exclusive_cpumask - update the exclusive_cpus mask of a cpuset 2344 * @cs: the cpuset to consider 2345 * @trialcs: trial cpuset 2346 * @buf: buffer of cpu numbers written to this cpuset 2347 * 2348 * The tasks' cpumask will be updated if cs is a valid partition root. 2349 */ 2350 static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs, 2351 const char *buf) 2352 { 2353 int retval; 2354 struct tmpmasks tmp; 2355 struct cpuset *parent = parent_cs(cs); 2356 bool invalidate = false; 2357 bool force = false; 2358 int old_prs = cs->partition_root_state; 2359 2360 if (!*buf) { 2361 cpumask_clear(trialcs->exclusive_cpus); 2362 cpumask_clear(trialcs->effective_xcpus); 2363 } else { 2364 retval = cpulist_parse(buf, trialcs->exclusive_cpus); 2365 if (retval < 0) 2366 return retval; 2367 } 2368 2369 /* Nothing to do if the CPUs didn't change */ 2370 if (cpumask_equal(cs->exclusive_cpus, trialcs->exclusive_cpus)) 2371 return 0; 2372 2373 if (*buf) 2374 compute_effective_exclusive_cpumask(trialcs, NULL); 2375 2376 /* 2377 * Check all the descendants in update_cpumasks_hier() if 2378 * effective_xcpus is to be changed. 2379 */ 2380 force = !cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus); 2381 2382 retval = validate_change(cs, trialcs); 2383 if (retval) 2384 return retval; 2385 2386 if (alloc_cpumasks(NULL, &tmp)) 2387 return -ENOMEM; 2388 2389 if (old_prs) { 2390 if (cpumask_empty(trialcs->effective_xcpus)) { 2391 invalidate = true; 2392 cs->prs_err = PERR_INVCPUS; 2393 } else if (prstate_housekeeping_conflict(old_prs, trialcs->effective_xcpus)) { 2394 invalidate = true; 2395 cs->prs_err = PERR_HKEEPING; 2396 } else if (tasks_nocpu_error(parent, cs, trialcs->effective_xcpus)) { 2397 invalidate = true; 2398 cs->prs_err = PERR_NOCPUS; 2399 } 2400 2401 if (is_remote_partition(cs)) { 2402 if (invalidate) 2403 remote_partition_disable(cs, &tmp); 2404 else 2405 remote_cpus_update(cs, trialcs->effective_xcpus, 2406 &tmp); 2407 } else if (invalidate) { 2408 update_parent_effective_cpumask(cs, partcmd_invalidate, 2409 NULL, &tmp); 2410 } else { 2411 update_parent_effective_cpumask(cs, partcmd_update, 2412 trialcs->effective_xcpus, &tmp); 2413 } 2414 } else if (!cpumask_empty(trialcs->exclusive_cpus)) { 2415 /* 2416 * Use trialcs->effective_cpus as a temp cpumask 2417 */ 2418 remote_partition_check(cs, trialcs->effective_xcpus, 2419 trialcs->effective_cpus, &tmp); 2420 } 2421 spin_lock_irq(&callback_lock); 2422 cpumask_copy(cs->exclusive_cpus, trialcs->exclusive_cpus); 2423 cpumask_copy(cs->effective_xcpus, trialcs->effective_xcpus); 2424 if ((old_prs > 0) && !is_partition_valid(cs)) 2425 reset_partition_data(cs); 2426 spin_unlock_irq(&callback_lock); 2427 2428 /* 2429 * Call update_cpumasks_hier() to update effective_cpus/effective_xcpus 2430 * of the subtree when it is a valid partition root or effective_xcpus 2431 * is updated. 2432 */ 2433 if (is_partition_valid(cs) || force) 2434 update_cpumasks_hier(cs, &tmp, force); 2435 2436 /* Update CS_SCHED_LOAD_BALANCE and/or sched_domains, if necessary */ 2437 if (cs->partition_root_state) 2438 update_partition_sd_lb(cs, old_prs); 2439 2440 free_cpumasks(NULL, &tmp); 2441 return 0; 2442 } 2443 2444 /* 2445 * Migrate memory region from one set of nodes to another. This is 2446 * performed asynchronously as it can be called from process migration path 2447 * holding locks involved in process management. All mm migrations are 2448 * performed in the queued order and can be waited for by flushing 2449 * cpuset_migrate_mm_wq. 2450 */ 2451 2452 struct cpuset_migrate_mm_work { 2453 struct work_struct work; 2454 struct mm_struct *mm; 2455 nodemask_t from; 2456 nodemask_t to; 2457 }; 2458 2459 static void cpuset_migrate_mm_workfn(struct work_struct *work) 2460 { 2461 struct cpuset_migrate_mm_work *mwork = 2462 container_of(work, struct cpuset_migrate_mm_work, work); 2463 2464 /* on a wq worker, no need to worry about %current's mems_allowed */ 2465 do_migrate_pages(mwork->mm, &mwork->from, &mwork->to, MPOL_MF_MOVE_ALL); 2466 mmput(mwork->mm); 2467 kfree(mwork); 2468 } 2469 2470 static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, 2471 const nodemask_t *to) 2472 { 2473 struct cpuset_migrate_mm_work *mwork; 2474 2475 if (nodes_equal(*from, *to)) { 2476 mmput(mm); 2477 return; 2478 } 2479 2480 mwork = kzalloc(sizeof(*mwork), GFP_KERNEL); 2481 if (mwork) { 2482 mwork->mm = mm; 2483 mwork->from = *from; 2484 mwork->to = *to; 2485 INIT_WORK(&mwork->work, cpuset_migrate_mm_workfn); 2486 queue_work(cpuset_migrate_mm_wq, &mwork->work); 2487 } else { 2488 mmput(mm); 2489 } 2490 } 2491 2492 static void cpuset_post_attach(void) 2493 { 2494 flush_workqueue(cpuset_migrate_mm_wq); 2495 } 2496 2497 /* 2498 * cpuset_change_task_nodemask - change task's mems_allowed and mempolicy 2499 * @tsk: the task to change 2500 * @newmems: new nodes that the task will be set 2501 * 2502 * We use the mems_allowed_seq seqlock to safely update both tsk->mems_allowed 2503 * and rebind an eventual tasks' mempolicy. If the task is allocating in 2504 * parallel, it might temporarily see an empty intersection, which results in 2505 * a seqlock check and retry before OOM or allocation failure. 2506 */ 2507 static void cpuset_change_task_nodemask(struct task_struct *tsk, 2508 nodemask_t *newmems) 2509 { 2510 task_lock(tsk); 2511 2512 local_irq_disable(); 2513 write_seqcount_begin(&tsk->mems_allowed_seq); 2514 2515 nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems); 2516 mpol_rebind_task(tsk, newmems); 2517 tsk->mems_allowed = *newmems; 2518 2519 write_seqcount_end(&tsk->mems_allowed_seq); 2520 local_irq_enable(); 2521 2522 task_unlock(tsk); 2523 } 2524 2525 static void *cpuset_being_rebound; 2526 2527 /** 2528 * cpuset_update_tasks_nodemask - Update the nodemasks of tasks in the cpuset. 2529 * @cs: the cpuset in which each task's mems_allowed mask needs to be changed 2530 * 2531 * Iterate through each task of @cs updating its mems_allowed to the 2532 * effective cpuset's. As this function is called with cpuset_mutex held, 2533 * cpuset membership stays stable. 2534 */ 2535 void cpuset_update_tasks_nodemask(struct cpuset *cs) 2536 { 2537 static nodemask_t newmems; /* protected by cpuset_mutex */ 2538 struct css_task_iter it; 2539 struct task_struct *task; 2540 2541 cpuset_being_rebound = cs; /* causes mpol_dup() rebind */ 2542 2543 guarantee_online_mems(cs, &newmems); 2544 2545 /* 2546 * The mpol_rebind_mm() call takes mmap_lock, which we couldn't 2547 * take while holding tasklist_lock. Forks can happen - the 2548 * mpol_dup() cpuset_being_rebound check will catch such forks, 2549 * and rebind their vma mempolicies too. Because we still hold 2550 * the global cpuset_mutex, we know that no other rebind effort 2551 * will be contending for the global variable cpuset_being_rebound. 2552 * It's ok if we rebind the same mm twice; mpol_rebind_mm() 2553 * is idempotent. Also migrate pages in each mm to new nodes. 2554 */ 2555 css_task_iter_start(&cs->css, 0, &it); 2556 while ((task = css_task_iter_next(&it))) { 2557 struct mm_struct *mm; 2558 bool migrate; 2559 2560 cpuset_change_task_nodemask(task, &newmems); 2561 2562 mm = get_task_mm(task); 2563 if (!mm) 2564 continue; 2565 2566 migrate = is_memory_migrate(cs); 2567 2568 mpol_rebind_mm(mm, &cs->mems_allowed); 2569 if (migrate) 2570 cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems); 2571 else 2572 mmput(mm); 2573 } 2574 css_task_iter_end(&it); 2575 2576 /* 2577 * All the tasks' nodemasks have been updated, update 2578 * cs->old_mems_allowed. 2579 */ 2580 cs->old_mems_allowed = newmems; 2581 2582 /* We're done rebinding vmas to this cpuset's new mems_allowed. */ 2583 cpuset_being_rebound = NULL; 2584 } 2585 2586 /* 2587 * update_nodemasks_hier - Update effective nodemasks and tasks in the subtree 2588 * @cs: the cpuset to consider 2589 * @new_mems: a temp variable for calculating new effective_mems 2590 * 2591 * When configured nodemask is changed, the effective nodemasks of this cpuset 2592 * and all its descendants need to be updated. 2593 * 2594 * On legacy hierarchy, effective_mems will be the same with mems_allowed. 2595 * 2596 * Called with cpuset_mutex held 2597 */ 2598 static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems) 2599 { 2600 struct cpuset *cp; 2601 struct cgroup_subsys_state *pos_css; 2602 2603 rcu_read_lock(); 2604 cpuset_for_each_descendant_pre(cp, pos_css, cs) { 2605 struct cpuset *parent = parent_cs(cp); 2606 2607 nodes_and(*new_mems, cp->mems_allowed, parent->effective_mems); 2608 2609 /* 2610 * If it becomes empty, inherit the effective mask of the 2611 * parent, which is guaranteed to have some MEMs. 2612 */ 2613 if (is_in_v2_mode() && nodes_empty(*new_mems)) 2614 *new_mems = parent->effective_mems; 2615 2616 /* Skip the whole subtree if the nodemask remains the same. */ 2617 if (nodes_equal(*new_mems, cp->effective_mems)) { 2618 pos_css = css_rightmost_descendant(pos_css); 2619 continue; 2620 } 2621 2622 if (!css_tryget_online(&cp->css)) 2623 continue; 2624 rcu_read_unlock(); 2625 2626 spin_lock_irq(&callback_lock); 2627 cp->effective_mems = *new_mems; 2628 spin_unlock_irq(&callback_lock); 2629 2630 WARN_ON(!is_in_v2_mode() && 2631 !nodes_equal(cp->mems_allowed, cp->effective_mems)); 2632 2633 cpuset_update_tasks_nodemask(cp); 2634 2635 rcu_read_lock(); 2636 css_put(&cp->css); 2637 } 2638 rcu_read_unlock(); 2639 } 2640 2641 /* 2642 * Handle user request to change the 'mems' memory placement 2643 * of a cpuset. Needs to validate the request, update the 2644 * cpusets mems_allowed, and for each task in the cpuset, 2645 * update mems_allowed and rebind task's mempolicy and any vma 2646 * mempolicies and if the cpuset is marked 'memory_migrate', 2647 * migrate the tasks pages to the new memory. 2648 * 2649 * Call with cpuset_mutex held. May take callback_lock during call. 2650 * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, 2651 * lock each such tasks mm->mmap_lock, scan its vma's and rebind 2652 * their mempolicies to the cpusets new mems_allowed. 2653 */ 2654 static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, 2655 const char *buf) 2656 { 2657 int retval; 2658 2659 /* 2660 * top_cpuset.mems_allowed tracks node_stats[N_MEMORY]; 2661 * it's read-only 2662 */ 2663 if (cs == &top_cpuset) { 2664 retval = -EACCES; 2665 goto done; 2666 } 2667 2668 /* 2669 * An empty mems_allowed is ok iff there are no tasks in the cpuset. 2670 * Since nodelist_parse() fails on an empty mask, we special case 2671 * that parsing. The validate_change() call ensures that cpusets 2672 * with tasks have memory. 2673 */ 2674 if (!*buf) { 2675 nodes_clear(trialcs->mems_allowed); 2676 } else { 2677 retval = nodelist_parse(buf, trialcs->mems_allowed); 2678 if (retval < 0) 2679 goto done; 2680 2681 if (!nodes_subset(trialcs->mems_allowed, 2682 top_cpuset.mems_allowed)) { 2683 retval = -EINVAL; 2684 goto done; 2685 } 2686 } 2687 2688 if (nodes_equal(cs->mems_allowed, trialcs->mems_allowed)) { 2689 retval = 0; /* Too easy - nothing to do */ 2690 goto done; 2691 } 2692 retval = validate_change(cs, trialcs); 2693 if (retval < 0) 2694 goto done; 2695 2696 check_insane_mems_config(&trialcs->mems_allowed); 2697 2698 spin_lock_irq(&callback_lock); 2699 cs->mems_allowed = trialcs->mems_allowed; 2700 spin_unlock_irq(&callback_lock); 2701 2702 /* use trialcs->mems_allowed as a temp variable */ 2703 update_nodemasks_hier(cs, &trialcs->mems_allowed); 2704 done: 2705 return retval; 2706 } 2707 2708 bool current_cpuset_is_being_rebound(void) 2709 { 2710 bool ret; 2711 2712 rcu_read_lock(); 2713 ret = task_cs(current) == cpuset_being_rebound; 2714 rcu_read_unlock(); 2715 2716 return ret; 2717 } 2718 2719 /* 2720 * cpuset_update_flag - read a 0 or a 1 in a file and update associated flag 2721 * bit: the bit to update (see cpuset_flagbits_t) 2722 * cs: the cpuset to update 2723 * turning_on: whether the flag is being set or cleared 2724 * 2725 * Call with cpuset_mutex held. 2726 */ 2727 2728 int cpuset_update_flag(cpuset_flagbits_t bit, struct cpuset *cs, 2729 int turning_on) 2730 { 2731 struct cpuset *trialcs; 2732 int balance_flag_changed; 2733 int spread_flag_changed; 2734 int err; 2735 2736 trialcs = alloc_trial_cpuset(cs); 2737 if (!trialcs) 2738 return -ENOMEM; 2739 2740 if (turning_on) 2741 set_bit(bit, &trialcs->flags); 2742 else 2743 clear_bit(bit, &trialcs->flags); 2744 2745 err = validate_change(cs, trialcs); 2746 if (err < 0) 2747 goto out; 2748 2749 balance_flag_changed = (is_sched_load_balance(cs) != 2750 is_sched_load_balance(trialcs)); 2751 2752 spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs)) 2753 || (is_spread_page(cs) != is_spread_page(trialcs))); 2754 2755 spin_lock_irq(&callback_lock); 2756 cs->flags = trialcs->flags; 2757 spin_unlock_irq(&callback_lock); 2758 2759 if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) { 2760 if (cpuset_v2()) 2761 cpuset_force_rebuild(); 2762 else 2763 rebuild_sched_domains_locked(); 2764 } 2765 2766 if (spread_flag_changed) 2767 cpuset1_update_tasks_flags(cs); 2768 out: 2769 free_cpuset(trialcs); 2770 return err; 2771 } 2772 2773 /** 2774 * update_prstate - update partition_root_state 2775 * @cs: the cpuset to update 2776 * @new_prs: new partition root state 2777 * Return: 0 if successful, != 0 if error 2778 * 2779 * Call with cpuset_mutex held. 2780 */ 2781 static int update_prstate(struct cpuset *cs, int new_prs) 2782 { 2783 int err = PERR_NONE, old_prs = cs->partition_root_state; 2784 struct cpuset *parent = parent_cs(cs); 2785 struct tmpmasks tmpmask; 2786 bool new_xcpus_state = false; 2787 2788 if (old_prs == new_prs) 2789 return 0; 2790 2791 /* 2792 * Treat a previously invalid partition root as if it is a "member". 2793 */ 2794 if (new_prs && is_prs_invalid(old_prs)) 2795 old_prs = PRS_MEMBER; 2796 2797 if (alloc_cpumasks(NULL, &tmpmask)) 2798 return -ENOMEM; 2799 2800 /* 2801 * Setup effective_xcpus if not properly set yet, it will be cleared 2802 * later if partition becomes invalid. 2803 */ 2804 if ((new_prs > 0) && cpumask_empty(cs->exclusive_cpus)) { 2805 spin_lock_irq(&callback_lock); 2806 cpumask_and(cs->effective_xcpus, 2807 cs->cpus_allowed, parent->effective_xcpus); 2808 spin_unlock_irq(&callback_lock); 2809 } 2810 2811 err = update_partition_exclusive(cs, new_prs); 2812 if (err) 2813 goto out; 2814 2815 if (!old_prs) { 2816 /* 2817 * cpus_allowed and exclusive_cpus cannot be both empty. 2818 */ 2819 if (xcpus_empty(cs)) { 2820 err = PERR_CPUSEMPTY; 2821 goto out; 2822 } 2823 2824 /* 2825 * If parent is valid partition, enable local partiion. 2826 * Otherwise, enable a remote partition. 2827 */ 2828 if (is_partition_valid(parent)) { 2829 enum partition_cmd cmd = (new_prs == PRS_ROOT) 2830 ? partcmd_enable : partcmd_enablei; 2831 2832 err = update_parent_effective_cpumask(cs, cmd, NULL, &tmpmask); 2833 } else { 2834 err = remote_partition_enable(cs, new_prs, &tmpmask); 2835 } 2836 } else if (old_prs && new_prs) { 2837 /* 2838 * A change in load balance state only, no change in cpumasks. 2839 */ 2840 new_xcpus_state = true; 2841 } else { 2842 /* 2843 * Switching back to member is always allowed even if it 2844 * disables child partitions. 2845 */ 2846 if (is_remote_partition(cs)) 2847 remote_partition_disable(cs, &tmpmask); 2848 else 2849 update_parent_effective_cpumask(cs, partcmd_disable, 2850 NULL, &tmpmask); 2851 2852 /* 2853 * Invalidation of child partitions will be done in 2854 * update_cpumasks_hier(). 2855 */ 2856 } 2857 out: 2858 /* 2859 * Make partition invalid & disable CS_CPU_EXCLUSIVE if an error 2860 * happens. 2861 */ 2862 if (err) { 2863 new_prs = -new_prs; 2864 update_partition_exclusive(cs, new_prs); 2865 } 2866 2867 spin_lock_irq(&callback_lock); 2868 cs->partition_root_state = new_prs; 2869 WRITE_ONCE(cs->prs_err, err); 2870 if (!is_partition_valid(cs)) 2871 reset_partition_data(cs); 2872 else if (new_xcpus_state) 2873 partition_xcpus_newstate(old_prs, new_prs, cs->effective_xcpus); 2874 spin_unlock_irq(&callback_lock); 2875 update_unbound_workqueue_cpumask(new_xcpus_state); 2876 2877 /* Force update if switching back to member */ 2878 update_cpumasks_hier(cs, &tmpmask, !new_prs); 2879 2880 /* Update sched domains and load balance flag */ 2881 update_partition_sd_lb(cs, old_prs); 2882 2883 notify_partition_change(cs, old_prs); 2884 if (force_sd_rebuild) 2885 rebuild_sched_domains_locked(); 2886 free_cpumasks(NULL, &tmpmask); 2887 return 0; 2888 } 2889 2890 static struct cpuset *cpuset_attach_old_cs; 2891 2892 /* 2893 * Check to see if a cpuset can accept a new task 2894 * For v1, cpus_allowed and mems_allowed can't be empty. 2895 * For v2, effective_cpus can't be empty. 2896 * Note that in v1, effective_cpus = cpus_allowed. 2897 */ 2898 static int cpuset_can_attach_check(struct cpuset *cs) 2899 { 2900 if (cpumask_empty(cs->effective_cpus) || 2901 (!is_in_v2_mode() && nodes_empty(cs->mems_allowed))) 2902 return -ENOSPC; 2903 return 0; 2904 } 2905 2906 static void reset_migrate_dl_data(struct cpuset *cs) 2907 { 2908 cs->nr_migrate_dl_tasks = 0; 2909 cs->sum_migrate_dl_bw = 0; 2910 } 2911 2912 /* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */ 2913 static int cpuset_can_attach(struct cgroup_taskset *tset) 2914 { 2915 struct cgroup_subsys_state *css; 2916 struct cpuset *cs, *oldcs; 2917 struct task_struct *task; 2918 bool cpus_updated, mems_updated; 2919 int ret; 2920 2921 /* used later by cpuset_attach() */ 2922 cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css)); 2923 oldcs = cpuset_attach_old_cs; 2924 cs = css_cs(css); 2925 2926 mutex_lock(&cpuset_mutex); 2927 2928 /* Check to see if task is allowed in the cpuset */ 2929 ret = cpuset_can_attach_check(cs); 2930 if (ret) 2931 goto out_unlock; 2932 2933 cpus_updated = !cpumask_equal(cs->effective_cpus, oldcs->effective_cpus); 2934 mems_updated = !nodes_equal(cs->effective_mems, oldcs->effective_mems); 2935 2936 cgroup_taskset_for_each(task, css, tset) { 2937 ret = task_can_attach(task); 2938 if (ret) 2939 goto out_unlock; 2940 2941 /* 2942 * Skip rights over task check in v2 when nothing changes, 2943 * migration permission derives from hierarchy ownership in 2944 * cgroup_procs_write_permission()). 2945 */ 2946 if (!cpuset_v2() || (cpus_updated || mems_updated)) { 2947 ret = security_task_setscheduler(task); 2948 if (ret) 2949 goto out_unlock; 2950 } 2951 2952 if (dl_task(task)) { 2953 cs->nr_migrate_dl_tasks++; 2954 cs->sum_migrate_dl_bw += task->dl.dl_bw; 2955 } 2956 } 2957 2958 if (!cs->nr_migrate_dl_tasks) 2959 goto out_success; 2960 2961 if (!cpumask_intersects(oldcs->effective_cpus, cs->effective_cpus)) { 2962 int cpu = cpumask_any_and(cpu_active_mask, cs->effective_cpus); 2963 2964 if (unlikely(cpu >= nr_cpu_ids)) { 2965 reset_migrate_dl_data(cs); 2966 ret = -EINVAL; 2967 goto out_unlock; 2968 } 2969 2970 ret = dl_bw_alloc(cpu, cs->sum_migrate_dl_bw); 2971 if (ret) { 2972 reset_migrate_dl_data(cs); 2973 goto out_unlock; 2974 } 2975 } 2976 2977 out_success: 2978 /* 2979 * Mark attach is in progress. This makes validate_change() fail 2980 * changes which zero cpus/mems_allowed. 2981 */ 2982 cs->attach_in_progress++; 2983 out_unlock: 2984 mutex_unlock(&cpuset_mutex); 2985 return ret; 2986 } 2987 2988 static void cpuset_cancel_attach(struct cgroup_taskset *tset) 2989 { 2990 struct cgroup_subsys_state *css; 2991 struct cpuset *cs; 2992 2993 cgroup_taskset_first(tset, &css); 2994 cs = css_cs(css); 2995 2996 mutex_lock(&cpuset_mutex); 2997 dec_attach_in_progress_locked(cs); 2998 2999 if (cs->nr_migrate_dl_tasks) { 3000 int cpu = cpumask_any(cs->effective_cpus); 3001 3002 dl_bw_free(cpu, cs->sum_migrate_dl_bw); 3003 reset_migrate_dl_data(cs); 3004 } 3005 3006 mutex_unlock(&cpuset_mutex); 3007 } 3008 3009 /* 3010 * Protected by cpuset_mutex. cpus_attach is used only by cpuset_attach_task() 3011 * but we can't allocate it dynamically there. Define it global and 3012 * allocate from cpuset_init(). 3013 */ 3014 static cpumask_var_t cpus_attach; 3015 static nodemask_t cpuset_attach_nodemask_to; 3016 3017 static void cpuset_attach_task(struct cpuset *cs, struct task_struct *task) 3018 { 3019 lockdep_assert_held(&cpuset_mutex); 3020 3021 if (cs != &top_cpuset) 3022 guarantee_online_cpus(task, cpus_attach); 3023 else 3024 cpumask_andnot(cpus_attach, task_cpu_possible_mask(task), 3025 subpartitions_cpus); 3026 /* 3027 * can_attach beforehand should guarantee that this doesn't 3028 * fail. TODO: have a better way to handle failure here 3029 */ 3030 WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach)); 3031 3032 cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to); 3033 cpuset1_update_task_spread_flags(cs, task); 3034 } 3035 3036 static void cpuset_attach(struct cgroup_taskset *tset) 3037 { 3038 struct task_struct *task; 3039 struct task_struct *leader; 3040 struct cgroup_subsys_state *css; 3041 struct cpuset *cs; 3042 struct cpuset *oldcs = cpuset_attach_old_cs; 3043 bool cpus_updated, mems_updated; 3044 3045 cgroup_taskset_first(tset, &css); 3046 cs = css_cs(css); 3047 3048 lockdep_assert_cpus_held(); /* see cgroup_attach_lock() */ 3049 mutex_lock(&cpuset_mutex); 3050 cpus_updated = !cpumask_equal(cs->effective_cpus, 3051 oldcs->effective_cpus); 3052 mems_updated = !nodes_equal(cs->effective_mems, oldcs->effective_mems); 3053 3054 /* 3055 * In the default hierarchy, enabling cpuset in the child cgroups 3056 * will trigger a number of cpuset_attach() calls with no change 3057 * in effective cpus and mems. In that case, we can optimize out 3058 * by skipping the task iteration and update. 3059 */ 3060 if (cpuset_v2() && !cpus_updated && !mems_updated) { 3061 cpuset_attach_nodemask_to = cs->effective_mems; 3062 goto out; 3063 } 3064 3065 guarantee_online_mems(cs, &cpuset_attach_nodemask_to); 3066 3067 cgroup_taskset_for_each(task, css, tset) 3068 cpuset_attach_task(cs, task); 3069 3070 /* 3071 * Change mm for all threadgroup leaders. This is expensive and may 3072 * sleep and should be moved outside migration path proper. Skip it 3073 * if there is no change in effective_mems and CS_MEMORY_MIGRATE is 3074 * not set. 3075 */ 3076 cpuset_attach_nodemask_to = cs->effective_mems; 3077 if (!is_memory_migrate(cs) && !mems_updated) 3078 goto out; 3079 3080 cgroup_taskset_for_each_leader(leader, css, tset) { 3081 struct mm_struct *mm = get_task_mm(leader); 3082 3083 if (mm) { 3084 mpol_rebind_mm(mm, &cpuset_attach_nodemask_to); 3085 3086 /* 3087 * old_mems_allowed is the same with mems_allowed 3088 * here, except if this task is being moved 3089 * automatically due to hotplug. In that case 3090 * @mems_allowed has been updated and is empty, so 3091 * @old_mems_allowed is the right nodesets that we 3092 * migrate mm from. 3093 */ 3094 if (is_memory_migrate(cs)) 3095 cpuset_migrate_mm(mm, &oldcs->old_mems_allowed, 3096 &cpuset_attach_nodemask_to); 3097 else 3098 mmput(mm); 3099 } 3100 } 3101 3102 out: 3103 cs->old_mems_allowed = cpuset_attach_nodemask_to; 3104 3105 if (cs->nr_migrate_dl_tasks) { 3106 cs->nr_deadline_tasks += cs->nr_migrate_dl_tasks; 3107 oldcs->nr_deadline_tasks -= cs->nr_migrate_dl_tasks; 3108 reset_migrate_dl_data(cs); 3109 } 3110 3111 dec_attach_in_progress_locked(cs); 3112 3113 mutex_unlock(&cpuset_mutex); 3114 } 3115 3116 /* 3117 * Common handling for a write to a "cpus" or "mems" file. 3118 */ 3119 ssize_t cpuset_write_resmask(struct kernfs_open_file *of, 3120 char *buf, size_t nbytes, loff_t off) 3121 { 3122 struct cpuset *cs = css_cs(of_css(of)); 3123 struct cpuset *trialcs; 3124 int retval = -ENODEV; 3125 3126 buf = strstrip(buf); 3127 cpus_read_lock(); 3128 mutex_lock(&cpuset_mutex); 3129 if (!is_cpuset_online(cs)) 3130 goto out_unlock; 3131 3132 trialcs = alloc_trial_cpuset(cs); 3133 if (!trialcs) { 3134 retval = -ENOMEM; 3135 goto out_unlock; 3136 } 3137 3138 switch (of_cft(of)->private) { 3139 case FILE_CPULIST: 3140 retval = update_cpumask(cs, trialcs, buf); 3141 break; 3142 case FILE_EXCLUSIVE_CPULIST: 3143 retval = update_exclusive_cpumask(cs, trialcs, buf); 3144 break; 3145 case FILE_MEMLIST: 3146 retval = update_nodemask(cs, trialcs, buf); 3147 break; 3148 default: 3149 retval = -EINVAL; 3150 break; 3151 } 3152 3153 free_cpuset(trialcs); 3154 if (force_sd_rebuild) 3155 rebuild_sched_domains_locked(); 3156 out_unlock: 3157 mutex_unlock(&cpuset_mutex); 3158 cpus_read_unlock(); 3159 flush_workqueue(cpuset_migrate_mm_wq); 3160 return retval ?: nbytes; 3161 } 3162 3163 /* 3164 * These ascii lists should be read in a single call, by using a user 3165 * buffer large enough to hold the entire map. If read in smaller 3166 * chunks, there is no guarantee of atomicity. Since the display format 3167 * used, list of ranges of sequential numbers, is variable length, 3168 * and since these maps can change value dynamically, one could read 3169 * gibberish by doing partial reads while a list was changing. 3170 */ 3171 int cpuset_common_seq_show(struct seq_file *sf, void *v) 3172 { 3173 struct cpuset *cs = css_cs(seq_css(sf)); 3174 cpuset_filetype_t type = seq_cft(sf)->private; 3175 int ret = 0; 3176 3177 spin_lock_irq(&callback_lock); 3178 3179 switch (type) { 3180 case FILE_CPULIST: 3181 seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->cpus_allowed)); 3182 break; 3183 case FILE_MEMLIST: 3184 seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->mems_allowed)); 3185 break; 3186 case FILE_EFFECTIVE_CPULIST: 3187 seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->effective_cpus)); 3188 break; 3189 case FILE_EFFECTIVE_MEMLIST: 3190 seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->effective_mems)); 3191 break; 3192 case FILE_EXCLUSIVE_CPULIST: 3193 seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->exclusive_cpus)); 3194 break; 3195 case FILE_EFFECTIVE_XCPULIST: 3196 seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->effective_xcpus)); 3197 break; 3198 case FILE_SUBPARTS_CPULIST: 3199 seq_printf(sf, "%*pbl\n", cpumask_pr_args(subpartitions_cpus)); 3200 break; 3201 case FILE_ISOLATED_CPULIST: 3202 seq_printf(sf, "%*pbl\n", cpumask_pr_args(isolated_cpus)); 3203 break; 3204 default: 3205 ret = -EINVAL; 3206 } 3207 3208 spin_unlock_irq(&callback_lock); 3209 return ret; 3210 } 3211 3212 static int sched_partition_show(struct seq_file *seq, void *v) 3213 { 3214 struct cpuset *cs = css_cs(seq_css(seq)); 3215 const char *err, *type = NULL; 3216 3217 switch (cs->partition_root_state) { 3218 case PRS_ROOT: 3219 seq_puts(seq, "root\n"); 3220 break; 3221 case PRS_ISOLATED: 3222 seq_puts(seq, "isolated\n"); 3223 break; 3224 case PRS_MEMBER: 3225 seq_puts(seq, "member\n"); 3226 break; 3227 case PRS_INVALID_ROOT: 3228 type = "root"; 3229 fallthrough; 3230 case PRS_INVALID_ISOLATED: 3231 if (!type) 3232 type = "isolated"; 3233 err = perr_strings[READ_ONCE(cs->prs_err)]; 3234 if (err) 3235 seq_printf(seq, "%s invalid (%s)\n", type, err); 3236 else 3237 seq_printf(seq, "%s invalid\n", type); 3238 break; 3239 } 3240 return 0; 3241 } 3242 3243 static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf, 3244 size_t nbytes, loff_t off) 3245 { 3246 struct cpuset *cs = css_cs(of_css(of)); 3247 int val; 3248 int retval = -ENODEV; 3249 3250 buf = strstrip(buf); 3251 3252 if (!strcmp(buf, "root")) 3253 val = PRS_ROOT; 3254 else if (!strcmp(buf, "member")) 3255 val = PRS_MEMBER; 3256 else if (!strcmp(buf, "isolated")) 3257 val = PRS_ISOLATED; 3258 else 3259 return -EINVAL; 3260 3261 css_get(&cs->css); 3262 cpus_read_lock(); 3263 mutex_lock(&cpuset_mutex); 3264 if (!is_cpuset_online(cs)) 3265 goto out_unlock; 3266 3267 retval = update_prstate(cs, val); 3268 out_unlock: 3269 mutex_unlock(&cpuset_mutex); 3270 cpus_read_unlock(); 3271 css_put(&cs->css); 3272 return retval ?: nbytes; 3273 } 3274 3275 /* 3276 * This is currently a minimal set for the default hierarchy. It can be 3277 * expanded later on by migrating more features and control files from v1. 3278 */ 3279 static struct cftype dfl_files[] = { 3280 { 3281 .name = "cpus", 3282 .seq_show = cpuset_common_seq_show, 3283 .write = cpuset_write_resmask, 3284 .max_write_len = (100U + 6 * NR_CPUS), 3285 .private = FILE_CPULIST, 3286 .flags = CFTYPE_NOT_ON_ROOT, 3287 }, 3288 3289 { 3290 .name = "mems", 3291 .seq_show = cpuset_common_seq_show, 3292 .write = cpuset_write_resmask, 3293 .max_write_len = (100U + 6 * MAX_NUMNODES), 3294 .private = FILE_MEMLIST, 3295 .flags = CFTYPE_NOT_ON_ROOT, 3296 }, 3297 3298 { 3299 .name = "cpus.effective", 3300 .seq_show = cpuset_common_seq_show, 3301 .private = FILE_EFFECTIVE_CPULIST, 3302 }, 3303 3304 { 3305 .name = "mems.effective", 3306 .seq_show = cpuset_common_seq_show, 3307 .private = FILE_EFFECTIVE_MEMLIST, 3308 }, 3309 3310 { 3311 .name = "cpus.partition", 3312 .seq_show = sched_partition_show, 3313 .write = sched_partition_write, 3314 .private = FILE_PARTITION_ROOT, 3315 .flags = CFTYPE_NOT_ON_ROOT, 3316 .file_offset = offsetof(struct cpuset, partition_file), 3317 }, 3318 3319 { 3320 .name = "cpus.exclusive", 3321 .seq_show = cpuset_common_seq_show, 3322 .write = cpuset_write_resmask, 3323 .max_write_len = (100U + 6 * NR_CPUS), 3324 .private = FILE_EXCLUSIVE_CPULIST, 3325 .flags = CFTYPE_NOT_ON_ROOT, 3326 }, 3327 3328 { 3329 .name = "cpus.exclusive.effective", 3330 .seq_show = cpuset_common_seq_show, 3331 .private = FILE_EFFECTIVE_XCPULIST, 3332 .flags = CFTYPE_NOT_ON_ROOT, 3333 }, 3334 3335 { 3336 .name = "cpus.subpartitions", 3337 .seq_show = cpuset_common_seq_show, 3338 .private = FILE_SUBPARTS_CPULIST, 3339 .flags = CFTYPE_ONLY_ON_ROOT | CFTYPE_DEBUG, 3340 }, 3341 3342 { 3343 .name = "cpus.isolated", 3344 .seq_show = cpuset_common_seq_show, 3345 .private = FILE_ISOLATED_CPULIST, 3346 .flags = CFTYPE_ONLY_ON_ROOT, 3347 }, 3348 3349 { } /* terminate */ 3350 }; 3351 3352 3353 /** 3354 * cpuset_css_alloc - Allocate a cpuset css 3355 * @parent_css: Parent css of the control group that the new cpuset will be 3356 * part of 3357 * Return: cpuset css on success, -ENOMEM on failure. 3358 * 3359 * Allocate and initialize a new cpuset css, for non-NULL @parent_css, return 3360 * top cpuset css otherwise. 3361 */ 3362 static struct cgroup_subsys_state * 3363 cpuset_css_alloc(struct cgroup_subsys_state *parent_css) 3364 { 3365 struct cpuset *cs; 3366 3367 if (!parent_css) 3368 return &top_cpuset.css; 3369 3370 cs = kzalloc(sizeof(*cs), GFP_KERNEL); 3371 if (!cs) 3372 return ERR_PTR(-ENOMEM); 3373 3374 if (alloc_cpumasks(cs, NULL)) { 3375 kfree(cs); 3376 return ERR_PTR(-ENOMEM); 3377 } 3378 3379 __set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); 3380 fmeter_init(&cs->fmeter); 3381 cs->relax_domain_level = -1; 3382 INIT_LIST_HEAD(&cs->remote_sibling); 3383 3384 /* Set CS_MEMORY_MIGRATE for default hierarchy */ 3385 if (cpuset_v2()) 3386 __set_bit(CS_MEMORY_MIGRATE, &cs->flags); 3387 3388 return &cs->css; 3389 } 3390 3391 static int cpuset_css_online(struct cgroup_subsys_state *css) 3392 { 3393 struct cpuset *cs = css_cs(css); 3394 struct cpuset *parent = parent_cs(cs); 3395 struct cpuset *tmp_cs; 3396 struct cgroup_subsys_state *pos_css; 3397 3398 if (!parent) 3399 return 0; 3400 3401 cpus_read_lock(); 3402 mutex_lock(&cpuset_mutex); 3403 3404 set_bit(CS_ONLINE, &cs->flags); 3405 if (is_spread_page(parent)) 3406 set_bit(CS_SPREAD_PAGE, &cs->flags); 3407 if (is_spread_slab(parent)) 3408 set_bit(CS_SPREAD_SLAB, &cs->flags); 3409 /* 3410 * For v2, clear CS_SCHED_LOAD_BALANCE if parent is isolated 3411 */ 3412 if (cpuset_v2() && !is_sched_load_balance(parent)) 3413 clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); 3414 3415 cpuset_inc(); 3416 3417 spin_lock_irq(&callback_lock); 3418 if (is_in_v2_mode()) { 3419 cpumask_copy(cs->effective_cpus, parent->effective_cpus); 3420 cs->effective_mems = parent->effective_mems; 3421 } 3422 spin_unlock_irq(&callback_lock); 3423 3424 if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags)) 3425 goto out_unlock; 3426 3427 /* 3428 * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is 3429 * set. This flag handling is implemented in cgroup core for 3430 * historical reasons - the flag may be specified during mount. 3431 * 3432 * Currently, if any sibling cpusets have exclusive cpus or mem, we 3433 * refuse to clone the configuration - thereby refusing the task to 3434 * be entered, and as a result refusing the sys_unshare() or 3435 * clone() which initiated it. If this becomes a problem for some 3436 * users who wish to allow that scenario, then this could be 3437 * changed to grant parent->cpus_allowed-sibling_cpus_exclusive 3438 * (and likewise for mems) to the new cgroup. 3439 */ 3440 rcu_read_lock(); 3441 cpuset_for_each_child(tmp_cs, pos_css, parent) { 3442 if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) { 3443 rcu_read_unlock(); 3444 goto out_unlock; 3445 } 3446 } 3447 rcu_read_unlock(); 3448 3449 spin_lock_irq(&callback_lock); 3450 cs->mems_allowed = parent->mems_allowed; 3451 cs->effective_mems = parent->mems_allowed; 3452 cpumask_copy(cs->cpus_allowed, parent->cpus_allowed); 3453 cpumask_copy(cs->effective_cpus, parent->cpus_allowed); 3454 spin_unlock_irq(&callback_lock); 3455 out_unlock: 3456 mutex_unlock(&cpuset_mutex); 3457 cpus_read_unlock(); 3458 return 0; 3459 } 3460 3461 /* 3462 * If the cpuset being removed has its flag 'sched_load_balance' 3463 * enabled, then simulate turning sched_load_balance off, which 3464 * will call rebuild_sched_domains_locked(). That is not needed 3465 * in the default hierarchy where only changes in partition 3466 * will cause repartitioning. 3467 * 3468 * If the cpuset has the 'sched.partition' flag enabled, simulate 3469 * turning 'sched.partition" off. 3470 */ 3471 3472 static void cpuset_css_offline(struct cgroup_subsys_state *css) 3473 { 3474 struct cpuset *cs = css_cs(css); 3475 3476 cpus_read_lock(); 3477 mutex_lock(&cpuset_mutex); 3478 3479 if (is_partition_valid(cs)) 3480 update_prstate(cs, 0); 3481 3482 if (!cpuset_v2() && is_sched_load_balance(cs)) 3483 cpuset_update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); 3484 3485 cpuset_dec(); 3486 clear_bit(CS_ONLINE, &cs->flags); 3487 3488 mutex_unlock(&cpuset_mutex); 3489 cpus_read_unlock(); 3490 } 3491 3492 static void cpuset_css_free(struct cgroup_subsys_state *css) 3493 { 3494 struct cpuset *cs = css_cs(css); 3495 3496 free_cpuset(cs); 3497 } 3498 3499 static void cpuset_bind(struct cgroup_subsys_state *root_css) 3500 { 3501 mutex_lock(&cpuset_mutex); 3502 spin_lock_irq(&callback_lock); 3503 3504 if (is_in_v2_mode()) { 3505 cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask); 3506 cpumask_copy(top_cpuset.effective_xcpus, cpu_possible_mask); 3507 top_cpuset.mems_allowed = node_possible_map; 3508 } else { 3509 cpumask_copy(top_cpuset.cpus_allowed, 3510 top_cpuset.effective_cpus); 3511 top_cpuset.mems_allowed = top_cpuset.effective_mems; 3512 } 3513 3514 spin_unlock_irq(&callback_lock); 3515 mutex_unlock(&cpuset_mutex); 3516 } 3517 3518 /* 3519 * In case the child is cloned into a cpuset different from its parent, 3520 * additional checks are done to see if the move is allowed. 3521 */ 3522 static int cpuset_can_fork(struct task_struct *task, struct css_set *cset) 3523 { 3524 struct cpuset *cs = css_cs(cset->subsys[cpuset_cgrp_id]); 3525 bool same_cs; 3526 int ret; 3527 3528 rcu_read_lock(); 3529 same_cs = (cs == task_cs(current)); 3530 rcu_read_unlock(); 3531 3532 if (same_cs) 3533 return 0; 3534 3535 lockdep_assert_held(&cgroup_mutex); 3536 mutex_lock(&cpuset_mutex); 3537 3538 /* Check to see if task is allowed in the cpuset */ 3539 ret = cpuset_can_attach_check(cs); 3540 if (ret) 3541 goto out_unlock; 3542 3543 ret = task_can_attach(task); 3544 if (ret) 3545 goto out_unlock; 3546 3547 ret = security_task_setscheduler(task); 3548 if (ret) 3549 goto out_unlock; 3550 3551 /* 3552 * Mark attach is in progress. This makes validate_change() fail 3553 * changes which zero cpus/mems_allowed. 3554 */ 3555 cs->attach_in_progress++; 3556 out_unlock: 3557 mutex_unlock(&cpuset_mutex); 3558 return ret; 3559 } 3560 3561 static void cpuset_cancel_fork(struct task_struct *task, struct css_set *cset) 3562 { 3563 struct cpuset *cs = css_cs(cset->subsys[cpuset_cgrp_id]); 3564 bool same_cs; 3565 3566 rcu_read_lock(); 3567 same_cs = (cs == task_cs(current)); 3568 rcu_read_unlock(); 3569 3570 if (same_cs) 3571 return; 3572 3573 dec_attach_in_progress(cs); 3574 } 3575 3576 /* 3577 * Make sure the new task conform to the current state of its parent, 3578 * which could have been changed by cpuset just after it inherits the 3579 * state from the parent and before it sits on the cgroup's task list. 3580 */ 3581 static void cpuset_fork(struct task_struct *task) 3582 { 3583 struct cpuset *cs; 3584 bool same_cs; 3585 3586 rcu_read_lock(); 3587 cs = task_cs(task); 3588 same_cs = (cs == task_cs(current)); 3589 rcu_read_unlock(); 3590 3591 if (same_cs) { 3592 if (cs == &top_cpuset) 3593 return; 3594 3595 set_cpus_allowed_ptr(task, current->cpus_ptr); 3596 task->mems_allowed = current->mems_allowed; 3597 return; 3598 } 3599 3600 /* CLONE_INTO_CGROUP */ 3601 mutex_lock(&cpuset_mutex); 3602 guarantee_online_mems(cs, &cpuset_attach_nodemask_to); 3603 cpuset_attach_task(cs, task); 3604 3605 dec_attach_in_progress_locked(cs); 3606 mutex_unlock(&cpuset_mutex); 3607 } 3608 3609 struct cgroup_subsys cpuset_cgrp_subsys = { 3610 .css_alloc = cpuset_css_alloc, 3611 .css_online = cpuset_css_online, 3612 .css_offline = cpuset_css_offline, 3613 .css_free = cpuset_css_free, 3614 .can_attach = cpuset_can_attach, 3615 .cancel_attach = cpuset_cancel_attach, 3616 .attach = cpuset_attach, 3617 .post_attach = cpuset_post_attach, 3618 .bind = cpuset_bind, 3619 .can_fork = cpuset_can_fork, 3620 .cancel_fork = cpuset_cancel_fork, 3621 .fork = cpuset_fork, 3622 #ifdef CONFIG_CPUSETS_V1 3623 .legacy_cftypes = cpuset1_files, 3624 #endif 3625 .dfl_cftypes = dfl_files, 3626 .early_init = true, 3627 .threaded = true, 3628 }; 3629 3630 /** 3631 * cpuset_init - initialize cpusets at system boot 3632 * 3633 * Description: Initialize top_cpuset 3634 **/ 3635 3636 int __init cpuset_init(void) 3637 { 3638 BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL)); 3639 BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL)); 3640 BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_xcpus, GFP_KERNEL)); 3641 BUG_ON(!alloc_cpumask_var(&top_cpuset.exclusive_cpus, GFP_KERNEL)); 3642 BUG_ON(!zalloc_cpumask_var(&subpartitions_cpus, GFP_KERNEL)); 3643 BUG_ON(!zalloc_cpumask_var(&isolated_cpus, GFP_KERNEL)); 3644 3645 cpumask_setall(top_cpuset.cpus_allowed); 3646 nodes_setall(top_cpuset.mems_allowed); 3647 cpumask_setall(top_cpuset.effective_cpus); 3648 cpumask_setall(top_cpuset.effective_xcpus); 3649 cpumask_setall(top_cpuset.exclusive_cpus); 3650 nodes_setall(top_cpuset.effective_mems); 3651 3652 fmeter_init(&top_cpuset.fmeter); 3653 INIT_LIST_HEAD(&remote_children); 3654 3655 BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL)); 3656 3657 have_boot_isolcpus = housekeeping_enabled(HK_TYPE_DOMAIN); 3658 if (have_boot_isolcpus) { 3659 BUG_ON(!alloc_cpumask_var(&boot_hk_cpus, GFP_KERNEL)); 3660 cpumask_copy(boot_hk_cpus, housekeeping_cpumask(HK_TYPE_DOMAIN)); 3661 cpumask_andnot(isolated_cpus, cpu_possible_mask, boot_hk_cpus); 3662 } 3663 3664 return 0; 3665 } 3666 3667 static void 3668 hotplug_update_tasks(struct cpuset *cs, 3669 struct cpumask *new_cpus, nodemask_t *new_mems, 3670 bool cpus_updated, bool mems_updated) 3671 { 3672 /* A partition root is allowed to have empty effective cpus */ 3673 if (cpumask_empty(new_cpus) && !is_partition_valid(cs)) 3674 cpumask_copy(new_cpus, parent_cs(cs)->effective_cpus); 3675 if (nodes_empty(*new_mems)) 3676 *new_mems = parent_cs(cs)->effective_mems; 3677 3678 spin_lock_irq(&callback_lock); 3679 cpumask_copy(cs->effective_cpus, new_cpus); 3680 cs->effective_mems = *new_mems; 3681 spin_unlock_irq(&callback_lock); 3682 3683 if (cpus_updated) 3684 cpuset_update_tasks_cpumask(cs, new_cpus); 3685 if (mems_updated) 3686 cpuset_update_tasks_nodemask(cs); 3687 } 3688 3689 void cpuset_force_rebuild(void) 3690 { 3691 force_sd_rebuild = true; 3692 } 3693 3694 /** 3695 * cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug 3696 * @cs: cpuset in interest 3697 * @tmp: the tmpmasks structure pointer 3698 * 3699 * Compare @cs's cpu and mem masks against top_cpuset and if some have gone 3700 * offline, update @cs accordingly. If @cs ends up with no CPU or memory, 3701 * all its tasks are moved to the nearest ancestor with both resources. 3702 */ 3703 static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp) 3704 { 3705 static cpumask_t new_cpus; 3706 static nodemask_t new_mems; 3707 bool cpus_updated; 3708 bool mems_updated; 3709 bool remote; 3710 int partcmd = -1; 3711 struct cpuset *parent; 3712 retry: 3713 wait_event(cpuset_attach_wq, cs->attach_in_progress == 0); 3714 3715 mutex_lock(&cpuset_mutex); 3716 3717 /* 3718 * We have raced with task attaching. We wait until attaching 3719 * is finished, so we won't attach a task to an empty cpuset. 3720 */ 3721 if (cs->attach_in_progress) { 3722 mutex_unlock(&cpuset_mutex); 3723 goto retry; 3724 } 3725 3726 parent = parent_cs(cs); 3727 compute_effective_cpumask(&new_cpus, cs, parent); 3728 nodes_and(new_mems, cs->mems_allowed, parent->effective_mems); 3729 3730 if (!tmp || !cs->partition_root_state) 3731 goto update_tasks; 3732 3733 /* 3734 * Compute effective_cpus for valid partition root, may invalidate 3735 * child partition roots if necessary. 3736 */ 3737 remote = is_remote_partition(cs); 3738 if (remote || (is_partition_valid(cs) && is_partition_valid(parent))) 3739 compute_partition_effective_cpumask(cs, &new_cpus); 3740 3741 if (remote && cpumask_empty(&new_cpus) && 3742 partition_is_populated(cs, NULL)) { 3743 remote_partition_disable(cs, tmp); 3744 compute_effective_cpumask(&new_cpus, cs, parent); 3745 remote = false; 3746 cpuset_force_rebuild(); 3747 } 3748 3749 /* 3750 * Force the partition to become invalid if either one of 3751 * the following conditions hold: 3752 * 1) empty effective cpus but not valid empty partition. 3753 * 2) parent is invalid or doesn't grant any cpus to child 3754 * partitions. 3755 */ 3756 if (is_local_partition(cs) && (!is_partition_valid(parent) || 3757 tasks_nocpu_error(parent, cs, &new_cpus))) 3758 partcmd = partcmd_invalidate; 3759 /* 3760 * On the other hand, an invalid partition root may be transitioned 3761 * back to a regular one. 3762 */ 3763 else if (is_partition_valid(parent) && is_partition_invalid(cs)) 3764 partcmd = partcmd_update; 3765 3766 if (partcmd >= 0) { 3767 update_parent_effective_cpumask(cs, partcmd, NULL, tmp); 3768 if ((partcmd == partcmd_invalidate) || is_partition_valid(cs)) { 3769 compute_partition_effective_cpumask(cs, &new_cpus); 3770 cpuset_force_rebuild(); 3771 } 3772 } 3773 3774 update_tasks: 3775 cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus); 3776 mems_updated = !nodes_equal(new_mems, cs->effective_mems); 3777 if (!cpus_updated && !mems_updated) 3778 goto unlock; /* Hotplug doesn't affect this cpuset */ 3779 3780 if (mems_updated) 3781 check_insane_mems_config(&new_mems); 3782 3783 if (is_in_v2_mode()) 3784 hotplug_update_tasks(cs, &new_cpus, &new_mems, 3785 cpus_updated, mems_updated); 3786 else 3787 cpuset1_hotplug_update_tasks(cs, &new_cpus, &new_mems, 3788 cpus_updated, mems_updated); 3789 3790 unlock: 3791 mutex_unlock(&cpuset_mutex); 3792 } 3793 3794 /** 3795 * cpuset_handle_hotplug - handle CPU/memory hot{,un}plug for a cpuset 3796 * 3797 * This function is called after either CPU or memory configuration has 3798 * changed and updates cpuset accordingly. The top_cpuset is always 3799 * synchronized to cpu_active_mask and N_MEMORY, which is necessary in 3800 * order to make cpusets transparent (of no affect) on systems that are 3801 * actively using CPU hotplug but making no active use of cpusets. 3802 * 3803 * Non-root cpusets are only affected by offlining. If any CPUs or memory 3804 * nodes have been taken down, cpuset_hotplug_update_tasks() is invoked on 3805 * all descendants. 3806 * 3807 * Note that CPU offlining during suspend is ignored. We don't modify 3808 * cpusets across suspend/resume cycles at all. 3809 * 3810 * CPU / memory hotplug is handled synchronously. 3811 */ 3812 static void cpuset_handle_hotplug(void) 3813 { 3814 static cpumask_t new_cpus; 3815 static nodemask_t new_mems; 3816 bool cpus_updated, mems_updated; 3817 bool on_dfl = is_in_v2_mode(); 3818 struct tmpmasks tmp, *ptmp = NULL; 3819 3820 if (on_dfl && !alloc_cpumasks(NULL, &tmp)) 3821 ptmp = &tmp; 3822 3823 lockdep_assert_cpus_held(); 3824 mutex_lock(&cpuset_mutex); 3825 3826 /* fetch the available cpus/mems and find out which changed how */ 3827 cpumask_copy(&new_cpus, cpu_active_mask); 3828 new_mems = node_states[N_MEMORY]; 3829 3830 /* 3831 * If subpartitions_cpus is populated, it is likely that the check 3832 * below will produce a false positive on cpus_updated when the cpu 3833 * list isn't changed. It is extra work, but it is better to be safe. 3834 */ 3835 cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus) || 3836 !cpumask_empty(subpartitions_cpus); 3837 mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems); 3838 3839 /* For v1, synchronize cpus_allowed to cpu_active_mask */ 3840 if (cpus_updated) { 3841 cpuset_force_rebuild(); 3842 spin_lock_irq(&callback_lock); 3843 if (!on_dfl) 3844 cpumask_copy(top_cpuset.cpus_allowed, &new_cpus); 3845 /* 3846 * Make sure that CPUs allocated to child partitions 3847 * do not show up in effective_cpus. If no CPU is left, 3848 * we clear the subpartitions_cpus & let the child partitions 3849 * fight for the CPUs again. 3850 */ 3851 if (!cpumask_empty(subpartitions_cpus)) { 3852 if (cpumask_subset(&new_cpus, subpartitions_cpus)) { 3853 top_cpuset.nr_subparts = 0; 3854 cpumask_clear(subpartitions_cpus); 3855 } else { 3856 cpumask_andnot(&new_cpus, &new_cpus, 3857 subpartitions_cpus); 3858 } 3859 } 3860 cpumask_copy(top_cpuset.effective_cpus, &new_cpus); 3861 spin_unlock_irq(&callback_lock); 3862 /* we don't mess with cpumasks of tasks in top_cpuset */ 3863 } 3864 3865 /* synchronize mems_allowed to N_MEMORY */ 3866 if (mems_updated) { 3867 spin_lock_irq(&callback_lock); 3868 if (!on_dfl) 3869 top_cpuset.mems_allowed = new_mems; 3870 top_cpuset.effective_mems = new_mems; 3871 spin_unlock_irq(&callback_lock); 3872 cpuset_update_tasks_nodemask(&top_cpuset); 3873 } 3874 3875 mutex_unlock(&cpuset_mutex); 3876 3877 /* if cpus or mems changed, we need to propagate to descendants */ 3878 if (cpus_updated || mems_updated) { 3879 struct cpuset *cs; 3880 struct cgroup_subsys_state *pos_css; 3881 3882 rcu_read_lock(); 3883 cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) { 3884 if (cs == &top_cpuset || !css_tryget_online(&cs->css)) 3885 continue; 3886 rcu_read_unlock(); 3887 3888 cpuset_hotplug_update_tasks(cs, ptmp); 3889 3890 rcu_read_lock(); 3891 css_put(&cs->css); 3892 } 3893 rcu_read_unlock(); 3894 } 3895 3896 /* rebuild sched domains if necessary */ 3897 if (force_sd_rebuild) 3898 rebuild_sched_domains_cpuslocked(); 3899 3900 free_cpumasks(NULL, ptmp); 3901 } 3902 3903 void cpuset_update_active_cpus(void) 3904 { 3905 /* 3906 * We're inside cpu hotplug critical region which usually nests 3907 * inside cgroup synchronization. Bounce actual hotplug processing 3908 * to a work item to avoid reverse locking order. 3909 */ 3910 cpuset_handle_hotplug(); 3911 } 3912 3913 /* 3914 * Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY]. 3915 * Call this routine anytime after node_states[N_MEMORY] changes. 3916 * See cpuset_update_active_cpus() for CPU hotplug handling. 3917 */ 3918 static int cpuset_track_online_nodes(struct notifier_block *self, 3919 unsigned long action, void *arg) 3920 { 3921 cpuset_handle_hotplug(); 3922 return NOTIFY_OK; 3923 } 3924 3925 /** 3926 * cpuset_init_smp - initialize cpus_allowed 3927 * 3928 * Description: Finish top cpuset after cpu, node maps are initialized 3929 */ 3930 void __init cpuset_init_smp(void) 3931 { 3932 /* 3933 * cpus_allowd/mems_allowed set to v2 values in the initial 3934 * cpuset_bind() call will be reset to v1 values in another 3935 * cpuset_bind() call when v1 cpuset is mounted. 3936 */ 3937 top_cpuset.old_mems_allowed = top_cpuset.mems_allowed; 3938 3939 cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask); 3940 top_cpuset.effective_mems = node_states[N_MEMORY]; 3941 3942 hotplug_memory_notifier(cpuset_track_online_nodes, CPUSET_CALLBACK_PRI); 3943 3944 cpuset_migrate_mm_wq = alloc_ordered_workqueue("cpuset_migrate_mm", 0); 3945 BUG_ON(!cpuset_migrate_mm_wq); 3946 } 3947 3948 /** 3949 * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset. 3950 * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed. 3951 * @pmask: pointer to struct cpumask variable to receive cpus_allowed set. 3952 * 3953 * Description: Returns the cpumask_var_t cpus_allowed of the cpuset 3954 * attached to the specified @tsk. Guaranteed to return some non-empty 3955 * subset of cpu_online_mask, even if this means going outside the 3956 * tasks cpuset, except when the task is in the top cpuset. 3957 **/ 3958 3959 void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) 3960 { 3961 unsigned long flags; 3962 struct cpuset *cs; 3963 3964 spin_lock_irqsave(&callback_lock, flags); 3965 rcu_read_lock(); 3966 3967 cs = task_cs(tsk); 3968 if (cs != &top_cpuset) 3969 guarantee_online_cpus(tsk, pmask); 3970 /* 3971 * Tasks in the top cpuset won't get update to their cpumasks 3972 * when a hotplug online/offline event happens. So we include all 3973 * offline cpus in the allowed cpu list. 3974 */ 3975 if ((cs == &top_cpuset) || cpumask_empty(pmask)) { 3976 const struct cpumask *possible_mask = task_cpu_possible_mask(tsk); 3977 3978 /* 3979 * We first exclude cpus allocated to partitions. If there is no 3980 * allowable online cpu left, we fall back to all possible cpus. 3981 */ 3982 cpumask_andnot(pmask, possible_mask, subpartitions_cpus); 3983 if (!cpumask_intersects(pmask, cpu_online_mask)) 3984 cpumask_copy(pmask, possible_mask); 3985 } 3986 3987 rcu_read_unlock(); 3988 spin_unlock_irqrestore(&callback_lock, flags); 3989 } 3990 3991 /** 3992 * cpuset_cpus_allowed_fallback - final fallback before complete catastrophe. 3993 * @tsk: pointer to task_struct with which the scheduler is struggling 3994 * 3995 * Description: In the case that the scheduler cannot find an allowed cpu in 3996 * tsk->cpus_allowed, we fall back to task_cs(tsk)->cpus_allowed. In legacy 3997 * mode however, this value is the same as task_cs(tsk)->effective_cpus, 3998 * which will not contain a sane cpumask during cases such as cpu hotplugging. 3999 * This is the absolute last resort for the scheduler and it is only used if 4000 * _every_ other avenue has been traveled. 4001 * 4002 * Returns true if the affinity of @tsk was changed, false otherwise. 4003 **/ 4004 4005 bool cpuset_cpus_allowed_fallback(struct task_struct *tsk) 4006 { 4007 const struct cpumask *possible_mask = task_cpu_possible_mask(tsk); 4008 const struct cpumask *cs_mask; 4009 bool changed = false; 4010 4011 rcu_read_lock(); 4012 cs_mask = task_cs(tsk)->cpus_allowed; 4013 if (is_in_v2_mode() && cpumask_subset(cs_mask, possible_mask)) { 4014 do_set_cpus_allowed(tsk, cs_mask); 4015 changed = true; 4016 } 4017 rcu_read_unlock(); 4018 4019 /* 4020 * We own tsk->cpus_allowed, nobody can change it under us. 4021 * 4022 * But we used cs && cs->cpus_allowed lockless and thus can 4023 * race with cgroup_attach_task() or update_cpumask() and get 4024 * the wrong tsk->cpus_allowed. However, both cases imply the 4025 * subsequent cpuset_change_cpumask()->set_cpus_allowed_ptr() 4026 * which takes task_rq_lock(). 4027 * 4028 * If we are called after it dropped the lock we must see all 4029 * changes in tsk_cs()->cpus_allowed. Otherwise we can temporary 4030 * set any mask even if it is not right from task_cs() pov, 4031 * the pending set_cpus_allowed_ptr() will fix things. 4032 * 4033 * select_fallback_rq() will fix things ups and set cpu_possible_mask 4034 * if required. 4035 */ 4036 return changed; 4037 } 4038 4039 void __init cpuset_init_current_mems_allowed(void) 4040 { 4041 nodes_setall(current->mems_allowed); 4042 } 4043 4044 /** 4045 * cpuset_mems_allowed - return mems_allowed mask from a tasks cpuset. 4046 * @tsk: pointer to task_struct from which to obtain cpuset->mems_allowed. 4047 * 4048 * Description: Returns the nodemask_t mems_allowed of the cpuset 4049 * attached to the specified @tsk. Guaranteed to return some non-empty 4050 * subset of node_states[N_MEMORY], even if this means going outside the 4051 * tasks cpuset. 4052 **/ 4053 4054 nodemask_t cpuset_mems_allowed(struct task_struct *tsk) 4055 { 4056 nodemask_t mask; 4057 unsigned long flags; 4058 4059 spin_lock_irqsave(&callback_lock, flags); 4060 rcu_read_lock(); 4061 guarantee_online_mems(task_cs(tsk), &mask); 4062 rcu_read_unlock(); 4063 spin_unlock_irqrestore(&callback_lock, flags); 4064 4065 return mask; 4066 } 4067 4068 /** 4069 * cpuset_nodemask_valid_mems_allowed - check nodemask vs. current mems_allowed 4070 * @nodemask: the nodemask to be checked 4071 * 4072 * Are any of the nodes in the nodemask allowed in current->mems_allowed? 4073 */ 4074 int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask) 4075 { 4076 return nodes_intersects(*nodemask, current->mems_allowed); 4077 } 4078 4079 /* 4080 * nearest_hardwall_ancestor() - Returns the nearest mem_exclusive or 4081 * mem_hardwall ancestor to the specified cpuset. Call holding 4082 * callback_lock. If no ancestor is mem_exclusive or mem_hardwall 4083 * (an unusual configuration), then returns the root cpuset. 4084 */ 4085 static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) 4086 { 4087 while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && parent_cs(cs)) 4088 cs = parent_cs(cs); 4089 return cs; 4090 } 4091 4092 /* 4093 * cpuset_node_allowed - Can we allocate on a memory node? 4094 * @node: is this an allowed node? 4095 * @gfp_mask: memory allocation flags 4096 * 4097 * If we're in interrupt, yes, we can always allocate. If @node is set in 4098 * current's mems_allowed, yes. If it's not a __GFP_HARDWALL request and this 4099 * node is set in the nearest hardwalled cpuset ancestor to current's cpuset, 4100 * yes. If current has access to memory reserves as an oom victim, yes. 4101 * Otherwise, no. 4102 * 4103 * GFP_USER allocations are marked with the __GFP_HARDWALL bit, 4104 * and do not allow allocations outside the current tasks cpuset 4105 * unless the task has been OOM killed. 4106 * GFP_KERNEL allocations are not so marked, so can escape to the 4107 * nearest enclosing hardwalled ancestor cpuset. 4108 * 4109 * Scanning up parent cpusets requires callback_lock. The 4110 * __alloc_pages() routine only calls here with __GFP_HARDWALL bit 4111 * _not_ set if it's a GFP_KERNEL allocation, and all nodes in the 4112 * current tasks mems_allowed came up empty on the first pass over 4113 * the zonelist. So only GFP_KERNEL allocations, if all nodes in the 4114 * cpuset are short of memory, might require taking the callback_lock. 4115 * 4116 * The first call here from mm/page_alloc:get_page_from_freelist() 4117 * has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets, 4118 * so no allocation on a node outside the cpuset is allowed (unless 4119 * in interrupt, of course). 4120 * 4121 * The second pass through get_page_from_freelist() doesn't even call 4122 * here for GFP_ATOMIC calls. For those calls, the __alloc_pages() 4123 * variable 'wait' is not set, and the bit ALLOC_CPUSET is not set 4124 * in alloc_flags. That logic and the checks below have the combined 4125 * affect that: 4126 * in_interrupt - any node ok (current task context irrelevant) 4127 * GFP_ATOMIC - any node ok 4128 * tsk_is_oom_victim - any node ok 4129 * GFP_KERNEL - any node in enclosing hardwalled cpuset ok 4130 * GFP_USER - only nodes in current tasks mems allowed ok. 4131 */ 4132 bool cpuset_node_allowed(int node, gfp_t gfp_mask) 4133 { 4134 struct cpuset *cs; /* current cpuset ancestors */ 4135 bool allowed; /* is allocation in zone z allowed? */ 4136 unsigned long flags; 4137 4138 if (in_interrupt()) 4139 return true; 4140 if (node_isset(node, current->mems_allowed)) 4141 return true; 4142 /* 4143 * Allow tasks that have access to memory reserves because they have 4144 * been OOM killed to get memory anywhere. 4145 */ 4146 if (unlikely(tsk_is_oom_victim(current))) 4147 return true; 4148 if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */ 4149 return false; 4150 4151 if (current->flags & PF_EXITING) /* Let dying task have memory */ 4152 return true; 4153 4154 /* Not hardwall and node outside mems_allowed: scan up cpusets */ 4155 spin_lock_irqsave(&callback_lock, flags); 4156 4157 rcu_read_lock(); 4158 cs = nearest_hardwall_ancestor(task_cs(current)); 4159 allowed = node_isset(node, cs->mems_allowed); 4160 rcu_read_unlock(); 4161 4162 spin_unlock_irqrestore(&callback_lock, flags); 4163 return allowed; 4164 } 4165 4166 /** 4167 * cpuset_spread_node() - On which node to begin search for a page 4168 * @rotor: round robin rotor 4169 * 4170 * If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for 4171 * tasks in a cpuset with is_spread_page or is_spread_slab set), 4172 * and if the memory allocation used cpuset_mem_spread_node() 4173 * to determine on which node to start looking, as it will for 4174 * certain page cache or slab cache pages such as used for file 4175 * system buffers and inode caches, then instead of starting on the 4176 * local node to look for a free page, rather spread the starting 4177 * node around the tasks mems_allowed nodes. 4178 * 4179 * We don't have to worry about the returned node being offline 4180 * because "it can't happen", and even if it did, it would be ok. 4181 * 4182 * The routines calling guarantee_online_mems() are careful to 4183 * only set nodes in task->mems_allowed that are online. So it 4184 * should not be possible for the following code to return an 4185 * offline node. But if it did, that would be ok, as this routine 4186 * is not returning the node where the allocation must be, only 4187 * the node where the search should start. The zonelist passed to 4188 * __alloc_pages() will include all nodes. If the slab allocator 4189 * is passed an offline node, it will fall back to the local node. 4190 * See kmem_cache_alloc_node(). 4191 */ 4192 static int cpuset_spread_node(int *rotor) 4193 { 4194 return *rotor = next_node_in(*rotor, current->mems_allowed); 4195 } 4196 4197 /** 4198 * cpuset_mem_spread_node() - On which node to begin search for a file page 4199 */ 4200 int cpuset_mem_spread_node(void) 4201 { 4202 if (current->cpuset_mem_spread_rotor == NUMA_NO_NODE) 4203 current->cpuset_mem_spread_rotor = 4204 node_random(¤t->mems_allowed); 4205 4206 return cpuset_spread_node(¤t->cpuset_mem_spread_rotor); 4207 } 4208 4209 /** 4210 * cpuset_mems_allowed_intersects - Does @tsk1's mems_allowed intersect @tsk2's? 4211 * @tsk1: pointer to task_struct of some task. 4212 * @tsk2: pointer to task_struct of some other task. 4213 * 4214 * Description: Return true if @tsk1's mems_allowed intersects the 4215 * mems_allowed of @tsk2. Used by the OOM killer to determine if 4216 * one of the task's memory usage might impact the memory available 4217 * to the other. 4218 **/ 4219 4220 int cpuset_mems_allowed_intersects(const struct task_struct *tsk1, 4221 const struct task_struct *tsk2) 4222 { 4223 return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed); 4224 } 4225 4226 /** 4227 * cpuset_print_current_mems_allowed - prints current's cpuset and mems_allowed 4228 * 4229 * Description: Prints current's name, cpuset name, and cached copy of its 4230 * mems_allowed to the kernel log. 4231 */ 4232 void cpuset_print_current_mems_allowed(void) 4233 { 4234 struct cgroup *cgrp; 4235 4236 rcu_read_lock(); 4237 4238 cgrp = task_cs(current)->css.cgroup; 4239 pr_cont(",cpuset="); 4240 pr_cont_cgroup_name(cgrp); 4241 pr_cont(",mems_allowed=%*pbl", 4242 nodemask_pr_args(¤t->mems_allowed)); 4243 4244 rcu_read_unlock(); 4245 } 4246 4247 #ifdef CONFIG_PROC_PID_CPUSET 4248 /* 4249 * proc_cpuset_show() 4250 * - Print tasks cpuset path into seq_file. 4251 * - Used for /proc/<pid>/cpuset. 4252 * - No need to task_lock(tsk) on this tsk->cpuset reference, as it 4253 * doesn't really matter if tsk->cpuset changes after we read it, 4254 * and we take cpuset_mutex, keeping cpuset_attach() from changing it 4255 * anyway. 4256 */ 4257 int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns, 4258 struct pid *pid, struct task_struct *tsk) 4259 { 4260 char *buf; 4261 struct cgroup_subsys_state *css; 4262 int retval; 4263 4264 retval = -ENOMEM; 4265 buf = kmalloc(PATH_MAX, GFP_KERNEL); 4266 if (!buf) 4267 goto out; 4268 4269 rcu_read_lock(); 4270 spin_lock_irq(&css_set_lock); 4271 css = task_css(tsk, cpuset_cgrp_id); 4272 retval = cgroup_path_ns_locked(css->cgroup, buf, PATH_MAX, 4273 current->nsproxy->cgroup_ns); 4274 spin_unlock_irq(&css_set_lock); 4275 rcu_read_unlock(); 4276 4277 if (retval == -E2BIG) 4278 retval = -ENAMETOOLONG; 4279 if (retval < 0) 4280 goto out_free; 4281 seq_puts(m, buf); 4282 seq_putc(m, '\n'); 4283 retval = 0; 4284 out_free: 4285 kfree(buf); 4286 out: 4287 return retval; 4288 } 4289 #endif /* CONFIG_PROC_PID_CPUSET */ 4290 4291 /* Display task mems_allowed in /proc/<pid>/status file. */ 4292 void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task) 4293 { 4294 seq_printf(m, "Mems_allowed:\t%*pb\n", 4295 nodemask_pr_args(&task->mems_allowed)); 4296 seq_printf(m, "Mems_allowed_list:\t%*pbl\n", 4297 nodemask_pr_args(&task->mems_allowed)); 4298 } 4299