1 /* 2 * kernel/cpuset.c 3 * 4 * Processor and Memory placement constraints for sets of tasks. 5 * 6 * Copyright (C) 2003 BULL SA. 7 * Copyright (C) 2004-2007 Silicon Graphics, Inc. 8 * Copyright (C) 2006 Google, Inc 9 * 10 * Portions derived from Patrick Mochel's sysfs code. 11 * sysfs is Copyright (c) 2001-3 Patrick Mochel 12 * 13 * 2003-10-10 Written by Simon Derr. 14 * 2003-10-22 Updates by Stephen Hemminger. 15 * 2004 May-July Rework by Paul Jackson. 16 * 2006 Rework by Paul Menage to use generic cgroups 17 * 2008 Rework of the scheduler domains and CPU hotplug handling 18 * by Max Krasnyansky 19 * 20 * This file is subject to the terms and conditions of the GNU General Public 21 * License. See the file COPYING in the main directory of the Linux 22 * distribution for more details. 23 */ 24 #include "cpuset-internal.h" 25 26 #include <linux/init.h> 27 #include <linux/interrupt.h> 28 #include <linux/kernel.h> 29 #include <linux/mempolicy.h> 30 #include <linux/mm.h> 31 #include <linux/memory.h> 32 #include <linux/export.h> 33 #include <linux/rcupdate.h> 34 #include <linux/sched.h> 35 #include <linux/sched/deadline.h> 36 #include <linux/sched/mm.h> 37 #include <linux/sched/task.h> 38 #include <linux/security.h> 39 #include <linux/oom.h> 40 #include <linux/sched/isolation.h> 41 #include <linux/wait.h> 42 #include <linux/workqueue.h> 43 44 DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key); 45 DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key); 46 47 /* 48 * There could be abnormal cpuset configurations for cpu or memory 49 * node binding, add this key to provide a quick low-cost judgment 50 * of the situation. 51 */ 52 DEFINE_STATIC_KEY_FALSE(cpusets_insane_config_key); 53 54 static const char * const perr_strings[] = { 55 [PERR_INVCPUS] = "Invalid cpu list in cpuset.cpus.exclusive", 56 [PERR_INVPARENT] = "Parent is an invalid partition root", 57 [PERR_NOTPART] = "Parent is not a partition root", 58 [PERR_NOTEXCL] = "Cpu list in cpuset.cpus not exclusive", 59 [PERR_NOCPUS] = "Parent unable to distribute cpu downstream", 60 [PERR_HOTPLUG] = "No cpu available due to hotplug", 61 [PERR_CPUSEMPTY] = "cpuset.cpus and cpuset.cpus.exclusive are empty", 62 [PERR_HKEEPING] = "partition config conflicts with housekeeping setup", 63 [PERR_ACCESS] = "Enable partition not permitted", 64 [PERR_REMOTE] = "Have remote partition underneath", 65 }; 66 67 /* 68 * For local partitions, update to subpartitions_cpus & isolated_cpus is done 69 * in update_parent_effective_cpumask(). For remote partitions, it is done in 70 * the remote_partition_*() and remote_cpus_update() helpers. 71 */ 72 /* 73 * Exclusive CPUs distributed out to local or remote sub-partitions of 74 * top_cpuset 75 */ 76 static cpumask_var_t subpartitions_cpus; 77 78 /* 79 * Exclusive CPUs in isolated partitions 80 */ 81 static cpumask_var_t isolated_cpus; 82 83 /* 84 * Housekeeping (HK_TYPE_DOMAIN) CPUs at boot 85 */ 86 static cpumask_var_t boot_hk_cpus; 87 static bool have_boot_isolcpus; 88 89 /* List of remote partition root children */ 90 static struct list_head remote_children; 91 92 /* 93 * A flag to force sched domain rebuild at the end of an operation. 94 * It can be set in 95 * - update_partition_sd_lb() 96 * - update_cpumasks_hier() 97 * - cpuset_update_flag() 98 * - cpuset_hotplug_update_tasks() 99 * - cpuset_handle_hotplug() 100 * 101 * Protected by cpuset_mutex (with cpus_read_lock held) or cpus_write_lock. 102 * 103 * Note that update_relax_domain_level() in cpuset-v1.c can still call 104 * rebuild_sched_domains_locked() directly without using this flag. 105 */ 106 static bool force_sd_rebuild; 107 108 /* 109 * Partition root states: 110 * 111 * 0 - member (not a partition root) 112 * 1 - partition root 113 * 2 - partition root without load balancing (isolated) 114 * -1 - invalid partition root 115 * -2 - invalid isolated partition root 116 * 117 * There are 2 types of partitions - local or remote. Local partitions are 118 * those whose parents are partition root themselves. Setting of 119 * cpuset.cpus.exclusive are optional in setting up local partitions. 120 * Remote partitions are those whose parents are not partition roots. Passing 121 * down exclusive CPUs by setting cpuset.cpus.exclusive along its ancestor 122 * nodes are mandatory in creating a remote partition. 123 * 124 * For simplicity, a local partition can be created under a local or remote 125 * partition but a remote partition cannot have any partition root in its 126 * ancestor chain except the cgroup root. 127 */ 128 #define PRS_MEMBER 0 129 #define PRS_ROOT 1 130 #define PRS_ISOLATED 2 131 #define PRS_INVALID_ROOT -1 132 #define PRS_INVALID_ISOLATED -2 133 134 static inline bool is_prs_invalid(int prs_state) 135 { 136 return prs_state < 0; 137 } 138 139 /* 140 * Temporary cpumasks for working with partitions that are passed among 141 * functions to avoid memory allocation in inner functions. 142 */ 143 struct tmpmasks { 144 cpumask_var_t addmask, delmask; /* For partition root */ 145 cpumask_var_t new_cpus; /* For update_cpumasks_hier() */ 146 }; 147 148 void inc_dl_tasks_cs(struct task_struct *p) 149 { 150 struct cpuset *cs = task_cs(p); 151 152 cs->nr_deadline_tasks++; 153 } 154 155 void dec_dl_tasks_cs(struct task_struct *p) 156 { 157 struct cpuset *cs = task_cs(p); 158 159 cs->nr_deadline_tasks--; 160 } 161 162 static inline int is_partition_valid(const struct cpuset *cs) 163 { 164 return cs->partition_root_state > 0; 165 } 166 167 static inline int is_partition_invalid(const struct cpuset *cs) 168 { 169 return cs->partition_root_state < 0; 170 } 171 172 /* 173 * Callers should hold callback_lock to modify partition_root_state. 174 */ 175 static inline void make_partition_invalid(struct cpuset *cs) 176 { 177 if (cs->partition_root_state > 0) 178 cs->partition_root_state = -cs->partition_root_state; 179 } 180 181 /* 182 * Send notification event of whenever partition_root_state changes. 183 */ 184 static inline void notify_partition_change(struct cpuset *cs, int old_prs) 185 { 186 if (old_prs == cs->partition_root_state) 187 return; 188 cgroup_file_notify(&cs->partition_file); 189 190 /* Reset prs_err if not invalid */ 191 if (is_partition_valid(cs)) 192 WRITE_ONCE(cs->prs_err, PERR_NONE); 193 } 194 195 /* 196 * The top_cpuset is always synchronized to cpu_active_mask and we should avoid 197 * using cpu_online_mask as much as possible. An active CPU is always an online 198 * CPU, but not vice versa. cpu_active_mask and cpu_online_mask can differ 199 * during hotplug operations. A CPU is marked active at the last stage of CPU 200 * bringup (CPUHP_AP_ACTIVE). It is also the stage where cpuset hotplug code 201 * will be called to update the sched domains so that the scheduler can move 202 * a normal task to a newly active CPU or remove tasks away from a newly 203 * inactivated CPU. The online bit is set much earlier in the CPU bringup 204 * process and cleared much later in CPU teardown. 205 * 206 * If cpu_online_mask is used while a hotunplug operation is happening in 207 * parallel, we may leave an offline CPU in cpu_allowed or some other masks. 208 */ 209 static struct cpuset top_cpuset = { 210 .flags = BIT(CS_ONLINE) | BIT(CS_CPU_EXCLUSIVE) | 211 BIT(CS_MEM_EXCLUSIVE) | BIT(CS_SCHED_LOAD_BALANCE), 212 .partition_root_state = PRS_ROOT, 213 .relax_domain_level = -1, 214 .remote_sibling = LIST_HEAD_INIT(top_cpuset.remote_sibling), 215 }; 216 217 /* 218 * There are two global locks guarding cpuset structures - cpuset_mutex and 219 * callback_lock. The cpuset code uses only cpuset_mutex. Other kernel 220 * subsystems can use cpuset_lock()/cpuset_unlock() to prevent change to cpuset 221 * structures. Note that cpuset_mutex needs to be a mutex as it is used in 222 * paths that rely on priority inheritance (e.g. scheduler - on RT) for 223 * correctness. 224 * 225 * A task must hold both locks to modify cpusets. If a task holds 226 * cpuset_mutex, it blocks others, ensuring that it is the only task able to 227 * also acquire callback_lock and be able to modify cpusets. It can perform 228 * various checks on the cpuset structure first, knowing nothing will change. 229 * It can also allocate memory while just holding cpuset_mutex. While it is 230 * performing these checks, various callback routines can briefly acquire 231 * callback_lock to query cpusets. Once it is ready to make the changes, it 232 * takes callback_lock, blocking everyone else. 233 * 234 * Calls to the kernel memory allocator can not be made while holding 235 * callback_lock, as that would risk double tripping on callback_lock 236 * from one of the callbacks into the cpuset code from within 237 * __alloc_pages(). 238 * 239 * If a task is only holding callback_lock, then it has read-only 240 * access to cpusets. 241 * 242 * Now, the task_struct fields mems_allowed and mempolicy may be changed 243 * by other task, we use alloc_lock in the task_struct fields to protect 244 * them. 245 * 246 * The cpuset_common_seq_show() handlers only hold callback_lock across 247 * small pieces of code, such as when reading out possibly multi-word 248 * cpumasks and nodemasks. 249 */ 250 251 static DEFINE_MUTEX(cpuset_mutex); 252 253 void cpuset_lock(void) 254 { 255 mutex_lock(&cpuset_mutex); 256 } 257 258 void cpuset_unlock(void) 259 { 260 mutex_unlock(&cpuset_mutex); 261 } 262 263 static DEFINE_SPINLOCK(callback_lock); 264 265 void cpuset_callback_lock_irq(void) 266 { 267 spin_lock_irq(&callback_lock); 268 } 269 270 void cpuset_callback_unlock_irq(void) 271 { 272 spin_unlock_irq(&callback_lock); 273 } 274 275 static struct workqueue_struct *cpuset_migrate_mm_wq; 276 277 static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq); 278 279 static inline void check_insane_mems_config(nodemask_t *nodes) 280 { 281 if (!cpusets_insane_config() && 282 movable_only_nodes(nodes)) { 283 static_branch_enable(&cpusets_insane_config_key); 284 pr_info("Unsupported (movable nodes only) cpuset configuration detected (nmask=%*pbl)!\n" 285 "Cpuset allocations might fail even with a lot of memory available.\n", 286 nodemask_pr_args(nodes)); 287 } 288 } 289 290 /* 291 * decrease cs->attach_in_progress. 292 * wake_up cpuset_attach_wq if cs->attach_in_progress==0. 293 */ 294 static inline void dec_attach_in_progress_locked(struct cpuset *cs) 295 { 296 lockdep_assert_held(&cpuset_mutex); 297 298 cs->attach_in_progress--; 299 if (!cs->attach_in_progress) 300 wake_up(&cpuset_attach_wq); 301 } 302 303 static inline void dec_attach_in_progress(struct cpuset *cs) 304 { 305 mutex_lock(&cpuset_mutex); 306 dec_attach_in_progress_locked(cs); 307 mutex_unlock(&cpuset_mutex); 308 } 309 310 static inline bool cpuset_v2(void) 311 { 312 return !IS_ENABLED(CONFIG_CPUSETS_V1) || 313 cgroup_subsys_on_dfl(cpuset_cgrp_subsys); 314 } 315 316 /* 317 * Cgroup v2 behavior is used on the "cpus" and "mems" control files when 318 * on default hierarchy or when the cpuset_v2_mode flag is set by mounting 319 * the v1 cpuset cgroup filesystem with the "cpuset_v2_mode" mount option. 320 * With v2 behavior, "cpus" and "mems" are always what the users have 321 * requested and won't be changed by hotplug events. Only the effective 322 * cpus or mems will be affected. 323 */ 324 static inline bool is_in_v2_mode(void) 325 { 326 return cpuset_v2() || 327 (cpuset_cgrp_subsys.root->flags & CGRP_ROOT_CPUSET_V2_MODE); 328 } 329 330 /** 331 * partition_is_populated - check if partition has tasks 332 * @cs: partition root to be checked 333 * @excluded_child: a child cpuset to be excluded in task checking 334 * Return: true if there are tasks, false otherwise 335 * 336 * It is assumed that @cs is a valid partition root. @excluded_child should 337 * be non-NULL when this cpuset is going to become a partition itself. 338 */ 339 static inline bool partition_is_populated(struct cpuset *cs, 340 struct cpuset *excluded_child) 341 { 342 struct cgroup_subsys_state *css; 343 struct cpuset *child; 344 345 if (cs->css.cgroup->nr_populated_csets) 346 return true; 347 if (!excluded_child && !cs->nr_subparts) 348 return cgroup_is_populated(cs->css.cgroup); 349 350 rcu_read_lock(); 351 cpuset_for_each_child(child, css, cs) { 352 if (child == excluded_child) 353 continue; 354 if (is_partition_valid(child)) 355 continue; 356 if (cgroup_is_populated(child->css.cgroup)) { 357 rcu_read_unlock(); 358 return true; 359 } 360 } 361 rcu_read_unlock(); 362 return false; 363 } 364 365 /* 366 * Return in pmask the portion of a task's cpusets's cpus_allowed that 367 * are online and are capable of running the task. If none are found, 368 * walk up the cpuset hierarchy until we find one that does have some 369 * appropriate cpus. 370 * 371 * One way or another, we guarantee to return some non-empty subset 372 * of cpu_active_mask. 373 * 374 * Call with callback_lock or cpuset_mutex held. 375 */ 376 static void guarantee_active_cpus(struct task_struct *tsk, 377 struct cpumask *pmask) 378 { 379 const struct cpumask *possible_mask = task_cpu_possible_mask(tsk); 380 struct cpuset *cs; 381 382 if (WARN_ON(!cpumask_and(pmask, possible_mask, cpu_active_mask))) 383 cpumask_copy(pmask, cpu_active_mask); 384 385 rcu_read_lock(); 386 cs = task_cs(tsk); 387 388 while (!cpumask_intersects(cs->effective_cpus, pmask)) 389 cs = parent_cs(cs); 390 391 cpumask_and(pmask, pmask, cs->effective_cpus); 392 rcu_read_unlock(); 393 } 394 395 /* 396 * Return in *pmask the portion of a cpusets's mems_allowed that 397 * are online, with memory. If none are online with memory, walk 398 * up the cpuset hierarchy until we find one that does have some 399 * online mems. The top cpuset always has some mems online. 400 * 401 * One way or another, we guarantee to return some non-empty subset 402 * of node_states[N_MEMORY]. 403 * 404 * Call with callback_lock or cpuset_mutex held. 405 */ 406 static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask) 407 { 408 while (!nodes_intersects(cs->effective_mems, node_states[N_MEMORY])) 409 cs = parent_cs(cs); 410 nodes_and(*pmask, cs->effective_mems, node_states[N_MEMORY]); 411 } 412 413 /** 414 * alloc_cpumasks - allocate three cpumasks for cpuset 415 * @cs: the cpuset that have cpumasks to be allocated. 416 * @tmp: the tmpmasks structure pointer 417 * Return: 0 if successful, -ENOMEM otherwise. 418 * 419 * Only one of the two input arguments should be non-NULL. 420 */ 421 static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp) 422 { 423 cpumask_var_t *pmask1, *pmask2, *pmask3, *pmask4; 424 425 if (cs) { 426 pmask1 = &cs->cpus_allowed; 427 pmask2 = &cs->effective_cpus; 428 pmask3 = &cs->effective_xcpus; 429 pmask4 = &cs->exclusive_cpus; 430 } else { 431 pmask1 = &tmp->new_cpus; 432 pmask2 = &tmp->addmask; 433 pmask3 = &tmp->delmask; 434 pmask4 = NULL; 435 } 436 437 if (!zalloc_cpumask_var(pmask1, GFP_KERNEL)) 438 return -ENOMEM; 439 440 if (!zalloc_cpumask_var(pmask2, GFP_KERNEL)) 441 goto free_one; 442 443 if (!zalloc_cpumask_var(pmask3, GFP_KERNEL)) 444 goto free_two; 445 446 if (pmask4 && !zalloc_cpumask_var(pmask4, GFP_KERNEL)) 447 goto free_three; 448 449 450 return 0; 451 452 free_three: 453 free_cpumask_var(*pmask3); 454 free_two: 455 free_cpumask_var(*pmask2); 456 free_one: 457 free_cpumask_var(*pmask1); 458 return -ENOMEM; 459 } 460 461 /** 462 * free_cpumasks - free cpumasks in a tmpmasks structure 463 * @cs: the cpuset that have cpumasks to be free. 464 * @tmp: the tmpmasks structure pointer 465 */ 466 static inline void free_cpumasks(struct cpuset *cs, struct tmpmasks *tmp) 467 { 468 if (cs) { 469 free_cpumask_var(cs->cpus_allowed); 470 free_cpumask_var(cs->effective_cpus); 471 free_cpumask_var(cs->effective_xcpus); 472 free_cpumask_var(cs->exclusive_cpus); 473 } 474 if (tmp) { 475 free_cpumask_var(tmp->new_cpus); 476 free_cpumask_var(tmp->addmask); 477 free_cpumask_var(tmp->delmask); 478 } 479 } 480 481 /** 482 * alloc_trial_cpuset - allocate a trial cpuset 483 * @cs: the cpuset that the trial cpuset duplicates 484 */ 485 static struct cpuset *alloc_trial_cpuset(struct cpuset *cs) 486 { 487 struct cpuset *trial; 488 489 trial = kmemdup(cs, sizeof(*cs), GFP_KERNEL); 490 if (!trial) 491 return NULL; 492 493 if (alloc_cpumasks(trial, NULL)) { 494 kfree(trial); 495 return NULL; 496 } 497 498 cpumask_copy(trial->cpus_allowed, cs->cpus_allowed); 499 cpumask_copy(trial->effective_cpus, cs->effective_cpus); 500 cpumask_copy(trial->effective_xcpus, cs->effective_xcpus); 501 cpumask_copy(trial->exclusive_cpus, cs->exclusive_cpus); 502 return trial; 503 } 504 505 /** 506 * free_cpuset - free the cpuset 507 * @cs: the cpuset to be freed 508 */ 509 static inline void free_cpuset(struct cpuset *cs) 510 { 511 free_cpumasks(cs, NULL); 512 kfree(cs); 513 } 514 515 /* Return user specified exclusive CPUs */ 516 static inline struct cpumask *user_xcpus(struct cpuset *cs) 517 { 518 return cpumask_empty(cs->exclusive_cpus) ? cs->cpus_allowed 519 : cs->exclusive_cpus; 520 } 521 522 static inline bool xcpus_empty(struct cpuset *cs) 523 { 524 return cpumask_empty(cs->cpus_allowed) && 525 cpumask_empty(cs->exclusive_cpus); 526 } 527 528 /* 529 * cpusets_are_exclusive() - check if two cpusets are exclusive 530 * 531 * Return true if exclusive, false if not 532 */ 533 static inline bool cpusets_are_exclusive(struct cpuset *cs1, struct cpuset *cs2) 534 { 535 struct cpumask *xcpus1 = user_xcpus(cs1); 536 struct cpumask *xcpus2 = user_xcpus(cs2); 537 538 if (cpumask_intersects(xcpus1, xcpus2)) 539 return false; 540 return true; 541 } 542 543 /* 544 * validate_change() - Used to validate that any proposed cpuset change 545 * follows the structural rules for cpusets. 546 * 547 * If we replaced the flag and mask values of the current cpuset 548 * (cur) with those values in the trial cpuset (trial), would 549 * our various subset and exclusive rules still be valid? Presumes 550 * cpuset_mutex held. 551 * 552 * 'cur' is the address of an actual, in-use cpuset. Operations 553 * such as list traversal that depend on the actual address of the 554 * cpuset in the list must use cur below, not trial. 555 * 556 * 'trial' is the address of bulk structure copy of cur, with 557 * perhaps one or more of the fields cpus_allowed, mems_allowed, 558 * or flags changed to new, trial values. 559 * 560 * Return 0 if valid, -errno if not. 561 */ 562 563 static int validate_change(struct cpuset *cur, struct cpuset *trial) 564 { 565 struct cgroup_subsys_state *css; 566 struct cpuset *c, *par; 567 int ret = 0; 568 569 rcu_read_lock(); 570 571 if (!is_in_v2_mode()) 572 ret = cpuset1_validate_change(cur, trial); 573 if (ret) 574 goto out; 575 576 /* Remaining checks don't apply to root cpuset */ 577 if (cur == &top_cpuset) 578 goto out; 579 580 par = parent_cs(cur); 581 582 /* 583 * Cpusets with tasks - existing or newly being attached - can't 584 * be changed to have empty cpus_allowed or mems_allowed. 585 */ 586 ret = -ENOSPC; 587 if ((cgroup_is_populated(cur->css.cgroup) || cur->attach_in_progress)) { 588 if (!cpumask_empty(cur->cpus_allowed) && 589 cpumask_empty(trial->cpus_allowed)) 590 goto out; 591 if (!nodes_empty(cur->mems_allowed) && 592 nodes_empty(trial->mems_allowed)) 593 goto out; 594 } 595 596 /* 597 * We can't shrink if we won't have enough room for SCHED_DEADLINE 598 * tasks. This check is not done when scheduling is disabled as the 599 * users should know what they are doing. 600 * 601 * For v1, effective_cpus == cpus_allowed & user_xcpus() returns 602 * cpus_allowed. 603 * 604 * For v2, is_cpu_exclusive() & is_sched_load_balance() are true only 605 * for non-isolated partition root. At this point, the target 606 * effective_cpus isn't computed yet. user_xcpus() is the best 607 * approximation. 608 * 609 * TBD: May need to precompute the real effective_cpus here in case 610 * incorrect scheduling of SCHED_DEADLINE tasks in a partition 611 * becomes an issue. 612 */ 613 ret = -EBUSY; 614 if (is_cpu_exclusive(cur) && is_sched_load_balance(cur) && 615 !cpuset_cpumask_can_shrink(cur->effective_cpus, user_xcpus(trial))) 616 goto out; 617 618 /* 619 * If either I or some sibling (!= me) is exclusive, we can't 620 * overlap. exclusive_cpus cannot overlap with each other if set. 621 */ 622 ret = -EINVAL; 623 cpuset_for_each_child(c, css, par) { 624 bool txset, cxset; /* Are exclusive_cpus set? */ 625 626 if (c == cur) 627 continue; 628 629 txset = !cpumask_empty(trial->exclusive_cpus); 630 cxset = !cpumask_empty(c->exclusive_cpus); 631 if (is_cpu_exclusive(trial) || is_cpu_exclusive(c) || 632 (txset && cxset)) { 633 if (!cpusets_are_exclusive(trial, c)) 634 goto out; 635 } else if (txset || cxset) { 636 struct cpumask *xcpus, *acpus; 637 638 /* 639 * When just one of the exclusive_cpus's is set, 640 * cpus_allowed of the other cpuset, if set, cannot be 641 * a subset of it or none of those CPUs will be 642 * available if these exclusive CPUs are activated. 643 */ 644 if (txset) { 645 xcpus = trial->exclusive_cpus; 646 acpus = c->cpus_allowed; 647 } else { 648 xcpus = c->exclusive_cpus; 649 acpus = trial->cpus_allowed; 650 } 651 if (!cpumask_empty(acpus) && cpumask_subset(acpus, xcpus)) 652 goto out; 653 } 654 if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) && 655 nodes_intersects(trial->mems_allowed, c->mems_allowed)) 656 goto out; 657 } 658 659 ret = 0; 660 out: 661 rcu_read_unlock(); 662 return ret; 663 } 664 665 #ifdef CONFIG_SMP 666 /* 667 * Helper routine for generate_sched_domains(). 668 * Do cpusets a, b have overlapping effective cpus_allowed masks? 669 */ 670 static int cpusets_overlap(struct cpuset *a, struct cpuset *b) 671 { 672 return cpumask_intersects(a->effective_cpus, b->effective_cpus); 673 } 674 675 static void 676 update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c) 677 { 678 if (dattr->relax_domain_level < c->relax_domain_level) 679 dattr->relax_domain_level = c->relax_domain_level; 680 return; 681 } 682 683 static void update_domain_attr_tree(struct sched_domain_attr *dattr, 684 struct cpuset *root_cs) 685 { 686 struct cpuset *cp; 687 struct cgroup_subsys_state *pos_css; 688 689 rcu_read_lock(); 690 cpuset_for_each_descendant_pre(cp, pos_css, root_cs) { 691 /* skip the whole subtree if @cp doesn't have any CPU */ 692 if (cpumask_empty(cp->cpus_allowed)) { 693 pos_css = css_rightmost_descendant(pos_css); 694 continue; 695 } 696 697 if (is_sched_load_balance(cp)) 698 update_domain_attr(dattr, cp); 699 } 700 rcu_read_unlock(); 701 } 702 703 /* Must be called with cpuset_mutex held. */ 704 static inline int nr_cpusets(void) 705 { 706 /* jump label reference count + the top-level cpuset */ 707 return static_key_count(&cpusets_enabled_key.key) + 1; 708 } 709 710 /* 711 * generate_sched_domains() 712 * 713 * This function builds a partial partition of the systems CPUs 714 * A 'partial partition' is a set of non-overlapping subsets whose 715 * union is a subset of that set. 716 * The output of this function needs to be passed to kernel/sched/core.c 717 * partition_sched_domains() routine, which will rebuild the scheduler's 718 * load balancing domains (sched domains) as specified by that partial 719 * partition. 720 * 721 * See "What is sched_load_balance" in Documentation/admin-guide/cgroup-v1/cpusets.rst 722 * for a background explanation of this. 723 * 724 * Does not return errors, on the theory that the callers of this 725 * routine would rather not worry about failures to rebuild sched 726 * domains when operating in the severe memory shortage situations 727 * that could cause allocation failures below. 728 * 729 * Must be called with cpuset_mutex held. 730 * 731 * The three key local variables below are: 732 * cp - cpuset pointer, used (together with pos_css) to perform a 733 * top-down scan of all cpusets. For our purposes, rebuilding 734 * the schedulers sched domains, we can ignore !is_sched_load_ 735 * balance cpusets. 736 * csa - (for CpuSet Array) Array of pointers to all the cpusets 737 * that need to be load balanced, for convenient iterative 738 * access by the subsequent code that finds the best partition, 739 * i.e the set of domains (subsets) of CPUs such that the 740 * cpus_allowed of every cpuset marked is_sched_load_balance 741 * is a subset of one of these domains, while there are as 742 * many such domains as possible, each as small as possible. 743 * doms - Conversion of 'csa' to an array of cpumasks, for passing to 744 * the kernel/sched/core.c routine partition_sched_domains() in a 745 * convenient format, that can be easily compared to the prior 746 * value to determine what partition elements (sched domains) 747 * were changed (added or removed.) 748 * 749 * Finding the best partition (set of domains): 750 * The double nested loops below over i, j scan over the load 751 * balanced cpusets (using the array of cpuset pointers in csa[]) 752 * looking for pairs of cpusets that have overlapping cpus_allowed 753 * and merging them using a union-find algorithm. 754 * 755 * The union of the cpus_allowed masks from the set of all cpusets 756 * having the same root then form the one element of the partition 757 * (one sched domain) to be passed to partition_sched_domains(). 758 * 759 */ 760 static int generate_sched_domains(cpumask_var_t **domains, 761 struct sched_domain_attr **attributes) 762 { 763 struct cpuset *cp; /* top-down scan of cpusets */ 764 struct cpuset **csa; /* array of all cpuset ptrs */ 765 int csn; /* how many cpuset ptrs in csa so far */ 766 int i, j; /* indices for partition finding loops */ 767 cpumask_var_t *doms; /* resulting partition; i.e. sched domains */ 768 struct sched_domain_attr *dattr; /* attributes for custom domains */ 769 int ndoms = 0; /* number of sched domains in result */ 770 int nslot; /* next empty doms[] struct cpumask slot */ 771 struct cgroup_subsys_state *pos_css; 772 bool root_load_balance = is_sched_load_balance(&top_cpuset); 773 bool cgrpv2 = cpuset_v2(); 774 int nslot_update; 775 776 doms = NULL; 777 dattr = NULL; 778 csa = NULL; 779 780 /* Special case for the 99% of systems with one, full, sched domain */ 781 if (root_load_balance && cpumask_empty(subpartitions_cpus)) { 782 single_root_domain: 783 ndoms = 1; 784 doms = alloc_sched_domains(ndoms); 785 if (!doms) 786 goto done; 787 788 dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL); 789 if (dattr) { 790 *dattr = SD_ATTR_INIT; 791 update_domain_attr_tree(dattr, &top_cpuset); 792 } 793 cpumask_and(doms[0], top_cpuset.effective_cpus, 794 housekeeping_cpumask(HK_TYPE_DOMAIN)); 795 796 goto done; 797 } 798 799 csa = kmalloc_array(nr_cpusets(), sizeof(cp), GFP_KERNEL); 800 if (!csa) 801 goto done; 802 csn = 0; 803 804 rcu_read_lock(); 805 if (root_load_balance) 806 csa[csn++] = &top_cpuset; 807 cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) { 808 if (cp == &top_cpuset) 809 continue; 810 811 if (cgrpv2) 812 goto v2; 813 814 /* 815 * v1: 816 * Continue traversing beyond @cp iff @cp has some CPUs and 817 * isn't load balancing. The former is obvious. The 818 * latter: All child cpusets contain a subset of the 819 * parent's cpus, so just skip them, and then we call 820 * update_domain_attr_tree() to calc relax_domain_level of 821 * the corresponding sched domain. 822 */ 823 if (!cpumask_empty(cp->cpus_allowed) && 824 !(is_sched_load_balance(cp) && 825 cpumask_intersects(cp->cpus_allowed, 826 housekeeping_cpumask(HK_TYPE_DOMAIN)))) 827 continue; 828 829 if (is_sched_load_balance(cp) && 830 !cpumask_empty(cp->effective_cpus)) 831 csa[csn++] = cp; 832 833 /* skip @cp's subtree */ 834 pos_css = css_rightmost_descendant(pos_css); 835 continue; 836 837 v2: 838 /* 839 * Only valid partition roots that are not isolated and with 840 * non-empty effective_cpus will be saved into csn[]. 841 */ 842 if ((cp->partition_root_state == PRS_ROOT) && 843 !cpumask_empty(cp->effective_cpus)) 844 csa[csn++] = cp; 845 846 /* 847 * Skip @cp's subtree if not a partition root and has no 848 * exclusive CPUs to be granted to child cpusets. 849 */ 850 if (!is_partition_valid(cp) && cpumask_empty(cp->exclusive_cpus)) 851 pos_css = css_rightmost_descendant(pos_css); 852 } 853 rcu_read_unlock(); 854 855 /* 856 * If there are only isolated partitions underneath the cgroup root, 857 * we can optimize out unneeded sched domains scanning. 858 */ 859 if (root_load_balance && (csn == 1)) 860 goto single_root_domain; 861 862 for (i = 0; i < csn; i++) 863 uf_node_init(&csa[i]->node); 864 865 /* Merge overlapping cpusets */ 866 for (i = 0; i < csn; i++) { 867 for (j = i + 1; j < csn; j++) { 868 if (cpusets_overlap(csa[i], csa[j])) { 869 /* 870 * Cgroup v2 shouldn't pass down overlapping 871 * partition root cpusets. 872 */ 873 WARN_ON_ONCE(cgrpv2); 874 uf_union(&csa[i]->node, &csa[j]->node); 875 } 876 } 877 } 878 879 /* Count the total number of domains */ 880 for (i = 0; i < csn; i++) { 881 if (uf_find(&csa[i]->node) == &csa[i]->node) 882 ndoms++; 883 } 884 885 /* 886 * Now we know how many domains to create. 887 * Convert <csn, csa> to <ndoms, doms> and populate cpu masks. 888 */ 889 doms = alloc_sched_domains(ndoms); 890 if (!doms) 891 goto done; 892 893 /* 894 * The rest of the code, including the scheduler, can deal with 895 * dattr==NULL case. No need to abort if alloc fails. 896 */ 897 dattr = kmalloc_array(ndoms, sizeof(struct sched_domain_attr), 898 GFP_KERNEL); 899 900 /* 901 * Cgroup v2 doesn't support domain attributes, just set all of them 902 * to SD_ATTR_INIT. Also non-isolating partition root CPUs are a 903 * subset of HK_TYPE_DOMAIN housekeeping CPUs. 904 */ 905 if (cgrpv2) { 906 for (i = 0; i < ndoms; i++) { 907 /* 908 * The top cpuset may contain some boot time isolated 909 * CPUs that need to be excluded from the sched domain. 910 */ 911 if (csa[i] == &top_cpuset) 912 cpumask_and(doms[i], csa[i]->effective_cpus, 913 housekeeping_cpumask(HK_TYPE_DOMAIN)); 914 else 915 cpumask_copy(doms[i], csa[i]->effective_cpus); 916 if (dattr) 917 dattr[i] = SD_ATTR_INIT; 918 } 919 goto done; 920 } 921 922 for (nslot = 0, i = 0; i < csn; i++) { 923 nslot_update = 0; 924 for (j = i; j < csn; j++) { 925 if (uf_find(&csa[j]->node) == &csa[i]->node) { 926 struct cpumask *dp = doms[nslot]; 927 928 if (i == j) { 929 nslot_update = 1; 930 cpumask_clear(dp); 931 if (dattr) 932 *(dattr + nslot) = SD_ATTR_INIT; 933 } 934 cpumask_or(dp, dp, csa[j]->effective_cpus); 935 cpumask_and(dp, dp, housekeeping_cpumask(HK_TYPE_DOMAIN)); 936 if (dattr) 937 update_domain_attr_tree(dattr + nslot, csa[j]); 938 } 939 } 940 if (nslot_update) 941 nslot++; 942 } 943 BUG_ON(nslot != ndoms); 944 945 done: 946 kfree(csa); 947 948 /* 949 * Fallback to the default domain if kmalloc() failed. 950 * See comments in partition_sched_domains(). 951 */ 952 if (doms == NULL) 953 ndoms = 1; 954 955 *domains = doms; 956 *attributes = dattr; 957 return ndoms; 958 } 959 960 static void dl_update_tasks_root_domain(struct cpuset *cs) 961 { 962 struct css_task_iter it; 963 struct task_struct *task; 964 965 if (cs->nr_deadline_tasks == 0) 966 return; 967 968 css_task_iter_start(&cs->css, 0, &it); 969 970 while ((task = css_task_iter_next(&it))) 971 dl_add_task_root_domain(task); 972 973 css_task_iter_end(&it); 974 } 975 976 void dl_rebuild_rd_accounting(void) 977 { 978 struct cpuset *cs = NULL; 979 struct cgroup_subsys_state *pos_css; 980 int cpu; 981 u64 cookie = ++dl_cookie; 982 983 lockdep_assert_held(&cpuset_mutex); 984 lockdep_assert_cpus_held(); 985 lockdep_assert_held(&sched_domains_mutex); 986 987 rcu_read_lock(); 988 989 for_each_possible_cpu(cpu) { 990 if (dl_bw_visited(cpu, cookie)) 991 continue; 992 993 dl_clear_root_domain_cpu(cpu); 994 } 995 996 cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) { 997 998 if (cpumask_empty(cs->effective_cpus)) { 999 pos_css = css_rightmost_descendant(pos_css); 1000 continue; 1001 } 1002 1003 css_get(&cs->css); 1004 1005 rcu_read_unlock(); 1006 1007 dl_update_tasks_root_domain(cs); 1008 1009 rcu_read_lock(); 1010 css_put(&cs->css); 1011 } 1012 rcu_read_unlock(); 1013 } 1014 1015 /* 1016 * Rebuild scheduler domains. 1017 * 1018 * If the flag 'sched_load_balance' of any cpuset with non-empty 1019 * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset 1020 * which has that flag enabled, or if any cpuset with a non-empty 1021 * 'cpus' is removed, then call this routine to rebuild the 1022 * scheduler's dynamic sched domains. 1023 * 1024 * Call with cpuset_mutex held. Takes cpus_read_lock(). 1025 */ 1026 void rebuild_sched_domains_locked(void) 1027 { 1028 struct cgroup_subsys_state *pos_css; 1029 struct sched_domain_attr *attr; 1030 cpumask_var_t *doms; 1031 struct cpuset *cs; 1032 int ndoms; 1033 1034 lockdep_assert_cpus_held(); 1035 lockdep_assert_held(&cpuset_mutex); 1036 force_sd_rebuild = false; 1037 1038 /* 1039 * If we have raced with CPU hotplug, return early to avoid 1040 * passing doms with offlined cpu to partition_sched_domains(). 1041 * Anyways, cpuset_handle_hotplug() will rebuild sched domains. 1042 * 1043 * With no CPUs in any subpartitions, top_cpuset's effective CPUs 1044 * should be the same as the active CPUs, so checking only top_cpuset 1045 * is enough to detect racing CPU offlines. 1046 */ 1047 if (cpumask_empty(subpartitions_cpus) && 1048 !cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask)) 1049 return; 1050 1051 /* 1052 * With subpartition CPUs, however, the effective CPUs of a partition 1053 * root should be only a subset of the active CPUs. Since a CPU in any 1054 * partition root could be offlined, all must be checked. 1055 */ 1056 if (!cpumask_empty(subpartitions_cpus)) { 1057 rcu_read_lock(); 1058 cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) { 1059 if (!is_partition_valid(cs)) { 1060 pos_css = css_rightmost_descendant(pos_css); 1061 continue; 1062 } 1063 if (!cpumask_subset(cs->effective_cpus, 1064 cpu_active_mask)) { 1065 rcu_read_unlock(); 1066 return; 1067 } 1068 } 1069 rcu_read_unlock(); 1070 } 1071 1072 /* Generate domain masks and attrs */ 1073 ndoms = generate_sched_domains(&doms, &attr); 1074 1075 /* Have scheduler rebuild the domains */ 1076 partition_sched_domains(ndoms, doms, attr); 1077 } 1078 #else /* !CONFIG_SMP */ 1079 void rebuild_sched_domains_locked(void) 1080 { 1081 } 1082 #endif /* CONFIG_SMP */ 1083 1084 static void rebuild_sched_domains_cpuslocked(void) 1085 { 1086 mutex_lock(&cpuset_mutex); 1087 rebuild_sched_domains_locked(); 1088 mutex_unlock(&cpuset_mutex); 1089 } 1090 1091 void rebuild_sched_domains(void) 1092 { 1093 cpus_read_lock(); 1094 rebuild_sched_domains_cpuslocked(); 1095 cpus_read_unlock(); 1096 } 1097 1098 void cpuset_reset_sched_domains(void) 1099 { 1100 mutex_lock(&cpuset_mutex); 1101 partition_sched_domains(1, NULL, NULL); 1102 mutex_unlock(&cpuset_mutex); 1103 } 1104 1105 /** 1106 * cpuset_update_tasks_cpumask - Update the cpumasks of tasks in the cpuset. 1107 * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed 1108 * @new_cpus: the temp variable for the new effective_cpus mask 1109 * 1110 * Iterate through each task of @cs updating its cpus_allowed to the 1111 * effective cpuset's. As this function is called with cpuset_mutex held, 1112 * cpuset membership stays stable. 1113 * 1114 * For top_cpuset, task_cpu_possible_mask() is used instead of effective_cpus 1115 * to make sure all offline CPUs are also included as hotplug code won't 1116 * update cpumasks for tasks in top_cpuset. 1117 * 1118 * As task_cpu_possible_mask() can be task dependent in arm64, we have to 1119 * do cpu masking per task instead of doing it once for all. 1120 */ 1121 void cpuset_update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus) 1122 { 1123 struct css_task_iter it; 1124 struct task_struct *task; 1125 bool top_cs = cs == &top_cpuset; 1126 1127 css_task_iter_start(&cs->css, 0, &it); 1128 while ((task = css_task_iter_next(&it))) { 1129 const struct cpumask *possible_mask = task_cpu_possible_mask(task); 1130 1131 if (top_cs) { 1132 /* 1133 * PF_NO_SETAFFINITY tasks are ignored. 1134 * All per cpu kthreads should have PF_NO_SETAFFINITY 1135 * flag set, see kthread_set_per_cpu(). 1136 */ 1137 if (task->flags & PF_NO_SETAFFINITY) 1138 continue; 1139 cpumask_andnot(new_cpus, possible_mask, subpartitions_cpus); 1140 } else { 1141 cpumask_and(new_cpus, possible_mask, cs->effective_cpus); 1142 } 1143 set_cpus_allowed_ptr(task, new_cpus); 1144 } 1145 css_task_iter_end(&it); 1146 } 1147 1148 /** 1149 * compute_effective_cpumask - Compute the effective cpumask of the cpuset 1150 * @new_cpus: the temp variable for the new effective_cpus mask 1151 * @cs: the cpuset the need to recompute the new effective_cpus mask 1152 * @parent: the parent cpuset 1153 * 1154 * The result is valid only if the given cpuset isn't a partition root. 1155 */ 1156 static void compute_effective_cpumask(struct cpumask *new_cpus, 1157 struct cpuset *cs, struct cpuset *parent) 1158 { 1159 cpumask_and(new_cpus, cs->cpus_allowed, parent->effective_cpus); 1160 } 1161 1162 /* 1163 * Commands for update_parent_effective_cpumask 1164 */ 1165 enum partition_cmd { 1166 partcmd_enable, /* Enable partition root */ 1167 partcmd_enablei, /* Enable isolated partition root */ 1168 partcmd_disable, /* Disable partition root */ 1169 partcmd_update, /* Update parent's effective_cpus */ 1170 partcmd_invalidate, /* Make partition invalid */ 1171 }; 1172 1173 static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, 1174 struct tmpmasks *tmp); 1175 1176 /* 1177 * Update partition exclusive flag 1178 * 1179 * Return: 0 if successful, an error code otherwise 1180 */ 1181 static int update_partition_exclusive_flag(struct cpuset *cs, int new_prs) 1182 { 1183 bool exclusive = (new_prs > PRS_MEMBER); 1184 1185 if (exclusive && !is_cpu_exclusive(cs)) { 1186 if (cpuset_update_flag(CS_CPU_EXCLUSIVE, cs, 1)) 1187 return PERR_NOTEXCL; 1188 } else if (!exclusive && is_cpu_exclusive(cs)) { 1189 /* Turning off CS_CPU_EXCLUSIVE will not return error */ 1190 cpuset_update_flag(CS_CPU_EXCLUSIVE, cs, 0); 1191 } 1192 return 0; 1193 } 1194 1195 /* 1196 * Update partition load balance flag and/or rebuild sched domain 1197 * 1198 * Changing load balance flag will automatically call 1199 * rebuild_sched_domains_locked(). 1200 * This function is for cgroup v2 only. 1201 */ 1202 static void update_partition_sd_lb(struct cpuset *cs, int old_prs) 1203 { 1204 int new_prs = cs->partition_root_state; 1205 bool rebuild_domains = (new_prs > 0) || (old_prs > 0); 1206 bool new_lb; 1207 1208 /* 1209 * If cs is not a valid partition root, the load balance state 1210 * will follow its parent. 1211 */ 1212 if (new_prs > 0) { 1213 new_lb = (new_prs != PRS_ISOLATED); 1214 } else { 1215 new_lb = is_sched_load_balance(parent_cs(cs)); 1216 } 1217 if (new_lb != !!is_sched_load_balance(cs)) { 1218 rebuild_domains = true; 1219 if (new_lb) 1220 set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); 1221 else 1222 clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); 1223 } 1224 1225 if (rebuild_domains) 1226 cpuset_force_rebuild(); 1227 } 1228 1229 /* 1230 * tasks_nocpu_error - Return true if tasks will have no effective_cpus 1231 */ 1232 static bool tasks_nocpu_error(struct cpuset *parent, struct cpuset *cs, 1233 struct cpumask *xcpus) 1234 { 1235 /* 1236 * A populated partition (cs or parent) can't have empty effective_cpus 1237 */ 1238 return (cpumask_subset(parent->effective_cpus, xcpus) && 1239 partition_is_populated(parent, cs)) || 1240 (!cpumask_intersects(xcpus, cpu_active_mask) && 1241 partition_is_populated(cs, NULL)); 1242 } 1243 1244 static void reset_partition_data(struct cpuset *cs) 1245 { 1246 struct cpuset *parent = parent_cs(cs); 1247 1248 if (!cpuset_v2()) 1249 return; 1250 1251 lockdep_assert_held(&callback_lock); 1252 1253 cs->nr_subparts = 0; 1254 if (cpumask_empty(cs->exclusive_cpus)) { 1255 cpumask_clear(cs->effective_xcpus); 1256 if (is_cpu_exclusive(cs)) 1257 clear_bit(CS_CPU_EXCLUSIVE, &cs->flags); 1258 } 1259 if (!cpumask_and(cs->effective_cpus, parent->effective_cpus, cs->cpus_allowed)) 1260 cpumask_copy(cs->effective_cpus, parent->effective_cpus); 1261 } 1262 1263 /* 1264 * isolated_cpus_update - Update the isolated_cpus mask 1265 * @old_prs: old partition_root_state 1266 * @new_prs: new partition_root_state 1267 * @xcpus: exclusive CPUs with state change 1268 */ 1269 static void isolated_cpus_update(int old_prs, int new_prs, struct cpumask *xcpus) 1270 { 1271 WARN_ON_ONCE(old_prs == new_prs); 1272 if (new_prs == PRS_ISOLATED) 1273 cpumask_or(isolated_cpus, isolated_cpus, xcpus); 1274 else 1275 cpumask_andnot(isolated_cpus, isolated_cpus, xcpus); 1276 } 1277 1278 /* 1279 * partition_xcpus_add - Add new exclusive CPUs to partition 1280 * @new_prs: new partition_root_state 1281 * @parent: parent cpuset 1282 * @xcpus: exclusive CPUs to be added 1283 * Return: true if isolated_cpus modified, false otherwise 1284 * 1285 * Remote partition if parent == NULL 1286 */ 1287 static bool partition_xcpus_add(int new_prs, struct cpuset *parent, 1288 struct cpumask *xcpus) 1289 { 1290 bool isolcpus_updated; 1291 1292 WARN_ON_ONCE(new_prs < 0); 1293 lockdep_assert_held(&callback_lock); 1294 if (!parent) 1295 parent = &top_cpuset; 1296 1297 1298 if (parent == &top_cpuset) 1299 cpumask_or(subpartitions_cpus, subpartitions_cpus, xcpus); 1300 1301 isolcpus_updated = (new_prs != parent->partition_root_state); 1302 if (isolcpus_updated) 1303 isolated_cpus_update(parent->partition_root_state, new_prs, 1304 xcpus); 1305 1306 cpumask_andnot(parent->effective_cpus, parent->effective_cpus, xcpus); 1307 return isolcpus_updated; 1308 } 1309 1310 /* 1311 * partition_xcpus_del - Remove exclusive CPUs from partition 1312 * @old_prs: old partition_root_state 1313 * @parent: parent cpuset 1314 * @xcpus: exclusive CPUs to be removed 1315 * Return: true if isolated_cpus modified, false otherwise 1316 * 1317 * Remote partition if parent == NULL 1318 */ 1319 static bool partition_xcpus_del(int old_prs, struct cpuset *parent, 1320 struct cpumask *xcpus) 1321 { 1322 bool isolcpus_updated; 1323 1324 WARN_ON_ONCE(old_prs < 0); 1325 lockdep_assert_held(&callback_lock); 1326 if (!parent) 1327 parent = &top_cpuset; 1328 1329 if (parent == &top_cpuset) 1330 cpumask_andnot(subpartitions_cpus, subpartitions_cpus, xcpus); 1331 1332 isolcpus_updated = (old_prs != parent->partition_root_state); 1333 if (isolcpus_updated) 1334 isolated_cpus_update(old_prs, parent->partition_root_state, 1335 xcpus); 1336 1337 cpumask_and(xcpus, xcpus, cpu_active_mask); 1338 cpumask_or(parent->effective_cpus, parent->effective_cpus, xcpus); 1339 return isolcpus_updated; 1340 } 1341 1342 static void update_unbound_workqueue_cpumask(bool isolcpus_updated) 1343 { 1344 int ret; 1345 1346 lockdep_assert_cpus_held(); 1347 1348 if (!isolcpus_updated) 1349 return; 1350 1351 ret = workqueue_unbound_exclude_cpumask(isolated_cpus); 1352 WARN_ON_ONCE(ret < 0); 1353 } 1354 1355 /** 1356 * cpuset_cpu_is_isolated - Check if the given CPU is isolated 1357 * @cpu: the CPU number to be checked 1358 * Return: true if CPU is used in an isolated partition, false otherwise 1359 */ 1360 bool cpuset_cpu_is_isolated(int cpu) 1361 { 1362 return cpumask_test_cpu(cpu, isolated_cpus); 1363 } 1364 EXPORT_SYMBOL_GPL(cpuset_cpu_is_isolated); 1365 1366 /* 1367 * compute_effective_exclusive_cpumask - compute effective exclusive CPUs 1368 * @cs: cpuset 1369 * @xcpus: effective exclusive CPUs value to be set 1370 * @real_cs: the real cpuset (can be NULL) 1371 * Return: 0 if there is no sibling conflict, > 0 otherwise 1372 * 1373 * If exclusive_cpus isn't explicitly set or a real_cs is provided, we have to 1374 * scan the sibling cpusets and exclude their exclusive_cpus or effective_xcpus 1375 * as well. The provision of real_cs means that a cpumask is being changed and 1376 * the given cs is a trial one. 1377 */ 1378 static int compute_effective_exclusive_cpumask(struct cpuset *cs, 1379 struct cpumask *xcpus, 1380 struct cpuset *real_cs) 1381 { 1382 struct cgroup_subsys_state *css; 1383 struct cpuset *parent = parent_cs(cs); 1384 struct cpuset *sibling; 1385 int retval = 0; 1386 1387 if (!xcpus) 1388 xcpus = cs->effective_xcpus; 1389 1390 cpumask_and(xcpus, user_xcpus(cs), parent->effective_xcpus); 1391 1392 if (!real_cs) { 1393 if (!cpumask_empty(cs->exclusive_cpus)) 1394 return 0; 1395 } else { 1396 cs = real_cs; 1397 } 1398 1399 /* 1400 * Exclude exclusive CPUs from siblings 1401 */ 1402 rcu_read_lock(); 1403 cpuset_for_each_child(sibling, css, parent) { 1404 if (sibling == cs) 1405 continue; 1406 1407 if (cpumask_intersects(xcpus, sibling->exclusive_cpus)) { 1408 cpumask_andnot(xcpus, xcpus, sibling->exclusive_cpus); 1409 retval++; 1410 continue; 1411 } 1412 if (cpumask_intersects(xcpus, sibling->effective_xcpus)) { 1413 cpumask_andnot(xcpus, xcpus, sibling->effective_xcpus); 1414 retval++; 1415 } 1416 } 1417 rcu_read_unlock(); 1418 return retval; 1419 } 1420 1421 static inline bool is_remote_partition(struct cpuset *cs) 1422 { 1423 return !list_empty(&cs->remote_sibling); 1424 } 1425 1426 static inline bool is_local_partition(struct cpuset *cs) 1427 { 1428 return is_partition_valid(cs) && !is_remote_partition(cs); 1429 } 1430 1431 /* 1432 * remote_partition_enable - Enable current cpuset as a remote partition root 1433 * @cs: the cpuset to update 1434 * @new_prs: new partition_root_state 1435 * @tmp: temporary masks 1436 * Return: 0 if successful, errcode if error 1437 * 1438 * Enable the current cpuset to become a remote partition root taking CPUs 1439 * directly from the top cpuset. cpuset_mutex must be held by the caller. 1440 */ 1441 static int remote_partition_enable(struct cpuset *cs, int new_prs, 1442 struct tmpmasks *tmp) 1443 { 1444 bool isolcpus_updated; 1445 1446 /* 1447 * The user must have sysadmin privilege. 1448 */ 1449 if (!capable(CAP_SYS_ADMIN)) 1450 return PERR_ACCESS; 1451 1452 /* 1453 * The requested exclusive_cpus must not be allocated to other 1454 * partitions and it can't use up all the root's effective_cpus. 1455 * 1456 * The effective_xcpus mask can contain offline CPUs, but there must 1457 * be at least one or more online CPUs present before it can be enabled. 1458 * 1459 * Note that creating a remote partition with any local partition root 1460 * above it or remote partition root underneath it is not allowed. 1461 */ 1462 compute_effective_exclusive_cpumask(cs, tmp->new_cpus, NULL); 1463 WARN_ON_ONCE(cpumask_intersects(tmp->new_cpus, subpartitions_cpus)); 1464 if (!cpumask_intersects(tmp->new_cpus, cpu_active_mask) || 1465 cpumask_subset(top_cpuset.effective_cpus, tmp->new_cpus)) 1466 return PERR_INVCPUS; 1467 1468 spin_lock_irq(&callback_lock); 1469 isolcpus_updated = partition_xcpus_add(new_prs, NULL, tmp->new_cpus); 1470 list_add(&cs->remote_sibling, &remote_children); 1471 cpumask_copy(cs->effective_xcpus, tmp->new_cpus); 1472 spin_unlock_irq(&callback_lock); 1473 update_unbound_workqueue_cpumask(isolcpus_updated); 1474 cpuset_force_rebuild(); 1475 cs->prs_err = 0; 1476 1477 /* 1478 * Propagate changes in top_cpuset's effective_cpus down the hierarchy. 1479 */ 1480 cpuset_update_tasks_cpumask(&top_cpuset, tmp->new_cpus); 1481 update_sibling_cpumasks(&top_cpuset, NULL, tmp); 1482 return 0; 1483 } 1484 1485 /* 1486 * remote_partition_disable - Remove current cpuset from remote partition list 1487 * @cs: the cpuset to update 1488 * @tmp: temporary masks 1489 * 1490 * The effective_cpus is also updated. 1491 * 1492 * cpuset_mutex must be held by the caller. 1493 */ 1494 static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp) 1495 { 1496 bool isolcpus_updated; 1497 1498 WARN_ON_ONCE(!is_remote_partition(cs)); 1499 WARN_ON_ONCE(!cpumask_subset(cs->effective_xcpus, subpartitions_cpus)); 1500 1501 spin_lock_irq(&callback_lock); 1502 list_del_init(&cs->remote_sibling); 1503 isolcpus_updated = partition_xcpus_del(cs->partition_root_state, 1504 NULL, cs->effective_xcpus); 1505 if (cs->prs_err) 1506 cs->partition_root_state = -cs->partition_root_state; 1507 else 1508 cs->partition_root_state = PRS_MEMBER; 1509 1510 /* effective_xcpus may need to be changed */ 1511 compute_effective_exclusive_cpumask(cs, NULL, NULL); 1512 reset_partition_data(cs); 1513 spin_unlock_irq(&callback_lock); 1514 update_unbound_workqueue_cpumask(isolcpus_updated); 1515 cpuset_force_rebuild(); 1516 1517 /* 1518 * Propagate changes in top_cpuset's effective_cpus down the hierarchy. 1519 */ 1520 cpuset_update_tasks_cpumask(&top_cpuset, tmp->new_cpus); 1521 update_sibling_cpumasks(&top_cpuset, NULL, tmp); 1522 } 1523 1524 /* 1525 * remote_cpus_update - cpus_exclusive change of remote partition 1526 * @cs: the cpuset to be updated 1527 * @xcpus: the new exclusive_cpus mask, if non-NULL 1528 * @excpus: the new effective_xcpus mask 1529 * @tmp: temporary masks 1530 * 1531 * top_cpuset and subpartitions_cpus will be updated or partition can be 1532 * invalidated. 1533 */ 1534 static void remote_cpus_update(struct cpuset *cs, struct cpumask *xcpus, 1535 struct cpumask *excpus, struct tmpmasks *tmp) 1536 { 1537 bool adding, deleting; 1538 int prs = cs->partition_root_state; 1539 int isolcpus_updated = 0; 1540 1541 if (WARN_ON_ONCE(!is_remote_partition(cs))) 1542 return; 1543 1544 WARN_ON_ONCE(!cpumask_subset(cs->effective_xcpus, subpartitions_cpus)); 1545 1546 if (cpumask_empty(excpus)) { 1547 cs->prs_err = PERR_CPUSEMPTY; 1548 goto invalidate; 1549 } 1550 1551 adding = cpumask_andnot(tmp->addmask, excpus, cs->effective_xcpus); 1552 deleting = cpumask_andnot(tmp->delmask, cs->effective_xcpus, excpus); 1553 1554 /* 1555 * Additions of remote CPUs is only allowed if those CPUs are 1556 * not allocated to other partitions and there are effective_cpus 1557 * left in the top cpuset. 1558 */ 1559 if (adding) { 1560 WARN_ON_ONCE(cpumask_intersects(tmp->addmask, subpartitions_cpus)); 1561 if (!capable(CAP_SYS_ADMIN)) 1562 cs->prs_err = PERR_ACCESS; 1563 else if (cpumask_intersects(tmp->addmask, subpartitions_cpus) || 1564 cpumask_subset(top_cpuset.effective_cpus, tmp->addmask)) 1565 cs->prs_err = PERR_NOCPUS; 1566 if (cs->prs_err) 1567 goto invalidate; 1568 } 1569 1570 spin_lock_irq(&callback_lock); 1571 if (adding) 1572 isolcpus_updated += partition_xcpus_add(prs, NULL, tmp->addmask); 1573 if (deleting) 1574 isolcpus_updated += partition_xcpus_del(prs, NULL, tmp->delmask); 1575 /* 1576 * Need to update effective_xcpus and exclusive_cpus now as 1577 * update_sibling_cpumasks() below may iterate back to the same cs. 1578 */ 1579 cpumask_copy(cs->effective_xcpus, excpus); 1580 if (xcpus) 1581 cpumask_copy(cs->exclusive_cpus, xcpus); 1582 spin_unlock_irq(&callback_lock); 1583 update_unbound_workqueue_cpumask(isolcpus_updated); 1584 if (adding || deleting) 1585 cpuset_force_rebuild(); 1586 1587 /* 1588 * Propagate changes in top_cpuset's effective_cpus down the hierarchy. 1589 */ 1590 cpuset_update_tasks_cpumask(&top_cpuset, tmp->new_cpus); 1591 update_sibling_cpumasks(&top_cpuset, NULL, tmp); 1592 return; 1593 1594 invalidate: 1595 remote_partition_disable(cs, tmp); 1596 } 1597 1598 /* 1599 * prstate_housekeeping_conflict - check for partition & housekeeping conflicts 1600 * @prstate: partition root state to be checked 1601 * @new_cpus: cpu mask 1602 * Return: true if there is conflict, false otherwise 1603 * 1604 * CPUs outside of boot_hk_cpus, if defined, can only be used in an 1605 * isolated partition. 1606 */ 1607 static bool prstate_housekeeping_conflict(int prstate, struct cpumask *new_cpus) 1608 { 1609 if (!have_boot_isolcpus) 1610 return false; 1611 1612 if ((prstate != PRS_ISOLATED) && !cpumask_subset(new_cpus, boot_hk_cpus)) 1613 return true; 1614 1615 return false; 1616 } 1617 1618 /** 1619 * update_parent_effective_cpumask - update effective_cpus mask of parent cpuset 1620 * @cs: The cpuset that requests change in partition root state 1621 * @cmd: Partition root state change command 1622 * @newmask: Optional new cpumask for partcmd_update 1623 * @tmp: Temporary addmask and delmask 1624 * Return: 0 or a partition root state error code 1625 * 1626 * For partcmd_enable*, the cpuset is being transformed from a non-partition 1627 * root to a partition root. The effective_xcpus (cpus_allowed if 1628 * effective_xcpus not set) mask of the given cpuset will be taken away from 1629 * parent's effective_cpus. The function will return 0 if all the CPUs listed 1630 * in effective_xcpus can be granted or an error code will be returned. 1631 * 1632 * For partcmd_disable, the cpuset is being transformed from a partition 1633 * root back to a non-partition root. Any CPUs in effective_xcpus will be 1634 * given back to parent's effective_cpus. 0 will always be returned. 1635 * 1636 * For partcmd_update, if the optional newmask is specified, the cpu list is 1637 * to be changed from effective_xcpus to newmask. Otherwise, effective_xcpus is 1638 * assumed to remain the same. The cpuset should either be a valid or invalid 1639 * partition root. The partition root state may change from valid to invalid 1640 * or vice versa. An error code will be returned if transitioning from 1641 * invalid to valid violates the exclusivity rule. 1642 * 1643 * For partcmd_invalidate, the current partition will be made invalid. 1644 * 1645 * The partcmd_enable* and partcmd_disable commands are used by 1646 * update_prstate(). An error code may be returned and the caller will check 1647 * for error. 1648 * 1649 * The partcmd_update command is used by update_cpumasks_hier() with newmask 1650 * NULL and update_cpumask() with newmask set. The partcmd_invalidate is used 1651 * by update_cpumask() with NULL newmask. In both cases, the callers won't 1652 * check for error and so partition_root_state and prs_err will be updated 1653 * directly. 1654 */ 1655 static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, 1656 struct cpumask *newmask, 1657 struct tmpmasks *tmp) 1658 { 1659 struct cpuset *parent = parent_cs(cs); 1660 int adding; /* Adding cpus to parent's effective_cpus */ 1661 int deleting; /* Deleting cpus from parent's effective_cpus */ 1662 int old_prs, new_prs; 1663 int part_error = PERR_NONE; /* Partition error? */ 1664 int subparts_delta = 0; 1665 int isolcpus_updated = 0; 1666 struct cpumask *xcpus = user_xcpus(cs); 1667 bool nocpu; 1668 1669 lockdep_assert_held(&cpuset_mutex); 1670 WARN_ON_ONCE(is_remote_partition(cs)); /* For local partition only */ 1671 1672 /* 1673 * new_prs will only be changed for the partcmd_update and 1674 * partcmd_invalidate commands. 1675 */ 1676 adding = deleting = false; 1677 old_prs = new_prs = cs->partition_root_state; 1678 1679 if (cmd == partcmd_invalidate) { 1680 if (is_prs_invalid(old_prs)) 1681 return 0; 1682 1683 /* 1684 * Make the current partition invalid. 1685 */ 1686 if (is_partition_valid(parent)) 1687 adding = cpumask_and(tmp->addmask, 1688 xcpus, parent->effective_xcpus); 1689 if (old_prs > 0) { 1690 new_prs = -old_prs; 1691 subparts_delta--; 1692 } 1693 goto write_error; 1694 } 1695 1696 /* 1697 * The parent must be a partition root. 1698 * The new cpumask, if present, or the current cpus_allowed must 1699 * not be empty. 1700 */ 1701 if (!is_partition_valid(parent)) { 1702 return is_partition_invalid(parent) 1703 ? PERR_INVPARENT : PERR_NOTPART; 1704 } 1705 if (!newmask && xcpus_empty(cs)) 1706 return PERR_CPUSEMPTY; 1707 1708 nocpu = tasks_nocpu_error(parent, cs, xcpus); 1709 1710 if ((cmd == partcmd_enable) || (cmd == partcmd_enablei)) { 1711 /* 1712 * Need to call compute_effective_exclusive_cpumask() in case 1713 * exclusive_cpus not set. Sibling conflict should only happen 1714 * if exclusive_cpus isn't set. 1715 */ 1716 xcpus = tmp->delmask; 1717 if (compute_effective_exclusive_cpumask(cs, xcpus, NULL)) 1718 WARN_ON_ONCE(!cpumask_empty(cs->exclusive_cpus)); 1719 1720 /* 1721 * Enabling partition root is not allowed if its 1722 * effective_xcpus is empty. 1723 */ 1724 if (cpumask_empty(xcpus)) 1725 return PERR_INVCPUS; 1726 1727 if (prstate_housekeeping_conflict(new_prs, xcpus)) 1728 return PERR_HKEEPING; 1729 1730 /* 1731 * A parent can be left with no CPU as long as there is no 1732 * task directly associated with the parent partition. 1733 */ 1734 if (nocpu) 1735 return PERR_NOCPUS; 1736 1737 /* 1738 * This function will only be called when all the preliminary 1739 * checks have passed. At this point, the following condition 1740 * should hold. 1741 * 1742 * (cs->effective_xcpus & cpu_active_mask) ⊆ parent->effective_cpus 1743 * 1744 * Warn if it is not the case. 1745 */ 1746 cpumask_and(tmp->new_cpus, xcpus, cpu_active_mask); 1747 WARN_ON_ONCE(!cpumask_subset(tmp->new_cpus, parent->effective_cpus)); 1748 1749 deleting = true; 1750 subparts_delta++; 1751 new_prs = (cmd == partcmd_enable) ? PRS_ROOT : PRS_ISOLATED; 1752 } else if (cmd == partcmd_disable) { 1753 /* 1754 * May need to add cpus back to parent's effective_cpus 1755 * (and maybe removed from subpartitions_cpus/isolated_cpus) 1756 * for valid partition root. xcpus may contain CPUs that 1757 * shouldn't be removed from the two global cpumasks. 1758 */ 1759 if (is_partition_valid(cs)) { 1760 cpumask_copy(tmp->addmask, cs->effective_xcpus); 1761 adding = true; 1762 subparts_delta--; 1763 } 1764 new_prs = PRS_MEMBER; 1765 } else if (newmask) { 1766 /* 1767 * Empty cpumask is not allowed 1768 */ 1769 if (cpumask_empty(newmask)) { 1770 part_error = PERR_CPUSEMPTY; 1771 goto write_error; 1772 } 1773 1774 /* Check newmask again, whether cpus are available for parent/cs */ 1775 nocpu |= tasks_nocpu_error(parent, cs, newmask); 1776 1777 /* 1778 * partcmd_update with newmask: 1779 * 1780 * Compute add/delete mask to/from effective_cpus 1781 * 1782 * For valid partition: 1783 * addmask = exclusive_cpus & ~newmask 1784 * & parent->effective_xcpus 1785 * delmask = newmask & ~exclusive_cpus 1786 * & parent->effective_xcpus 1787 * 1788 * For invalid partition: 1789 * delmask = newmask & parent->effective_xcpus 1790 */ 1791 if (is_prs_invalid(old_prs)) { 1792 adding = false; 1793 deleting = cpumask_and(tmp->delmask, 1794 newmask, parent->effective_xcpus); 1795 } else { 1796 cpumask_andnot(tmp->addmask, xcpus, newmask); 1797 adding = cpumask_and(tmp->addmask, tmp->addmask, 1798 parent->effective_xcpus); 1799 1800 cpumask_andnot(tmp->delmask, newmask, xcpus); 1801 deleting = cpumask_and(tmp->delmask, tmp->delmask, 1802 parent->effective_xcpus); 1803 } 1804 /* 1805 * The new CPUs to be removed from parent's effective CPUs 1806 * must be present. 1807 */ 1808 if (deleting) { 1809 cpumask_and(tmp->new_cpus, tmp->delmask, cpu_active_mask); 1810 WARN_ON_ONCE(!cpumask_subset(tmp->new_cpus, parent->effective_cpus)); 1811 } 1812 1813 /* 1814 * Make partition invalid if parent's effective_cpus could 1815 * become empty and there are tasks in the parent. 1816 */ 1817 if (nocpu && (!adding || 1818 !cpumask_intersects(tmp->addmask, cpu_active_mask))) { 1819 part_error = PERR_NOCPUS; 1820 deleting = false; 1821 adding = cpumask_and(tmp->addmask, 1822 xcpus, parent->effective_xcpus); 1823 } 1824 } else { 1825 /* 1826 * partcmd_update w/o newmask 1827 * 1828 * delmask = effective_xcpus & parent->effective_cpus 1829 * 1830 * This can be called from: 1831 * 1) update_cpumasks_hier() 1832 * 2) cpuset_hotplug_update_tasks() 1833 * 1834 * Check to see if it can be transitioned from valid to 1835 * invalid partition or vice versa. 1836 * 1837 * A partition error happens when parent has tasks and all 1838 * its effective CPUs will have to be distributed out. 1839 */ 1840 WARN_ON_ONCE(!is_partition_valid(parent)); 1841 if (nocpu) { 1842 part_error = PERR_NOCPUS; 1843 if (is_partition_valid(cs)) 1844 adding = cpumask_and(tmp->addmask, 1845 xcpus, parent->effective_xcpus); 1846 } else if (is_partition_invalid(cs) && 1847 cpumask_subset(xcpus, parent->effective_xcpus)) { 1848 struct cgroup_subsys_state *css; 1849 struct cpuset *child; 1850 bool exclusive = true; 1851 1852 /* 1853 * Convert invalid partition to valid has to 1854 * pass the cpu exclusivity test. 1855 */ 1856 rcu_read_lock(); 1857 cpuset_for_each_child(child, css, parent) { 1858 if (child == cs) 1859 continue; 1860 if (!cpusets_are_exclusive(cs, child)) { 1861 exclusive = false; 1862 break; 1863 } 1864 } 1865 rcu_read_unlock(); 1866 if (exclusive) 1867 deleting = cpumask_and(tmp->delmask, 1868 xcpus, parent->effective_cpus); 1869 else 1870 part_error = PERR_NOTEXCL; 1871 } 1872 } 1873 1874 write_error: 1875 if (part_error) 1876 WRITE_ONCE(cs->prs_err, part_error); 1877 1878 if (cmd == partcmd_update) { 1879 /* 1880 * Check for possible transition between valid and invalid 1881 * partition root. 1882 */ 1883 switch (cs->partition_root_state) { 1884 case PRS_ROOT: 1885 case PRS_ISOLATED: 1886 if (part_error) { 1887 new_prs = -old_prs; 1888 subparts_delta--; 1889 } 1890 break; 1891 case PRS_INVALID_ROOT: 1892 case PRS_INVALID_ISOLATED: 1893 if (!part_error) { 1894 new_prs = -old_prs; 1895 subparts_delta++; 1896 } 1897 break; 1898 } 1899 } 1900 1901 if (!adding && !deleting && (new_prs == old_prs)) 1902 return 0; 1903 1904 /* 1905 * Transitioning between invalid to valid or vice versa may require 1906 * changing CS_CPU_EXCLUSIVE. In the case of partcmd_update, 1907 * validate_change() has already been successfully called and 1908 * CPU lists in cs haven't been updated yet. So defer it to later. 1909 */ 1910 if ((old_prs != new_prs) && (cmd != partcmd_update)) { 1911 int err = update_partition_exclusive_flag(cs, new_prs); 1912 1913 if (err) 1914 return err; 1915 } 1916 1917 /* 1918 * Change the parent's effective_cpus & effective_xcpus (top cpuset 1919 * only). 1920 * 1921 * Newly added CPUs will be removed from effective_cpus and 1922 * newly deleted ones will be added back to effective_cpus. 1923 */ 1924 spin_lock_irq(&callback_lock); 1925 if (old_prs != new_prs) { 1926 cs->partition_root_state = new_prs; 1927 if (new_prs <= 0) 1928 cs->nr_subparts = 0; 1929 } 1930 /* 1931 * Adding to parent's effective_cpus means deletion CPUs from cs 1932 * and vice versa. 1933 */ 1934 if (adding) 1935 isolcpus_updated += partition_xcpus_del(old_prs, parent, 1936 tmp->addmask); 1937 if (deleting) 1938 isolcpus_updated += partition_xcpus_add(new_prs, parent, 1939 tmp->delmask); 1940 1941 if (is_partition_valid(parent)) { 1942 parent->nr_subparts += subparts_delta; 1943 WARN_ON_ONCE(parent->nr_subparts < 0); 1944 } 1945 spin_unlock_irq(&callback_lock); 1946 update_unbound_workqueue_cpumask(isolcpus_updated); 1947 1948 if ((old_prs != new_prs) && (cmd == partcmd_update)) 1949 update_partition_exclusive_flag(cs, new_prs); 1950 1951 if (adding || deleting) { 1952 cpuset_update_tasks_cpumask(parent, tmp->addmask); 1953 update_sibling_cpumasks(parent, cs, tmp); 1954 } 1955 1956 /* 1957 * For partcmd_update without newmask, it is being called from 1958 * cpuset_handle_hotplug(). Update the load balance flag and 1959 * scheduling domain accordingly. 1960 */ 1961 if ((cmd == partcmd_update) && !newmask) 1962 update_partition_sd_lb(cs, old_prs); 1963 1964 notify_partition_change(cs, old_prs); 1965 return 0; 1966 } 1967 1968 /** 1969 * compute_partition_effective_cpumask - compute effective_cpus for partition 1970 * @cs: partition root cpuset 1971 * @new_ecpus: previously computed effective_cpus to be updated 1972 * 1973 * Compute the effective_cpus of a partition root by scanning effective_xcpus 1974 * of child partition roots and excluding their effective_xcpus. 1975 * 1976 * This has the side effect of invalidating valid child partition roots, 1977 * if necessary. Since it is called from either cpuset_hotplug_update_tasks() 1978 * or update_cpumasks_hier() where parent and children are modified 1979 * successively, we don't need to call update_parent_effective_cpumask() 1980 * and the child's effective_cpus will be updated in later iterations. 1981 * 1982 * Note that rcu_read_lock() is assumed to be held. 1983 */ 1984 static void compute_partition_effective_cpumask(struct cpuset *cs, 1985 struct cpumask *new_ecpus) 1986 { 1987 struct cgroup_subsys_state *css; 1988 struct cpuset *child; 1989 bool populated = partition_is_populated(cs, NULL); 1990 1991 /* 1992 * Check child partition roots to see if they should be 1993 * invalidated when 1994 * 1) child effective_xcpus not a subset of new 1995 * excluisve_cpus 1996 * 2) All the effective_cpus will be used up and cp 1997 * has tasks 1998 */ 1999 compute_effective_exclusive_cpumask(cs, new_ecpus, NULL); 2000 cpumask_and(new_ecpus, new_ecpus, cpu_active_mask); 2001 2002 rcu_read_lock(); 2003 cpuset_for_each_child(child, css, cs) { 2004 if (!is_partition_valid(child)) 2005 continue; 2006 2007 /* 2008 * There shouldn't be a remote partition underneath another 2009 * partition root. 2010 */ 2011 WARN_ON_ONCE(is_remote_partition(child)); 2012 child->prs_err = 0; 2013 if (!cpumask_subset(child->effective_xcpus, 2014 cs->effective_xcpus)) 2015 child->prs_err = PERR_INVCPUS; 2016 else if (populated && 2017 cpumask_subset(new_ecpus, child->effective_xcpus)) 2018 child->prs_err = PERR_NOCPUS; 2019 2020 if (child->prs_err) { 2021 int old_prs = child->partition_root_state; 2022 2023 /* 2024 * Invalidate child partition 2025 */ 2026 spin_lock_irq(&callback_lock); 2027 make_partition_invalid(child); 2028 cs->nr_subparts--; 2029 child->nr_subparts = 0; 2030 spin_unlock_irq(&callback_lock); 2031 notify_partition_change(child, old_prs); 2032 continue; 2033 } 2034 cpumask_andnot(new_ecpus, new_ecpus, 2035 child->effective_xcpus); 2036 } 2037 rcu_read_unlock(); 2038 } 2039 2040 /* 2041 * update_cpumasks_hier - Update effective cpumasks and tasks in the subtree 2042 * @cs: the cpuset to consider 2043 * @tmp: temp variables for calculating effective_cpus & partition setup 2044 * @force: don't skip any descendant cpusets if set 2045 * 2046 * When configured cpumask is changed, the effective cpumasks of this cpuset 2047 * and all its descendants need to be updated. 2048 * 2049 * On legacy hierarchy, effective_cpus will be the same with cpu_allowed. 2050 * 2051 * Called with cpuset_mutex held 2052 */ 2053 static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp, 2054 bool force) 2055 { 2056 struct cpuset *cp; 2057 struct cgroup_subsys_state *pos_css; 2058 bool need_rebuild_sched_domains = false; 2059 int old_prs, new_prs; 2060 2061 rcu_read_lock(); 2062 cpuset_for_each_descendant_pre(cp, pos_css, cs) { 2063 struct cpuset *parent = parent_cs(cp); 2064 bool remote = is_remote_partition(cp); 2065 bool update_parent = false; 2066 2067 old_prs = new_prs = cp->partition_root_state; 2068 2069 /* 2070 * For child remote partition root (!= cs), we need to call 2071 * remote_cpus_update() if effective_xcpus will be changed. 2072 * Otherwise, we can skip the whole subtree. 2073 * 2074 * remote_cpus_update() will reuse tmp->new_cpus only after 2075 * its value is being processed. 2076 */ 2077 if (remote && (cp != cs)) { 2078 compute_effective_exclusive_cpumask(cp, tmp->new_cpus, NULL); 2079 if (cpumask_equal(cp->effective_xcpus, tmp->new_cpus)) { 2080 pos_css = css_rightmost_descendant(pos_css); 2081 continue; 2082 } 2083 rcu_read_unlock(); 2084 remote_cpus_update(cp, NULL, tmp->new_cpus, tmp); 2085 rcu_read_lock(); 2086 2087 /* Remote partition may be invalidated */ 2088 new_prs = cp->partition_root_state; 2089 remote = (new_prs == old_prs); 2090 } 2091 2092 if (remote || (is_partition_valid(parent) && is_partition_valid(cp))) 2093 compute_partition_effective_cpumask(cp, tmp->new_cpus); 2094 else 2095 compute_effective_cpumask(tmp->new_cpus, cp, parent); 2096 2097 if (remote) 2098 goto get_css; /* Ready to update cpuset data */ 2099 2100 /* 2101 * A partition with no effective_cpus is allowed as long as 2102 * there is no task associated with it. Call 2103 * update_parent_effective_cpumask() to check it. 2104 */ 2105 if (is_partition_valid(cp) && cpumask_empty(tmp->new_cpus)) { 2106 update_parent = true; 2107 goto update_parent_effective; 2108 } 2109 2110 /* 2111 * If it becomes empty, inherit the effective mask of the 2112 * parent, which is guaranteed to have some CPUs unless 2113 * it is a partition root that has explicitly distributed 2114 * out all its CPUs. 2115 */ 2116 if (is_in_v2_mode() && !remote && cpumask_empty(tmp->new_cpus)) 2117 cpumask_copy(tmp->new_cpus, parent->effective_cpus); 2118 2119 /* 2120 * Skip the whole subtree if 2121 * 1) the cpumask remains the same, 2122 * 2) has no partition root state, 2123 * 3) force flag not set, and 2124 * 4) for v2 load balance state same as its parent. 2125 */ 2126 if (!cp->partition_root_state && !force && 2127 cpumask_equal(tmp->new_cpus, cp->effective_cpus) && 2128 (!cpuset_v2() || 2129 (is_sched_load_balance(parent) == is_sched_load_balance(cp)))) { 2130 pos_css = css_rightmost_descendant(pos_css); 2131 continue; 2132 } 2133 2134 update_parent_effective: 2135 /* 2136 * update_parent_effective_cpumask() should have been called 2137 * for cs already in update_cpumask(). We should also call 2138 * cpuset_update_tasks_cpumask() again for tasks in the parent 2139 * cpuset if the parent's effective_cpus changes. 2140 */ 2141 if ((cp != cs) && old_prs) { 2142 switch (parent->partition_root_state) { 2143 case PRS_ROOT: 2144 case PRS_ISOLATED: 2145 update_parent = true; 2146 break; 2147 2148 default: 2149 /* 2150 * When parent is not a partition root or is 2151 * invalid, child partition roots become 2152 * invalid too. 2153 */ 2154 if (is_partition_valid(cp)) 2155 new_prs = -cp->partition_root_state; 2156 WRITE_ONCE(cp->prs_err, 2157 is_partition_invalid(parent) 2158 ? PERR_INVPARENT : PERR_NOTPART); 2159 break; 2160 } 2161 } 2162 get_css: 2163 if (!css_tryget_online(&cp->css)) 2164 continue; 2165 rcu_read_unlock(); 2166 2167 if (update_parent) { 2168 update_parent_effective_cpumask(cp, partcmd_update, NULL, tmp); 2169 /* 2170 * The cpuset partition_root_state may become 2171 * invalid. Capture it. 2172 */ 2173 new_prs = cp->partition_root_state; 2174 } 2175 2176 spin_lock_irq(&callback_lock); 2177 cpumask_copy(cp->effective_cpus, tmp->new_cpus); 2178 cp->partition_root_state = new_prs; 2179 if (!cpumask_empty(cp->exclusive_cpus) && (cp != cs)) 2180 compute_effective_exclusive_cpumask(cp, NULL, NULL); 2181 2182 /* 2183 * Make sure effective_xcpus is properly set for a valid 2184 * partition root. 2185 */ 2186 if ((new_prs > 0) && cpumask_empty(cp->exclusive_cpus)) 2187 cpumask_and(cp->effective_xcpus, 2188 cp->cpus_allowed, parent->effective_xcpus); 2189 else if (new_prs < 0) 2190 reset_partition_data(cp); 2191 spin_unlock_irq(&callback_lock); 2192 2193 notify_partition_change(cp, old_prs); 2194 2195 WARN_ON(!is_in_v2_mode() && 2196 !cpumask_equal(cp->cpus_allowed, cp->effective_cpus)); 2197 2198 cpuset_update_tasks_cpumask(cp, cp->effective_cpus); 2199 2200 /* 2201 * On default hierarchy, inherit the CS_SCHED_LOAD_BALANCE 2202 * from parent if current cpuset isn't a valid partition root 2203 * and their load balance states differ. 2204 */ 2205 if (cpuset_v2() && !is_partition_valid(cp) && 2206 (is_sched_load_balance(parent) != is_sched_load_balance(cp))) { 2207 if (is_sched_load_balance(parent)) 2208 set_bit(CS_SCHED_LOAD_BALANCE, &cp->flags); 2209 else 2210 clear_bit(CS_SCHED_LOAD_BALANCE, &cp->flags); 2211 } 2212 2213 /* 2214 * On legacy hierarchy, if the effective cpumask of any non- 2215 * empty cpuset is changed, we need to rebuild sched domains. 2216 * On default hierarchy, the cpuset needs to be a partition 2217 * root as well. 2218 */ 2219 if (!cpumask_empty(cp->cpus_allowed) && 2220 is_sched_load_balance(cp) && 2221 (!cpuset_v2() || is_partition_valid(cp))) 2222 need_rebuild_sched_domains = true; 2223 2224 rcu_read_lock(); 2225 css_put(&cp->css); 2226 } 2227 rcu_read_unlock(); 2228 2229 if (need_rebuild_sched_domains) 2230 cpuset_force_rebuild(); 2231 } 2232 2233 /** 2234 * update_sibling_cpumasks - Update siblings cpumasks 2235 * @parent: Parent cpuset 2236 * @cs: Current cpuset 2237 * @tmp: Temp variables 2238 */ 2239 static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, 2240 struct tmpmasks *tmp) 2241 { 2242 struct cpuset *sibling; 2243 struct cgroup_subsys_state *pos_css; 2244 2245 lockdep_assert_held(&cpuset_mutex); 2246 2247 /* 2248 * Check all its siblings and call update_cpumasks_hier() 2249 * if their effective_cpus will need to be changed. 2250 * 2251 * It is possible a change in parent's effective_cpus 2252 * due to a change in a child partition's effective_xcpus will impact 2253 * its siblings even if they do not inherit parent's effective_cpus 2254 * directly. 2255 * 2256 * The update_cpumasks_hier() function may sleep. So we have to 2257 * release the RCU read lock before calling it. 2258 */ 2259 rcu_read_lock(); 2260 cpuset_for_each_child(sibling, pos_css, parent) { 2261 if (sibling == cs) 2262 continue; 2263 if (!is_partition_valid(sibling)) { 2264 compute_effective_cpumask(tmp->new_cpus, sibling, 2265 parent); 2266 if (cpumask_equal(tmp->new_cpus, sibling->effective_cpus)) 2267 continue; 2268 } else if (is_remote_partition(sibling)) { 2269 /* 2270 * Change in a sibling cpuset won't affect a remote 2271 * partition root. 2272 */ 2273 continue; 2274 } 2275 2276 if (!css_tryget_online(&sibling->css)) 2277 continue; 2278 2279 rcu_read_unlock(); 2280 update_cpumasks_hier(sibling, tmp, false); 2281 rcu_read_lock(); 2282 css_put(&sibling->css); 2283 } 2284 rcu_read_unlock(); 2285 } 2286 2287 /** 2288 * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it 2289 * @cs: the cpuset to consider 2290 * @trialcs: trial cpuset 2291 * @buf: buffer of cpu numbers written to this cpuset 2292 */ 2293 static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, 2294 const char *buf) 2295 { 2296 int retval; 2297 struct tmpmasks tmp; 2298 struct cpuset *parent = parent_cs(cs); 2299 bool invalidate = false; 2300 bool force = false; 2301 int old_prs = cs->partition_root_state; 2302 2303 /* top_cpuset.cpus_allowed tracks cpu_active_mask; it's read-only */ 2304 if (cs == &top_cpuset) 2305 return -EACCES; 2306 2307 /* 2308 * An empty cpus_allowed is ok only if the cpuset has no tasks. 2309 * Since cpulist_parse() fails on an empty mask, we special case 2310 * that parsing. The validate_change() call ensures that cpusets 2311 * with tasks have cpus. 2312 */ 2313 if (!*buf) { 2314 cpumask_clear(trialcs->cpus_allowed); 2315 if (cpumask_empty(trialcs->exclusive_cpus)) 2316 cpumask_clear(trialcs->effective_xcpus); 2317 } else { 2318 retval = cpulist_parse(buf, trialcs->cpus_allowed); 2319 if (retval < 0) 2320 return retval; 2321 2322 if (!cpumask_subset(trialcs->cpus_allowed, 2323 top_cpuset.cpus_allowed)) 2324 return -EINVAL; 2325 2326 /* 2327 * When exclusive_cpus isn't explicitly set, it is constrained 2328 * by cpus_allowed and parent's effective_xcpus. Otherwise, 2329 * trialcs->effective_xcpus is used as a temporary cpumask 2330 * for checking validity of the partition root. 2331 */ 2332 trialcs->partition_root_state = PRS_MEMBER; 2333 if (!cpumask_empty(trialcs->exclusive_cpus) || is_partition_valid(cs)) 2334 compute_effective_exclusive_cpumask(trialcs, NULL, cs); 2335 } 2336 2337 /* Nothing to do if the cpus didn't change */ 2338 if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed)) 2339 return 0; 2340 2341 if (alloc_cpumasks(NULL, &tmp)) 2342 return -ENOMEM; 2343 2344 if (old_prs) { 2345 if (is_partition_valid(cs) && 2346 cpumask_empty(trialcs->effective_xcpus)) { 2347 invalidate = true; 2348 cs->prs_err = PERR_INVCPUS; 2349 } else if (prstate_housekeeping_conflict(old_prs, trialcs->effective_xcpus)) { 2350 invalidate = true; 2351 cs->prs_err = PERR_HKEEPING; 2352 } else if (tasks_nocpu_error(parent, cs, trialcs->effective_xcpus)) { 2353 invalidate = true; 2354 cs->prs_err = PERR_NOCPUS; 2355 } 2356 } 2357 2358 /* 2359 * Check all the descendants in update_cpumasks_hier() if 2360 * effective_xcpus is to be changed. 2361 */ 2362 force = !cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus); 2363 2364 retval = validate_change(cs, trialcs); 2365 2366 if ((retval == -EINVAL) && cpuset_v2()) { 2367 struct cgroup_subsys_state *css; 2368 struct cpuset *cp; 2369 2370 /* 2371 * The -EINVAL error code indicates that partition sibling 2372 * CPU exclusivity rule has been violated. We still allow 2373 * the cpumask change to proceed while invalidating the 2374 * partition. However, any conflicting sibling partitions 2375 * have to be marked as invalid too. 2376 */ 2377 invalidate = true; 2378 rcu_read_lock(); 2379 cpuset_for_each_child(cp, css, parent) { 2380 struct cpumask *xcpus = user_xcpus(trialcs); 2381 2382 if (is_partition_valid(cp) && 2383 cpumask_intersects(xcpus, cp->effective_xcpus)) { 2384 rcu_read_unlock(); 2385 update_parent_effective_cpumask(cp, partcmd_invalidate, NULL, &tmp); 2386 rcu_read_lock(); 2387 } 2388 } 2389 rcu_read_unlock(); 2390 retval = 0; 2391 } 2392 2393 if (retval < 0) 2394 goto out_free; 2395 2396 if (is_partition_valid(cs) || 2397 (is_partition_invalid(cs) && !invalidate)) { 2398 struct cpumask *xcpus = trialcs->effective_xcpus; 2399 2400 if (cpumask_empty(xcpus) && is_partition_invalid(cs)) 2401 xcpus = trialcs->cpus_allowed; 2402 2403 /* 2404 * Call remote_cpus_update() to handle valid remote partition 2405 */ 2406 if (is_remote_partition(cs)) 2407 remote_cpus_update(cs, NULL, xcpus, &tmp); 2408 else if (invalidate) 2409 update_parent_effective_cpumask(cs, partcmd_invalidate, 2410 NULL, &tmp); 2411 else 2412 update_parent_effective_cpumask(cs, partcmd_update, 2413 xcpus, &tmp); 2414 } 2415 2416 spin_lock_irq(&callback_lock); 2417 cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed); 2418 cpumask_copy(cs->effective_xcpus, trialcs->effective_xcpus); 2419 if ((old_prs > 0) && !is_partition_valid(cs)) 2420 reset_partition_data(cs); 2421 spin_unlock_irq(&callback_lock); 2422 2423 /* effective_cpus/effective_xcpus will be updated here */ 2424 update_cpumasks_hier(cs, &tmp, force); 2425 2426 /* Update CS_SCHED_LOAD_BALANCE and/or sched_domains, if necessary */ 2427 if (cs->partition_root_state) 2428 update_partition_sd_lb(cs, old_prs); 2429 out_free: 2430 free_cpumasks(NULL, &tmp); 2431 return retval; 2432 } 2433 2434 /** 2435 * update_exclusive_cpumask - update the exclusive_cpus mask of a cpuset 2436 * @cs: the cpuset to consider 2437 * @trialcs: trial cpuset 2438 * @buf: buffer of cpu numbers written to this cpuset 2439 * 2440 * The tasks' cpumask will be updated if cs is a valid partition root. 2441 */ 2442 static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs, 2443 const char *buf) 2444 { 2445 int retval; 2446 struct tmpmasks tmp; 2447 struct cpuset *parent = parent_cs(cs); 2448 bool invalidate = false; 2449 bool force = false; 2450 int old_prs = cs->partition_root_state; 2451 2452 if (!*buf) { 2453 cpumask_clear(trialcs->exclusive_cpus); 2454 cpumask_clear(trialcs->effective_xcpus); 2455 } else { 2456 retval = cpulist_parse(buf, trialcs->exclusive_cpus); 2457 if (retval < 0) 2458 return retval; 2459 } 2460 2461 /* Nothing to do if the CPUs didn't change */ 2462 if (cpumask_equal(cs->exclusive_cpus, trialcs->exclusive_cpus)) 2463 return 0; 2464 2465 if (*buf) { 2466 trialcs->partition_root_state = PRS_MEMBER; 2467 /* 2468 * Reject the change if there is exclusive CPUs conflict with 2469 * the siblings. 2470 */ 2471 if (compute_effective_exclusive_cpumask(trialcs, NULL, cs)) 2472 return -EINVAL; 2473 } 2474 2475 /* 2476 * Check all the descendants in update_cpumasks_hier() if 2477 * effective_xcpus is to be changed. 2478 */ 2479 force = !cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus); 2480 2481 retval = validate_change(cs, trialcs); 2482 if (retval) 2483 return retval; 2484 2485 if (alloc_cpumasks(NULL, &tmp)) 2486 return -ENOMEM; 2487 2488 if (old_prs) { 2489 if (cpumask_empty(trialcs->effective_xcpus)) { 2490 invalidate = true; 2491 cs->prs_err = PERR_INVCPUS; 2492 } else if (prstate_housekeeping_conflict(old_prs, trialcs->effective_xcpus)) { 2493 invalidate = true; 2494 cs->prs_err = PERR_HKEEPING; 2495 } else if (tasks_nocpu_error(parent, cs, trialcs->effective_xcpus)) { 2496 invalidate = true; 2497 cs->prs_err = PERR_NOCPUS; 2498 } 2499 2500 if (is_remote_partition(cs)) { 2501 if (invalidate) 2502 remote_partition_disable(cs, &tmp); 2503 else 2504 remote_cpus_update(cs, trialcs->exclusive_cpus, 2505 trialcs->effective_xcpus, &tmp); 2506 } else if (invalidate) { 2507 update_parent_effective_cpumask(cs, partcmd_invalidate, 2508 NULL, &tmp); 2509 } else { 2510 update_parent_effective_cpumask(cs, partcmd_update, 2511 trialcs->effective_xcpus, &tmp); 2512 } 2513 } 2514 spin_lock_irq(&callback_lock); 2515 cpumask_copy(cs->exclusive_cpus, trialcs->exclusive_cpus); 2516 cpumask_copy(cs->effective_xcpus, trialcs->effective_xcpus); 2517 if ((old_prs > 0) && !is_partition_valid(cs)) 2518 reset_partition_data(cs); 2519 spin_unlock_irq(&callback_lock); 2520 2521 /* 2522 * Call update_cpumasks_hier() to update effective_cpus/effective_xcpus 2523 * of the subtree when it is a valid partition root or effective_xcpus 2524 * is updated. 2525 */ 2526 if (is_partition_valid(cs) || force) 2527 update_cpumasks_hier(cs, &tmp, force); 2528 2529 /* Update CS_SCHED_LOAD_BALANCE and/or sched_domains, if necessary */ 2530 if (cs->partition_root_state) 2531 update_partition_sd_lb(cs, old_prs); 2532 2533 free_cpumasks(NULL, &tmp); 2534 return 0; 2535 } 2536 2537 /* 2538 * Migrate memory region from one set of nodes to another. This is 2539 * performed asynchronously as it can be called from process migration path 2540 * holding locks involved in process management. All mm migrations are 2541 * performed in the queued order and can be waited for by flushing 2542 * cpuset_migrate_mm_wq. 2543 */ 2544 2545 struct cpuset_migrate_mm_work { 2546 struct work_struct work; 2547 struct mm_struct *mm; 2548 nodemask_t from; 2549 nodemask_t to; 2550 }; 2551 2552 static void cpuset_migrate_mm_workfn(struct work_struct *work) 2553 { 2554 struct cpuset_migrate_mm_work *mwork = 2555 container_of(work, struct cpuset_migrate_mm_work, work); 2556 2557 /* on a wq worker, no need to worry about %current's mems_allowed */ 2558 do_migrate_pages(mwork->mm, &mwork->from, &mwork->to, MPOL_MF_MOVE_ALL); 2559 mmput(mwork->mm); 2560 kfree(mwork); 2561 } 2562 2563 static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, 2564 const nodemask_t *to) 2565 { 2566 struct cpuset_migrate_mm_work *mwork; 2567 2568 if (nodes_equal(*from, *to)) { 2569 mmput(mm); 2570 return; 2571 } 2572 2573 mwork = kzalloc(sizeof(*mwork), GFP_KERNEL); 2574 if (mwork) { 2575 mwork->mm = mm; 2576 mwork->from = *from; 2577 mwork->to = *to; 2578 INIT_WORK(&mwork->work, cpuset_migrate_mm_workfn); 2579 queue_work(cpuset_migrate_mm_wq, &mwork->work); 2580 } else { 2581 mmput(mm); 2582 } 2583 } 2584 2585 static void cpuset_post_attach(void) 2586 { 2587 flush_workqueue(cpuset_migrate_mm_wq); 2588 } 2589 2590 /* 2591 * cpuset_change_task_nodemask - change task's mems_allowed and mempolicy 2592 * @tsk: the task to change 2593 * @newmems: new nodes that the task will be set 2594 * 2595 * We use the mems_allowed_seq seqlock to safely update both tsk->mems_allowed 2596 * and rebind an eventual tasks' mempolicy. If the task is allocating in 2597 * parallel, it might temporarily see an empty intersection, which results in 2598 * a seqlock check and retry before OOM or allocation failure. 2599 */ 2600 static void cpuset_change_task_nodemask(struct task_struct *tsk, 2601 nodemask_t *newmems) 2602 { 2603 task_lock(tsk); 2604 2605 local_irq_disable(); 2606 write_seqcount_begin(&tsk->mems_allowed_seq); 2607 2608 nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems); 2609 mpol_rebind_task(tsk, newmems); 2610 tsk->mems_allowed = *newmems; 2611 2612 write_seqcount_end(&tsk->mems_allowed_seq); 2613 local_irq_enable(); 2614 2615 task_unlock(tsk); 2616 } 2617 2618 static void *cpuset_being_rebound; 2619 2620 /** 2621 * cpuset_update_tasks_nodemask - Update the nodemasks of tasks in the cpuset. 2622 * @cs: the cpuset in which each task's mems_allowed mask needs to be changed 2623 * 2624 * Iterate through each task of @cs updating its mems_allowed to the 2625 * effective cpuset's. As this function is called with cpuset_mutex held, 2626 * cpuset membership stays stable. 2627 */ 2628 void cpuset_update_tasks_nodemask(struct cpuset *cs) 2629 { 2630 static nodemask_t newmems; /* protected by cpuset_mutex */ 2631 struct css_task_iter it; 2632 struct task_struct *task; 2633 2634 cpuset_being_rebound = cs; /* causes mpol_dup() rebind */ 2635 2636 guarantee_online_mems(cs, &newmems); 2637 2638 /* 2639 * The mpol_rebind_mm() call takes mmap_lock, which we couldn't 2640 * take while holding tasklist_lock. Forks can happen - the 2641 * mpol_dup() cpuset_being_rebound check will catch such forks, 2642 * and rebind their vma mempolicies too. Because we still hold 2643 * the global cpuset_mutex, we know that no other rebind effort 2644 * will be contending for the global variable cpuset_being_rebound. 2645 * It's ok if we rebind the same mm twice; mpol_rebind_mm() 2646 * is idempotent. Also migrate pages in each mm to new nodes. 2647 */ 2648 css_task_iter_start(&cs->css, 0, &it); 2649 while ((task = css_task_iter_next(&it))) { 2650 struct mm_struct *mm; 2651 bool migrate; 2652 2653 cpuset_change_task_nodemask(task, &newmems); 2654 2655 mm = get_task_mm(task); 2656 if (!mm) 2657 continue; 2658 2659 migrate = is_memory_migrate(cs); 2660 2661 mpol_rebind_mm(mm, &cs->mems_allowed); 2662 if (migrate) 2663 cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems); 2664 else 2665 mmput(mm); 2666 } 2667 css_task_iter_end(&it); 2668 2669 /* 2670 * All the tasks' nodemasks have been updated, update 2671 * cs->old_mems_allowed. 2672 */ 2673 cs->old_mems_allowed = newmems; 2674 2675 /* We're done rebinding vmas to this cpuset's new mems_allowed. */ 2676 cpuset_being_rebound = NULL; 2677 } 2678 2679 /* 2680 * update_nodemasks_hier - Update effective nodemasks and tasks in the subtree 2681 * @cs: the cpuset to consider 2682 * @new_mems: a temp variable for calculating new effective_mems 2683 * 2684 * When configured nodemask is changed, the effective nodemasks of this cpuset 2685 * and all its descendants need to be updated. 2686 * 2687 * On legacy hierarchy, effective_mems will be the same with mems_allowed. 2688 * 2689 * Called with cpuset_mutex held 2690 */ 2691 static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems) 2692 { 2693 struct cpuset *cp; 2694 struct cgroup_subsys_state *pos_css; 2695 2696 rcu_read_lock(); 2697 cpuset_for_each_descendant_pre(cp, pos_css, cs) { 2698 struct cpuset *parent = parent_cs(cp); 2699 2700 nodes_and(*new_mems, cp->mems_allowed, parent->effective_mems); 2701 2702 /* 2703 * If it becomes empty, inherit the effective mask of the 2704 * parent, which is guaranteed to have some MEMs. 2705 */ 2706 if (is_in_v2_mode() && nodes_empty(*new_mems)) 2707 *new_mems = parent->effective_mems; 2708 2709 /* Skip the whole subtree if the nodemask remains the same. */ 2710 if (nodes_equal(*new_mems, cp->effective_mems)) { 2711 pos_css = css_rightmost_descendant(pos_css); 2712 continue; 2713 } 2714 2715 if (!css_tryget_online(&cp->css)) 2716 continue; 2717 rcu_read_unlock(); 2718 2719 spin_lock_irq(&callback_lock); 2720 cp->effective_mems = *new_mems; 2721 spin_unlock_irq(&callback_lock); 2722 2723 WARN_ON(!is_in_v2_mode() && 2724 !nodes_equal(cp->mems_allowed, cp->effective_mems)); 2725 2726 cpuset_update_tasks_nodemask(cp); 2727 2728 rcu_read_lock(); 2729 css_put(&cp->css); 2730 } 2731 rcu_read_unlock(); 2732 } 2733 2734 /* 2735 * Handle user request to change the 'mems' memory placement 2736 * of a cpuset. Needs to validate the request, update the 2737 * cpusets mems_allowed, and for each task in the cpuset, 2738 * update mems_allowed and rebind task's mempolicy and any vma 2739 * mempolicies and if the cpuset is marked 'memory_migrate', 2740 * migrate the tasks pages to the new memory. 2741 * 2742 * Call with cpuset_mutex held. May take callback_lock during call. 2743 * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, 2744 * lock each such tasks mm->mmap_lock, scan its vma's and rebind 2745 * their mempolicies to the cpusets new mems_allowed. 2746 */ 2747 static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, 2748 const char *buf) 2749 { 2750 int retval; 2751 2752 /* 2753 * top_cpuset.mems_allowed tracks node_stats[N_MEMORY]; 2754 * it's read-only 2755 */ 2756 if (cs == &top_cpuset) { 2757 retval = -EACCES; 2758 goto done; 2759 } 2760 2761 /* 2762 * An empty mems_allowed is ok iff there are no tasks in the cpuset. 2763 * Since nodelist_parse() fails on an empty mask, we special case 2764 * that parsing. The validate_change() call ensures that cpusets 2765 * with tasks have memory. 2766 */ 2767 if (!*buf) { 2768 nodes_clear(trialcs->mems_allowed); 2769 } else { 2770 retval = nodelist_parse(buf, trialcs->mems_allowed); 2771 if (retval < 0) 2772 goto done; 2773 2774 if (!nodes_subset(trialcs->mems_allowed, 2775 top_cpuset.mems_allowed)) { 2776 retval = -EINVAL; 2777 goto done; 2778 } 2779 } 2780 2781 if (nodes_equal(cs->mems_allowed, trialcs->mems_allowed)) { 2782 retval = 0; /* Too easy - nothing to do */ 2783 goto done; 2784 } 2785 retval = validate_change(cs, trialcs); 2786 if (retval < 0) 2787 goto done; 2788 2789 check_insane_mems_config(&trialcs->mems_allowed); 2790 2791 spin_lock_irq(&callback_lock); 2792 cs->mems_allowed = trialcs->mems_allowed; 2793 spin_unlock_irq(&callback_lock); 2794 2795 /* use trialcs->mems_allowed as a temp variable */ 2796 update_nodemasks_hier(cs, &trialcs->mems_allowed); 2797 done: 2798 return retval; 2799 } 2800 2801 bool current_cpuset_is_being_rebound(void) 2802 { 2803 bool ret; 2804 2805 rcu_read_lock(); 2806 ret = task_cs(current) == cpuset_being_rebound; 2807 rcu_read_unlock(); 2808 2809 return ret; 2810 } 2811 2812 /* 2813 * cpuset_update_flag - read a 0 or a 1 in a file and update associated flag 2814 * bit: the bit to update (see cpuset_flagbits_t) 2815 * cs: the cpuset to update 2816 * turning_on: whether the flag is being set or cleared 2817 * 2818 * Call with cpuset_mutex held. 2819 */ 2820 2821 int cpuset_update_flag(cpuset_flagbits_t bit, struct cpuset *cs, 2822 int turning_on) 2823 { 2824 struct cpuset *trialcs; 2825 int balance_flag_changed; 2826 int spread_flag_changed; 2827 int err; 2828 2829 trialcs = alloc_trial_cpuset(cs); 2830 if (!trialcs) 2831 return -ENOMEM; 2832 2833 if (turning_on) 2834 set_bit(bit, &trialcs->flags); 2835 else 2836 clear_bit(bit, &trialcs->flags); 2837 2838 err = validate_change(cs, trialcs); 2839 if (err < 0) 2840 goto out; 2841 2842 balance_flag_changed = (is_sched_load_balance(cs) != 2843 is_sched_load_balance(trialcs)); 2844 2845 spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs)) 2846 || (is_spread_page(cs) != is_spread_page(trialcs))); 2847 2848 spin_lock_irq(&callback_lock); 2849 cs->flags = trialcs->flags; 2850 spin_unlock_irq(&callback_lock); 2851 2852 if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) { 2853 if (cpuset_v2()) 2854 cpuset_force_rebuild(); 2855 else 2856 rebuild_sched_domains_locked(); 2857 } 2858 2859 if (spread_flag_changed) 2860 cpuset1_update_tasks_flags(cs); 2861 out: 2862 free_cpuset(trialcs); 2863 return err; 2864 } 2865 2866 /** 2867 * update_prstate - update partition_root_state 2868 * @cs: the cpuset to update 2869 * @new_prs: new partition root state 2870 * Return: 0 if successful, != 0 if error 2871 * 2872 * Call with cpuset_mutex held. 2873 */ 2874 static int update_prstate(struct cpuset *cs, int new_prs) 2875 { 2876 int err = PERR_NONE, old_prs = cs->partition_root_state; 2877 struct cpuset *parent = parent_cs(cs); 2878 struct tmpmasks tmpmask; 2879 bool isolcpus_updated = false; 2880 2881 if (old_prs == new_prs) 2882 return 0; 2883 2884 /* 2885 * Treat a previously invalid partition root as if it is a "member". 2886 */ 2887 if (new_prs && is_prs_invalid(old_prs)) 2888 old_prs = PRS_MEMBER; 2889 2890 if (alloc_cpumasks(NULL, &tmpmask)) 2891 return -ENOMEM; 2892 2893 err = update_partition_exclusive_flag(cs, new_prs); 2894 if (err) 2895 goto out; 2896 2897 if (!old_prs) { 2898 /* 2899 * cpus_allowed and exclusive_cpus cannot be both empty. 2900 */ 2901 if (xcpus_empty(cs)) { 2902 err = PERR_CPUSEMPTY; 2903 goto out; 2904 } 2905 2906 /* 2907 * We don't support the creation of a new local partition with 2908 * a remote partition underneath it. This unsupported 2909 * setting can happen only if parent is the top_cpuset because 2910 * a remote partition cannot be created underneath an existing 2911 * local or remote partition. 2912 */ 2913 if ((parent == &top_cpuset) && 2914 cpumask_intersects(cs->exclusive_cpus, subpartitions_cpus)) { 2915 err = PERR_REMOTE; 2916 goto out; 2917 } 2918 2919 /* 2920 * If parent is valid partition, enable local partiion. 2921 * Otherwise, enable a remote partition. 2922 */ 2923 if (is_partition_valid(parent)) { 2924 enum partition_cmd cmd = (new_prs == PRS_ROOT) 2925 ? partcmd_enable : partcmd_enablei; 2926 2927 err = update_parent_effective_cpumask(cs, cmd, NULL, &tmpmask); 2928 } else { 2929 err = remote_partition_enable(cs, new_prs, &tmpmask); 2930 } 2931 } else if (old_prs && new_prs) { 2932 /* 2933 * A change in load balance state only, no change in cpumasks. 2934 * Need to update isolated_cpus. 2935 */ 2936 isolcpus_updated = true; 2937 } else { 2938 /* 2939 * Switching back to member is always allowed even if it 2940 * disables child partitions. 2941 */ 2942 if (is_remote_partition(cs)) 2943 remote_partition_disable(cs, &tmpmask); 2944 else 2945 update_parent_effective_cpumask(cs, partcmd_disable, 2946 NULL, &tmpmask); 2947 2948 /* 2949 * Invalidation of child partitions will be done in 2950 * update_cpumasks_hier(). 2951 */ 2952 } 2953 out: 2954 /* 2955 * Make partition invalid & disable CS_CPU_EXCLUSIVE if an error 2956 * happens. 2957 */ 2958 if (err) { 2959 new_prs = -new_prs; 2960 update_partition_exclusive_flag(cs, new_prs); 2961 } 2962 2963 spin_lock_irq(&callback_lock); 2964 cs->partition_root_state = new_prs; 2965 WRITE_ONCE(cs->prs_err, err); 2966 if (!is_partition_valid(cs)) 2967 reset_partition_data(cs); 2968 else if (isolcpus_updated) 2969 isolated_cpus_update(old_prs, new_prs, cs->effective_xcpus); 2970 spin_unlock_irq(&callback_lock); 2971 update_unbound_workqueue_cpumask(isolcpus_updated); 2972 2973 /* Force update if switching back to member & update effective_xcpus */ 2974 update_cpumasks_hier(cs, &tmpmask, !new_prs); 2975 2976 /* A newly created partition must have effective_xcpus set */ 2977 WARN_ON_ONCE(!old_prs && (new_prs > 0) 2978 && cpumask_empty(cs->effective_xcpus)); 2979 2980 /* Update sched domains and load balance flag */ 2981 update_partition_sd_lb(cs, old_prs); 2982 2983 notify_partition_change(cs, old_prs); 2984 if (force_sd_rebuild) 2985 rebuild_sched_domains_locked(); 2986 free_cpumasks(NULL, &tmpmask); 2987 return 0; 2988 } 2989 2990 static struct cpuset *cpuset_attach_old_cs; 2991 2992 /* 2993 * Check to see if a cpuset can accept a new task 2994 * For v1, cpus_allowed and mems_allowed can't be empty. 2995 * For v2, effective_cpus can't be empty. 2996 * Note that in v1, effective_cpus = cpus_allowed. 2997 */ 2998 static int cpuset_can_attach_check(struct cpuset *cs) 2999 { 3000 if (cpumask_empty(cs->effective_cpus) || 3001 (!is_in_v2_mode() && nodes_empty(cs->mems_allowed))) 3002 return -ENOSPC; 3003 return 0; 3004 } 3005 3006 static void reset_migrate_dl_data(struct cpuset *cs) 3007 { 3008 cs->nr_migrate_dl_tasks = 0; 3009 cs->sum_migrate_dl_bw = 0; 3010 } 3011 3012 /* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */ 3013 static int cpuset_can_attach(struct cgroup_taskset *tset) 3014 { 3015 struct cgroup_subsys_state *css; 3016 struct cpuset *cs, *oldcs; 3017 struct task_struct *task; 3018 bool cpus_updated, mems_updated; 3019 int ret; 3020 3021 /* used later by cpuset_attach() */ 3022 cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css)); 3023 oldcs = cpuset_attach_old_cs; 3024 cs = css_cs(css); 3025 3026 mutex_lock(&cpuset_mutex); 3027 3028 /* Check to see if task is allowed in the cpuset */ 3029 ret = cpuset_can_attach_check(cs); 3030 if (ret) 3031 goto out_unlock; 3032 3033 cpus_updated = !cpumask_equal(cs->effective_cpus, oldcs->effective_cpus); 3034 mems_updated = !nodes_equal(cs->effective_mems, oldcs->effective_mems); 3035 3036 cgroup_taskset_for_each(task, css, tset) { 3037 ret = task_can_attach(task); 3038 if (ret) 3039 goto out_unlock; 3040 3041 /* 3042 * Skip rights over task check in v2 when nothing changes, 3043 * migration permission derives from hierarchy ownership in 3044 * cgroup_procs_write_permission()). 3045 */ 3046 if (!cpuset_v2() || (cpus_updated || mems_updated)) { 3047 ret = security_task_setscheduler(task); 3048 if (ret) 3049 goto out_unlock; 3050 } 3051 3052 if (dl_task(task)) { 3053 cs->nr_migrate_dl_tasks++; 3054 cs->sum_migrate_dl_bw += task->dl.dl_bw; 3055 } 3056 } 3057 3058 if (!cs->nr_migrate_dl_tasks) 3059 goto out_success; 3060 3061 if (!cpumask_intersects(oldcs->effective_cpus, cs->effective_cpus)) { 3062 int cpu = cpumask_any_and(cpu_active_mask, cs->effective_cpus); 3063 3064 if (unlikely(cpu >= nr_cpu_ids)) { 3065 reset_migrate_dl_data(cs); 3066 ret = -EINVAL; 3067 goto out_unlock; 3068 } 3069 3070 ret = dl_bw_alloc(cpu, cs->sum_migrate_dl_bw); 3071 if (ret) { 3072 reset_migrate_dl_data(cs); 3073 goto out_unlock; 3074 } 3075 } 3076 3077 out_success: 3078 /* 3079 * Mark attach is in progress. This makes validate_change() fail 3080 * changes which zero cpus/mems_allowed. 3081 */ 3082 cs->attach_in_progress++; 3083 out_unlock: 3084 mutex_unlock(&cpuset_mutex); 3085 return ret; 3086 } 3087 3088 static void cpuset_cancel_attach(struct cgroup_taskset *tset) 3089 { 3090 struct cgroup_subsys_state *css; 3091 struct cpuset *cs; 3092 3093 cgroup_taskset_first(tset, &css); 3094 cs = css_cs(css); 3095 3096 mutex_lock(&cpuset_mutex); 3097 dec_attach_in_progress_locked(cs); 3098 3099 if (cs->nr_migrate_dl_tasks) { 3100 int cpu = cpumask_any(cs->effective_cpus); 3101 3102 dl_bw_free(cpu, cs->sum_migrate_dl_bw); 3103 reset_migrate_dl_data(cs); 3104 } 3105 3106 mutex_unlock(&cpuset_mutex); 3107 } 3108 3109 /* 3110 * Protected by cpuset_mutex. cpus_attach is used only by cpuset_attach_task() 3111 * but we can't allocate it dynamically there. Define it global and 3112 * allocate from cpuset_init(). 3113 */ 3114 static cpumask_var_t cpus_attach; 3115 static nodemask_t cpuset_attach_nodemask_to; 3116 3117 static void cpuset_attach_task(struct cpuset *cs, struct task_struct *task) 3118 { 3119 lockdep_assert_held(&cpuset_mutex); 3120 3121 if (cs != &top_cpuset) 3122 guarantee_active_cpus(task, cpus_attach); 3123 else 3124 cpumask_andnot(cpus_attach, task_cpu_possible_mask(task), 3125 subpartitions_cpus); 3126 /* 3127 * can_attach beforehand should guarantee that this doesn't 3128 * fail. TODO: have a better way to handle failure here 3129 */ 3130 WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach)); 3131 3132 cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to); 3133 cpuset1_update_task_spread_flags(cs, task); 3134 } 3135 3136 static void cpuset_attach(struct cgroup_taskset *tset) 3137 { 3138 struct task_struct *task; 3139 struct task_struct *leader; 3140 struct cgroup_subsys_state *css; 3141 struct cpuset *cs; 3142 struct cpuset *oldcs = cpuset_attach_old_cs; 3143 bool cpus_updated, mems_updated; 3144 3145 cgroup_taskset_first(tset, &css); 3146 cs = css_cs(css); 3147 3148 lockdep_assert_cpus_held(); /* see cgroup_attach_lock() */ 3149 mutex_lock(&cpuset_mutex); 3150 cpus_updated = !cpumask_equal(cs->effective_cpus, 3151 oldcs->effective_cpus); 3152 mems_updated = !nodes_equal(cs->effective_mems, oldcs->effective_mems); 3153 3154 /* 3155 * In the default hierarchy, enabling cpuset in the child cgroups 3156 * will trigger a number of cpuset_attach() calls with no change 3157 * in effective cpus and mems. In that case, we can optimize out 3158 * by skipping the task iteration and update. 3159 */ 3160 if (cpuset_v2() && !cpus_updated && !mems_updated) { 3161 cpuset_attach_nodemask_to = cs->effective_mems; 3162 goto out; 3163 } 3164 3165 guarantee_online_mems(cs, &cpuset_attach_nodemask_to); 3166 3167 cgroup_taskset_for_each(task, css, tset) 3168 cpuset_attach_task(cs, task); 3169 3170 /* 3171 * Change mm for all threadgroup leaders. This is expensive and may 3172 * sleep and should be moved outside migration path proper. Skip it 3173 * if there is no change in effective_mems and CS_MEMORY_MIGRATE is 3174 * not set. 3175 */ 3176 cpuset_attach_nodemask_to = cs->effective_mems; 3177 if (!is_memory_migrate(cs) && !mems_updated) 3178 goto out; 3179 3180 cgroup_taskset_for_each_leader(leader, css, tset) { 3181 struct mm_struct *mm = get_task_mm(leader); 3182 3183 if (mm) { 3184 mpol_rebind_mm(mm, &cpuset_attach_nodemask_to); 3185 3186 /* 3187 * old_mems_allowed is the same with mems_allowed 3188 * here, except if this task is being moved 3189 * automatically due to hotplug. In that case 3190 * @mems_allowed has been updated and is empty, so 3191 * @old_mems_allowed is the right nodesets that we 3192 * migrate mm from. 3193 */ 3194 if (is_memory_migrate(cs)) 3195 cpuset_migrate_mm(mm, &oldcs->old_mems_allowed, 3196 &cpuset_attach_nodemask_to); 3197 else 3198 mmput(mm); 3199 } 3200 } 3201 3202 out: 3203 cs->old_mems_allowed = cpuset_attach_nodemask_to; 3204 3205 if (cs->nr_migrate_dl_tasks) { 3206 cs->nr_deadline_tasks += cs->nr_migrate_dl_tasks; 3207 oldcs->nr_deadline_tasks -= cs->nr_migrate_dl_tasks; 3208 reset_migrate_dl_data(cs); 3209 } 3210 3211 dec_attach_in_progress_locked(cs); 3212 3213 mutex_unlock(&cpuset_mutex); 3214 } 3215 3216 /* 3217 * Common handling for a write to a "cpus" or "mems" file. 3218 */ 3219 ssize_t cpuset_write_resmask(struct kernfs_open_file *of, 3220 char *buf, size_t nbytes, loff_t off) 3221 { 3222 struct cpuset *cs = css_cs(of_css(of)); 3223 struct cpuset *trialcs; 3224 int retval = -ENODEV; 3225 3226 buf = strstrip(buf); 3227 cpus_read_lock(); 3228 mutex_lock(&cpuset_mutex); 3229 if (!is_cpuset_online(cs)) 3230 goto out_unlock; 3231 3232 trialcs = alloc_trial_cpuset(cs); 3233 if (!trialcs) { 3234 retval = -ENOMEM; 3235 goto out_unlock; 3236 } 3237 3238 switch (of_cft(of)->private) { 3239 case FILE_CPULIST: 3240 retval = update_cpumask(cs, trialcs, buf); 3241 break; 3242 case FILE_EXCLUSIVE_CPULIST: 3243 retval = update_exclusive_cpumask(cs, trialcs, buf); 3244 break; 3245 case FILE_MEMLIST: 3246 retval = update_nodemask(cs, trialcs, buf); 3247 break; 3248 default: 3249 retval = -EINVAL; 3250 break; 3251 } 3252 3253 free_cpuset(trialcs); 3254 if (force_sd_rebuild) 3255 rebuild_sched_domains_locked(); 3256 out_unlock: 3257 mutex_unlock(&cpuset_mutex); 3258 cpus_read_unlock(); 3259 flush_workqueue(cpuset_migrate_mm_wq); 3260 return retval ?: nbytes; 3261 } 3262 3263 /* 3264 * These ascii lists should be read in a single call, by using a user 3265 * buffer large enough to hold the entire map. If read in smaller 3266 * chunks, there is no guarantee of atomicity. Since the display format 3267 * used, list of ranges of sequential numbers, is variable length, 3268 * and since these maps can change value dynamically, one could read 3269 * gibberish by doing partial reads while a list was changing. 3270 */ 3271 int cpuset_common_seq_show(struct seq_file *sf, void *v) 3272 { 3273 struct cpuset *cs = css_cs(seq_css(sf)); 3274 cpuset_filetype_t type = seq_cft(sf)->private; 3275 int ret = 0; 3276 3277 spin_lock_irq(&callback_lock); 3278 3279 switch (type) { 3280 case FILE_CPULIST: 3281 seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->cpus_allowed)); 3282 break; 3283 case FILE_MEMLIST: 3284 seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->mems_allowed)); 3285 break; 3286 case FILE_EFFECTIVE_CPULIST: 3287 seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->effective_cpus)); 3288 break; 3289 case FILE_EFFECTIVE_MEMLIST: 3290 seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->effective_mems)); 3291 break; 3292 case FILE_EXCLUSIVE_CPULIST: 3293 seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->exclusive_cpus)); 3294 break; 3295 case FILE_EFFECTIVE_XCPULIST: 3296 seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->effective_xcpus)); 3297 break; 3298 case FILE_SUBPARTS_CPULIST: 3299 seq_printf(sf, "%*pbl\n", cpumask_pr_args(subpartitions_cpus)); 3300 break; 3301 case FILE_ISOLATED_CPULIST: 3302 seq_printf(sf, "%*pbl\n", cpumask_pr_args(isolated_cpus)); 3303 break; 3304 default: 3305 ret = -EINVAL; 3306 } 3307 3308 spin_unlock_irq(&callback_lock); 3309 return ret; 3310 } 3311 3312 static int cpuset_partition_show(struct seq_file *seq, void *v) 3313 { 3314 struct cpuset *cs = css_cs(seq_css(seq)); 3315 const char *err, *type = NULL; 3316 3317 switch (cs->partition_root_state) { 3318 case PRS_ROOT: 3319 seq_puts(seq, "root\n"); 3320 break; 3321 case PRS_ISOLATED: 3322 seq_puts(seq, "isolated\n"); 3323 break; 3324 case PRS_MEMBER: 3325 seq_puts(seq, "member\n"); 3326 break; 3327 case PRS_INVALID_ROOT: 3328 type = "root"; 3329 fallthrough; 3330 case PRS_INVALID_ISOLATED: 3331 if (!type) 3332 type = "isolated"; 3333 err = perr_strings[READ_ONCE(cs->prs_err)]; 3334 if (err) 3335 seq_printf(seq, "%s invalid (%s)\n", type, err); 3336 else 3337 seq_printf(seq, "%s invalid\n", type); 3338 break; 3339 } 3340 return 0; 3341 } 3342 3343 static ssize_t cpuset_partition_write(struct kernfs_open_file *of, char *buf, 3344 size_t nbytes, loff_t off) 3345 { 3346 struct cpuset *cs = css_cs(of_css(of)); 3347 int val; 3348 int retval = -ENODEV; 3349 3350 buf = strstrip(buf); 3351 3352 if (!strcmp(buf, "root")) 3353 val = PRS_ROOT; 3354 else if (!strcmp(buf, "member")) 3355 val = PRS_MEMBER; 3356 else if (!strcmp(buf, "isolated")) 3357 val = PRS_ISOLATED; 3358 else 3359 return -EINVAL; 3360 3361 css_get(&cs->css); 3362 cpus_read_lock(); 3363 mutex_lock(&cpuset_mutex); 3364 if (is_cpuset_online(cs)) 3365 retval = update_prstate(cs, val); 3366 mutex_unlock(&cpuset_mutex); 3367 cpus_read_unlock(); 3368 css_put(&cs->css); 3369 return retval ?: nbytes; 3370 } 3371 3372 /* 3373 * This is currently a minimal set for the default hierarchy. It can be 3374 * expanded later on by migrating more features and control files from v1. 3375 */ 3376 static struct cftype dfl_files[] = { 3377 { 3378 .name = "cpus", 3379 .seq_show = cpuset_common_seq_show, 3380 .write = cpuset_write_resmask, 3381 .max_write_len = (100U + 6 * NR_CPUS), 3382 .private = FILE_CPULIST, 3383 .flags = CFTYPE_NOT_ON_ROOT, 3384 }, 3385 3386 { 3387 .name = "mems", 3388 .seq_show = cpuset_common_seq_show, 3389 .write = cpuset_write_resmask, 3390 .max_write_len = (100U + 6 * MAX_NUMNODES), 3391 .private = FILE_MEMLIST, 3392 .flags = CFTYPE_NOT_ON_ROOT, 3393 }, 3394 3395 { 3396 .name = "cpus.effective", 3397 .seq_show = cpuset_common_seq_show, 3398 .private = FILE_EFFECTIVE_CPULIST, 3399 }, 3400 3401 { 3402 .name = "mems.effective", 3403 .seq_show = cpuset_common_seq_show, 3404 .private = FILE_EFFECTIVE_MEMLIST, 3405 }, 3406 3407 { 3408 .name = "cpus.partition", 3409 .seq_show = cpuset_partition_show, 3410 .write = cpuset_partition_write, 3411 .private = FILE_PARTITION_ROOT, 3412 .flags = CFTYPE_NOT_ON_ROOT, 3413 .file_offset = offsetof(struct cpuset, partition_file), 3414 }, 3415 3416 { 3417 .name = "cpus.exclusive", 3418 .seq_show = cpuset_common_seq_show, 3419 .write = cpuset_write_resmask, 3420 .max_write_len = (100U + 6 * NR_CPUS), 3421 .private = FILE_EXCLUSIVE_CPULIST, 3422 .flags = CFTYPE_NOT_ON_ROOT, 3423 }, 3424 3425 { 3426 .name = "cpus.exclusive.effective", 3427 .seq_show = cpuset_common_seq_show, 3428 .private = FILE_EFFECTIVE_XCPULIST, 3429 .flags = CFTYPE_NOT_ON_ROOT, 3430 }, 3431 3432 { 3433 .name = "cpus.subpartitions", 3434 .seq_show = cpuset_common_seq_show, 3435 .private = FILE_SUBPARTS_CPULIST, 3436 .flags = CFTYPE_ONLY_ON_ROOT | CFTYPE_DEBUG, 3437 }, 3438 3439 { 3440 .name = "cpus.isolated", 3441 .seq_show = cpuset_common_seq_show, 3442 .private = FILE_ISOLATED_CPULIST, 3443 .flags = CFTYPE_ONLY_ON_ROOT, 3444 }, 3445 3446 { } /* terminate */ 3447 }; 3448 3449 3450 /** 3451 * cpuset_css_alloc - Allocate a cpuset css 3452 * @parent_css: Parent css of the control group that the new cpuset will be 3453 * part of 3454 * Return: cpuset css on success, -ENOMEM on failure. 3455 * 3456 * Allocate and initialize a new cpuset css, for non-NULL @parent_css, return 3457 * top cpuset css otherwise. 3458 */ 3459 static struct cgroup_subsys_state * 3460 cpuset_css_alloc(struct cgroup_subsys_state *parent_css) 3461 { 3462 struct cpuset *cs; 3463 3464 if (!parent_css) 3465 return &top_cpuset.css; 3466 3467 cs = kzalloc(sizeof(*cs), GFP_KERNEL); 3468 if (!cs) 3469 return ERR_PTR(-ENOMEM); 3470 3471 if (alloc_cpumasks(cs, NULL)) { 3472 kfree(cs); 3473 return ERR_PTR(-ENOMEM); 3474 } 3475 3476 __set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); 3477 fmeter_init(&cs->fmeter); 3478 cs->relax_domain_level = -1; 3479 INIT_LIST_HEAD(&cs->remote_sibling); 3480 3481 /* Set CS_MEMORY_MIGRATE for default hierarchy */ 3482 if (cpuset_v2()) 3483 __set_bit(CS_MEMORY_MIGRATE, &cs->flags); 3484 3485 return &cs->css; 3486 } 3487 3488 static int cpuset_css_online(struct cgroup_subsys_state *css) 3489 { 3490 struct cpuset *cs = css_cs(css); 3491 struct cpuset *parent = parent_cs(cs); 3492 struct cpuset *tmp_cs; 3493 struct cgroup_subsys_state *pos_css; 3494 3495 if (!parent) 3496 return 0; 3497 3498 cpus_read_lock(); 3499 mutex_lock(&cpuset_mutex); 3500 3501 set_bit(CS_ONLINE, &cs->flags); 3502 if (is_spread_page(parent)) 3503 set_bit(CS_SPREAD_PAGE, &cs->flags); 3504 if (is_spread_slab(parent)) 3505 set_bit(CS_SPREAD_SLAB, &cs->flags); 3506 /* 3507 * For v2, clear CS_SCHED_LOAD_BALANCE if parent is isolated 3508 */ 3509 if (cpuset_v2() && !is_sched_load_balance(parent)) 3510 clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); 3511 3512 cpuset_inc(); 3513 3514 spin_lock_irq(&callback_lock); 3515 if (is_in_v2_mode()) { 3516 cpumask_copy(cs->effective_cpus, parent->effective_cpus); 3517 cs->effective_mems = parent->effective_mems; 3518 } 3519 spin_unlock_irq(&callback_lock); 3520 3521 if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags)) 3522 goto out_unlock; 3523 3524 /* 3525 * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is 3526 * set. This flag handling is implemented in cgroup core for 3527 * historical reasons - the flag may be specified during mount. 3528 * 3529 * Currently, if any sibling cpusets have exclusive cpus or mem, we 3530 * refuse to clone the configuration - thereby refusing the task to 3531 * be entered, and as a result refusing the sys_unshare() or 3532 * clone() which initiated it. If this becomes a problem for some 3533 * users who wish to allow that scenario, then this could be 3534 * changed to grant parent->cpus_allowed-sibling_cpus_exclusive 3535 * (and likewise for mems) to the new cgroup. 3536 */ 3537 rcu_read_lock(); 3538 cpuset_for_each_child(tmp_cs, pos_css, parent) { 3539 if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) { 3540 rcu_read_unlock(); 3541 goto out_unlock; 3542 } 3543 } 3544 rcu_read_unlock(); 3545 3546 spin_lock_irq(&callback_lock); 3547 cs->mems_allowed = parent->mems_allowed; 3548 cs->effective_mems = parent->mems_allowed; 3549 cpumask_copy(cs->cpus_allowed, parent->cpus_allowed); 3550 cpumask_copy(cs->effective_cpus, parent->cpus_allowed); 3551 spin_unlock_irq(&callback_lock); 3552 out_unlock: 3553 mutex_unlock(&cpuset_mutex); 3554 cpus_read_unlock(); 3555 return 0; 3556 } 3557 3558 /* 3559 * If the cpuset being removed has its flag 'sched_load_balance' 3560 * enabled, then simulate turning sched_load_balance off, which 3561 * will call rebuild_sched_domains_locked(). That is not needed 3562 * in the default hierarchy where only changes in partition 3563 * will cause repartitioning. 3564 */ 3565 static void cpuset_css_offline(struct cgroup_subsys_state *css) 3566 { 3567 struct cpuset *cs = css_cs(css); 3568 3569 cpus_read_lock(); 3570 mutex_lock(&cpuset_mutex); 3571 3572 if (!cpuset_v2() && is_sched_load_balance(cs)) 3573 cpuset_update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); 3574 3575 cpuset_dec(); 3576 clear_bit(CS_ONLINE, &cs->flags); 3577 3578 mutex_unlock(&cpuset_mutex); 3579 cpus_read_unlock(); 3580 } 3581 3582 /* 3583 * If a dying cpuset has the 'cpus.partition' enabled, turn it off by 3584 * changing it back to member to free its exclusive CPUs back to the pool to 3585 * be used by other online cpusets. 3586 */ 3587 static void cpuset_css_killed(struct cgroup_subsys_state *css) 3588 { 3589 struct cpuset *cs = css_cs(css); 3590 3591 cpus_read_lock(); 3592 mutex_lock(&cpuset_mutex); 3593 3594 /* Reset valid partition back to member */ 3595 if (is_partition_valid(cs)) 3596 update_prstate(cs, PRS_MEMBER); 3597 3598 mutex_unlock(&cpuset_mutex); 3599 cpus_read_unlock(); 3600 3601 } 3602 3603 static void cpuset_css_free(struct cgroup_subsys_state *css) 3604 { 3605 struct cpuset *cs = css_cs(css); 3606 3607 free_cpuset(cs); 3608 } 3609 3610 static void cpuset_bind(struct cgroup_subsys_state *root_css) 3611 { 3612 mutex_lock(&cpuset_mutex); 3613 spin_lock_irq(&callback_lock); 3614 3615 if (is_in_v2_mode()) { 3616 cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask); 3617 cpumask_copy(top_cpuset.effective_xcpus, cpu_possible_mask); 3618 top_cpuset.mems_allowed = node_possible_map; 3619 } else { 3620 cpumask_copy(top_cpuset.cpus_allowed, 3621 top_cpuset.effective_cpus); 3622 top_cpuset.mems_allowed = top_cpuset.effective_mems; 3623 } 3624 3625 spin_unlock_irq(&callback_lock); 3626 mutex_unlock(&cpuset_mutex); 3627 } 3628 3629 /* 3630 * In case the child is cloned into a cpuset different from its parent, 3631 * additional checks are done to see if the move is allowed. 3632 */ 3633 static int cpuset_can_fork(struct task_struct *task, struct css_set *cset) 3634 { 3635 struct cpuset *cs = css_cs(cset->subsys[cpuset_cgrp_id]); 3636 bool same_cs; 3637 int ret; 3638 3639 rcu_read_lock(); 3640 same_cs = (cs == task_cs(current)); 3641 rcu_read_unlock(); 3642 3643 if (same_cs) 3644 return 0; 3645 3646 lockdep_assert_held(&cgroup_mutex); 3647 mutex_lock(&cpuset_mutex); 3648 3649 /* Check to see if task is allowed in the cpuset */ 3650 ret = cpuset_can_attach_check(cs); 3651 if (ret) 3652 goto out_unlock; 3653 3654 ret = task_can_attach(task); 3655 if (ret) 3656 goto out_unlock; 3657 3658 ret = security_task_setscheduler(task); 3659 if (ret) 3660 goto out_unlock; 3661 3662 /* 3663 * Mark attach is in progress. This makes validate_change() fail 3664 * changes which zero cpus/mems_allowed. 3665 */ 3666 cs->attach_in_progress++; 3667 out_unlock: 3668 mutex_unlock(&cpuset_mutex); 3669 return ret; 3670 } 3671 3672 static void cpuset_cancel_fork(struct task_struct *task, struct css_set *cset) 3673 { 3674 struct cpuset *cs = css_cs(cset->subsys[cpuset_cgrp_id]); 3675 bool same_cs; 3676 3677 rcu_read_lock(); 3678 same_cs = (cs == task_cs(current)); 3679 rcu_read_unlock(); 3680 3681 if (same_cs) 3682 return; 3683 3684 dec_attach_in_progress(cs); 3685 } 3686 3687 /* 3688 * Make sure the new task conform to the current state of its parent, 3689 * which could have been changed by cpuset just after it inherits the 3690 * state from the parent and before it sits on the cgroup's task list. 3691 */ 3692 static void cpuset_fork(struct task_struct *task) 3693 { 3694 struct cpuset *cs; 3695 bool same_cs; 3696 3697 rcu_read_lock(); 3698 cs = task_cs(task); 3699 same_cs = (cs == task_cs(current)); 3700 rcu_read_unlock(); 3701 3702 if (same_cs) { 3703 if (cs == &top_cpuset) 3704 return; 3705 3706 set_cpus_allowed_ptr(task, current->cpus_ptr); 3707 task->mems_allowed = current->mems_allowed; 3708 return; 3709 } 3710 3711 /* CLONE_INTO_CGROUP */ 3712 mutex_lock(&cpuset_mutex); 3713 guarantee_online_mems(cs, &cpuset_attach_nodemask_to); 3714 cpuset_attach_task(cs, task); 3715 3716 dec_attach_in_progress_locked(cs); 3717 mutex_unlock(&cpuset_mutex); 3718 } 3719 3720 struct cgroup_subsys cpuset_cgrp_subsys = { 3721 .css_alloc = cpuset_css_alloc, 3722 .css_online = cpuset_css_online, 3723 .css_offline = cpuset_css_offline, 3724 .css_killed = cpuset_css_killed, 3725 .css_free = cpuset_css_free, 3726 .can_attach = cpuset_can_attach, 3727 .cancel_attach = cpuset_cancel_attach, 3728 .attach = cpuset_attach, 3729 .post_attach = cpuset_post_attach, 3730 .bind = cpuset_bind, 3731 .can_fork = cpuset_can_fork, 3732 .cancel_fork = cpuset_cancel_fork, 3733 .fork = cpuset_fork, 3734 #ifdef CONFIG_CPUSETS_V1 3735 .legacy_cftypes = cpuset1_files, 3736 #endif 3737 .dfl_cftypes = dfl_files, 3738 .early_init = true, 3739 .threaded = true, 3740 }; 3741 3742 /** 3743 * cpuset_init - initialize cpusets at system boot 3744 * 3745 * Description: Initialize top_cpuset 3746 **/ 3747 3748 int __init cpuset_init(void) 3749 { 3750 BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL)); 3751 BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL)); 3752 BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_xcpus, GFP_KERNEL)); 3753 BUG_ON(!alloc_cpumask_var(&top_cpuset.exclusive_cpus, GFP_KERNEL)); 3754 BUG_ON(!zalloc_cpumask_var(&subpartitions_cpus, GFP_KERNEL)); 3755 BUG_ON(!zalloc_cpumask_var(&isolated_cpus, GFP_KERNEL)); 3756 3757 cpumask_setall(top_cpuset.cpus_allowed); 3758 nodes_setall(top_cpuset.mems_allowed); 3759 cpumask_setall(top_cpuset.effective_cpus); 3760 cpumask_setall(top_cpuset.effective_xcpus); 3761 cpumask_setall(top_cpuset.exclusive_cpus); 3762 nodes_setall(top_cpuset.effective_mems); 3763 3764 fmeter_init(&top_cpuset.fmeter); 3765 INIT_LIST_HEAD(&remote_children); 3766 3767 BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL)); 3768 3769 have_boot_isolcpus = housekeeping_enabled(HK_TYPE_DOMAIN); 3770 if (have_boot_isolcpus) { 3771 BUG_ON(!alloc_cpumask_var(&boot_hk_cpus, GFP_KERNEL)); 3772 cpumask_copy(boot_hk_cpus, housekeeping_cpumask(HK_TYPE_DOMAIN)); 3773 cpumask_andnot(isolated_cpus, cpu_possible_mask, boot_hk_cpus); 3774 } 3775 3776 return 0; 3777 } 3778 3779 static void 3780 hotplug_update_tasks(struct cpuset *cs, 3781 struct cpumask *new_cpus, nodemask_t *new_mems, 3782 bool cpus_updated, bool mems_updated) 3783 { 3784 /* A partition root is allowed to have empty effective cpus */ 3785 if (cpumask_empty(new_cpus) && !is_partition_valid(cs)) 3786 cpumask_copy(new_cpus, parent_cs(cs)->effective_cpus); 3787 if (nodes_empty(*new_mems)) 3788 *new_mems = parent_cs(cs)->effective_mems; 3789 3790 spin_lock_irq(&callback_lock); 3791 cpumask_copy(cs->effective_cpus, new_cpus); 3792 cs->effective_mems = *new_mems; 3793 spin_unlock_irq(&callback_lock); 3794 3795 if (cpus_updated) 3796 cpuset_update_tasks_cpumask(cs, new_cpus); 3797 if (mems_updated) 3798 cpuset_update_tasks_nodemask(cs); 3799 } 3800 3801 void cpuset_force_rebuild(void) 3802 { 3803 force_sd_rebuild = true; 3804 } 3805 3806 /** 3807 * cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug 3808 * @cs: cpuset in interest 3809 * @tmp: the tmpmasks structure pointer 3810 * 3811 * Compare @cs's cpu and mem masks against top_cpuset and if some have gone 3812 * offline, update @cs accordingly. If @cs ends up with no CPU or memory, 3813 * all its tasks are moved to the nearest ancestor with both resources. 3814 */ 3815 static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp) 3816 { 3817 static cpumask_t new_cpus; 3818 static nodemask_t new_mems; 3819 bool cpus_updated; 3820 bool mems_updated; 3821 bool remote; 3822 int partcmd = -1; 3823 struct cpuset *parent; 3824 retry: 3825 wait_event(cpuset_attach_wq, cs->attach_in_progress == 0); 3826 3827 mutex_lock(&cpuset_mutex); 3828 3829 /* 3830 * We have raced with task attaching. We wait until attaching 3831 * is finished, so we won't attach a task to an empty cpuset. 3832 */ 3833 if (cs->attach_in_progress) { 3834 mutex_unlock(&cpuset_mutex); 3835 goto retry; 3836 } 3837 3838 parent = parent_cs(cs); 3839 compute_effective_cpumask(&new_cpus, cs, parent); 3840 nodes_and(new_mems, cs->mems_allowed, parent->effective_mems); 3841 3842 if (!tmp || !cs->partition_root_state) 3843 goto update_tasks; 3844 3845 /* 3846 * Compute effective_cpus for valid partition root, may invalidate 3847 * child partition roots if necessary. 3848 */ 3849 remote = is_remote_partition(cs); 3850 if (remote || (is_partition_valid(cs) && is_partition_valid(parent))) 3851 compute_partition_effective_cpumask(cs, &new_cpus); 3852 3853 if (remote && cpumask_empty(&new_cpus) && 3854 partition_is_populated(cs, NULL)) { 3855 cs->prs_err = PERR_HOTPLUG; 3856 remote_partition_disable(cs, tmp); 3857 compute_effective_cpumask(&new_cpus, cs, parent); 3858 remote = false; 3859 } 3860 3861 /* 3862 * Force the partition to become invalid if either one of 3863 * the following conditions hold: 3864 * 1) empty effective cpus but not valid empty partition. 3865 * 2) parent is invalid or doesn't grant any cpus to child 3866 * partitions. 3867 */ 3868 if (is_local_partition(cs) && (!is_partition_valid(parent) || 3869 tasks_nocpu_error(parent, cs, &new_cpus))) 3870 partcmd = partcmd_invalidate; 3871 /* 3872 * On the other hand, an invalid partition root may be transitioned 3873 * back to a regular one. 3874 */ 3875 else if (is_partition_valid(parent) && is_partition_invalid(cs)) 3876 partcmd = partcmd_update; 3877 3878 if (partcmd >= 0) { 3879 update_parent_effective_cpumask(cs, partcmd, NULL, tmp); 3880 if ((partcmd == partcmd_invalidate) || is_partition_valid(cs)) { 3881 compute_partition_effective_cpumask(cs, &new_cpus); 3882 cpuset_force_rebuild(); 3883 } 3884 } 3885 3886 update_tasks: 3887 cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus); 3888 mems_updated = !nodes_equal(new_mems, cs->effective_mems); 3889 if (!cpus_updated && !mems_updated) 3890 goto unlock; /* Hotplug doesn't affect this cpuset */ 3891 3892 if (mems_updated) 3893 check_insane_mems_config(&new_mems); 3894 3895 if (is_in_v2_mode()) 3896 hotplug_update_tasks(cs, &new_cpus, &new_mems, 3897 cpus_updated, mems_updated); 3898 else 3899 cpuset1_hotplug_update_tasks(cs, &new_cpus, &new_mems, 3900 cpus_updated, mems_updated); 3901 3902 unlock: 3903 mutex_unlock(&cpuset_mutex); 3904 } 3905 3906 /** 3907 * cpuset_handle_hotplug - handle CPU/memory hot{,un}plug for a cpuset 3908 * 3909 * This function is called after either CPU or memory configuration has 3910 * changed and updates cpuset accordingly. The top_cpuset is always 3911 * synchronized to cpu_active_mask and N_MEMORY, which is necessary in 3912 * order to make cpusets transparent (of no affect) on systems that are 3913 * actively using CPU hotplug but making no active use of cpusets. 3914 * 3915 * Non-root cpusets are only affected by offlining. If any CPUs or memory 3916 * nodes have been taken down, cpuset_hotplug_update_tasks() is invoked on 3917 * all descendants. 3918 * 3919 * Note that CPU offlining during suspend is ignored. We don't modify 3920 * cpusets across suspend/resume cycles at all. 3921 * 3922 * CPU / memory hotplug is handled synchronously. 3923 */ 3924 static void cpuset_handle_hotplug(void) 3925 { 3926 static cpumask_t new_cpus; 3927 static nodemask_t new_mems; 3928 bool cpus_updated, mems_updated; 3929 bool on_dfl = is_in_v2_mode(); 3930 struct tmpmasks tmp, *ptmp = NULL; 3931 3932 if (on_dfl && !alloc_cpumasks(NULL, &tmp)) 3933 ptmp = &tmp; 3934 3935 lockdep_assert_cpus_held(); 3936 mutex_lock(&cpuset_mutex); 3937 3938 /* fetch the available cpus/mems and find out which changed how */ 3939 cpumask_copy(&new_cpus, cpu_active_mask); 3940 new_mems = node_states[N_MEMORY]; 3941 3942 /* 3943 * If subpartitions_cpus is populated, it is likely that the check 3944 * below will produce a false positive on cpus_updated when the cpu 3945 * list isn't changed. It is extra work, but it is better to be safe. 3946 */ 3947 cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus) || 3948 !cpumask_empty(subpartitions_cpus); 3949 mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems); 3950 3951 /* For v1, synchronize cpus_allowed to cpu_active_mask */ 3952 if (cpus_updated) { 3953 cpuset_force_rebuild(); 3954 spin_lock_irq(&callback_lock); 3955 if (!on_dfl) 3956 cpumask_copy(top_cpuset.cpus_allowed, &new_cpus); 3957 /* 3958 * Make sure that CPUs allocated to child partitions 3959 * do not show up in effective_cpus. If no CPU is left, 3960 * we clear the subpartitions_cpus & let the child partitions 3961 * fight for the CPUs again. 3962 */ 3963 if (!cpumask_empty(subpartitions_cpus)) { 3964 if (cpumask_subset(&new_cpus, subpartitions_cpus)) { 3965 top_cpuset.nr_subparts = 0; 3966 cpumask_clear(subpartitions_cpus); 3967 } else { 3968 cpumask_andnot(&new_cpus, &new_cpus, 3969 subpartitions_cpus); 3970 } 3971 } 3972 cpumask_copy(top_cpuset.effective_cpus, &new_cpus); 3973 spin_unlock_irq(&callback_lock); 3974 /* we don't mess with cpumasks of tasks in top_cpuset */ 3975 } 3976 3977 /* synchronize mems_allowed to N_MEMORY */ 3978 if (mems_updated) { 3979 spin_lock_irq(&callback_lock); 3980 if (!on_dfl) 3981 top_cpuset.mems_allowed = new_mems; 3982 top_cpuset.effective_mems = new_mems; 3983 spin_unlock_irq(&callback_lock); 3984 cpuset_update_tasks_nodemask(&top_cpuset); 3985 } 3986 3987 mutex_unlock(&cpuset_mutex); 3988 3989 /* if cpus or mems changed, we need to propagate to descendants */ 3990 if (cpus_updated || mems_updated) { 3991 struct cpuset *cs; 3992 struct cgroup_subsys_state *pos_css; 3993 3994 rcu_read_lock(); 3995 cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) { 3996 if (cs == &top_cpuset || !css_tryget_online(&cs->css)) 3997 continue; 3998 rcu_read_unlock(); 3999 4000 cpuset_hotplug_update_tasks(cs, ptmp); 4001 4002 rcu_read_lock(); 4003 css_put(&cs->css); 4004 } 4005 rcu_read_unlock(); 4006 } 4007 4008 /* rebuild sched domains if necessary */ 4009 if (force_sd_rebuild) 4010 rebuild_sched_domains_cpuslocked(); 4011 4012 free_cpumasks(NULL, ptmp); 4013 } 4014 4015 void cpuset_update_active_cpus(void) 4016 { 4017 /* 4018 * We're inside cpu hotplug critical region which usually nests 4019 * inside cgroup synchronization. Bounce actual hotplug processing 4020 * to a work item to avoid reverse locking order. 4021 */ 4022 cpuset_handle_hotplug(); 4023 } 4024 4025 /* 4026 * Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY]. 4027 * Call this routine anytime after node_states[N_MEMORY] changes. 4028 * See cpuset_update_active_cpus() for CPU hotplug handling. 4029 */ 4030 static int cpuset_track_online_nodes(struct notifier_block *self, 4031 unsigned long action, void *arg) 4032 { 4033 cpuset_handle_hotplug(); 4034 return NOTIFY_OK; 4035 } 4036 4037 /** 4038 * cpuset_init_smp - initialize cpus_allowed 4039 * 4040 * Description: Finish top cpuset after cpu, node maps are initialized 4041 */ 4042 void __init cpuset_init_smp(void) 4043 { 4044 /* 4045 * cpus_allowd/mems_allowed set to v2 values in the initial 4046 * cpuset_bind() call will be reset to v1 values in another 4047 * cpuset_bind() call when v1 cpuset is mounted. 4048 */ 4049 top_cpuset.old_mems_allowed = top_cpuset.mems_allowed; 4050 4051 cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask); 4052 top_cpuset.effective_mems = node_states[N_MEMORY]; 4053 4054 hotplug_node_notifier(cpuset_track_online_nodes, CPUSET_CALLBACK_PRI); 4055 4056 cpuset_migrate_mm_wq = alloc_ordered_workqueue("cpuset_migrate_mm", 0); 4057 BUG_ON(!cpuset_migrate_mm_wq); 4058 } 4059 4060 /** 4061 * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset. 4062 * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed. 4063 * @pmask: pointer to struct cpumask variable to receive cpus_allowed set. 4064 * 4065 * Description: Returns the cpumask_var_t cpus_allowed of the cpuset 4066 * attached to the specified @tsk. Guaranteed to return some non-empty 4067 * subset of cpu_active_mask, even if this means going outside the 4068 * tasks cpuset, except when the task is in the top cpuset. 4069 **/ 4070 4071 void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) 4072 { 4073 unsigned long flags; 4074 struct cpuset *cs; 4075 4076 spin_lock_irqsave(&callback_lock, flags); 4077 rcu_read_lock(); 4078 4079 cs = task_cs(tsk); 4080 if (cs != &top_cpuset) 4081 guarantee_active_cpus(tsk, pmask); 4082 /* 4083 * Tasks in the top cpuset won't get update to their cpumasks 4084 * when a hotplug online/offline event happens. So we include all 4085 * offline cpus in the allowed cpu list. 4086 */ 4087 if ((cs == &top_cpuset) || cpumask_empty(pmask)) { 4088 const struct cpumask *possible_mask = task_cpu_possible_mask(tsk); 4089 4090 /* 4091 * We first exclude cpus allocated to partitions. If there is no 4092 * allowable online cpu left, we fall back to all possible cpus. 4093 */ 4094 cpumask_andnot(pmask, possible_mask, subpartitions_cpus); 4095 if (!cpumask_intersects(pmask, cpu_active_mask)) 4096 cpumask_copy(pmask, possible_mask); 4097 } 4098 4099 rcu_read_unlock(); 4100 spin_unlock_irqrestore(&callback_lock, flags); 4101 } 4102 4103 /** 4104 * cpuset_cpus_allowed_fallback - final fallback before complete catastrophe. 4105 * @tsk: pointer to task_struct with which the scheduler is struggling 4106 * 4107 * Description: In the case that the scheduler cannot find an allowed cpu in 4108 * tsk->cpus_allowed, we fall back to task_cs(tsk)->cpus_allowed. In legacy 4109 * mode however, this value is the same as task_cs(tsk)->effective_cpus, 4110 * which will not contain a sane cpumask during cases such as cpu hotplugging. 4111 * This is the absolute last resort for the scheduler and it is only used if 4112 * _every_ other avenue has been traveled. 4113 * 4114 * Returns true if the affinity of @tsk was changed, false otherwise. 4115 **/ 4116 4117 bool cpuset_cpus_allowed_fallback(struct task_struct *tsk) 4118 { 4119 const struct cpumask *possible_mask = task_cpu_possible_mask(tsk); 4120 const struct cpumask *cs_mask; 4121 bool changed = false; 4122 4123 rcu_read_lock(); 4124 cs_mask = task_cs(tsk)->cpus_allowed; 4125 if (is_in_v2_mode() && cpumask_subset(cs_mask, possible_mask)) { 4126 do_set_cpus_allowed(tsk, cs_mask); 4127 changed = true; 4128 } 4129 rcu_read_unlock(); 4130 4131 /* 4132 * We own tsk->cpus_allowed, nobody can change it under us. 4133 * 4134 * But we used cs && cs->cpus_allowed lockless and thus can 4135 * race with cgroup_attach_task() or update_cpumask() and get 4136 * the wrong tsk->cpus_allowed. However, both cases imply the 4137 * subsequent cpuset_change_cpumask()->set_cpus_allowed_ptr() 4138 * which takes task_rq_lock(). 4139 * 4140 * If we are called after it dropped the lock we must see all 4141 * changes in tsk_cs()->cpus_allowed. Otherwise we can temporary 4142 * set any mask even if it is not right from task_cs() pov, 4143 * the pending set_cpus_allowed_ptr() will fix things. 4144 * 4145 * select_fallback_rq() will fix things ups and set cpu_possible_mask 4146 * if required. 4147 */ 4148 return changed; 4149 } 4150 4151 void __init cpuset_init_current_mems_allowed(void) 4152 { 4153 nodes_setall(current->mems_allowed); 4154 } 4155 4156 /** 4157 * cpuset_mems_allowed - return mems_allowed mask from a tasks cpuset. 4158 * @tsk: pointer to task_struct from which to obtain cpuset->mems_allowed. 4159 * 4160 * Description: Returns the nodemask_t mems_allowed of the cpuset 4161 * attached to the specified @tsk. Guaranteed to return some non-empty 4162 * subset of node_states[N_MEMORY], even if this means going outside the 4163 * tasks cpuset. 4164 **/ 4165 4166 nodemask_t cpuset_mems_allowed(struct task_struct *tsk) 4167 { 4168 nodemask_t mask; 4169 unsigned long flags; 4170 4171 spin_lock_irqsave(&callback_lock, flags); 4172 rcu_read_lock(); 4173 guarantee_online_mems(task_cs(tsk), &mask); 4174 rcu_read_unlock(); 4175 spin_unlock_irqrestore(&callback_lock, flags); 4176 4177 return mask; 4178 } 4179 4180 /** 4181 * cpuset_nodemask_valid_mems_allowed - check nodemask vs. current mems_allowed 4182 * @nodemask: the nodemask to be checked 4183 * 4184 * Are any of the nodes in the nodemask allowed in current->mems_allowed? 4185 */ 4186 int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask) 4187 { 4188 return nodes_intersects(*nodemask, current->mems_allowed); 4189 } 4190 4191 /* 4192 * nearest_hardwall_ancestor() - Returns the nearest mem_exclusive or 4193 * mem_hardwall ancestor to the specified cpuset. Call holding 4194 * callback_lock. If no ancestor is mem_exclusive or mem_hardwall 4195 * (an unusual configuration), then returns the root cpuset. 4196 */ 4197 static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) 4198 { 4199 while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && parent_cs(cs)) 4200 cs = parent_cs(cs); 4201 return cs; 4202 } 4203 4204 /* 4205 * cpuset_current_node_allowed - Can current task allocate on a memory node? 4206 * @node: is this an allowed node? 4207 * @gfp_mask: memory allocation flags 4208 * 4209 * If we're in interrupt, yes, we can always allocate. If @node is set in 4210 * current's mems_allowed, yes. If it's not a __GFP_HARDWALL request and this 4211 * node is set in the nearest hardwalled cpuset ancestor to current's cpuset, 4212 * yes. If current has access to memory reserves as an oom victim, yes. 4213 * Otherwise, no. 4214 * 4215 * GFP_USER allocations are marked with the __GFP_HARDWALL bit, 4216 * and do not allow allocations outside the current tasks cpuset 4217 * unless the task has been OOM killed. 4218 * GFP_KERNEL allocations are not so marked, so can escape to the 4219 * nearest enclosing hardwalled ancestor cpuset. 4220 * 4221 * Scanning up parent cpusets requires callback_lock. The 4222 * __alloc_pages() routine only calls here with __GFP_HARDWALL bit 4223 * _not_ set if it's a GFP_KERNEL allocation, and all nodes in the 4224 * current tasks mems_allowed came up empty on the first pass over 4225 * the zonelist. So only GFP_KERNEL allocations, if all nodes in the 4226 * cpuset are short of memory, might require taking the callback_lock. 4227 * 4228 * The first call here from mm/page_alloc:get_page_from_freelist() 4229 * has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets, 4230 * so no allocation on a node outside the cpuset is allowed (unless 4231 * in interrupt, of course). 4232 * 4233 * The second pass through get_page_from_freelist() doesn't even call 4234 * here for GFP_ATOMIC calls. For those calls, the __alloc_pages() 4235 * variable 'wait' is not set, and the bit ALLOC_CPUSET is not set 4236 * in alloc_flags. That logic and the checks below have the combined 4237 * affect that: 4238 * in_interrupt - any node ok (current task context irrelevant) 4239 * GFP_ATOMIC - any node ok 4240 * tsk_is_oom_victim - any node ok 4241 * GFP_KERNEL - any node in enclosing hardwalled cpuset ok 4242 * GFP_USER - only nodes in current tasks mems allowed ok. 4243 */ 4244 bool cpuset_current_node_allowed(int node, gfp_t gfp_mask) 4245 { 4246 struct cpuset *cs; /* current cpuset ancestors */ 4247 bool allowed; /* is allocation in zone z allowed? */ 4248 unsigned long flags; 4249 4250 if (in_interrupt()) 4251 return true; 4252 if (node_isset(node, current->mems_allowed)) 4253 return true; 4254 /* 4255 * Allow tasks that have access to memory reserves because they have 4256 * been OOM killed to get memory anywhere. 4257 */ 4258 if (unlikely(tsk_is_oom_victim(current))) 4259 return true; 4260 if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */ 4261 return false; 4262 4263 if (current->flags & PF_EXITING) /* Let dying task have memory */ 4264 return true; 4265 4266 /* Not hardwall and node outside mems_allowed: scan up cpusets */ 4267 spin_lock_irqsave(&callback_lock, flags); 4268 4269 rcu_read_lock(); 4270 cs = nearest_hardwall_ancestor(task_cs(current)); 4271 allowed = node_isset(node, cs->mems_allowed); 4272 rcu_read_unlock(); 4273 4274 spin_unlock_irqrestore(&callback_lock, flags); 4275 return allowed; 4276 } 4277 4278 bool cpuset_node_allowed(struct cgroup *cgroup, int nid) 4279 { 4280 struct cgroup_subsys_state *css; 4281 struct cpuset *cs; 4282 bool allowed; 4283 4284 /* 4285 * In v1, mem_cgroup and cpuset are unlikely in the same hierarchy 4286 * and mems_allowed is likely to be empty even if we could get to it, 4287 * so return true to avoid taking a global lock on the empty check. 4288 */ 4289 if (!cpuset_v2()) 4290 return true; 4291 4292 css = cgroup_get_e_css(cgroup, &cpuset_cgrp_subsys); 4293 if (!css) 4294 return true; 4295 4296 /* 4297 * Normally, accessing effective_mems would require the cpuset_mutex 4298 * or callback_lock - but node_isset is atomic and the reference 4299 * taken via cgroup_get_e_css is sufficient to protect css. 4300 * 4301 * Since this interface is intended for use by migration paths, we 4302 * relax locking here to avoid taking global locks - while accepting 4303 * there may be rare scenarios where the result may be innaccurate. 4304 * 4305 * Reclaim and migration are subject to these same race conditions, and 4306 * cannot make strong isolation guarantees, so this is acceptable. 4307 */ 4308 cs = container_of(css, struct cpuset, css); 4309 allowed = node_isset(nid, cs->effective_mems); 4310 css_put(css); 4311 return allowed; 4312 } 4313 4314 /** 4315 * cpuset_spread_node() - On which node to begin search for a page 4316 * @rotor: round robin rotor 4317 * 4318 * If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for 4319 * tasks in a cpuset with is_spread_page or is_spread_slab set), 4320 * and if the memory allocation used cpuset_mem_spread_node() 4321 * to determine on which node to start looking, as it will for 4322 * certain page cache or slab cache pages such as used for file 4323 * system buffers and inode caches, then instead of starting on the 4324 * local node to look for a free page, rather spread the starting 4325 * node around the tasks mems_allowed nodes. 4326 * 4327 * We don't have to worry about the returned node being offline 4328 * because "it can't happen", and even if it did, it would be ok. 4329 * 4330 * The routines calling guarantee_online_mems() are careful to 4331 * only set nodes in task->mems_allowed that are online. So it 4332 * should not be possible for the following code to return an 4333 * offline node. But if it did, that would be ok, as this routine 4334 * is not returning the node where the allocation must be, only 4335 * the node where the search should start. The zonelist passed to 4336 * __alloc_pages() will include all nodes. If the slab allocator 4337 * is passed an offline node, it will fall back to the local node. 4338 * See kmem_cache_alloc_node(). 4339 */ 4340 static int cpuset_spread_node(int *rotor) 4341 { 4342 return *rotor = next_node_in(*rotor, current->mems_allowed); 4343 } 4344 4345 /** 4346 * cpuset_mem_spread_node() - On which node to begin search for a file page 4347 */ 4348 int cpuset_mem_spread_node(void) 4349 { 4350 if (current->cpuset_mem_spread_rotor == NUMA_NO_NODE) 4351 current->cpuset_mem_spread_rotor = 4352 node_random(¤t->mems_allowed); 4353 4354 return cpuset_spread_node(¤t->cpuset_mem_spread_rotor); 4355 } 4356 4357 /** 4358 * cpuset_mems_allowed_intersects - Does @tsk1's mems_allowed intersect @tsk2's? 4359 * @tsk1: pointer to task_struct of some task. 4360 * @tsk2: pointer to task_struct of some other task. 4361 * 4362 * Description: Return true if @tsk1's mems_allowed intersects the 4363 * mems_allowed of @tsk2. Used by the OOM killer to determine if 4364 * one of the task's memory usage might impact the memory available 4365 * to the other. 4366 **/ 4367 4368 int cpuset_mems_allowed_intersects(const struct task_struct *tsk1, 4369 const struct task_struct *tsk2) 4370 { 4371 return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed); 4372 } 4373 4374 /** 4375 * cpuset_print_current_mems_allowed - prints current's cpuset and mems_allowed 4376 * 4377 * Description: Prints current's name, cpuset name, and cached copy of its 4378 * mems_allowed to the kernel log. 4379 */ 4380 void cpuset_print_current_mems_allowed(void) 4381 { 4382 struct cgroup *cgrp; 4383 4384 rcu_read_lock(); 4385 4386 cgrp = task_cs(current)->css.cgroup; 4387 pr_cont(",cpuset="); 4388 pr_cont_cgroup_name(cgrp); 4389 pr_cont(",mems_allowed=%*pbl", 4390 nodemask_pr_args(¤t->mems_allowed)); 4391 4392 rcu_read_unlock(); 4393 } 4394 4395 /* Display task mems_allowed in /proc/<pid>/status file. */ 4396 void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task) 4397 { 4398 seq_printf(m, "Mems_allowed:\t%*pb\n", 4399 nodemask_pr_args(&task->mems_allowed)); 4400 seq_printf(m, "Mems_allowed_list:\t%*pbl\n", 4401 nodemask_pr_args(&task->mems_allowed)); 4402 } 4403