1 /* 2 * kernel/cpuset.c 3 * 4 * Processor and Memory placement constraints for sets of tasks. 5 * 6 * Copyright (C) 2003 BULL SA. 7 * Copyright (C) 2004-2007 Silicon Graphics, Inc. 8 * Copyright (C) 2006 Google, Inc 9 * 10 * Portions derived from Patrick Mochel's sysfs code. 11 * sysfs is Copyright (c) 2001-3 Patrick Mochel 12 * 13 * 2003-10-10 Written by Simon Derr. 14 * 2003-10-22 Updates by Stephen Hemminger. 15 * 2004 May-July Rework by Paul Jackson. 16 * 2006 Rework by Paul Menage to use generic cgroups 17 * 2008 Rework of the scheduler domains and CPU hotplug handling 18 * by Max Krasnyansky 19 * 20 * This file is subject to the terms and conditions of the GNU General Public 21 * License. See the file COPYING in the main directory of the Linux 22 * distribution for more details. 23 */ 24 25 #include <linux/cpu.h> 26 #include <linux/cpumask.h> 27 #include <linux/cpuset.h> 28 #include <linux/delay.h> 29 #include <linux/init.h> 30 #include <linux/interrupt.h> 31 #include <linux/kernel.h> 32 #include <linux/mempolicy.h> 33 #include <linux/mm.h> 34 #include <linux/memory.h> 35 #include <linux/export.h> 36 #include <linux/rcupdate.h> 37 #include <linux/sched.h> 38 #include <linux/sched/deadline.h> 39 #include <linux/sched/mm.h> 40 #include <linux/sched/task.h> 41 #include <linux/security.h> 42 #include <linux/spinlock.h> 43 #include <linux/oom.h> 44 #include <linux/sched/isolation.h> 45 #include <linux/cgroup.h> 46 #include <linux/wait.h> 47 #include <linux/workqueue.h> 48 49 DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key); 50 DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key); 51 52 /* 53 * There could be abnormal cpuset configurations for cpu or memory 54 * node binding, add this key to provide a quick low-cost judgment 55 * of the situation. 56 */ 57 DEFINE_STATIC_KEY_FALSE(cpusets_insane_config_key); 58 59 /* See "Frequency meter" comments, below. */ 60 61 struct fmeter { 62 int cnt; /* unprocessed events count */ 63 int val; /* most recent output value */ 64 time64_t time; /* clock (secs) when val computed */ 65 spinlock_t lock; /* guards read or write of above */ 66 }; 67 68 /* 69 * Invalid partition error code 70 */ 71 enum prs_errcode { 72 PERR_NONE = 0, 73 PERR_INVCPUS, 74 PERR_INVPARENT, 75 PERR_NOTPART, 76 PERR_NOTEXCL, 77 PERR_NOCPUS, 78 PERR_HOTPLUG, 79 PERR_CPUSEMPTY, 80 PERR_HKEEPING, 81 }; 82 83 static const char * const perr_strings[] = { 84 [PERR_INVCPUS] = "Invalid cpu list in cpuset.cpus.exclusive", 85 [PERR_INVPARENT] = "Parent is an invalid partition root", 86 [PERR_NOTPART] = "Parent is not a partition root", 87 [PERR_NOTEXCL] = "Cpu list in cpuset.cpus not exclusive", 88 [PERR_NOCPUS] = "Parent unable to distribute cpu downstream", 89 [PERR_HOTPLUG] = "No cpu available due to hotplug", 90 [PERR_CPUSEMPTY] = "cpuset.cpus is empty", 91 [PERR_HKEEPING] = "partition config conflicts with housekeeping setup", 92 }; 93 94 struct cpuset { 95 struct cgroup_subsys_state css; 96 97 unsigned long flags; /* "unsigned long" so bitops work */ 98 99 /* 100 * On default hierarchy: 101 * 102 * The user-configured masks can only be changed by writing to 103 * cpuset.cpus and cpuset.mems, and won't be limited by the 104 * parent masks. 105 * 106 * The effective masks is the real masks that apply to the tasks 107 * in the cpuset. They may be changed if the configured masks are 108 * changed or hotplug happens. 109 * 110 * effective_mask == configured_mask & parent's effective_mask, 111 * and if it ends up empty, it will inherit the parent's mask. 112 * 113 * 114 * On legacy hierarchy: 115 * 116 * The user-configured masks are always the same with effective masks. 117 */ 118 119 /* user-configured CPUs and Memory Nodes allow to tasks */ 120 cpumask_var_t cpus_allowed; 121 nodemask_t mems_allowed; 122 123 /* effective CPUs and Memory Nodes allow to tasks */ 124 cpumask_var_t effective_cpus; 125 nodemask_t effective_mems; 126 127 /* 128 * Exclusive CPUs dedicated to current cgroup (default hierarchy only) 129 * 130 * This exclusive CPUs must be a subset of cpus_allowed. A parent 131 * cgroup can only grant exclusive CPUs to one of its children. 132 * 133 * When the cgroup becomes a valid partition root, effective_xcpus 134 * defaults to cpus_allowed if not set. The effective_cpus of a valid 135 * partition root comes solely from its effective_xcpus and some of the 136 * effective_xcpus may be distributed to sub-partitions below & hence 137 * excluded from its effective_cpus. 138 */ 139 cpumask_var_t effective_xcpus; 140 141 /* 142 * Exclusive CPUs as requested by the user (default hierarchy only) 143 */ 144 cpumask_var_t exclusive_cpus; 145 146 /* 147 * This is old Memory Nodes tasks took on. 148 * 149 * - top_cpuset.old_mems_allowed is initialized to mems_allowed. 150 * - A new cpuset's old_mems_allowed is initialized when some 151 * task is moved into it. 152 * - old_mems_allowed is used in cpuset_migrate_mm() when we change 153 * cpuset.mems_allowed and have tasks' nodemask updated, and 154 * then old_mems_allowed is updated to mems_allowed. 155 */ 156 nodemask_t old_mems_allowed; 157 158 struct fmeter fmeter; /* memory_pressure filter */ 159 160 /* 161 * Tasks are being attached to this cpuset. Used to prevent 162 * zeroing cpus/mems_allowed between ->can_attach() and ->attach(). 163 */ 164 int attach_in_progress; 165 166 /* partition number for rebuild_sched_domains() */ 167 int pn; 168 169 /* for custom sched domain */ 170 int relax_domain_level; 171 172 /* number of valid sub-partitions */ 173 int nr_subparts; 174 175 /* partition root state */ 176 int partition_root_state; 177 178 /* 179 * Default hierarchy only: 180 * use_parent_ecpus - set if using parent's effective_cpus 181 * child_ecpus_count - # of children with use_parent_ecpus set 182 */ 183 int use_parent_ecpus; 184 int child_ecpus_count; 185 186 /* 187 * number of SCHED_DEADLINE tasks attached to this cpuset, so that we 188 * know when to rebuild associated root domain bandwidth information. 189 */ 190 int nr_deadline_tasks; 191 int nr_migrate_dl_tasks; 192 u64 sum_migrate_dl_bw; 193 194 /* Invalid partition error code, not lock protected */ 195 enum prs_errcode prs_err; 196 197 /* Handle for cpuset.cpus.partition */ 198 struct cgroup_file partition_file; 199 200 /* Remote partition silbling list anchored at remote_children */ 201 struct list_head remote_sibling; 202 }; 203 204 /* 205 * Legacy hierarchy call to cgroup_transfer_tasks() is handled asynchrously 206 */ 207 struct cpuset_remove_tasks_struct { 208 struct work_struct work; 209 struct cpuset *cs; 210 }; 211 212 /* 213 * Exclusive CPUs distributed out to sub-partitions of top_cpuset 214 */ 215 static cpumask_var_t subpartitions_cpus; 216 217 /* 218 * Exclusive CPUs in isolated partitions 219 */ 220 static cpumask_var_t isolated_cpus; 221 222 /* List of remote partition root children */ 223 static struct list_head remote_children; 224 225 /* 226 * Partition root states: 227 * 228 * 0 - member (not a partition root) 229 * 1 - partition root 230 * 2 - partition root without load balancing (isolated) 231 * -1 - invalid partition root 232 * -2 - invalid isolated partition root 233 */ 234 #define PRS_MEMBER 0 235 #define PRS_ROOT 1 236 #define PRS_ISOLATED 2 237 #define PRS_INVALID_ROOT -1 238 #define PRS_INVALID_ISOLATED -2 239 240 static inline bool is_prs_invalid(int prs_state) 241 { 242 return prs_state < 0; 243 } 244 245 /* 246 * Temporary cpumasks for working with partitions that are passed among 247 * functions to avoid memory allocation in inner functions. 248 */ 249 struct tmpmasks { 250 cpumask_var_t addmask, delmask; /* For partition root */ 251 cpumask_var_t new_cpus; /* For update_cpumasks_hier() */ 252 }; 253 254 static inline struct cpuset *css_cs(struct cgroup_subsys_state *css) 255 { 256 return css ? container_of(css, struct cpuset, css) : NULL; 257 } 258 259 /* Retrieve the cpuset for a task */ 260 static inline struct cpuset *task_cs(struct task_struct *task) 261 { 262 return css_cs(task_css(task, cpuset_cgrp_id)); 263 } 264 265 static inline struct cpuset *parent_cs(struct cpuset *cs) 266 { 267 return css_cs(cs->css.parent); 268 } 269 270 void inc_dl_tasks_cs(struct task_struct *p) 271 { 272 struct cpuset *cs = task_cs(p); 273 274 cs->nr_deadline_tasks++; 275 } 276 277 void dec_dl_tasks_cs(struct task_struct *p) 278 { 279 struct cpuset *cs = task_cs(p); 280 281 cs->nr_deadline_tasks--; 282 } 283 284 /* bits in struct cpuset flags field */ 285 typedef enum { 286 CS_ONLINE, 287 CS_CPU_EXCLUSIVE, 288 CS_MEM_EXCLUSIVE, 289 CS_MEM_HARDWALL, 290 CS_MEMORY_MIGRATE, 291 CS_SCHED_LOAD_BALANCE, 292 CS_SPREAD_PAGE, 293 CS_SPREAD_SLAB, 294 } cpuset_flagbits_t; 295 296 /* convenient tests for these bits */ 297 static inline bool is_cpuset_online(struct cpuset *cs) 298 { 299 return test_bit(CS_ONLINE, &cs->flags) && !css_is_dying(&cs->css); 300 } 301 302 static inline int is_cpu_exclusive(const struct cpuset *cs) 303 { 304 return test_bit(CS_CPU_EXCLUSIVE, &cs->flags); 305 } 306 307 static inline int is_mem_exclusive(const struct cpuset *cs) 308 { 309 return test_bit(CS_MEM_EXCLUSIVE, &cs->flags); 310 } 311 312 static inline int is_mem_hardwall(const struct cpuset *cs) 313 { 314 return test_bit(CS_MEM_HARDWALL, &cs->flags); 315 } 316 317 static inline int is_sched_load_balance(const struct cpuset *cs) 318 { 319 return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); 320 } 321 322 static inline int is_memory_migrate(const struct cpuset *cs) 323 { 324 return test_bit(CS_MEMORY_MIGRATE, &cs->flags); 325 } 326 327 static inline int is_spread_page(const struct cpuset *cs) 328 { 329 return test_bit(CS_SPREAD_PAGE, &cs->flags); 330 } 331 332 static inline int is_spread_slab(const struct cpuset *cs) 333 { 334 return test_bit(CS_SPREAD_SLAB, &cs->flags); 335 } 336 337 static inline int is_partition_valid(const struct cpuset *cs) 338 { 339 return cs->partition_root_state > 0; 340 } 341 342 static inline int is_partition_invalid(const struct cpuset *cs) 343 { 344 return cs->partition_root_state < 0; 345 } 346 347 /* 348 * Callers should hold callback_lock to modify partition_root_state. 349 */ 350 static inline void make_partition_invalid(struct cpuset *cs) 351 { 352 if (cs->partition_root_state > 0) 353 cs->partition_root_state = -cs->partition_root_state; 354 } 355 356 /* 357 * Send notification event of whenever partition_root_state changes. 358 */ 359 static inline void notify_partition_change(struct cpuset *cs, int old_prs) 360 { 361 if (old_prs == cs->partition_root_state) 362 return; 363 cgroup_file_notify(&cs->partition_file); 364 365 /* Reset prs_err if not invalid */ 366 if (is_partition_valid(cs)) 367 WRITE_ONCE(cs->prs_err, PERR_NONE); 368 } 369 370 static struct cpuset top_cpuset = { 371 .flags = BIT(CS_ONLINE) | BIT(CS_CPU_EXCLUSIVE) | 372 BIT(CS_MEM_EXCLUSIVE) | BIT(CS_SCHED_LOAD_BALANCE), 373 .partition_root_state = PRS_ROOT, 374 .relax_domain_level = -1, 375 .remote_sibling = LIST_HEAD_INIT(top_cpuset.remote_sibling), 376 }; 377 378 /** 379 * cpuset_for_each_child - traverse online children of a cpuset 380 * @child_cs: loop cursor pointing to the current child 381 * @pos_css: used for iteration 382 * @parent_cs: target cpuset to walk children of 383 * 384 * Walk @child_cs through the online children of @parent_cs. Must be used 385 * with RCU read locked. 386 */ 387 #define cpuset_for_each_child(child_cs, pos_css, parent_cs) \ 388 css_for_each_child((pos_css), &(parent_cs)->css) \ 389 if (is_cpuset_online(((child_cs) = css_cs((pos_css))))) 390 391 /** 392 * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants 393 * @des_cs: loop cursor pointing to the current descendant 394 * @pos_css: used for iteration 395 * @root_cs: target cpuset to walk ancestor of 396 * 397 * Walk @des_cs through the online descendants of @root_cs. Must be used 398 * with RCU read locked. The caller may modify @pos_css by calling 399 * css_rightmost_descendant() to skip subtree. @root_cs is included in the 400 * iteration and the first node to be visited. 401 */ 402 #define cpuset_for_each_descendant_pre(des_cs, pos_css, root_cs) \ 403 css_for_each_descendant_pre((pos_css), &(root_cs)->css) \ 404 if (is_cpuset_online(((des_cs) = css_cs((pos_css))))) 405 406 /* 407 * There are two global locks guarding cpuset structures - cpuset_mutex and 408 * callback_lock. We also require taking task_lock() when dereferencing a 409 * task's cpuset pointer. See "The task_lock() exception", at the end of this 410 * comment. The cpuset code uses only cpuset_mutex. Other kernel subsystems 411 * can use cpuset_lock()/cpuset_unlock() to prevent change to cpuset 412 * structures. Note that cpuset_mutex needs to be a mutex as it is used in 413 * paths that rely on priority inheritance (e.g. scheduler - on RT) for 414 * correctness. 415 * 416 * A task must hold both locks to modify cpusets. If a task holds 417 * cpuset_mutex, it blocks others, ensuring that it is the only task able to 418 * also acquire callback_lock and be able to modify cpusets. It can perform 419 * various checks on the cpuset structure first, knowing nothing will change. 420 * It can also allocate memory while just holding cpuset_mutex. While it is 421 * performing these checks, various callback routines can briefly acquire 422 * callback_lock to query cpusets. Once it is ready to make the changes, it 423 * takes callback_lock, blocking everyone else. 424 * 425 * Calls to the kernel memory allocator can not be made while holding 426 * callback_lock, as that would risk double tripping on callback_lock 427 * from one of the callbacks into the cpuset code from within 428 * __alloc_pages(). 429 * 430 * If a task is only holding callback_lock, then it has read-only 431 * access to cpusets. 432 * 433 * Now, the task_struct fields mems_allowed and mempolicy may be changed 434 * by other task, we use alloc_lock in the task_struct fields to protect 435 * them. 436 * 437 * The cpuset_common_file_read() handlers only hold callback_lock across 438 * small pieces of code, such as when reading out possibly multi-word 439 * cpumasks and nodemasks. 440 * 441 * Accessing a task's cpuset should be done in accordance with the 442 * guidelines for accessing subsystem state in kernel/cgroup.c 443 */ 444 445 static DEFINE_MUTEX(cpuset_mutex); 446 447 void cpuset_lock(void) 448 { 449 mutex_lock(&cpuset_mutex); 450 } 451 452 void cpuset_unlock(void) 453 { 454 mutex_unlock(&cpuset_mutex); 455 } 456 457 static DEFINE_SPINLOCK(callback_lock); 458 459 static struct workqueue_struct *cpuset_migrate_mm_wq; 460 461 static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq); 462 463 static inline void check_insane_mems_config(nodemask_t *nodes) 464 { 465 if (!cpusets_insane_config() && 466 movable_only_nodes(nodes)) { 467 static_branch_enable(&cpusets_insane_config_key); 468 pr_info("Unsupported (movable nodes only) cpuset configuration detected (nmask=%*pbl)!\n" 469 "Cpuset allocations might fail even with a lot of memory available.\n", 470 nodemask_pr_args(nodes)); 471 } 472 } 473 474 /* 475 * Cgroup v2 behavior is used on the "cpus" and "mems" control files when 476 * on default hierarchy or when the cpuset_v2_mode flag is set by mounting 477 * the v1 cpuset cgroup filesystem with the "cpuset_v2_mode" mount option. 478 * With v2 behavior, "cpus" and "mems" are always what the users have 479 * requested and won't be changed by hotplug events. Only the effective 480 * cpus or mems will be affected. 481 */ 482 static inline bool is_in_v2_mode(void) 483 { 484 return cgroup_subsys_on_dfl(cpuset_cgrp_subsys) || 485 (cpuset_cgrp_subsys.root->flags & CGRP_ROOT_CPUSET_V2_MODE); 486 } 487 488 /** 489 * partition_is_populated - check if partition has tasks 490 * @cs: partition root to be checked 491 * @excluded_child: a child cpuset to be excluded in task checking 492 * Return: true if there are tasks, false otherwise 493 * 494 * It is assumed that @cs is a valid partition root. @excluded_child should 495 * be non-NULL when this cpuset is going to become a partition itself. 496 */ 497 static inline bool partition_is_populated(struct cpuset *cs, 498 struct cpuset *excluded_child) 499 { 500 struct cgroup_subsys_state *css; 501 struct cpuset *child; 502 503 if (cs->css.cgroup->nr_populated_csets) 504 return true; 505 if (!excluded_child && !cs->nr_subparts) 506 return cgroup_is_populated(cs->css.cgroup); 507 508 rcu_read_lock(); 509 cpuset_for_each_child(child, css, cs) { 510 if (child == excluded_child) 511 continue; 512 if (is_partition_valid(child)) 513 continue; 514 if (cgroup_is_populated(child->css.cgroup)) { 515 rcu_read_unlock(); 516 return true; 517 } 518 } 519 rcu_read_unlock(); 520 return false; 521 } 522 523 /* 524 * Return in pmask the portion of a task's cpusets's cpus_allowed that 525 * are online and are capable of running the task. If none are found, 526 * walk up the cpuset hierarchy until we find one that does have some 527 * appropriate cpus. 528 * 529 * One way or another, we guarantee to return some non-empty subset 530 * of cpu_online_mask. 531 * 532 * Call with callback_lock or cpuset_mutex held. 533 */ 534 static void guarantee_online_cpus(struct task_struct *tsk, 535 struct cpumask *pmask) 536 { 537 const struct cpumask *possible_mask = task_cpu_possible_mask(tsk); 538 struct cpuset *cs; 539 540 if (WARN_ON(!cpumask_and(pmask, possible_mask, cpu_online_mask))) 541 cpumask_copy(pmask, cpu_online_mask); 542 543 rcu_read_lock(); 544 cs = task_cs(tsk); 545 546 while (!cpumask_intersects(cs->effective_cpus, pmask)) 547 cs = parent_cs(cs); 548 549 cpumask_and(pmask, pmask, cs->effective_cpus); 550 rcu_read_unlock(); 551 } 552 553 /* 554 * Return in *pmask the portion of a cpusets's mems_allowed that 555 * are online, with memory. If none are online with memory, walk 556 * up the cpuset hierarchy until we find one that does have some 557 * online mems. The top cpuset always has some mems online. 558 * 559 * One way or another, we guarantee to return some non-empty subset 560 * of node_states[N_MEMORY]. 561 * 562 * Call with callback_lock or cpuset_mutex held. 563 */ 564 static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask) 565 { 566 while (!nodes_intersects(cs->effective_mems, node_states[N_MEMORY])) 567 cs = parent_cs(cs); 568 nodes_and(*pmask, cs->effective_mems, node_states[N_MEMORY]); 569 } 570 571 /* 572 * update task's spread flag if cpuset's page/slab spread flag is set 573 * 574 * Call with callback_lock or cpuset_mutex held. The check can be skipped 575 * if on default hierarchy. 576 */ 577 static void cpuset_update_task_spread_flags(struct cpuset *cs, 578 struct task_struct *tsk) 579 { 580 if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) 581 return; 582 583 if (is_spread_page(cs)) 584 task_set_spread_page(tsk); 585 else 586 task_clear_spread_page(tsk); 587 588 if (is_spread_slab(cs)) 589 task_set_spread_slab(tsk); 590 else 591 task_clear_spread_slab(tsk); 592 } 593 594 /* 595 * is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q? 596 * 597 * One cpuset is a subset of another if all its allowed CPUs and 598 * Memory Nodes are a subset of the other, and its exclusive flags 599 * are only set if the other's are set. Call holding cpuset_mutex. 600 */ 601 602 static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) 603 { 604 return cpumask_subset(p->cpus_allowed, q->cpus_allowed) && 605 nodes_subset(p->mems_allowed, q->mems_allowed) && 606 is_cpu_exclusive(p) <= is_cpu_exclusive(q) && 607 is_mem_exclusive(p) <= is_mem_exclusive(q); 608 } 609 610 /** 611 * alloc_cpumasks - allocate three cpumasks for cpuset 612 * @cs: the cpuset that have cpumasks to be allocated. 613 * @tmp: the tmpmasks structure pointer 614 * Return: 0 if successful, -ENOMEM otherwise. 615 * 616 * Only one of the two input arguments should be non-NULL. 617 */ 618 static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp) 619 { 620 cpumask_var_t *pmask1, *pmask2, *pmask3, *pmask4; 621 622 if (cs) { 623 pmask1 = &cs->cpus_allowed; 624 pmask2 = &cs->effective_cpus; 625 pmask3 = &cs->effective_xcpus; 626 pmask4 = &cs->exclusive_cpus; 627 } else { 628 pmask1 = &tmp->new_cpus; 629 pmask2 = &tmp->addmask; 630 pmask3 = &tmp->delmask; 631 pmask4 = NULL; 632 } 633 634 if (!zalloc_cpumask_var(pmask1, GFP_KERNEL)) 635 return -ENOMEM; 636 637 if (!zalloc_cpumask_var(pmask2, GFP_KERNEL)) 638 goto free_one; 639 640 if (!zalloc_cpumask_var(pmask3, GFP_KERNEL)) 641 goto free_two; 642 643 if (pmask4 && !zalloc_cpumask_var(pmask4, GFP_KERNEL)) 644 goto free_three; 645 646 647 return 0; 648 649 free_three: 650 free_cpumask_var(*pmask3); 651 free_two: 652 free_cpumask_var(*pmask2); 653 free_one: 654 free_cpumask_var(*pmask1); 655 return -ENOMEM; 656 } 657 658 /** 659 * free_cpumasks - free cpumasks in a tmpmasks structure 660 * @cs: the cpuset that have cpumasks to be free. 661 * @tmp: the tmpmasks structure pointer 662 */ 663 static inline void free_cpumasks(struct cpuset *cs, struct tmpmasks *tmp) 664 { 665 if (cs) { 666 free_cpumask_var(cs->cpus_allowed); 667 free_cpumask_var(cs->effective_cpus); 668 free_cpumask_var(cs->effective_xcpus); 669 free_cpumask_var(cs->exclusive_cpus); 670 } 671 if (tmp) { 672 free_cpumask_var(tmp->new_cpus); 673 free_cpumask_var(tmp->addmask); 674 free_cpumask_var(tmp->delmask); 675 } 676 } 677 678 /** 679 * alloc_trial_cpuset - allocate a trial cpuset 680 * @cs: the cpuset that the trial cpuset duplicates 681 */ 682 static struct cpuset *alloc_trial_cpuset(struct cpuset *cs) 683 { 684 struct cpuset *trial; 685 686 trial = kmemdup(cs, sizeof(*cs), GFP_KERNEL); 687 if (!trial) 688 return NULL; 689 690 if (alloc_cpumasks(trial, NULL)) { 691 kfree(trial); 692 return NULL; 693 } 694 695 cpumask_copy(trial->cpus_allowed, cs->cpus_allowed); 696 cpumask_copy(trial->effective_cpus, cs->effective_cpus); 697 cpumask_copy(trial->effective_xcpus, cs->effective_xcpus); 698 cpumask_copy(trial->exclusive_cpus, cs->exclusive_cpus); 699 return trial; 700 } 701 702 /** 703 * free_cpuset - free the cpuset 704 * @cs: the cpuset to be freed 705 */ 706 static inline void free_cpuset(struct cpuset *cs) 707 { 708 free_cpumasks(cs, NULL); 709 kfree(cs); 710 } 711 712 static inline struct cpumask *fetch_xcpus(struct cpuset *cs) 713 { 714 return !cpumask_empty(cs->exclusive_cpus) ? cs->exclusive_cpus : 715 cpumask_empty(cs->effective_xcpus) ? cs->cpus_allowed 716 : cs->effective_xcpus; 717 } 718 719 /* 720 * cpusets_are_exclusive() - check if two cpusets are exclusive 721 * 722 * Return true if exclusive, false if not 723 */ 724 static inline bool cpusets_are_exclusive(struct cpuset *cs1, struct cpuset *cs2) 725 { 726 struct cpumask *xcpus1 = fetch_xcpus(cs1); 727 struct cpumask *xcpus2 = fetch_xcpus(cs2); 728 729 if (cpumask_intersects(xcpus1, xcpus2)) 730 return false; 731 return true; 732 } 733 734 /* 735 * validate_change_legacy() - Validate conditions specific to legacy (v1) 736 * behavior. 737 */ 738 static int validate_change_legacy(struct cpuset *cur, struct cpuset *trial) 739 { 740 struct cgroup_subsys_state *css; 741 struct cpuset *c, *par; 742 int ret; 743 744 WARN_ON_ONCE(!rcu_read_lock_held()); 745 746 /* Each of our child cpusets must be a subset of us */ 747 ret = -EBUSY; 748 cpuset_for_each_child(c, css, cur) 749 if (!is_cpuset_subset(c, trial)) 750 goto out; 751 752 /* On legacy hierarchy, we must be a subset of our parent cpuset. */ 753 ret = -EACCES; 754 par = parent_cs(cur); 755 if (par && !is_cpuset_subset(trial, par)) 756 goto out; 757 758 ret = 0; 759 out: 760 return ret; 761 } 762 763 /* 764 * validate_change() - Used to validate that any proposed cpuset change 765 * follows the structural rules for cpusets. 766 * 767 * If we replaced the flag and mask values of the current cpuset 768 * (cur) with those values in the trial cpuset (trial), would 769 * our various subset and exclusive rules still be valid? Presumes 770 * cpuset_mutex held. 771 * 772 * 'cur' is the address of an actual, in-use cpuset. Operations 773 * such as list traversal that depend on the actual address of the 774 * cpuset in the list must use cur below, not trial. 775 * 776 * 'trial' is the address of bulk structure copy of cur, with 777 * perhaps one or more of the fields cpus_allowed, mems_allowed, 778 * or flags changed to new, trial values. 779 * 780 * Return 0 if valid, -errno if not. 781 */ 782 783 static int validate_change(struct cpuset *cur, struct cpuset *trial) 784 { 785 struct cgroup_subsys_state *css; 786 struct cpuset *c, *par; 787 int ret = 0; 788 789 rcu_read_lock(); 790 791 if (!is_in_v2_mode()) 792 ret = validate_change_legacy(cur, trial); 793 if (ret) 794 goto out; 795 796 /* Remaining checks don't apply to root cpuset */ 797 if (cur == &top_cpuset) 798 goto out; 799 800 par = parent_cs(cur); 801 802 /* 803 * Cpusets with tasks - existing or newly being attached - can't 804 * be changed to have empty cpus_allowed or mems_allowed. 805 */ 806 ret = -ENOSPC; 807 if ((cgroup_is_populated(cur->css.cgroup) || cur->attach_in_progress)) { 808 if (!cpumask_empty(cur->cpus_allowed) && 809 cpumask_empty(trial->cpus_allowed)) 810 goto out; 811 if (!nodes_empty(cur->mems_allowed) && 812 nodes_empty(trial->mems_allowed)) 813 goto out; 814 } 815 816 /* 817 * We can't shrink if we won't have enough room for SCHED_DEADLINE 818 * tasks. 819 */ 820 ret = -EBUSY; 821 if (is_cpu_exclusive(cur) && 822 !cpuset_cpumask_can_shrink(cur->cpus_allowed, 823 trial->cpus_allowed)) 824 goto out; 825 826 /* 827 * If either I or some sibling (!= me) is exclusive, we can't 828 * overlap 829 */ 830 ret = -EINVAL; 831 cpuset_for_each_child(c, css, par) { 832 if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && 833 c != cur) { 834 if (!cpusets_are_exclusive(trial, c)) 835 goto out; 836 } 837 if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) && 838 c != cur && 839 nodes_intersects(trial->mems_allowed, c->mems_allowed)) 840 goto out; 841 } 842 843 ret = 0; 844 out: 845 rcu_read_unlock(); 846 return ret; 847 } 848 849 #ifdef CONFIG_SMP 850 /* 851 * Helper routine for generate_sched_domains(). 852 * Do cpusets a, b have overlapping effective cpus_allowed masks? 853 */ 854 static int cpusets_overlap(struct cpuset *a, struct cpuset *b) 855 { 856 return cpumask_intersects(a->effective_cpus, b->effective_cpus); 857 } 858 859 static void 860 update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c) 861 { 862 if (dattr->relax_domain_level < c->relax_domain_level) 863 dattr->relax_domain_level = c->relax_domain_level; 864 return; 865 } 866 867 static void update_domain_attr_tree(struct sched_domain_attr *dattr, 868 struct cpuset *root_cs) 869 { 870 struct cpuset *cp; 871 struct cgroup_subsys_state *pos_css; 872 873 rcu_read_lock(); 874 cpuset_for_each_descendant_pre(cp, pos_css, root_cs) { 875 /* skip the whole subtree if @cp doesn't have any CPU */ 876 if (cpumask_empty(cp->cpus_allowed)) { 877 pos_css = css_rightmost_descendant(pos_css); 878 continue; 879 } 880 881 if (is_sched_load_balance(cp)) 882 update_domain_attr(dattr, cp); 883 } 884 rcu_read_unlock(); 885 } 886 887 /* Must be called with cpuset_mutex held. */ 888 static inline int nr_cpusets(void) 889 { 890 /* jump label reference count + the top-level cpuset */ 891 return static_key_count(&cpusets_enabled_key.key) + 1; 892 } 893 894 /* 895 * generate_sched_domains() 896 * 897 * This function builds a partial partition of the systems CPUs 898 * A 'partial partition' is a set of non-overlapping subsets whose 899 * union is a subset of that set. 900 * The output of this function needs to be passed to kernel/sched/core.c 901 * partition_sched_domains() routine, which will rebuild the scheduler's 902 * load balancing domains (sched domains) as specified by that partial 903 * partition. 904 * 905 * See "What is sched_load_balance" in Documentation/admin-guide/cgroup-v1/cpusets.rst 906 * for a background explanation of this. 907 * 908 * Does not return errors, on the theory that the callers of this 909 * routine would rather not worry about failures to rebuild sched 910 * domains when operating in the severe memory shortage situations 911 * that could cause allocation failures below. 912 * 913 * Must be called with cpuset_mutex held. 914 * 915 * The three key local variables below are: 916 * cp - cpuset pointer, used (together with pos_css) to perform a 917 * top-down scan of all cpusets. For our purposes, rebuilding 918 * the schedulers sched domains, we can ignore !is_sched_load_ 919 * balance cpusets. 920 * csa - (for CpuSet Array) Array of pointers to all the cpusets 921 * that need to be load balanced, for convenient iterative 922 * access by the subsequent code that finds the best partition, 923 * i.e the set of domains (subsets) of CPUs such that the 924 * cpus_allowed of every cpuset marked is_sched_load_balance 925 * is a subset of one of these domains, while there are as 926 * many such domains as possible, each as small as possible. 927 * doms - Conversion of 'csa' to an array of cpumasks, for passing to 928 * the kernel/sched/core.c routine partition_sched_domains() in a 929 * convenient format, that can be easily compared to the prior 930 * value to determine what partition elements (sched domains) 931 * were changed (added or removed.) 932 * 933 * Finding the best partition (set of domains): 934 * The triple nested loops below over i, j, k scan over the 935 * load balanced cpusets (using the array of cpuset pointers in 936 * csa[]) looking for pairs of cpusets that have overlapping 937 * cpus_allowed, but which don't have the same 'pn' partition 938 * number and gives them in the same partition number. It keeps 939 * looping on the 'restart' label until it can no longer find 940 * any such pairs. 941 * 942 * The union of the cpus_allowed masks from the set of 943 * all cpusets having the same 'pn' value then form the one 944 * element of the partition (one sched domain) to be passed to 945 * partition_sched_domains(). 946 */ 947 static int generate_sched_domains(cpumask_var_t **domains, 948 struct sched_domain_attr **attributes) 949 { 950 struct cpuset *cp; /* top-down scan of cpusets */ 951 struct cpuset **csa; /* array of all cpuset ptrs */ 952 int csn; /* how many cpuset ptrs in csa so far */ 953 int i, j, k; /* indices for partition finding loops */ 954 cpumask_var_t *doms; /* resulting partition; i.e. sched domains */ 955 struct sched_domain_attr *dattr; /* attributes for custom domains */ 956 int ndoms = 0; /* number of sched domains in result */ 957 int nslot; /* next empty doms[] struct cpumask slot */ 958 struct cgroup_subsys_state *pos_css; 959 bool root_load_balance = is_sched_load_balance(&top_cpuset); 960 961 doms = NULL; 962 dattr = NULL; 963 csa = NULL; 964 965 /* Special case for the 99% of systems with one, full, sched domain */ 966 if (root_load_balance && !top_cpuset.nr_subparts) { 967 ndoms = 1; 968 doms = alloc_sched_domains(ndoms); 969 if (!doms) 970 goto done; 971 972 dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL); 973 if (dattr) { 974 *dattr = SD_ATTR_INIT; 975 update_domain_attr_tree(dattr, &top_cpuset); 976 } 977 cpumask_and(doms[0], top_cpuset.effective_cpus, 978 housekeeping_cpumask(HK_TYPE_DOMAIN)); 979 980 goto done; 981 } 982 983 csa = kmalloc_array(nr_cpusets(), sizeof(cp), GFP_KERNEL); 984 if (!csa) 985 goto done; 986 csn = 0; 987 988 rcu_read_lock(); 989 if (root_load_balance) 990 csa[csn++] = &top_cpuset; 991 cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) { 992 if (cp == &top_cpuset) 993 continue; 994 /* 995 * Continue traversing beyond @cp iff @cp has some CPUs and 996 * isn't load balancing. The former is obvious. The 997 * latter: All child cpusets contain a subset of the 998 * parent's cpus, so just skip them, and then we call 999 * update_domain_attr_tree() to calc relax_domain_level of 1000 * the corresponding sched domain. 1001 * 1002 * If root is load-balancing, we can skip @cp if it 1003 * is a subset of the root's effective_cpus. 1004 */ 1005 if (!cpumask_empty(cp->cpus_allowed) && 1006 !(is_sched_load_balance(cp) && 1007 cpumask_intersects(cp->cpus_allowed, 1008 housekeeping_cpumask(HK_TYPE_DOMAIN)))) 1009 continue; 1010 1011 if (root_load_balance && 1012 cpumask_subset(cp->cpus_allowed, top_cpuset.effective_cpus)) 1013 continue; 1014 1015 if (is_sched_load_balance(cp) && 1016 !cpumask_empty(cp->effective_cpus)) 1017 csa[csn++] = cp; 1018 1019 /* skip @cp's subtree if not a partition root */ 1020 if (!is_partition_valid(cp)) 1021 pos_css = css_rightmost_descendant(pos_css); 1022 } 1023 rcu_read_unlock(); 1024 1025 for (i = 0; i < csn; i++) 1026 csa[i]->pn = i; 1027 ndoms = csn; 1028 1029 restart: 1030 /* Find the best partition (set of sched domains) */ 1031 for (i = 0; i < csn; i++) { 1032 struct cpuset *a = csa[i]; 1033 int apn = a->pn; 1034 1035 for (j = 0; j < csn; j++) { 1036 struct cpuset *b = csa[j]; 1037 int bpn = b->pn; 1038 1039 if (apn != bpn && cpusets_overlap(a, b)) { 1040 for (k = 0; k < csn; k++) { 1041 struct cpuset *c = csa[k]; 1042 1043 if (c->pn == bpn) 1044 c->pn = apn; 1045 } 1046 ndoms--; /* one less element */ 1047 goto restart; 1048 } 1049 } 1050 } 1051 1052 /* 1053 * Now we know how many domains to create. 1054 * Convert <csn, csa> to <ndoms, doms> and populate cpu masks. 1055 */ 1056 doms = alloc_sched_domains(ndoms); 1057 if (!doms) 1058 goto done; 1059 1060 /* 1061 * The rest of the code, including the scheduler, can deal with 1062 * dattr==NULL case. No need to abort if alloc fails. 1063 */ 1064 dattr = kmalloc_array(ndoms, sizeof(struct sched_domain_attr), 1065 GFP_KERNEL); 1066 1067 for (nslot = 0, i = 0; i < csn; i++) { 1068 struct cpuset *a = csa[i]; 1069 struct cpumask *dp; 1070 int apn = a->pn; 1071 1072 if (apn < 0) { 1073 /* Skip completed partitions */ 1074 continue; 1075 } 1076 1077 dp = doms[nslot]; 1078 1079 if (nslot == ndoms) { 1080 static int warnings = 10; 1081 if (warnings) { 1082 pr_warn("rebuild_sched_domains confused: nslot %d, ndoms %d, csn %d, i %d, apn %d\n", 1083 nslot, ndoms, csn, i, apn); 1084 warnings--; 1085 } 1086 continue; 1087 } 1088 1089 cpumask_clear(dp); 1090 if (dattr) 1091 *(dattr + nslot) = SD_ATTR_INIT; 1092 for (j = i; j < csn; j++) { 1093 struct cpuset *b = csa[j]; 1094 1095 if (apn == b->pn) { 1096 cpumask_or(dp, dp, b->effective_cpus); 1097 cpumask_and(dp, dp, housekeeping_cpumask(HK_TYPE_DOMAIN)); 1098 if (dattr) 1099 update_domain_attr_tree(dattr + nslot, b); 1100 1101 /* Done with this partition */ 1102 b->pn = -1; 1103 } 1104 } 1105 nslot++; 1106 } 1107 BUG_ON(nslot != ndoms); 1108 1109 done: 1110 kfree(csa); 1111 1112 /* 1113 * Fallback to the default domain if kmalloc() failed. 1114 * See comments in partition_sched_domains(). 1115 */ 1116 if (doms == NULL) 1117 ndoms = 1; 1118 1119 *domains = doms; 1120 *attributes = dattr; 1121 return ndoms; 1122 } 1123 1124 static void dl_update_tasks_root_domain(struct cpuset *cs) 1125 { 1126 struct css_task_iter it; 1127 struct task_struct *task; 1128 1129 if (cs->nr_deadline_tasks == 0) 1130 return; 1131 1132 css_task_iter_start(&cs->css, 0, &it); 1133 1134 while ((task = css_task_iter_next(&it))) 1135 dl_add_task_root_domain(task); 1136 1137 css_task_iter_end(&it); 1138 } 1139 1140 static void dl_rebuild_rd_accounting(void) 1141 { 1142 struct cpuset *cs = NULL; 1143 struct cgroup_subsys_state *pos_css; 1144 1145 lockdep_assert_held(&cpuset_mutex); 1146 lockdep_assert_cpus_held(); 1147 lockdep_assert_held(&sched_domains_mutex); 1148 1149 rcu_read_lock(); 1150 1151 /* 1152 * Clear default root domain DL accounting, it will be computed again 1153 * if a task belongs to it. 1154 */ 1155 dl_clear_root_domain(&def_root_domain); 1156 1157 cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) { 1158 1159 if (cpumask_empty(cs->effective_cpus)) { 1160 pos_css = css_rightmost_descendant(pos_css); 1161 continue; 1162 } 1163 1164 css_get(&cs->css); 1165 1166 rcu_read_unlock(); 1167 1168 dl_update_tasks_root_domain(cs); 1169 1170 rcu_read_lock(); 1171 css_put(&cs->css); 1172 } 1173 rcu_read_unlock(); 1174 } 1175 1176 static void 1177 partition_and_rebuild_sched_domains(int ndoms_new, cpumask_var_t doms_new[], 1178 struct sched_domain_attr *dattr_new) 1179 { 1180 mutex_lock(&sched_domains_mutex); 1181 partition_sched_domains_locked(ndoms_new, doms_new, dattr_new); 1182 dl_rebuild_rd_accounting(); 1183 mutex_unlock(&sched_domains_mutex); 1184 } 1185 1186 /* 1187 * Rebuild scheduler domains. 1188 * 1189 * If the flag 'sched_load_balance' of any cpuset with non-empty 1190 * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset 1191 * which has that flag enabled, or if any cpuset with a non-empty 1192 * 'cpus' is removed, then call this routine to rebuild the 1193 * scheduler's dynamic sched domains. 1194 * 1195 * Call with cpuset_mutex held. Takes cpus_read_lock(). 1196 */ 1197 static void rebuild_sched_domains_locked(void) 1198 { 1199 struct cgroup_subsys_state *pos_css; 1200 struct sched_domain_attr *attr; 1201 cpumask_var_t *doms; 1202 struct cpuset *cs; 1203 int ndoms; 1204 1205 lockdep_assert_cpus_held(); 1206 lockdep_assert_held(&cpuset_mutex); 1207 1208 /* 1209 * If we have raced with CPU hotplug, return early to avoid 1210 * passing doms with offlined cpu to partition_sched_domains(). 1211 * Anyways, cpuset_handle_hotplug() will rebuild sched domains. 1212 * 1213 * With no CPUs in any subpartitions, top_cpuset's effective CPUs 1214 * should be the same as the active CPUs, so checking only top_cpuset 1215 * is enough to detect racing CPU offlines. 1216 */ 1217 if (cpumask_empty(subpartitions_cpus) && 1218 !cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask)) 1219 return; 1220 1221 /* 1222 * With subpartition CPUs, however, the effective CPUs of a partition 1223 * root should be only a subset of the active CPUs. Since a CPU in any 1224 * partition root could be offlined, all must be checked. 1225 */ 1226 if (top_cpuset.nr_subparts) { 1227 rcu_read_lock(); 1228 cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) { 1229 if (!is_partition_valid(cs)) { 1230 pos_css = css_rightmost_descendant(pos_css); 1231 continue; 1232 } 1233 if (!cpumask_subset(cs->effective_cpus, 1234 cpu_active_mask)) { 1235 rcu_read_unlock(); 1236 return; 1237 } 1238 } 1239 rcu_read_unlock(); 1240 } 1241 1242 /* Generate domain masks and attrs */ 1243 ndoms = generate_sched_domains(&doms, &attr); 1244 1245 /* Have scheduler rebuild the domains */ 1246 partition_and_rebuild_sched_domains(ndoms, doms, attr); 1247 } 1248 #else /* !CONFIG_SMP */ 1249 static void rebuild_sched_domains_locked(void) 1250 { 1251 } 1252 #endif /* CONFIG_SMP */ 1253 1254 static void rebuild_sched_domains_cpuslocked(void) 1255 { 1256 mutex_lock(&cpuset_mutex); 1257 rebuild_sched_domains_locked(); 1258 mutex_unlock(&cpuset_mutex); 1259 } 1260 1261 void rebuild_sched_domains(void) 1262 { 1263 cpus_read_lock(); 1264 rebuild_sched_domains_cpuslocked(); 1265 cpus_read_unlock(); 1266 } 1267 1268 /** 1269 * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset. 1270 * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed 1271 * @new_cpus: the temp variable for the new effective_cpus mask 1272 * 1273 * Iterate through each task of @cs updating its cpus_allowed to the 1274 * effective cpuset's. As this function is called with cpuset_mutex held, 1275 * cpuset membership stays stable. For top_cpuset, task_cpu_possible_mask() 1276 * is used instead of effective_cpus to make sure all offline CPUs are also 1277 * included as hotplug code won't update cpumasks for tasks in top_cpuset. 1278 */ 1279 static void update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus) 1280 { 1281 struct css_task_iter it; 1282 struct task_struct *task; 1283 bool top_cs = cs == &top_cpuset; 1284 1285 css_task_iter_start(&cs->css, 0, &it); 1286 while ((task = css_task_iter_next(&it))) { 1287 const struct cpumask *possible_mask = task_cpu_possible_mask(task); 1288 1289 if (top_cs) { 1290 /* 1291 * Percpu kthreads in top_cpuset are ignored 1292 */ 1293 if (kthread_is_per_cpu(task)) 1294 continue; 1295 cpumask_andnot(new_cpus, possible_mask, subpartitions_cpus); 1296 } else { 1297 cpumask_and(new_cpus, possible_mask, cs->effective_cpus); 1298 } 1299 set_cpus_allowed_ptr(task, new_cpus); 1300 } 1301 css_task_iter_end(&it); 1302 } 1303 1304 /** 1305 * compute_effective_cpumask - Compute the effective cpumask of the cpuset 1306 * @new_cpus: the temp variable for the new effective_cpus mask 1307 * @cs: the cpuset the need to recompute the new effective_cpus mask 1308 * @parent: the parent cpuset 1309 * 1310 * The result is valid only if the given cpuset isn't a partition root. 1311 */ 1312 static void compute_effective_cpumask(struct cpumask *new_cpus, 1313 struct cpuset *cs, struct cpuset *parent) 1314 { 1315 cpumask_and(new_cpus, cs->cpus_allowed, parent->effective_cpus); 1316 } 1317 1318 /* 1319 * Commands for update_parent_effective_cpumask 1320 */ 1321 enum partition_cmd { 1322 partcmd_enable, /* Enable partition root */ 1323 partcmd_enablei, /* Enable isolated partition root */ 1324 partcmd_disable, /* Disable partition root */ 1325 partcmd_update, /* Update parent's effective_cpus */ 1326 partcmd_invalidate, /* Make partition invalid */ 1327 }; 1328 1329 static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, 1330 int turning_on); 1331 static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, 1332 struct tmpmasks *tmp); 1333 1334 /* 1335 * Update partition exclusive flag 1336 * 1337 * Return: 0 if successful, an error code otherwise 1338 */ 1339 static int update_partition_exclusive(struct cpuset *cs, int new_prs) 1340 { 1341 bool exclusive = (new_prs > 0); 1342 1343 if (exclusive && !is_cpu_exclusive(cs)) { 1344 if (update_flag(CS_CPU_EXCLUSIVE, cs, 1)) 1345 return PERR_NOTEXCL; 1346 } else if (!exclusive && is_cpu_exclusive(cs)) { 1347 /* Turning off CS_CPU_EXCLUSIVE will not return error */ 1348 update_flag(CS_CPU_EXCLUSIVE, cs, 0); 1349 } 1350 return 0; 1351 } 1352 1353 /* 1354 * Update partition load balance flag and/or rebuild sched domain 1355 * 1356 * Changing load balance flag will automatically call 1357 * rebuild_sched_domains_locked(). 1358 * This function is for cgroup v2 only. 1359 */ 1360 static void update_partition_sd_lb(struct cpuset *cs, int old_prs) 1361 { 1362 int new_prs = cs->partition_root_state; 1363 bool rebuild_domains = (new_prs > 0) || (old_prs > 0); 1364 bool new_lb; 1365 1366 /* 1367 * If cs is not a valid partition root, the load balance state 1368 * will follow its parent. 1369 */ 1370 if (new_prs > 0) { 1371 new_lb = (new_prs != PRS_ISOLATED); 1372 } else { 1373 new_lb = is_sched_load_balance(parent_cs(cs)); 1374 } 1375 if (new_lb != !!is_sched_load_balance(cs)) { 1376 rebuild_domains = true; 1377 if (new_lb) 1378 set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); 1379 else 1380 clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); 1381 } 1382 1383 if (rebuild_domains) 1384 rebuild_sched_domains_locked(); 1385 } 1386 1387 /* 1388 * tasks_nocpu_error - Return true if tasks will have no effective_cpus 1389 */ 1390 static bool tasks_nocpu_error(struct cpuset *parent, struct cpuset *cs, 1391 struct cpumask *xcpus) 1392 { 1393 /* 1394 * A populated partition (cs or parent) can't have empty effective_cpus 1395 */ 1396 return (cpumask_subset(parent->effective_cpus, xcpus) && 1397 partition_is_populated(parent, cs)) || 1398 (!cpumask_intersects(xcpus, cpu_active_mask) && 1399 partition_is_populated(cs, NULL)); 1400 } 1401 1402 static void reset_partition_data(struct cpuset *cs) 1403 { 1404 struct cpuset *parent = parent_cs(cs); 1405 1406 if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) 1407 return; 1408 1409 lockdep_assert_held(&callback_lock); 1410 1411 cs->nr_subparts = 0; 1412 if (cpumask_empty(cs->exclusive_cpus)) { 1413 cpumask_clear(cs->effective_xcpus); 1414 if (is_cpu_exclusive(cs)) 1415 clear_bit(CS_CPU_EXCLUSIVE, &cs->flags); 1416 } 1417 if (!cpumask_and(cs->effective_cpus, 1418 parent->effective_cpus, cs->cpus_allowed)) { 1419 cs->use_parent_ecpus = true; 1420 parent->child_ecpus_count++; 1421 cpumask_copy(cs->effective_cpus, parent->effective_cpus); 1422 } 1423 } 1424 1425 /* 1426 * partition_xcpus_newstate - Exclusive CPUs state change 1427 * @old_prs: old partition_root_state 1428 * @new_prs: new partition_root_state 1429 * @xcpus: exclusive CPUs with state change 1430 */ 1431 static void partition_xcpus_newstate(int old_prs, int new_prs, struct cpumask *xcpus) 1432 { 1433 WARN_ON_ONCE(old_prs == new_prs); 1434 if (new_prs == PRS_ISOLATED) 1435 cpumask_or(isolated_cpus, isolated_cpus, xcpus); 1436 else 1437 cpumask_andnot(isolated_cpus, isolated_cpus, xcpus); 1438 } 1439 1440 /* 1441 * partition_xcpus_add - Add new exclusive CPUs to partition 1442 * @new_prs: new partition_root_state 1443 * @parent: parent cpuset 1444 * @xcpus: exclusive CPUs to be added 1445 * Return: true if isolated_cpus modified, false otherwise 1446 * 1447 * Remote partition if parent == NULL 1448 */ 1449 static bool partition_xcpus_add(int new_prs, struct cpuset *parent, 1450 struct cpumask *xcpus) 1451 { 1452 bool isolcpus_updated; 1453 1454 WARN_ON_ONCE(new_prs < 0); 1455 lockdep_assert_held(&callback_lock); 1456 if (!parent) 1457 parent = &top_cpuset; 1458 1459 1460 if (parent == &top_cpuset) 1461 cpumask_or(subpartitions_cpus, subpartitions_cpus, xcpus); 1462 1463 isolcpus_updated = (new_prs != parent->partition_root_state); 1464 if (isolcpus_updated) 1465 partition_xcpus_newstate(parent->partition_root_state, new_prs, 1466 xcpus); 1467 1468 cpumask_andnot(parent->effective_cpus, parent->effective_cpus, xcpus); 1469 return isolcpus_updated; 1470 } 1471 1472 /* 1473 * partition_xcpus_del - Remove exclusive CPUs from partition 1474 * @old_prs: old partition_root_state 1475 * @parent: parent cpuset 1476 * @xcpus: exclusive CPUs to be removed 1477 * Return: true if isolated_cpus modified, false otherwise 1478 * 1479 * Remote partition if parent == NULL 1480 */ 1481 static bool partition_xcpus_del(int old_prs, struct cpuset *parent, 1482 struct cpumask *xcpus) 1483 { 1484 bool isolcpus_updated; 1485 1486 WARN_ON_ONCE(old_prs < 0); 1487 lockdep_assert_held(&callback_lock); 1488 if (!parent) 1489 parent = &top_cpuset; 1490 1491 if (parent == &top_cpuset) 1492 cpumask_andnot(subpartitions_cpus, subpartitions_cpus, xcpus); 1493 1494 isolcpus_updated = (old_prs != parent->partition_root_state); 1495 if (isolcpus_updated) 1496 partition_xcpus_newstate(old_prs, parent->partition_root_state, 1497 xcpus); 1498 1499 cpumask_and(xcpus, xcpus, cpu_active_mask); 1500 cpumask_or(parent->effective_cpus, parent->effective_cpus, xcpus); 1501 return isolcpus_updated; 1502 } 1503 1504 static void update_unbound_workqueue_cpumask(bool isolcpus_updated) 1505 { 1506 int ret; 1507 1508 lockdep_assert_cpus_held(); 1509 1510 if (!isolcpus_updated) 1511 return; 1512 1513 ret = workqueue_unbound_exclude_cpumask(isolated_cpus); 1514 WARN_ON_ONCE(ret < 0); 1515 } 1516 1517 /** 1518 * cpuset_cpu_is_isolated - Check if the given CPU is isolated 1519 * @cpu: the CPU number to be checked 1520 * Return: true if CPU is used in an isolated partition, false otherwise 1521 */ 1522 bool cpuset_cpu_is_isolated(int cpu) 1523 { 1524 return cpumask_test_cpu(cpu, isolated_cpus); 1525 } 1526 EXPORT_SYMBOL_GPL(cpuset_cpu_is_isolated); 1527 1528 /* 1529 * compute_effective_exclusive_cpumask - compute effective exclusive CPUs 1530 * @cs: cpuset 1531 * @xcpus: effective exclusive CPUs value to be set 1532 * Return: true if xcpus is not empty, false otherwise. 1533 * 1534 * Starting with exclusive_cpus (cpus_allowed if exclusive_cpus is not set), 1535 * it must be a subset of cpus_allowed and parent's effective_xcpus. 1536 */ 1537 static bool compute_effective_exclusive_cpumask(struct cpuset *cs, 1538 struct cpumask *xcpus) 1539 { 1540 struct cpuset *parent = parent_cs(cs); 1541 1542 if (!xcpus) 1543 xcpus = cs->effective_xcpus; 1544 1545 if (!cpumask_empty(cs->exclusive_cpus)) 1546 cpumask_and(xcpus, cs->exclusive_cpus, cs->cpus_allowed); 1547 else 1548 cpumask_copy(xcpus, cs->cpus_allowed); 1549 1550 return cpumask_and(xcpus, xcpus, parent->effective_xcpus); 1551 } 1552 1553 static inline bool is_remote_partition(struct cpuset *cs) 1554 { 1555 return !list_empty(&cs->remote_sibling); 1556 } 1557 1558 static inline bool is_local_partition(struct cpuset *cs) 1559 { 1560 return is_partition_valid(cs) && !is_remote_partition(cs); 1561 } 1562 1563 /* 1564 * remote_partition_enable - Enable current cpuset as a remote partition root 1565 * @cs: the cpuset to update 1566 * @new_prs: new partition_root_state 1567 * @tmp: temparary masks 1568 * Return: 1 if successful, 0 if error 1569 * 1570 * Enable the current cpuset to become a remote partition root taking CPUs 1571 * directly from the top cpuset. cpuset_mutex must be held by the caller. 1572 */ 1573 static int remote_partition_enable(struct cpuset *cs, int new_prs, 1574 struct tmpmasks *tmp) 1575 { 1576 bool isolcpus_updated; 1577 1578 /* 1579 * The user must have sysadmin privilege. 1580 */ 1581 if (!capable(CAP_SYS_ADMIN)) 1582 return 0; 1583 1584 /* 1585 * The requested exclusive_cpus must not be allocated to other 1586 * partitions and it can't use up all the root's effective_cpus. 1587 * 1588 * Note that if there is any local partition root above it or 1589 * remote partition root underneath it, its exclusive_cpus must 1590 * have overlapped with subpartitions_cpus. 1591 */ 1592 compute_effective_exclusive_cpumask(cs, tmp->new_cpus); 1593 if (cpumask_empty(tmp->new_cpus) || 1594 cpumask_intersects(tmp->new_cpus, subpartitions_cpus) || 1595 cpumask_subset(top_cpuset.effective_cpus, tmp->new_cpus)) 1596 return 0; 1597 1598 spin_lock_irq(&callback_lock); 1599 isolcpus_updated = partition_xcpus_add(new_prs, NULL, tmp->new_cpus); 1600 list_add(&cs->remote_sibling, &remote_children); 1601 if (cs->use_parent_ecpus) { 1602 struct cpuset *parent = parent_cs(cs); 1603 1604 cs->use_parent_ecpus = false; 1605 parent->child_ecpus_count--; 1606 } 1607 spin_unlock_irq(&callback_lock); 1608 update_unbound_workqueue_cpumask(isolcpus_updated); 1609 1610 /* 1611 * Proprogate changes in top_cpuset's effective_cpus down the hierarchy. 1612 */ 1613 update_tasks_cpumask(&top_cpuset, tmp->new_cpus); 1614 update_sibling_cpumasks(&top_cpuset, NULL, tmp); 1615 return 1; 1616 } 1617 1618 /* 1619 * remote_partition_disable - Remove current cpuset from remote partition list 1620 * @cs: the cpuset to update 1621 * @tmp: temparary masks 1622 * 1623 * The effective_cpus is also updated. 1624 * 1625 * cpuset_mutex must be held by the caller. 1626 */ 1627 static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp) 1628 { 1629 bool isolcpus_updated; 1630 1631 compute_effective_exclusive_cpumask(cs, tmp->new_cpus); 1632 WARN_ON_ONCE(!is_remote_partition(cs)); 1633 WARN_ON_ONCE(!cpumask_subset(tmp->new_cpus, subpartitions_cpus)); 1634 1635 spin_lock_irq(&callback_lock); 1636 list_del_init(&cs->remote_sibling); 1637 isolcpus_updated = partition_xcpus_del(cs->partition_root_state, 1638 NULL, tmp->new_cpus); 1639 cs->partition_root_state = -cs->partition_root_state; 1640 if (!cs->prs_err) 1641 cs->prs_err = PERR_INVCPUS; 1642 reset_partition_data(cs); 1643 spin_unlock_irq(&callback_lock); 1644 update_unbound_workqueue_cpumask(isolcpus_updated); 1645 1646 /* 1647 * Proprogate changes in top_cpuset's effective_cpus down the hierarchy. 1648 */ 1649 update_tasks_cpumask(&top_cpuset, tmp->new_cpus); 1650 update_sibling_cpumasks(&top_cpuset, NULL, tmp); 1651 } 1652 1653 /* 1654 * remote_cpus_update - cpus_exclusive change of remote partition 1655 * @cs: the cpuset to be updated 1656 * @newmask: the new effective_xcpus mask 1657 * @tmp: temparary masks 1658 * 1659 * top_cpuset and subpartitions_cpus will be updated or partition can be 1660 * invalidated. 1661 */ 1662 static void remote_cpus_update(struct cpuset *cs, struct cpumask *newmask, 1663 struct tmpmasks *tmp) 1664 { 1665 bool adding, deleting; 1666 int prs = cs->partition_root_state; 1667 int isolcpus_updated = 0; 1668 1669 if (WARN_ON_ONCE(!is_remote_partition(cs))) 1670 return; 1671 1672 WARN_ON_ONCE(!cpumask_subset(cs->effective_xcpus, subpartitions_cpus)); 1673 1674 if (cpumask_empty(newmask)) 1675 goto invalidate; 1676 1677 adding = cpumask_andnot(tmp->addmask, newmask, cs->effective_xcpus); 1678 deleting = cpumask_andnot(tmp->delmask, cs->effective_xcpus, newmask); 1679 1680 /* 1681 * Additions of remote CPUs is only allowed if those CPUs are 1682 * not allocated to other partitions and there are effective_cpus 1683 * left in the top cpuset. 1684 */ 1685 if (adding && (!capable(CAP_SYS_ADMIN) || 1686 cpumask_intersects(tmp->addmask, subpartitions_cpus) || 1687 cpumask_subset(top_cpuset.effective_cpus, tmp->addmask))) 1688 goto invalidate; 1689 1690 spin_lock_irq(&callback_lock); 1691 if (adding) 1692 isolcpus_updated += partition_xcpus_add(prs, NULL, tmp->addmask); 1693 if (deleting) 1694 isolcpus_updated += partition_xcpus_del(prs, NULL, tmp->delmask); 1695 spin_unlock_irq(&callback_lock); 1696 update_unbound_workqueue_cpumask(isolcpus_updated); 1697 1698 /* 1699 * Proprogate changes in top_cpuset's effective_cpus down the hierarchy. 1700 */ 1701 update_tasks_cpumask(&top_cpuset, tmp->new_cpus); 1702 update_sibling_cpumasks(&top_cpuset, NULL, tmp); 1703 return; 1704 1705 invalidate: 1706 remote_partition_disable(cs, tmp); 1707 } 1708 1709 /* 1710 * remote_partition_check - check if a child remote partition needs update 1711 * @cs: the cpuset to be updated 1712 * @newmask: the new effective_xcpus mask 1713 * @delmask: temporary mask for deletion (not in tmp) 1714 * @tmp: temparary masks 1715 * 1716 * This should be called before the given cs has updated its cpus_allowed 1717 * and/or effective_xcpus. 1718 */ 1719 static void remote_partition_check(struct cpuset *cs, struct cpumask *newmask, 1720 struct cpumask *delmask, struct tmpmasks *tmp) 1721 { 1722 struct cpuset *child, *next; 1723 int disable_cnt = 0; 1724 1725 /* 1726 * Compute the effective exclusive CPUs that will be deleted. 1727 */ 1728 if (!cpumask_andnot(delmask, cs->effective_xcpus, newmask) || 1729 !cpumask_intersects(delmask, subpartitions_cpus)) 1730 return; /* No deletion of exclusive CPUs in partitions */ 1731 1732 /* 1733 * Searching the remote children list to look for those that will 1734 * be impacted by the deletion of exclusive CPUs. 1735 * 1736 * Since a cpuset must be removed from the remote children list 1737 * before it can go offline and holding cpuset_mutex will prevent 1738 * any change in cpuset status. RCU read lock isn't needed. 1739 */ 1740 lockdep_assert_held(&cpuset_mutex); 1741 list_for_each_entry_safe(child, next, &remote_children, remote_sibling) 1742 if (cpumask_intersects(child->effective_cpus, delmask)) { 1743 remote_partition_disable(child, tmp); 1744 disable_cnt++; 1745 } 1746 if (disable_cnt) 1747 rebuild_sched_domains_locked(); 1748 } 1749 1750 /* 1751 * prstate_housekeeping_conflict - check for partition & housekeeping conflicts 1752 * @prstate: partition root state to be checked 1753 * @new_cpus: cpu mask 1754 * Return: true if there is conflict, false otherwise 1755 * 1756 * CPUs outside of housekeeping_cpumask(HK_TYPE_DOMAIN) can only be used in 1757 * an isolated partition. 1758 */ 1759 static bool prstate_housekeeping_conflict(int prstate, struct cpumask *new_cpus) 1760 { 1761 const struct cpumask *hk_domain = housekeeping_cpumask(HK_TYPE_DOMAIN); 1762 bool all_in_hk = cpumask_subset(new_cpus, hk_domain); 1763 1764 if (!all_in_hk && (prstate != PRS_ISOLATED)) 1765 return true; 1766 1767 return false; 1768 } 1769 1770 /** 1771 * update_parent_effective_cpumask - update effective_cpus mask of parent cpuset 1772 * @cs: The cpuset that requests change in partition root state 1773 * @cmd: Partition root state change command 1774 * @newmask: Optional new cpumask for partcmd_update 1775 * @tmp: Temporary addmask and delmask 1776 * Return: 0 or a partition root state error code 1777 * 1778 * For partcmd_enable*, the cpuset is being transformed from a non-partition 1779 * root to a partition root. The effective_xcpus (cpus_allowed if 1780 * effective_xcpus not set) mask of the given cpuset will be taken away from 1781 * parent's effective_cpus. The function will return 0 if all the CPUs listed 1782 * in effective_xcpus can be granted or an error code will be returned. 1783 * 1784 * For partcmd_disable, the cpuset is being transformed from a partition 1785 * root back to a non-partition root. Any CPUs in effective_xcpus will be 1786 * given back to parent's effective_cpus. 0 will always be returned. 1787 * 1788 * For partcmd_update, if the optional newmask is specified, the cpu list is 1789 * to be changed from effective_xcpus to newmask. Otherwise, effective_xcpus is 1790 * assumed to remain the same. The cpuset should either be a valid or invalid 1791 * partition root. The partition root state may change from valid to invalid 1792 * or vice versa. An error code will be returned if transitioning from 1793 * invalid to valid violates the exclusivity rule. 1794 * 1795 * For partcmd_invalidate, the current partition will be made invalid. 1796 * 1797 * The partcmd_enable* and partcmd_disable commands are used by 1798 * update_prstate(). An error code may be returned and the caller will check 1799 * for error. 1800 * 1801 * The partcmd_update command is used by update_cpumasks_hier() with newmask 1802 * NULL and update_cpumask() with newmask set. The partcmd_invalidate is used 1803 * by update_cpumask() with NULL newmask. In both cases, the callers won't 1804 * check for error and so partition_root_state and prs_error will be updated 1805 * directly. 1806 */ 1807 static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, 1808 struct cpumask *newmask, 1809 struct tmpmasks *tmp) 1810 { 1811 struct cpuset *parent = parent_cs(cs); 1812 int adding; /* Adding cpus to parent's effective_cpus */ 1813 int deleting; /* Deleting cpus from parent's effective_cpus */ 1814 int old_prs, new_prs; 1815 int part_error = PERR_NONE; /* Partition error? */ 1816 int subparts_delta = 0; 1817 struct cpumask *xcpus; /* cs effective_xcpus */ 1818 int isolcpus_updated = 0; 1819 bool nocpu; 1820 1821 lockdep_assert_held(&cpuset_mutex); 1822 1823 /* 1824 * new_prs will only be changed for the partcmd_update and 1825 * partcmd_invalidate commands. 1826 */ 1827 adding = deleting = false; 1828 old_prs = new_prs = cs->partition_root_state; 1829 xcpus = !cpumask_empty(cs->exclusive_cpus) 1830 ? cs->effective_xcpus : cs->cpus_allowed; 1831 1832 if (cmd == partcmd_invalidate) { 1833 if (is_prs_invalid(old_prs)) 1834 return 0; 1835 1836 /* 1837 * Make the current partition invalid. 1838 */ 1839 if (is_partition_valid(parent)) 1840 adding = cpumask_and(tmp->addmask, 1841 xcpus, parent->effective_xcpus); 1842 if (old_prs > 0) { 1843 new_prs = -old_prs; 1844 subparts_delta--; 1845 } 1846 goto write_error; 1847 } 1848 1849 /* 1850 * The parent must be a partition root. 1851 * The new cpumask, if present, or the current cpus_allowed must 1852 * not be empty. 1853 */ 1854 if (!is_partition_valid(parent)) { 1855 return is_partition_invalid(parent) 1856 ? PERR_INVPARENT : PERR_NOTPART; 1857 } 1858 if (!newmask && cpumask_empty(cs->cpus_allowed)) 1859 return PERR_CPUSEMPTY; 1860 1861 nocpu = tasks_nocpu_error(parent, cs, xcpus); 1862 1863 if ((cmd == partcmd_enable) || (cmd == partcmd_enablei)) { 1864 /* 1865 * Enabling partition root is not allowed if its 1866 * effective_xcpus is empty or doesn't overlap with 1867 * parent's effective_xcpus. 1868 */ 1869 if (cpumask_empty(xcpus) || 1870 !cpumask_intersects(xcpus, parent->effective_xcpus)) 1871 return PERR_INVCPUS; 1872 1873 if (prstate_housekeeping_conflict(new_prs, xcpus)) 1874 return PERR_HKEEPING; 1875 1876 /* 1877 * A parent can be left with no CPU as long as there is no 1878 * task directly associated with the parent partition. 1879 */ 1880 if (nocpu) 1881 return PERR_NOCPUS; 1882 1883 cpumask_copy(tmp->delmask, xcpus); 1884 deleting = true; 1885 subparts_delta++; 1886 new_prs = (cmd == partcmd_enable) ? PRS_ROOT : PRS_ISOLATED; 1887 } else if (cmd == partcmd_disable) { 1888 /* 1889 * May need to add cpus to parent's effective_cpus for 1890 * valid partition root. 1891 */ 1892 adding = !is_prs_invalid(old_prs) && 1893 cpumask_and(tmp->addmask, xcpus, parent->effective_xcpus); 1894 if (adding) 1895 subparts_delta--; 1896 new_prs = PRS_MEMBER; 1897 } else if (newmask) { 1898 /* 1899 * Empty cpumask is not allowed 1900 */ 1901 if (cpumask_empty(newmask)) { 1902 part_error = PERR_CPUSEMPTY; 1903 goto write_error; 1904 } 1905 1906 /* 1907 * partcmd_update with newmask: 1908 * 1909 * Compute add/delete mask to/from effective_cpus 1910 * 1911 * For valid partition: 1912 * addmask = exclusive_cpus & ~newmask 1913 * & parent->effective_xcpus 1914 * delmask = newmask & ~exclusive_cpus 1915 * & parent->effective_xcpus 1916 * 1917 * For invalid partition: 1918 * delmask = newmask & parent->effective_xcpus 1919 */ 1920 if (is_prs_invalid(old_prs)) { 1921 adding = false; 1922 deleting = cpumask_and(tmp->delmask, 1923 newmask, parent->effective_xcpus); 1924 } else { 1925 cpumask_andnot(tmp->addmask, xcpus, newmask); 1926 adding = cpumask_and(tmp->addmask, tmp->addmask, 1927 parent->effective_xcpus); 1928 1929 cpumask_andnot(tmp->delmask, newmask, xcpus); 1930 deleting = cpumask_and(tmp->delmask, tmp->delmask, 1931 parent->effective_xcpus); 1932 } 1933 /* 1934 * Make partition invalid if parent's effective_cpus could 1935 * become empty and there are tasks in the parent. 1936 */ 1937 if (nocpu && (!adding || 1938 !cpumask_intersects(tmp->addmask, cpu_active_mask))) { 1939 part_error = PERR_NOCPUS; 1940 deleting = false; 1941 adding = cpumask_and(tmp->addmask, 1942 xcpus, parent->effective_xcpus); 1943 } 1944 } else { 1945 /* 1946 * partcmd_update w/o newmask 1947 * 1948 * delmask = effective_xcpus & parent->effective_cpus 1949 * 1950 * This can be called from: 1951 * 1) update_cpumasks_hier() 1952 * 2) cpuset_hotplug_update_tasks() 1953 * 1954 * Check to see if it can be transitioned from valid to 1955 * invalid partition or vice versa. 1956 * 1957 * A partition error happens when parent has tasks and all 1958 * its effective CPUs will have to be distributed out. 1959 */ 1960 WARN_ON_ONCE(!is_partition_valid(parent)); 1961 if (nocpu) { 1962 part_error = PERR_NOCPUS; 1963 if (is_partition_valid(cs)) 1964 adding = cpumask_and(tmp->addmask, 1965 xcpus, parent->effective_xcpus); 1966 } else if (is_partition_invalid(cs) && 1967 cpumask_subset(xcpus, parent->effective_xcpus)) { 1968 struct cgroup_subsys_state *css; 1969 struct cpuset *child; 1970 bool exclusive = true; 1971 1972 /* 1973 * Convert invalid partition to valid has to 1974 * pass the cpu exclusivity test. 1975 */ 1976 rcu_read_lock(); 1977 cpuset_for_each_child(child, css, parent) { 1978 if (child == cs) 1979 continue; 1980 if (!cpusets_are_exclusive(cs, child)) { 1981 exclusive = false; 1982 break; 1983 } 1984 } 1985 rcu_read_unlock(); 1986 if (exclusive) 1987 deleting = cpumask_and(tmp->delmask, 1988 xcpus, parent->effective_cpus); 1989 else 1990 part_error = PERR_NOTEXCL; 1991 } 1992 } 1993 1994 write_error: 1995 if (part_error) 1996 WRITE_ONCE(cs->prs_err, part_error); 1997 1998 if (cmd == partcmd_update) { 1999 /* 2000 * Check for possible transition between valid and invalid 2001 * partition root. 2002 */ 2003 switch (cs->partition_root_state) { 2004 case PRS_ROOT: 2005 case PRS_ISOLATED: 2006 if (part_error) { 2007 new_prs = -old_prs; 2008 subparts_delta--; 2009 } 2010 break; 2011 case PRS_INVALID_ROOT: 2012 case PRS_INVALID_ISOLATED: 2013 if (!part_error) { 2014 new_prs = -old_prs; 2015 subparts_delta++; 2016 } 2017 break; 2018 } 2019 } 2020 2021 if (!adding && !deleting && (new_prs == old_prs)) 2022 return 0; 2023 2024 /* 2025 * Transitioning between invalid to valid or vice versa may require 2026 * changing CS_CPU_EXCLUSIVE. In the case of partcmd_update, 2027 * validate_change() has already been successfully called and 2028 * CPU lists in cs haven't been updated yet. So defer it to later. 2029 */ 2030 if ((old_prs != new_prs) && (cmd != partcmd_update)) { 2031 int err = update_partition_exclusive(cs, new_prs); 2032 2033 if (err) 2034 return err; 2035 } 2036 2037 /* 2038 * Change the parent's effective_cpus & effective_xcpus (top cpuset 2039 * only). 2040 * 2041 * Newly added CPUs will be removed from effective_cpus and 2042 * newly deleted ones will be added back to effective_cpus. 2043 */ 2044 spin_lock_irq(&callback_lock); 2045 if (old_prs != new_prs) { 2046 cs->partition_root_state = new_prs; 2047 if (new_prs <= 0) 2048 cs->nr_subparts = 0; 2049 } 2050 /* 2051 * Adding to parent's effective_cpus means deletion CPUs from cs 2052 * and vice versa. 2053 */ 2054 if (adding) 2055 isolcpus_updated += partition_xcpus_del(old_prs, parent, 2056 tmp->addmask); 2057 if (deleting) 2058 isolcpus_updated += partition_xcpus_add(new_prs, parent, 2059 tmp->delmask); 2060 2061 if (is_partition_valid(parent)) { 2062 parent->nr_subparts += subparts_delta; 2063 WARN_ON_ONCE(parent->nr_subparts < 0); 2064 } 2065 spin_unlock_irq(&callback_lock); 2066 update_unbound_workqueue_cpumask(isolcpus_updated); 2067 2068 if ((old_prs != new_prs) && (cmd == partcmd_update)) 2069 update_partition_exclusive(cs, new_prs); 2070 2071 if (adding || deleting) { 2072 update_tasks_cpumask(parent, tmp->addmask); 2073 update_sibling_cpumasks(parent, cs, tmp); 2074 } 2075 2076 /* 2077 * For partcmd_update without newmask, it is being called from 2078 * cpuset_handle_hotplug(). Update the load balance flag and 2079 * scheduling domain accordingly. 2080 */ 2081 if ((cmd == partcmd_update) && !newmask) 2082 update_partition_sd_lb(cs, old_prs); 2083 2084 notify_partition_change(cs, old_prs); 2085 return 0; 2086 } 2087 2088 /** 2089 * compute_partition_effective_cpumask - compute effective_cpus for partition 2090 * @cs: partition root cpuset 2091 * @new_ecpus: previously computed effective_cpus to be updated 2092 * 2093 * Compute the effective_cpus of a partition root by scanning effective_xcpus 2094 * of child partition roots and excluding their effective_xcpus. 2095 * 2096 * This has the side effect of invalidating valid child partition roots, 2097 * if necessary. Since it is called from either cpuset_hotplug_update_tasks() 2098 * or update_cpumasks_hier() where parent and children are modified 2099 * successively, we don't need to call update_parent_effective_cpumask() 2100 * and the child's effective_cpus will be updated in later iterations. 2101 * 2102 * Note that rcu_read_lock() is assumed to be held. 2103 */ 2104 static void compute_partition_effective_cpumask(struct cpuset *cs, 2105 struct cpumask *new_ecpus) 2106 { 2107 struct cgroup_subsys_state *css; 2108 struct cpuset *child; 2109 bool populated = partition_is_populated(cs, NULL); 2110 2111 /* 2112 * Check child partition roots to see if they should be 2113 * invalidated when 2114 * 1) child effective_xcpus not a subset of new 2115 * excluisve_cpus 2116 * 2) All the effective_cpus will be used up and cp 2117 * has tasks 2118 */ 2119 compute_effective_exclusive_cpumask(cs, new_ecpus); 2120 cpumask_and(new_ecpus, new_ecpus, cpu_active_mask); 2121 2122 rcu_read_lock(); 2123 cpuset_for_each_child(child, css, cs) { 2124 if (!is_partition_valid(child)) 2125 continue; 2126 2127 child->prs_err = 0; 2128 if (!cpumask_subset(child->effective_xcpus, 2129 cs->effective_xcpus)) 2130 child->prs_err = PERR_INVCPUS; 2131 else if (populated && 2132 cpumask_subset(new_ecpus, child->effective_xcpus)) 2133 child->prs_err = PERR_NOCPUS; 2134 2135 if (child->prs_err) { 2136 int old_prs = child->partition_root_state; 2137 2138 /* 2139 * Invalidate child partition 2140 */ 2141 spin_lock_irq(&callback_lock); 2142 make_partition_invalid(child); 2143 cs->nr_subparts--; 2144 child->nr_subparts = 0; 2145 spin_unlock_irq(&callback_lock); 2146 notify_partition_change(child, old_prs); 2147 continue; 2148 } 2149 cpumask_andnot(new_ecpus, new_ecpus, 2150 child->effective_xcpus); 2151 } 2152 rcu_read_unlock(); 2153 } 2154 2155 /* 2156 * update_cpumasks_hier() flags 2157 */ 2158 #define HIER_CHECKALL 0x01 /* Check all cpusets with no skipping */ 2159 #define HIER_NO_SD_REBUILD 0x02 /* Don't rebuild sched domains */ 2160 2161 /* 2162 * update_cpumasks_hier - Update effective cpumasks and tasks in the subtree 2163 * @cs: the cpuset to consider 2164 * @tmp: temp variables for calculating effective_cpus & partition setup 2165 * @force: don't skip any descendant cpusets if set 2166 * 2167 * When configured cpumask is changed, the effective cpumasks of this cpuset 2168 * and all its descendants need to be updated. 2169 * 2170 * On legacy hierarchy, effective_cpus will be the same with cpu_allowed. 2171 * 2172 * Called with cpuset_mutex held 2173 */ 2174 static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp, 2175 int flags) 2176 { 2177 struct cpuset *cp; 2178 struct cgroup_subsys_state *pos_css; 2179 bool need_rebuild_sched_domains = false; 2180 int old_prs, new_prs; 2181 2182 rcu_read_lock(); 2183 cpuset_for_each_descendant_pre(cp, pos_css, cs) { 2184 struct cpuset *parent = parent_cs(cp); 2185 bool remote = is_remote_partition(cp); 2186 bool update_parent = false; 2187 2188 /* 2189 * Skip descendent remote partition that acquires CPUs 2190 * directly from top cpuset unless it is cs. 2191 */ 2192 if (remote && (cp != cs)) { 2193 pos_css = css_rightmost_descendant(pos_css); 2194 continue; 2195 } 2196 2197 /* 2198 * Update effective_xcpus if exclusive_cpus set. 2199 * The case when exclusive_cpus isn't set is handled later. 2200 */ 2201 if (!cpumask_empty(cp->exclusive_cpus) && (cp != cs)) { 2202 spin_lock_irq(&callback_lock); 2203 compute_effective_exclusive_cpumask(cp, NULL); 2204 spin_unlock_irq(&callback_lock); 2205 } 2206 2207 old_prs = new_prs = cp->partition_root_state; 2208 if (remote || (is_partition_valid(parent) && 2209 is_partition_valid(cp))) 2210 compute_partition_effective_cpumask(cp, tmp->new_cpus); 2211 else 2212 compute_effective_cpumask(tmp->new_cpus, cp, parent); 2213 2214 /* 2215 * A partition with no effective_cpus is allowed as long as 2216 * there is no task associated with it. Call 2217 * update_parent_effective_cpumask() to check it. 2218 */ 2219 if (is_partition_valid(cp) && cpumask_empty(tmp->new_cpus)) { 2220 update_parent = true; 2221 goto update_parent_effective; 2222 } 2223 2224 /* 2225 * If it becomes empty, inherit the effective mask of the 2226 * parent, which is guaranteed to have some CPUs unless 2227 * it is a partition root that has explicitly distributed 2228 * out all its CPUs. 2229 */ 2230 if (is_in_v2_mode() && !remote && cpumask_empty(tmp->new_cpus)) { 2231 cpumask_copy(tmp->new_cpus, parent->effective_cpus); 2232 if (!cp->use_parent_ecpus) { 2233 cp->use_parent_ecpus = true; 2234 parent->child_ecpus_count++; 2235 } 2236 } else if (cp->use_parent_ecpus) { 2237 cp->use_parent_ecpus = false; 2238 WARN_ON_ONCE(!parent->child_ecpus_count); 2239 parent->child_ecpus_count--; 2240 } 2241 2242 if (remote) 2243 goto get_css; 2244 2245 /* 2246 * Skip the whole subtree if 2247 * 1) the cpumask remains the same, 2248 * 2) has no partition root state, 2249 * 3) HIER_CHECKALL flag not set, and 2250 * 4) for v2 load balance state same as its parent. 2251 */ 2252 if (!cp->partition_root_state && !(flags & HIER_CHECKALL) && 2253 cpumask_equal(tmp->new_cpus, cp->effective_cpus) && 2254 (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) || 2255 (is_sched_load_balance(parent) == is_sched_load_balance(cp)))) { 2256 pos_css = css_rightmost_descendant(pos_css); 2257 continue; 2258 } 2259 2260 update_parent_effective: 2261 /* 2262 * update_parent_effective_cpumask() should have been called 2263 * for cs already in update_cpumask(). We should also call 2264 * update_tasks_cpumask() again for tasks in the parent 2265 * cpuset if the parent's effective_cpus changes. 2266 */ 2267 if ((cp != cs) && old_prs) { 2268 switch (parent->partition_root_state) { 2269 case PRS_ROOT: 2270 case PRS_ISOLATED: 2271 update_parent = true; 2272 break; 2273 2274 default: 2275 /* 2276 * When parent is not a partition root or is 2277 * invalid, child partition roots become 2278 * invalid too. 2279 */ 2280 if (is_partition_valid(cp)) 2281 new_prs = -cp->partition_root_state; 2282 WRITE_ONCE(cp->prs_err, 2283 is_partition_invalid(parent) 2284 ? PERR_INVPARENT : PERR_NOTPART); 2285 break; 2286 } 2287 } 2288 get_css: 2289 if (!css_tryget_online(&cp->css)) 2290 continue; 2291 rcu_read_unlock(); 2292 2293 if (update_parent) { 2294 update_parent_effective_cpumask(cp, partcmd_update, NULL, tmp); 2295 /* 2296 * The cpuset partition_root_state may become 2297 * invalid. Capture it. 2298 */ 2299 new_prs = cp->partition_root_state; 2300 } 2301 2302 spin_lock_irq(&callback_lock); 2303 cpumask_copy(cp->effective_cpus, tmp->new_cpus); 2304 cp->partition_root_state = new_prs; 2305 /* 2306 * Make sure effective_xcpus is properly set for a valid 2307 * partition root. 2308 */ 2309 if ((new_prs > 0) && cpumask_empty(cp->exclusive_cpus)) 2310 cpumask_and(cp->effective_xcpus, 2311 cp->cpus_allowed, parent->effective_xcpus); 2312 else if (new_prs < 0) 2313 reset_partition_data(cp); 2314 spin_unlock_irq(&callback_lock); 2315 2316 notify_partition_change(cp, old_prs); 2317 2318 WARN_ON(!is_in_v2_mode() && 2319 !cpumask_equal(cp->cpus_allowed, cp->effective_cpus)); 2320 2321 update_tasks_cpumask(cp, cp->effective_cpus); 2322 2323 /* 2324 * On default hierarchy, inherit the CS_SCHED_LOAD_BALANCE 2325 * from parent if current cpuset isn't a valid partition root 2326 * and their load balance states differ. 2327 */ 2328 if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && 2329 !is_partition_valid(cp) && 2330 (is_sched_load_balance(parent) != is_sched_load_balance(cp))) { 2331 if (is_sched_load_balance(parent)) 2332 set_bit(CS_SCHED_LOAD_BALANCE, &cp->flags); 2333 else 2334 clear_bit(CS_SCHED_LOAD_BALANCE, &cp->flags); 2335 } 2336 2337 /* 2338 * On legacy hierarchy, if the effective cpumask of any non- 2339 * empty cpuset is changed, we need to rebuild sched domains. 2340 * On default hierarchy, the cpuset needs to be a partition 2341 * root as well. 2342 */ 2343 if (!cpumask_empty(cp->cpus_allowed) && 2344 is_sched_load_balance(cp) && 2345 (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) || 2346 is_partition_valid(cp))) 2347 need_rebuild_sched_domains = true; 2348 2349 rcu_read_lock(); 2350 css_put(&cp->css); 2351 } 2352 rcu_read_unlock(); 2353 2354 if (need_rebuild_sched_domains && !(flags & HIER_NO_SD_REBUILD)) 2355 rebuild_sched_domains_locked(); 2356 } 2357 2358 /** 2359 * update_sibling_cpumasks - Update siblings cpumasks 2360 * @parent: Parent cpuset 2361 * @cs: Current cpuset 2362 * @tmp: Temp variables 2363 */ 2364 static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, 2365 struct tmpmasks *tmp) 2366 { 2367 struct cpuset *sibling; 2368 struct cgroup_subsys_state *pos_css; 2369 2370 lockdep_assert_held(&cpuset_mutex); 2371 2372 /* 2373 * Check all its siblings and call update_cpumasks_hier() 2374 * if their effective_cpus will need to be changed. 2375 * 2376 * With the addition of effective_xcpus which is a subset of 2377 * cpus_allowed. It is possible a change in parent's effective_cpus 2378 * due to a change in a child partition's effective_xcpus will impact 2379 * its siblings even if they do not inherit parent's effective_cpus 2380 * directly. 2381 * 2382 * The update_cpumasks_hier() function may sleep. So we have to 2383 * release the RCU read lock before calling it. HIER_NO_SD_REBUILD 2384 * flag is used to suppress rebuild of sched domains as the callers 2385 * will take care of that. 2386 */ 2387 rcu_read_lock(); 2388 cpuset_for_each_child(sibling, pos_css, parent) { 2389 if (sibling == cs) 2390 continue; 2391 if (!sibling->use_parent_ecpus && 2392 !is_partition_valid(sibling)) { 2393 compute_effective_cpumask(tmp->new_cpus, sibling, 2394 parent); 2395 if (cpumask_equal(tmp->new_cpus, sibling->effective_cpus)) 2396 continue; 2397 } 2398 if (!css_tryget_online(&sibling->css)) 2399 continue; 2400 2401 rcu_read_unlock(); 2402 update_cpumasks_hier(sibling, tmp, HIER_NO_SD_REBUILD); 2403 rcu_read_lock(); 2404 css_put(&sibling->css); 2405 } 2406 rcu_read_unlock(); 2407 } 2408 2409 /** 2410 * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it 2411 * @cs: the cpuset to consider 2412 * @trialcs: trial cpuset 2413 * @buf: buffer of cpu numbers written to this cpuset 2414 */ 2415 static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, 2416 const char *buf) 2417 { 2418 int retval; 2419 struct tmpmasks tmp; 2420 struct cpuset *parent = parent_cs(cs); 2421 bool invalidate = false; 2422 int hier_flags = 0; 2423 int old_prs = cs->partition_root_state; 2424 2425 /* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */ 2426 if (cs == &top_cpuset) 2427 return -EACCES; 2428 2429 /* 2430 * An empty cpus_allowed is ok only if the cpuset has no tasks. 2431 * Since cpulist_parse() fails on an empty mask, we special case 2432 * that parsing. The validate_change() call ensures that cpusets 2433 * with tasks have cpus. 2434 */ 2435 if (!*buf) { 2436 cpumask_clear(trialcs->cpus_allowed); 2437 cpumask_clear(trialcs->effective_xcpus); 2438 } else { 2439 retval = cpulist_parse(buf, trialcs->cpus_allowed); 2440 if (retval < 0) 2441 return retval; 2442 2443 if (!cpumask_subset(trialcs->cpus_allowed, 2444 top_cpuset.cpus_allowed)) 2445 return -EINVAL; 2446 2447 /* 2448 * When exclusive_cpus isn't explicitly set, it is constrainted 2449 * by cpus_allowed and parent's effective_xcpus. Otherwise, 2450 * trialcs->effective_xcpus is used as a temporary cpumask 2451 * for checking validity of the partition root. 2452 */ 2453 if (!cpumask_empty(trialcs->exclusive_cpus) || is_partition_valid(cs)) 2454 compute_effective_exclusive_cpumask(trialcs, NULL); 2455 } 2456 2457 /* Nothing to do if the cpus didn't change */ 2458 if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed)) 2459 return 0; 2460 2461 if (alloc_cpumasks(NULL, &tmp)) 2462 return -ENOMEM; 2463 2464 if (old_prs) { 2465 if (is_partition_valid(cs) && 2466 cpumask_empty(trialcs->effective_xcpus)) { 2467 invalidate = true; 2468 cs->prs_err = PERR_INVCPUS; 2469 } else if (prstate_housekeeping_conflict(old_prs, trialcs->effective_xcpus)) { 2470 invalidate = true; 2471 cs->prs_err = PERR_HKEEPING; 2472 } else if (tasks_nocpu_error(parent, cs, trialcs->effective_xcpus)) { 2473 invalidate = true; 2474 cs->prs_err = PERR_NOCPUS; 2475 } 2476 } 2477 2478 /* 2479 * Check all the descendants in update_cpumasks_hier() if 2480 * effective_xcpus is to be changed. 2481 */ 2482 if (!cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus)) 2483 hier_flags = HIER_CHECKALL; 2484 2485 retval = validate_change(cs, trialcs); 2486 2487 if ((retval == -EINVAL) && cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) { 2488 struct cgroup_subsys_state *css; 2489 struct cpuset *cp; 2490 2491 /* 2492 * The -EINVAL error code indicates that partition sibling 2493 * CPU exclusivity rule has been violated. We still allow 2494 * the cpumask change to proceed while invalidating the 2495 * partition. However, any conflicting sibling partitions 2496 * have to be marked as invalid too. 2497 */ 2498 invalidate = true; 2499 rcu_read_lock(); 2500 cpuset_for_each_child(cp, css, parent) { 2501 struct cpumask *xcpus = fetch_xcpus(trialcs); 2502 2503 if (is_partition_valid(cp) && 2504 cpumask_intersects(xcpus, cp->effective_xcpus)) { 2505 rcu_read_unlock(); 2506 update_parent_effective_cpumask(cp, partcmd_invalidate, NULL, &tmp); 2507 rcu_read_lock(); 2508 } 2509 } 2510 rcu_read_unlock(); 2511 retval = 0; 2512 } 2513 2514 if (retval < 0) 2515 goto out_free; 2516 2517 if (is_partition_valid(cs) || 2518 (is_partition_invalid(cs) && !invalidate)) { 2519 struct cpumask *xcpus = trialcs->effective_xcpus; 2520 2521 if (cpumask_empty(xcpus) && is_partition_invalid(cs)) 2522 xcpus = trialcs->cpus_allowed; 2523 2524 /* 2525 * Call remote_cpus_update() to handle valid remote partition 2526 */ 2527 if (is_remote_partition(cs)) 2528 remote_cpus_update(cs, xcpus, &tmp); 2529 else if (invalidate) 2530 update_parent_effective_cpumask(cs, partcmd_invalidate, 2531 NULL, &tmp); 2532 else 2533 update_parent_effective_cpumask(cs, partcmd_update, 2534 xcpus, &tmp); 2535 } else if (!cpumask_empty(cs->exclusive_cpus)) { 2536 /* 2537 * Use trialcs->effective_cpus as a temp cpumask 2538 */ 2539 remote_partition_check(cs, trialcs->effective_xcpus, 2540 trialcs->effective_cpus, &tmp); 2541 } 2542 2543 spin_lock_irq(&callback_lock); 2544 cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed); 2545 cpumask_copy(cs->effective_xcpus, trialcs->effective_xcpus); 2546 if ((old_prs > 0) && !is_partition_valid(cs)) 2547 reset_partition_data(cs); 2548 spin_unlock_irq(&callback_lock); 2549 2550 /* effective_cpus/effective_xcpus will be updated here */ 2551 update_cpumasks_hier(cs, &tmp, hier_flags); 2552 2553 /* Update CS_SCHED_LOAD_BALANCE and/or sched_domains, if necessary */ 2554 if (cs->partition_root_state) 2555 update_partition_sd_lb(cs, old_prs); 2556 out_free: 2557 free_cpumasks(NULL, &tmp); 2558 return retval; 2559 } 2560 2561 /** 2562 * update_exclusive_cpumask - update the exclusive_cpus mask of a cpuset 2563 * @cs: the cpuset to consider 2564 * @trialcs: trial cpuset 2565 * @buf: buffer of cpu numbers written to this cpuset 2566 * 2567 * The tasks' cpumask will be updated if cs is a valid partition root. 2568 */ 2569 static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs, 2570 const char *buf) 2571 { 2572 int retval; 2573 struct tmpmasks tmp; 2574 struct cpuset *parent = parent_cs(cs); 2575 bool invalidate = false; 2576 int hier_flags = 0; 2577 int old_prs = cs->partition_root_state; 2578 2579 if (!*buf) { 2580 cpumask_clear(trialcs->exclusive_cpus); 2581 cpumask_clear(trialcs->effective_xcpus); 2582 } else { 2583 retval = cpulist_parse(buf, trialcs->exclusive_cpus); 2584 if (retval < 0) 2585 return retval; 2586 if (!is_cpu_exclusive(cs)) 2587 set_bit(CS_CPU_EXCLUSIVE, &trialcs->flags); 2588 } 2589 2590 /* Nothing to do if the CPUs didn't change */ 2591 if (cpumask_equal(cs->exclusive_cpus, trialcs->exclusive_cpus)) 2592 return 0; 2593 2594 if (*buf) 2595 compute_effective_exclusive_cpumask(trialcs, NULL); 2596 2597 /* 2598 * Check all the descendants in update_cpumasks_hier() if 2599 * effective_xcpus is to be changed. 2600 */ 2601 if (!cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus)) 2602 hier_flags = HIER_CHECKALL; 2603 2604 retval = validate_change(cs, trialcs); 2605 if (retval) 2606 return retval; 2607 2608 if (alloc_cpumasks(NULL, &tmp)) 2609 return -ENOMEM; 2610 2611 if (old_prs) { 2612 if (cpumask_empty(trialcs->effective_xcpus)) { 2613 invalidate = true; 2614 cs->prs_err = PERR_INVCPUS; 2615 } else if (prstate_housekeeping_conflict(old_prs, trialcs->effective_xcpus)) { 2616 invalidate = true; 2617 cs->prs_err = PERR_HKEEPING; 2618 } else if (tasks_nocpu_error(parent, cs, trialcs->effective_xcpus)) { 2619 invalidate = true; 2620 cs->prs_err = PERR_NOCPUS; 2621 } 2622 2623 if (is_remote_partition(cs)) { 2624 if (invalidate) 2625 remote_partition_disable(cs, &tmp); 2626 else 2627 remote_cpus_update(cs, trialcs->effective_xcpus, 2628 &tmp); 2629 } else if (invalidate) { 2630 update_parent_effective_cpumask(cs, partcmd_invalidate, 2631 NULL, &tmp); 2632 } else { 2633 update_parent_effective_cpumask(cs, partcmd_update, 2634 trialcs->effective_xcpus, &tmp); 2635 } 2636 } else if (!cpumask_empty(trialcs->exclusive_cpus)) { 2637 /* 2638 * Use trialcs->effective_cpus as a temp cpumask 2639 */ 2640 remote_partition_check(cs, trialcs->effective_xcpus, 2641 trialcs->effective_cpus, &tmp); 2642 } 2643 spin_lock_irq(&callback_lock); 2644 cpumask_copy(cs->exclusive_cpus, trialcs->exclusive_cpus); 2645 cpumask_copy(cs->effective_xcpus, trialcs->effective_xcpus); 2646 if ((old_prs > 0) && !is_partition_valid(cs)) 2647 reset_partition_data(cs); 2648 spin_unlock_irq(&callback_lock); 2649 2650 /* 2651 * Call update_cpumasks_hier() to update effective_cpus/effective_xcpus 2652 * of the subtree when it is a valid partition root or effective_xcpus 2653 * is updated. 2654 */ 2655 if (is_partition_valid(cs) || hier_flags) 2656 update_cpumasks_hier(cs, &tmp, hier_flags); 2657 2658 /* Update CS_SCHED_LOAD_BALANCE and/or sched_domains, if necessary */ 2659 if (cs->partition_root_state) 2660 update_partition_sd_lb(cs, old_prs); 2661 2662 free_cpumasks(NULL, &tmp); 2663 return 0; 2664 } 2665 2666 /* 2667 * Migrate memory region from one set of nodes to another. This is 2668 * performed asynchronously as it can be called from process migration path 2669 * holding locks involved in process management. All mm migrations are 2670 * performed in the queued order and can be waited for by flushing 2671 * cpuset_migrate_mm_wq. 2672 */ 2673 2674 struct cpuset_migrate_mm_work { 2675 struct work_struct work; 2676 struct mm_struct *mm; 2677 nodemask_t from; 2678 nodemask_t to; 2679 }; 2680 2681 static void cpuset_migrate_mm_workfn(struct work_struct *work) 2682 { 2683 struct cpuset_migrate_mm_work *mwork = 2684 container_of(work, struct cpuset_migrate_mm_work, work); 2685 2686 /* on a wq worker, no need to worry about %current's mems_allowed */ 2687 do_migrate_pages(mwork->mm, &mwork->from, &mwork->to, MPOL_MF_MOVE_ALL); 2688 mmput(mwork->mm); 2689 kfree(mwork); 2690 } 2691 2692 static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, 2693 const nodemask_t *to) 2694 { 2695 struct cpuset_migrate_mm_work *mwork; 2696 2697 if (nodes_equal(*from, *to)) { 2698 mmput(mm); 2699 return; 2700 } 2701 2702 mwork = kzalloc(sizeof(*mwork), GFP_KERNEL); 2703 if (mwork) { 2704 mwork->mm = mm; 2705 mwork->from = *from; 2706 mwork->to = *to; 2707 INIT_WORK(&mwork->work, cpuset_migrate_mm_workfn); 2708 queue_work(cpuset_migrate_mm_wq, &mwork->work); 2709 } else { 2710 mmput(mm); 2711 } 2712 } 2713 2714 static void cpuset_post_attach(void) 2715 { 2716 flush_workqueue(cpuset_migrate_mm_wq); 2717 } 2718 2719 /* 2720 * cpuset_change_task_nodemask - change task's mems_allowed and mempolicy 2721 * @tsk: the task to change 2722 * @newmems: new nodes that the task will be set 2723 * 2724 * We use the mems_allowed_seq seqlock to safely update both tsk->mems_allowed 2725 * and rebind an eventual tasks' mempolicy. If the task is allocating in 2726 * parallel, it might temporarily see an empty intersection, which results in 2727 * a seqlock check and retry before OOM or allocation failure. 2728 */ 2729 static void cpuset_change_task_nodemask(struct task_struct *tsk, 2730 nodemask_t *newmems) 2731 { 2732 task_lock(tsk); 2733 2734 local_irq_disable(); 2735 write_seqcount_begin(&tsk->mems_allowed_seq); 2736 2737 nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems); 2738 mpol_rebind_task(tsk, newmems); 2739 tsk->mems_allowed = *newmems; 2740 2741 write_seqcount_end(&tsk->mems_allowed_seq); 2742 local_irq_enable(); 2743 2744 task_unlock(tsk); 2745 } 2746 2747 static void *cpuset_being_rebound; 2748 2749 /** 2750 * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset. 2751 * @cs: the cpuset in which each task's mems_allowed mask needs to be changed 2752 * 2753 * Iterate through each task of @cs updating its mems_allowed to the 2754 * effective cpuset's. As this function is called with cpuset_mutex held, 2755 * cpuset membership stays stable. 2756 */ 2757 static void update_tasks_nodemask(struct cpuset *cs) 2758 { 2759 static nodemask_t newmems; /* protected by cpuset_mutex */ 2760 struct css_task_iter it; 2761 struct task_struct *task; 2762 2763 cpuset_being_rebound = cs; /* causes mpol_dup() rebind */ 2764 2765 guarantee_online_mems(cs, &newmems); 2766 2767 /* 2768 * The mpol_rebind_mm() call takes mmap_lock, which we couldn't 2769 * take while holding tasklist_lock. Forks can happen - the 2770 * mpol_dup() cpuset_being_rebound check will catch such forks, 2771 * and rebind their vma mempolicies too. Because we still hold 2772 * the global cpuset_mutex, we know that no other rebind effort 2773 * will be contending for the global variable cpuset_being_rebound. 2774 * It's ok if we rebind the same mm twice; mpol_rebind_mm() 2775 * is idempotent. Also migrate pages in each mm to new nodes. 2776 */ 2777 css_task_iter_start(&cs->css, 0, &it); 2778 while ((task = css_task_iter_next(&it))) { 2779 struct mm_struct *mm; 2780 bool migrate; 2781 2782 cpuset_change_task_nodemask(task, &newmems); 2783 2784 mm = get_task_mm(task); 2785 if (!mm) 2786 continue; 2787 2788 migrate = is_memory_migrate(cs); 2789 2790 mpol_rebind_mm(mm, &cs->mems_allowed); 2791 if (migrate) 2792 cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems); 2793 else 2794 mmput(mm); 2795 } 2796 css_task_iter_end(&it); 2797 2798 /* 2799 * All the tasks' nodemasks have been updated, update 2800 * cs->old_mems_allowed. 2801 */ 2802 cs->old_mems_allowed = newmems; 2803 2804 /* We're done rebinding vmas to this cpuset's new mems_allowed. */ 2805 cpuset_being_rebound = NULL; 2806 } 2807 2808 /* 2809 * update_nodemasks_hier - Update effective nodemasks and tasks in the subtree 2810 * @cs: the cpuset to consider 2811 * @new_mems: a temp variable for calculating new effective_mems 2812 * 2813 * When configured nodemask is changed, the effective nodemasks of this cpuset 2814 * and all its descendants need to be updated. 2815 * 2816 * On legacy hierarchy, effective_mems will be the same with mems_allowed. 2817 * 2818 * Called with cpuset_mutex held 2819 */ 2820 static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems) 2821 { 2822 struct cpuset *cp; 2823 struct cgroup_subsys_state *pos_css; 2824 2825 rcu_read_lock(); 2826 cpuset_for_each_descendant_pre(cp, pos_css, cs) { 2827 struct cpuset *parent = parent_cs(cp); 2828 2829 nodes_and(*new_mems, cp->mems_allowed, parent->effective_mems); 2830 2831 /* 2832 * If it becomes empty, inherit the effective mask of the 2833 * parent, which is guaranteed to have some MEMs. 2834 */ 2835 if (is_in_v2_mode() && nodes_empty(*new_mems)) 2836 *new_mems = parent->effective_mems; 2837 2838 /* Skip the whole subtree if the nodemask remains the same. */ 2839 if (nodes_equal(*new_mems, cp->effective_mems)) { 2840 pos_css = css_rightmost_descendant(pos_css); 2841 continue; 2842 } 2843 2844 if (!css_tryget_online(&cp->css)) 2845 continue; 2846 rcu_read_unlock(); 2847 2848 spin_lock_irq(&callback_lock); 2849 cp->effective_mems = *new_mems; 2850 spin_unlock_irq(&callback_lock); 2851 2852 WARN_ON(!is_in_v2_mode() && 2853 !nodes_equal(cp->mems_allowed, cp->effective_mems)); 2854 2855 update_tasks_nodemask(cp); 2856 2857 rcu_read_lock(); 2858 css_put(&cp->css); 2859 } 2860 rcu_read_unlock(); 2861 } 2862 2863 /* 2864 * Handle user request to change the 'mems' memory placement 2865 * of a cpuset. Needs to validate the request, update the 2866 * cpusets mems_allowed, and for each task in the cpuset, 2867 * update mems_allowed and rebind task's mempolicy and any vma 2868 * mempolicies and if the cpuset is marked 'memory_migrate', 2869 * migrate the tasks pages to the new memory. 2870 * 2871 * Call with cpuset_mutex held. May take callback_lock during call. 2872 * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, 2873 * lock each such tasks mm->mmap_lock, scan its vma's and rebind 2874 * their mempolicies to the cpusets new mems_allowed. 2875 */ 2876 static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, 2877 const char *buf) 2878 { 2879 int retval; 2880 2881 /* 2882 * top_cpuset.mems_allowed tracks node_stats[N_MEMORY]; 2883 * it's read-only 2884 */ 2885 if (cs == &top_cpuset) { 2886 retval = -EACCES; 2887 goto done; 2888 } 2889 2890 /* 2891 * An empty mems_allowed is ok iff there are no tasks in the cpuset. 2892 * Since nodelist_parse() fails on an empty mask, we special case 2893 * that parsing. The validate_change() call ensures that cpusets 2894 * with tasks have memory. 2895 */ 2896 if (!*buf) { 2897 nodes_clear(trialcs->mems_allowed); 2898 } else { 2899 retval = nodelist_parse(buf, trialcs->mems_allowed); 2900 if (retval < 0) 2901 goto done; 2902 2903 if (!nodes_subset(trialcs->mems_allowed, 2904 top_cpuset.mems_allowed)) { 2905 retval = -EINVAL; 2906 goto done; 2907 } 2908 } 2909 2910 if (nodes_equal(cs->mems_allowed, trialcs->mems_allowed)) { 2911 retval = 0; /* Too easy - nothing to do */ 2912 goto done; 2913 } 2914 retval = validate_change(cs, trialcs); 2915 if (retval < 0) 2916 goto done; 2917 2918 check_insane_mems_config(&trialcs->mems_allowed); 2919 2920 spin_lock_irq(&callback_lock); 2921 cs->mems_allowed = trialcs->mems_allowed; 2922 spin_unlock_irq(&callback_lock); 2923 2924 /* use trialcs->mems_allowed as a temp variable */ 2925 update_nodemasks_hier(cs, &trialcs->mems_allowed); 2926 done: 2927 return retval; 2928 } 2929 2930 bool current_cpuset_is_being_rebound(void) 2931 { 2932 bool ret; 2933 2934 rcu_read_lock(); 2935 ret = task_cs(current) == cpuset_being_rebound; 2936 rcu_read_unlock(); 2937 2938 return ret; 2939 } 2940 2941 static int update_relax_domain_level(struct cpuset *cs, s64 val) 2942 { 2943 #ifdef CONFIG_SMP 2944 if (val < -1 || val > sched_domain_level_max + 1) 2945 return -EINVAL; 2946 #endif 2947 2948 if (val != cs->relax_domain_level) { 2949 cs->relax_domain_level = val; 2950 if (!cpumask_empty(cs->cpus_allowed) && 2951 is_sched_load_balance(cs)) 2952 rebuild_sched_domains_locked(); 2953 } 2954 2955 return 0; 2956 } 2957 2958 /** 2959 * update_tasks_flags - update the spread flags of tasks in the cpuset. 2960 * @cs: the cpuset in which each task's spread flags needs to be changed 2961 * 2962 * Iterate through each task of @cs updating its spread flags. As this 2963 * function is called with cpuset_mutex held, cpuset membership stays 2964 * stable. 2965 */ 2966 static void update_tasks_flags(struct cpuset *cs) 2967 { 2968 struct css_task_iter it; 2969 struct task_struct *task; 2970 2971 css_task_iter_start(&cs->css, 0, &it); 2972 while ((task = css_task_iter_next(&it))) 2973 cpuset_update_task_spread_flags(cs, task); 2974 css_task_iter_end(&it); 2975 } 2976 2977 /* 2978 * update_flag - read a 0 or a 1 in a file and update associated flag 2979 * bit: the bit to update (see cpuset_flagbits_t) 2980 * cs: the cpuset to update 2981 * turning_on: whether the flag is being set or cleared 2982 * 2983 * Call with cpuset_mutex held. 2984 */ 2985 2986 static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, 2987 int turning_on) 2988 { 2989 struct cpuset *trialcs; 2990 int balance_flag_changed; 2991 int spread_flag_changed; 2992 int err; 2993 2994 trialcs = alloc_trial_cpuset(cs); 2995 if (!trialcs) 2996 return -ENOMEM; 2997 2998 if (turning_on) 2999 set_bit(bit, &trialcs->flags); 3000 else 3001 clear_bit(bit, &trialcs->flags); 3002 3003 err = validate_change(cs, trialcs); 3004 if (err < 0) 3005 goto out; 3006 3007 balance_flag_changed = (is_sched_load_balance(cs) != 3008 is_sched_load_balance(trialcs)); 3009 3010 spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs)) 3011 || (is_spread_page(cs) != is_spread_page(trialcs))); 3012 3013 spin_lock_irq(&callback_lock); 3014 cs->flags = trialcs->flags; 3015 spin_unlock_irq(&callback_lock); 3016 3017 if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) 3018 rebuild_sched_domains_locked(); 3019 3020 if (spread_flag_changed) 3021 update_tasks_flags(cs); 3022 out: 3023 free_cpuset(trialcs); 3024 return err; 3025 } 3026 3027 /** 3028 * update_prstate - update partition_root_state 3029 * @cs: the cpuset to update 3030 * @new_prs: new partition root state 3031 * Return: 0 if successful, != 0 if error 3032 * 3033 * Call with cpuset_mutex held. 3034 */ 3035 static int update_prstate(struct cpuset *cs, int new_prs) 3036 { 3037 int err = PERR_NONE, old_prs = cs->partition_root_state; 3038 struct cpuset *parent = parent_cs(cs); 3039 struct tmpmasks tmpmask; 3040 bool new_xcpus_state = false; 3041 3042 if (old_prs == new_prs) 3043 return 0; 3044 3045 /* 3046 * Treat a previously invalid partition root as if it is a "member". 3047 */ 3048 if (new_prs && is_prs_invalid(old_prs)) 3049 old_prs = PRS_MEMBER; 3050 3051 if (alloc_cpumasks(NULL, &tmpmask)) 3052 return -ENOMEM; 3053 3054 /* 3055 * Setup effective_xcpus if not properly set yet, it will be cleared 3056 * later if partition becomes invalid. 3057 */ 3058 if ((new_prs > 0) && cpumask_empty(cs->exclusive_cpus)) { 3059 spin_lock_irq(&callback_lock); 3060 cpumask_and(cs->effective_xcpus, 3061 cs->cpus_allowed, parent->effective_xcpus); 3062 spin_unlock_irq(&callback_lock); 3063 } 3064 3065 err = update_partition_exclusive(cs, new_prs); 3066 if (err) 3067 goto out; 3068 3069 if (!old_prs) { 3070 enum partition_cmd cmd = (new_prs == PRS_ROOT) 3071 ? partcmd_enable : partcmd_enablei; 3072 3073 /* 3074 * cpus_allowed cannot be empty. 3075 */ 3076 if (cpumask_empty(cs->cpus_allowed)) { 3077 err = PERR_CPUSEMPTY; 3078 goto out; 3079 } 3080 3081 err = update_parent_effective_cpumask(cs, cmd, NULL, &tmpmask); 3082 /* 3083 * If an attempt to become local partition root fails, 3084 * try to become a remote partition root instead. 3085 */ 3086 if (err && remote_partition_enable(cs, new_prs, &tmpmask)) 3087 err = 0; 3088 } else if (old_prs && new_prs) { 3089 /* 3090 * A change in load balance state only, no change in cpumasks. 3091 */ 3092 new_xcpus_state = true; 3093 } else { 3094 /* 3095 * Switching back to member is always allowed even if it 3096 * disables child partitions. 3097 */ 3098 if (is_remote_partition(cs)) 3099 remote_partition_disable(cs, &tmpmask); 3100 else 3101 update_parent_effective_cpumask(cs, partcmd_disable, 3102 NULL, &tmpmask); 3103 3104 /* 3105 * Invalidation of child partitions will be done in 3106 * update_cpumasks_hier(). 3107 */ 3108 } 3109 out: 3110 /* 3111 * Make partition invalid & disable CS_CPU_EXCLUSIVE if an error 3112 * happens. 3113 */ 3114 if (err) { 3115 new_prs = -new_prs; 3116 update_partition_exclusive(cs, new_prs); 3117 } 3118 3119 spin_lock_irq(&callback_lock); 3120 cs->partition_root_state = new_prs; 3121 WRITE_ONCE(cs->prs_err, err); 3122 if (!is_partition_valid(cs)) 3123 reset_partition_data(cs); 3124 else if (new_xcpus_state) 3125 partition_xcpus_newstate(old_prs, new_prs, cs->effective_xcpus); 3126 spin_unlock_irq(&callback_lock); 3127 update_unbound_workqueue_cpumask(new_xcpus_state); 3128 3129 /* Force update if switching back to member */ 3130 update_cpumasks_hier(cs, &tmpmask, !new_prs ? HIER_CHECKALL : 0); 3131 3132 /* Update sched domains and load balance flag */ 3133 update_partition_sd_lb(cs, old_prs); 3134 3135 notify_partition_change(cs, old_prs); 3136 free_cpumasks(NULL, &tmpmask); 3137 return 0; 3138 } 3139 3140 /* 3141 * Frequency meter - How fast is some event occurring? 3142 * 3143 * These routines manage a digitally filtered, constant time based, 3144 * event frequency meter. There are four routines: 3145 * fmeter_init() - initialize a frequency meter. 3146 * fmeter_markevent() - called each time the event happens. 3147 * fmeter_getrate() - returns the recent rate of such events. 3148 * fmeter_update() - internal routine used to update fmeter. 3149 * 3150 * A common data structure is passed to each of these routines, 3151 * which is used to keep track of the state required to manage the 3152 * frequency meter and its digital filter. 3153 * 3154 * The filter works on the number of events marked per unit time. 3155 * The filter is single-pole low-pass recursive (IIR). The time unit 3156 * is 1 second. Arithmetic is done using 32-bit integers scaled to 3157 * simulate 3 decimal digits of precision (multiplied by 1000). 3158 * 3159 * With an FM_COEF of 933, and a time base of 1 second, the filter 3160 * has a half-life of 10 seconds, meaning that if the events quit 3161 * happening, then the rate returned from the fmeter_getrate() 3162 * will be cut in half each 10 seconds, until it converges to zero. 3163 * 3164 * It is not worth doing a real infinitely recursive filter. If more 3165 * than FM_MAXTICKS ticks have elapsed since the last filter event, 3166 * just compute FM_MAXTICKS ticks worth, by which point the level 3167 * will be stable. 3168 * 3169 * Limit the count of unprocessed events to FM_MAXCNT, so as to avoid 3170 * arithmetic overflow in the fmeter_update() routine. 3171 * 3172 * Given the simple 32 bit integer arithmetic used, this meter works 3173 * best for reporting rates between one per millisecond (msec) and 3174 * one per 32 (approx) seconds. At constant rates faster than one 3175 * per msec it maxes out at values just under 1,000,000. At constant 3176 * rates between one per msec, and one per second it will stabilize 3177 * to a value N*1000, where N is the rate of events per second. 3178 * At constant rates between one per second and one per 32 seconds, 3179 * it will be choppy, moving up on the seconds that have an event, 3180 * and then decaying until the next event. At rates slower than 3181 * about one in 32 seconds, it decays all the way back to zero between 3182 * each event. 3183 */ 3184 3185 #define FM_COEF 933 /* coefficient for half-life of 10 secs */ 3186 #define FM_MAXTICKS ((u32)99) /* useless computing more ticks than this */ 3187 #define FM_MAXCNT 1000000 /* limit cnt to avoid overflow */ 3188 #define FM_SCALE 1000 /* faux fixed point scale */ 3189 3190 /* Initialize a frequency meter */ 3191 static void fmeter_init(struct fmeter *fmp) 3192 { 3193 fmp->cnt = 0; 3194 fmp->val = 0; 3195 fmp->time = 0; 3196 spin_lock_init(&fmp->lock); 3197 } 3198 3199 /* Internal meter update - process cnt events and update value */ 3200 static void fmeter_update(struct fmeter *fmp) 3201 { 3202 time64_t now; 3203 u32 ticks; 3204 3205 now = ktime_get_seconds(); 3206 ticks = now - fmp->time; 3207 3208 if (ticks == 0) 3209 return; 3210 3211 ticks = min(FM_MAXTICKS, ticks); 3212 while (ticks-- > 0) 3213 fmp->val = (FM_COEF * fmp->val) / FM_SCALE; 3214 fmp->time = now; 3215 3216 fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE; 3217 fmp->cnt = 0; 3218 } 3219 3220 /* Process any previous ticks, then bump cnt by one (times scale). */ 3221 static void fmeter_markevent(struct fmeter *fmp) 3222 { 3223 spin_lock(&fmp->lock); 3224 fmeter_update(fmp); 3225 fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE); 3226 spin_unlock(&fmp->lock); 3227 } 3228 3229 /* Process any previous ticks, then return current value. */ 3230 static int fmeter_getrate(struct fmeter *fmp) 3231 { 3232 int val; 3233 3234 spin_lock(&fmp->lock); 3235 fmeter_update(fmp); 3236 val = fmp->val; 3237 spin_unlock(&fmp->lock); 3238 return val; 3239 } 3240 3241 static struct cpuset *cpuset_attach_old_cs; 3242 3243 /* 3244 * Check to see if a cpuset can accept a new task 3245 * For v1, cpus_allowed and mems_allowed can't be empty. 3246 * For v2, effective_cpus can't be empty. 3247 * Note that in v1, effective_cpus = cpus_allowed. 3248 */ 3249 static int cpuset_can_attach_check(struct cpuset *cs) 3250 { 3251 if (cpumask_empty(cs->effective_cpus) || 3252 (!is_in_v2_mode() && nodes_empty(cs->mems_allowed))) 3253 return -ENOSPC; 3254 return 0; 3255 } 3256 3257 static void reset_migrate_dl_data(struct cpuset *cs) 3258 { 3259 cs->nr_migrate_dl_tasks = 0; 3260 cs->sum_migrate_dl_bw = 0; 3261 } 3262 3263 /* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */ 3264 static int cpuset_can_attach(struct cgroup_taskset *tset) 3265 { 3266 struct cgroup_subsys_state *css; 3267 struct cpuset *cs, *oldcs; 3268 struct task_struct *task; 3269 bool cpus_updated, mems_updated; 3270 int ret; 3271 3272 /* used later by cpuset_attach() */ 3273 cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css)); 3274 oldcs = cpuset_attach_old_cs; 3275 cs = css_cs(css); 3276 3277 mutex_lock(&cpuset_mutex); 3278 3279 /* Check to see if task is allowed in the cpuset */ 3280 ret = cpuset_can_attach_check(cs); 3281 if (ret) 3282 goto out_unlock; 3283 3284 cpus_updated = !cpumask_equal(cs->effective_cpus, oldcs->effective_cpus); 3285 mems_updated = !nodes_equal(cs->effective_mems, oldcs->effective_mems); 3286 3287 cgroup_taskset_for_each(task, css, tset) { 3288 ret = task_can_attach(task); 3289 if (ret) 3290 goto out_unlock; 3291 3292 /* 3293 * Skip rights over task check in v2 when nothing changes, 3294 * migration permission derives from hierarchy ownership in 3295 * cgroup_procs_write_permission()). 3296 */ 3297 if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) || 3298 (cpus_updated || mems_updated)) { 3299 ret = security_task_setscheduler(task); 3300 if (ret) 3301 goto out_unlock; 3302 } 3303 3304 if (dl_task(task)) { 3305 cs->nr_migrate_dl_tasks++; 3306 cs->sum_migrate_dl_bw += task->dl.dl_bw; 3307 } 3308 } 3309 3310 if (!cs->nr_migrate_dl_tasks) 3311 goto out_success; 3312 3313 if (!cpumask_intersects(oldcs->effective_cpus, cs->effective_cpus)) { 3314 int cpu = cpumask_any_and(cpu_active_mask, cs->effective_cpus); 3315 3316 if (unlikely(cpu >= nr_cpu_ids)) { 3317 reset_migrate_dl_data(cs); 3318 ret = -EINVAL; 3319 goto out_unlock; 3320 } 3321 3322 ret = dl_bw_alloc(cpu, cs->sum_migrate_dl_bw); 3323 if (ret) { 3324 reset_migrate_dl_data(cs); 3325 goto out_unlock; 3326 } 3327 } 3328 3329 out_success: 3330 /* 3331 * Mark attach is in progress. This makes validate_change() fail 3332 * changes which zero cpus/mems_allowed. 3333 */ 3334 cs->attach_in_progress++; 3335 out_unlock: 3336 mutex_unlock(&cpuset_mutex); 3337 return ret; 3338 } 3339 3340 static void cpuset_cancel_attach(struct cgroup_taskset *tset) 3341 { 3342 struct cgroup_subsys_state *css; 3343 struct cpuset *cs; 3344 3345 cgroup_taskset_first(tset, &css); 3346 cs = css_cs(css); 3347 3348 mutex_lock(&cpuset_mutex); 3349 cs->attach_in_progress--; 3350 if (!cs->attach_in_progress) 3351 wake_up(&cpuset_attach_wq); 3352 3353 if (cs->nr_migrate_dl_tasks) { 3354 int cpu = cpumask_any(cs->effective_cpus); 3355 3356 dl_bw_free(cpu, cs->sum_migrate_dl_bw); 3357 reset_migrate_dl_data(cs); 3358 } 3359 3360 mutex_unlock(&cpuset_mutex); 3361 } 3362 3363 /* 3364 * Protected by cpuset_mutex. cpus_attach is used only by cpuset_attach_task() 3365 * but we can't allocate it dynamically there. Define it global and 3366 * allocate from cpuset_init(). 3367 */ 3368 static cpumask_var_t cpus_attach; 3369 static nodemask_t cpuset_attach_nodemask_to; 3370 3371 static void cpuset_attach_task(struct cpuset *cs, struct task_struct *task) 3372 { 3373 lockdep_assert_held(&cpuset_mutex); 3374 3375 if (cs != &top_cpuset) 3376 guarantee_online_cpus(task, cpus_attach); 3377 else 3378 cpumask_andnot(cpus_attach, task_cpu_possible_mask(task), 3379 subpartitions_cpus); 3380 /* 3381 * can_attach beforehand should guarantee that this doesn't 3382 * fail. TODO: have a better way to handle failure here 3383 */ 3384 WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach)); 3385 3386 cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to); 3387 cpuset_update_task_spread_flags(cs, task); 3388 } 3389 3390 static void cpuset_attach(struct cgroup_taskset *tset) 3391 { 3392 struct task_struct *task; 3393 struct task_struct *leader; 3394 struct cgroup_subsys_state *css; 3395 struct cpuset *cs; 3396 struct cpuset *oldcs = cpuset_attach_old_cs; 3397 bool cpus_updated, mems_updated; 3398 3399 cgroup_taskset_first(tset, &css); 3400 cs = css_cs(css); 3401 3402 lockdep_assert_cpus_held(); /* see cgroup_attach_lock() */ 3403 mutex_lock(&cpuset_mutex); 3404 cpus_updated = !cpumask_equal(cs->effective_cpus, 3405 oldcs->effective_cpus); 3406 mems_updated = !nodes_equal(cs->effective_mems, oldcs->effective_mems); 3407 3408 /* 3409 * In the default hierarchy, enabling cpuset in the child cgroups 3410 * will trigger a number of cpuset_attach() calls with no change 3411 * in effective cpus and mems. In that case, we can optimize out 3412 * by skipping the task iteration and update. 3413 */ 3414 if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && 3415 !cpus_updated && !mems_updated) { 3416 cpuset_attach_nodemask_to = cs->effective_mems; 3417 goto out; 3418 } 3419 3420 guarantee_online_mems(cs, &cpuset_attach_nodemask_to); 3421 3422 cgroup_taskset_for_each(task, css, tset) 3423 cpuset_attach_task(cs, task); 3424 3425 /* 3426 * Change mm for all threadgroup leaders. This is expensive and may 3427 * sleep and should be moved outside migration path proper. Skip it 3428 * if there is no change in effective_mems and CS_MEMORY_MIGRATE is 3429 * not set. 3430 */ 3431 cpuset_attach_nodemask_to = cs->effective_mems; 3432 if (!is_memory_migrate(cs) && !mems_updated) 3433 goto out; 3434 3435 cgroup_taskset_for_each_leader(leader, css, tset) { 3436 struct mm_struct *mm = get_task_mm(leader); 3437 3438 if (mm) { 3439 mpol_rebind_mm(mm, &cpuset_attach_nodemask_to); 3440 3441 /* 3442 * old_mems_allowed is the same with mems_allowed 3443 * here, except if this task is being moved 3444 * automatically due to hotplug. In that case 3445 * @mems_allowed has been updated and is empty, so 3446 * @old_mems_allowed is the right nodesets that we 3447 * migrate mm from. 3448 */ 3449 if (is_memory_migrate(cs)) 3450 cpuset_migrate_mm(mm, &oldcs->old_mems_allowed, 3451 &cpuset_attach_nodemask_to); 3452 else 3453 mmput(mm); 3454 } 3455 } 3456 3457 out: 3458 cs->old_mems_allowed = cpuset_attach_nodemask_to; 3459 3460 if (cs->nr_migrate_dl_tasks) { 3461 cs->nr_deadline_tasks += cs->nr_migrate_dl_tasks; 3462 oldcs->nr_deadline_tasks -= cs->nr_migrate_dl_tasks; 3463 reset_migrate_dl_data(cs); 3464 } 3465 3466 cs->attach_in_progress--; 3467 if (!cs->attach_in_progress) 3468 wake_up(&cpuset_attach_wq); 3469 3470 mutex_unlock(&cpuset_mutex); 3471 } 3472 3473 /* The various types of files and directories in a cpuset file system */ 3474 3475 typedef enum { 3476 FILE_MEMORY_MIGRATE, 3477 FILE_CPULIST, 3478 FILE_MEMLIST, 3479 FILE_EFFECTIVE_CPULIST, 3480 FILE_EFFECTIVE_MEMLIST, 3481 FILE_SUBPARTS_CPULIST, 3482 FILE_EXCLUSIVE_CPULIST, 3483 FILE_EFFECTIVE_XCPULIST, 3484 FILE_ISOLATED_CPULIST, 3485 FILE_CPU_EXCLUSIVE, 3486 FILE_MEM_EXCLUSIVE, 3487 FILE_MEM_HARDWALL, 3488 FILE_SCHED_LOAD_BALANCE, 3489 FILE_PARTITION_ROOT, 3490 FILE_SCHED_RELAX_DOMAIN_LEVEL, 3491 FILE_MEMORY_PRESSURE_ENABLED, 3492 FILE_MEMORY_PRESSURE, 3493 FILE_SPREAD_PAGE, 3494 FILE_SPREAD_SLAB, 3495 } cpuset_filetype_t; 3496 3497 static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, 3498 u64 val) 3499 { 3500 struct cpuset *cs = css_cs(css); 3501 cpuset_filetype_t type = cft->private; 3502 int retval = 0; 3503 3504 cpus_read_lock(); 3505 mutex_lock(&cpuset_mutex); 3506 if (!is_cpuset_online(cs)) { 3507 retval = -ENODEV; 3508 goto out_unlock; 3509 } 3510 3511 switch (type) { 3512 case FILE_CPU_EXCLUSIVE: 3513 retval = update_flag(CS_CPU_EXCLUSIVE, cs, val); 3514 break; 3515 case FILE_MEM_EXCLUSIVE: 3516 retval = update_flag(CS_MEM_EXCLUSIVE, cs, val); 3517 break; 3518 case FILE_MEM_HARDWALL: 3519 retval = update_flag(CS_MEM_HARDWALL, cs, val); 3520 break; 3521 case FILE_SCHED_LOAD_BALANCE: 3522 retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, val); 3523 break; 3524 case FILE_MEMORY_MIGRATE: 3525 retval = update_flag(CS_MEMORY_MIGRATE, cs, val); 3526 break; 3527 case FILE_MEMORY_PRESSURE_ENABLED: 3528 cpuset_memory_pressure_enabled = !!val; 3529 break; 3530 case FILE_SPREAD_PAGE: 3531 retval = update_flag(CS_SPREAD_PAGE, cs, val); 3532 break; 3533 case FILE_SPREAD_SLAB: 3534 retval = update_flag(CS_SPREAD_SLAB, cs, val); 3535 break; 3536 default: 3537 retval = -EINVAL; 3538 break; 3539 } 3540 out_unlock: 3541 mutex_unlock(&cpuset_mutex); 3542 cpus_read_unlock(); 3543 return retval; 3544 } 3545 3546 static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, 3547 s64 val) 3548 { 3549 struct cpuset *cs = css_cs(css); 3550 cpuset_filetype_t type = cft->private; 3551 int retval = -ENODEV; 3552 3553 cpus_read_lock(); 3554 mutex_lock(&cpuset_mutex); 3555 if (!is_cpuset_online(cs)) 3556 goto out_unlock; 3557 3558 switch (type) { 3559 case FILE_SCHED_RELAX_DOMAIN_LEVEL: 3560 retval = update_relax_domain_level(cs, val); 3561 break; 3562 default: 3563 retval = -EINVAL; 3564 break; 3565 } 3566 out_unlock: 3567 mutex_unlock(&cpuset_mutex); 3568 cpus_read_unlock(); 3569 return retval; 3570 } 3571 3572 /* 3573 * Common handling for a write to a "cpus" or "mems" file. 3574 */ 3575 static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, 3576 char *buf, size_t nbytes, loff_t off) 3577 { 3578 struct cpuset *cs = css_cs(of_css(of)); 3579 struct cpuset *trialcs; 3580 int retval = -ENODEV; 3581 3582 buf = strstrip(buf); 3583 3584 /* 3585 * CPU or memory hotunplug may leave @cs w/o any execution 3586 * resources, in which case the hotplug code asynchronously updates 3587 * configuration and transfers all tasks to the nearest ancestor 3588 * which can execute. 3589 * 3590 * As writes to "cpus" or "mems" may restore @cs's execution 3591 * resources, wait for the previously scheduled operations before 3592 * proceeding, so that we don't end up keep removing tasks added 3593 * after execution capability is restored. 3594 * 3595 * cpuset_handle_hotplug may call back into cgroup core asynchronously 3596 * via cgroup_transfer_tasks() and waiting for it from a cgroupfs 3597 * operation like this one can lead to a deadlock through kernfs 3598 * active_ref protection. Let's break the protection. Losing the 3599 * protection is okay as we check whether @cs is online after 3600 * grabbing cpuset_mutex anyway. This only happens on the legacy 3601 * hierarchies. 3602 */ 3603 css_get(&cs->css); 3604 kernfs_break_active_protection(of->kn); 3605 3606 cpus_read_lock(); 3607 mutex_lock(&cpuset_mutex); 3608 if (!is_cpuset_online(cs)) 3609 goto out_unlock; 3610 3611 trialcs = alloc_trial_cpuset(cs); 3612 if (!trialcs) { 3613 retval = -ENOMEM; 3614 goto out_unlock; 3615 } 3616 3617 switch (of_cft(of)->private) { 3618 case FILE_CPULIST: 3619 retval = update_cpumask(cs, trialcs, buf); 3620 break; 3621 case FILE_EXCLUSIVE_CPULIST: 3622 retval = update_exclusive_cpumask(cs, trialcs, buf); 3623 break; 3624 case FILE_MEMLIST: 3625 retval = update_nodemask(cs, trialcs, buf); 3626 break; 3627 default: 3628 retval = -EINVAL; 3629 break; 3630 } 3631 3632 free_cpuset(trialcs); 3633 out_unlock: 3634 mutex_unlock(&cpuset_mutex); 3635 cpus_read_unlock(); 3636 kernfs_unbreak_active_protection(of->kn); 3637 css_put(&cs->css); 3638 flush_workqueue(cpuset_migrate_mm_wq); 3639 return retval ?: nbytes; 3640 } 3641 3642 /* 3643 * These ascii lists should be read in a single call, by using a user 3644 * buffer large enough to hold the entire map. If read in smaller 3645 * chunks, there is no guarantee of atomicity. Since the display format 3646 * used, list of ranges of sequential numbers, is variable length, 3647 * and since these maps can change value dynamically, one could read 3648 * gibberish by doing partial reads while a list was changing. 3649 */ 3650 static int cpuset_common_seq_show(struct seq_file *sf, void *v) 3651 { 3652 struct cpuset *cs = css_cs(seq_css(sf)); 3653 cpuset_filetype_t type = seq_cft(sf)->private; 3654 int ret = 0; 3655 3656 spin_lock_irq(&callback_lock); 3657 3658 switch (type) { 3659 case FILE_CPULIST: 3660 seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->cpus_allowed)); 3661 break; 3662 case FILE_MEMLIST: 3663 seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->mems_allowed)); 3664 break; 3665 case FILE_EFFECTIVE_CPULIST: 3666 seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->effective_cpus)); 3667 break; 3668 case FILE_EFFECTIVE_MEMLIST: 3669 seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->effective_mems)); 3670 break; 3671 case FILE_EXCLUSIVE_CPULIST: 3672 seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->exclusive_cpus)); 3673 break; 3674 case FILE_EFFECTIVE_XCPULIST: 3675 seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->effective_xcpus)); 3676 break; 3677 case FILE_SUBPARTS_CPULIST: 3678 seq_printf(sf, "%*pbl\n", cpumask_pr_args(subpartitions_cpus)); 3679 break; 3680 case FILE_ISOLATED_CPULIST: 3681 seq_printf(sf, "%*pbl\n", cpumask_pr_args(isolated_cpus)); 3682 break; 3683 default: 3684 ret = -EINVAL; 3685 } 3686 3687 spin_unlock_irq(&callback_lock); 3688 return ret; 3689 } 3690 3691 static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft) 3692 { 3693 struct cpuset *cs = css_cs(css); 3694 cpuset_filetype_t type = cft->private; 3695 switch (type) { 3696 case FILE_CPU_EXCLUSIVE: 3697 return is_cpu_exclusive(cs); 3698 case FILE_MEM_EXCLUSIVE: 3699 return is_mem_exclusive(cs); 3700 case FILE_MEM_HARDWALL: 3701 return is_mem_hardwall(cs); 3702 case FILE_SCHED_LOAD_BALANCE: 3703 return is_sched_load_balance(cs); 3704 case FILE_MEMORY_MIGRATE: 3705 return is_memory_migrate(cs); 3706 case FILE_MEMORY_PRESSURE_ENABLED: 3707 return cpuset_memory_pressure_enabled; 3708 case FILE_MEMORY_PRESSURE: 3709 return fmeter_getrate(&cs->fmeter); 3710 case FILE_SPREAD_PAGE: 3711 return is_spread_page(cs); 3712 case FILE_SPREAD_SLAB: 3713 return is_spread_slab(cs); 3714 default: 3715 BUG(); 3716 } 3717 3718 /* Unreachable but makes gcc happy */ 3719 return 0; 3720 } 3721 3722 static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft) 3723 { 3724 struct cpuset *cs = css_cs(css); 3725 cpuset_filetype_t type = cft->private; 3726 switch (type) { 3727 case FILE_SCHED_RELAX_DOMAIN_LEVEL: 3728 return cs->relax_domain_level; 3729 default: 3730 BUG(); 3731 } 3732 3733 /* Unreachable but makes gcc happy */ 3734 return 0; 3735 } 3736 3737 static int sched_partition_show(struct seq_file *seq, void *v) 3738 { 3739 struct cpuset *cs = css_cs(seq_css(seq)); 3740 const char *err, *type = NULL; 3741 3742 switch (cs->partition_root_state) { 3743 case PRS_ROOT: 3744 seq_puts(seq, "root\n"); 3745 break; 3746 case PRS_ISOLATED: 3747 seq_puts(seq, "isolated\n"); 3748 break; 3749 case PRS_MEMBER: 3750 seq_puts(seq, "member\n"); 3751 break; 3752 case PRS_INVALID_ROOT: 3753 type = "root"; 3754 fallthrough; 3755 case PRS_INVALID_ISOLATED: 3756 if (!type) 3757 type = "isolated"; 3758 err = perr_strings[READ_ONCE(cs->prs_err)]; 3759 if (err) 3760 seq_printf(seq, "%s invalid (%s)\n", type, err); 3761 else 3762 seq_printf(seq, "%s invalid\n", type); 3763 break; 3764 } 3765 return 0; 3766 } 3767 3768 static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf, 3769 size_t nbytes, loff_t off) 3770 { 3771 struct cpuset *cs = css_cs(of_css(of)); 3772 int val; 3773 int retval = -ENODEV; 3774 3775 buf = strstrip(buf); 3776 3777 if (!strcmp(buf, "root")) 3778 val = PRS_ROOT; 3779 else if (!strcmp(buf, "member")) 3780 val = PRS_MEMBER; 3781 else if (!strcmp(buf, "isolated")) 3782 val = PRS_ISOLATED; 3783 else 3784 return -EINVAL; 3785 3786 css_get(&cs->css); 3787 cpus_read_lock(); 3788 mutex_lock(&cpuset_mutex); 3789 if (!is_cpuset_online(cs)) 3790 goto out_unlock; 3791 3792 retval = update_prstate(cs, val); 3793 out_unlock: 3794 mutex_unlock(&cpuset_mutex); 3795 cpus_read_unlock(); 3796 css_put(&cs->css); 3797 return retval ?: nbytes; 3798 } 3799 3800 /* 3801 * for the common functions, 'private' gives the type of file 3802 */ 3803 3804 static struct cftype legacy_files[] = { 3805 { 3806 .name = "cpus", 3807 .seq_show = cpuset_common_seq_show, 3808 .write = cpuset_write_resmask, 3809 .max_write_len = (100U + 6 * NR_CPUS), 3810 .private = FILE_CPULIST, 3811 }, 3812 3813 { 3814 .name = "mems", 3815 .seq_show = cpuset_common_seq_show, 3816 .write = cpuset_write_resmask, 3817 .max_write_len = (100U + 6 * MAX_NUMNODES), 3818 .private = FILE_MEMLIST, 3819 }, 3820 3821 { 3822 .name = "effective_cpus", 3823 .seq_show = cpuset_common_seq_show, 3824 .private = FILE_EFFECTIVE_CPULIST, 3825 }, 3826 3827 { 3828 .name = "effective_mems", 3829 .seq_show = cpuset_common_seq_show, 3830 .private = FILE_EFFECTIVE_MEMLIST, 3831 }, 3832 3833 { 3834 .name = "cpu_exclusive", 3835 .read_u64 = cpuset_read_u64, 3836 .write_u64 = cpuset_write_u64, 3837 .private = FILE_CPU_EXCLUSIVE, 3838 }, 3839 3840 { 3841 .name = "mem_exclusive", 3842 .read_u64 = cpuset_read_u64, 3843 .write_u64 = cpuset_write_u64, 3844 .private = FILE_MEM_EXCLUSIVE, 3845 }, 3846 3847 { 3848 .name = "mem_hardwall", 3849 .read_u64 = cpuset_read_u64, 3850 .write_u64 = cpuset_write_u64, 3851 .private = FILE_MEM_HARDWALL, 3852 }, 3853 3854 { 3855 .name = "sched_load_balance", 3856 .read_u64 = cpuset_read_u64, 3857 .write_u64 = cpuset_write_u64, 3858 .private = FILE_SCHED_LOAD_BALANCE, 3859 }, 3860 3861 { 3862 .name = "sched_relax_domain_level", 3863 .read_s64 = cpuset_read_s64, 3864 .write_s64 = cpuset_write_s64, 3865 .private = FILE_SCHED_RELAX_DOMAIN_LEVEL, 3866 }, 3867 3868 { 3869 .name = "memory_migrate", 3870 .read_u64 = cpuset_read_u64, 3871 .write_u64 = cpuset_write_u64, 3872 .private = FILE_MEMORY_MIGRATE, 3873 }, 3874 3875 { 3876 .name = "memory_pressure", 3877 .read_u64 = cpuset_read_u64, 3878 .private = FILE_MEMORY_PRESSURE, 3879 }, 3880 3881 { 3882 .name = "memory_spread_page", 3883 .read_u64 = cpuset_read_u64, 3884 .write_u64 = cpuset_write_u64, 3885 .private = FILE_SPREAD_PAGE, 3886 }, 3887 3888 { 3889 /* obsolete, may be removed in the future */ 3890 .name = "memory_spread_slab", 3891 .read_u64 = cpuset_read_u64, 3892 .write_u64 = cpuset_write_u64, 3893 .private = FILE_SPREAD_SLAB, 3894 }, 3895 3896 { 3897 .name = "memory_pressure_enabled", 3898 .flags = CFTYPE_ONLY_ON_ROOT, 3899 .read_u64 = cpuset_read_u64, 3900 .write_u64 = cpuset_write_u64, 3901 .private = FILE_MEMORY_PRESSURE_ENABLED, 3902 }, 3903 3904 { } /* terminate */ 3905 }; 3906 3907 /* 3908 * This is currently a minimal set for the default hierarchy. It can be 3909 * expanded later on by migrating more features and control files from v1. 3910 */ 3911 static struct cftype dfl_files[] = { 3912 { 3913 .name = "cpus", 3914 .seq_show = cpuset_common_seq_show, 3915 .write = cpuset_write_resmask, 3916 .max_write_len = (100U + 6 * NR_CPUS), 3917 .private = FILE_CPULIST, 3918 .flags = CFTYPE_NOT_ON_ROOT, 3919 }, 3920 3921 { 3922 .name = "mems", 3923 .seq_show = cpuset_common_seq_show, 3924 .write = cpuset_write_resmask, 3925 .max_write_len = (100U + 6 * MAX_NUMNODES), 3926 .private = FILE_MEMLIST, 3927 .flags = CFTYPE_NOT_ON_ROOT, 3928 }, 3929 3930 { 3931 .name = "cpus.effective", 3932 .seq_show = cpuset_common_seq_show, 3933 .private = FILE_EFFECTIVE_CPULIST, 3934 }, 3935 3936 { 3937 .name = "mems.effective", 3938 .seq_show = cpuset_common_seq_show, 3939 .private = FILE_EFFECTIVE_MEMLIST, 3940 }, 3941 3942 { 3943 .name = "cpus.partition", 3944 .seq_show = sched_partition_show, 3945 .write = sched_partition_write, 3946 .private = FILE_PARTITION_ROOT, 3947 .flags = CFTYPE_NOT_ON_ROOT, 3948 .file_offset = offsetof(struct cpuset, partition_file), 3949 }, 3950 3951 { 3952 .name = "cpus.exclusive", 3953 .seq_show = cpuset_common_seq_show, 3954 .write = cpuset_write_resmask, 3955 .max_write_len = (100U + 6 * NR_CPUS), 3956 .private = FILE_EXCLUSIVE_CPULIST, 3957 .flags = CFTYPE_NOT_ON_ROOT, 3958 }, 3959 3960 { 3961 .name = "cpus.exclusive.effective", 3962 .seq_show = cpuset_common_seq_show, 3963 .private = FILE_EFFECTIVE_XCPULIST, 3964 .flags = CFTYPE_NOT_ON_ROOT, 3965 }, 3966 3967 { 3968 .name = "cpus.subpartitions", 3969 .seq_show = cpuset_common_seq_show, 3970 .private = FILE_SUBPARTS_CPULIST, 3971 .flags = CFTYPE_ONLY_ON_ROOT | CFTYPE_DEBUG, 3972 }, 3973 3974 { 3975 .name = "cpus.isolated", 3976 .seq_show = cpuset_common_seq_show, 3977 .private = FILE_ISOLATED_CPULIST, 3978 .flags = CFTYPE_ONLY_ON_ROOT, 3979 }, 3980 3981 { } /* terminate */ 3982 }; 3983 3984 3985 /** 3986 * cpuset_css_alloc - Allocate a cpuset css 3987 * @parent_css: Parent css of the control group that the new cpuset will be 3988 * part of 3989 * Return: cpuset css on success, -ENOMEM on failure. 3990 * 3991 * Allocate and initialize a new cpuset css, for non-NULL @parent_css, return 3992 * top cpuset css otherwise. 3993 */ 3994 static struct cgroup_subsys_state * 3995 cpuset_css_alloc(struct cgroup_subsys_state *parent_css) 3996 { 3997 struct cpuset *cs; 3998 3999 if (!parent_css) 4000 return &top_cpuset.css; 4001 4002 cs = kzalloc(sizeof(*cs), GFP_KERNEL); 4003 if (!cs) 4004 return ERR_PTR(-ENOMEM); 4005 4006 if (alloc_cpumasks(cs, NULL)) { 4007 kfree(cs); 4008 return ERR_PTR(-ENOMEM); 4009 } 4010 4011 __set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); 4012 nodes_clear(cs->mems_allowed); 4013 nodes_clear(cs->effective_mems); 4014 fmeter_init(&cs->fmeter); 4015 cs->relax_domain_level = -1; 4016 INIT_LIST_HEAD(&cs->remote_sibling); 4017 4018 /* Set CS_MEMORY_MIGRATE for default hierarchy */ 4019 if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) 4020 __set_bit(CS_MEMORY_MIGRATE, &cs->flags); 4021 4022 return &cs->css; 4023 } 4024 4025 static int cpuset_css_online(struct cgroup_subsys_state *css) 4026 { 4027 struct cpuset *cs = css_cs(css); 4028 struct cpuset *parent = parent_cs(cs); 4029 struct cpuset *tmp_cs; 4030 struct cgroup_subsys_state *pos_css; 4031 4032 if (!parent) 4033 return 0; 4034 4035 cpus_read_lock(); 4036 mutex_lock(&cpuset_mutex); 4037 4038 set_bit(CS_ONLINE, &cs->flags); 4039 if (is_spread_page(parent)) 4040 set_bit(CS_SPREAD_PAGE, &cs->flags); 4041 if (is_spread_slab(parent)) 4042 set_bit(CS_SPREAD_SLAB, &cs->flags); 4043 4044 cpuset_inc(); 4045 4046 spin_lock_irq(&callback_lock); 4047 if (is_in_v2_mode()) { 4048 cpumask_copy(cs->effective_cpus, parent->effective_cpus); 4049 cs->effective_mems = parent->effective_mems; 4050 cs->use_parent_ecpus = true; 4051 parent->child_ecpus_count++; 4052 } 4053 4054 /* 4055 * For v2, clear CS_SCHED_LOAD_BALANCE if parent is isolated 4056 */ 4057 if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && 4058 !is_sched_load_balance(parent)) 4059 clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); 4060 4061 spin_unlock_irq(&callback_lock); 4062 4063 if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags)) 4064 goto out_unlock; 4065 4066 /* 4067 * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is 4068 * set. This flag handling is implemented in cgroup core for 4069 * historical reasons - the flag may be specified during mount. 4070 * 4071 * Currently, if any sibling cpusets have exclusive cpus or mem, we 4072 * refuse to clone the configuration - thereby refusing the task to 4073 * be entered, and as a result refusing the sys_unshare() or 4074 * clone() which initiated it. If this becomes a problem for some 4075 * users who wish to allow that scenario, then this could be 4076 * changed to grant parent->cpus_allowed-sibling_cpus_exclusive 4077 * (and likewise for mems) to the new cgroup. 4078 */ 4079 rcu_read_lock(); 4080 cpuset_for_each_child(tmp_cs, pos_css, parent) { 4081 if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) { 4082 rcu_read_unlock(); 4083 goto out_unlock; 4084 } 4085 } 4086 rcu_read_unlock(); 4087 4088 spin_lock_irq(&callback_lock); 4089 cs->mems_allowed = parent->mems_allowed; 4090 cs->effective_mems = parent->mems_allowed; 4091 cpumask_copy(cs->cpus_allowed, parent->cpus_allowed); 4092 cpumask_copy(cs->effective_cpus, parent->cpus_allowed); 4093 spin_unlock_irq(&callback_lock); 4094 out_unlock: 4095 mutex_unlock(&cpuset_mutex); 4096 cpus_read_unlock(); 4097 return 0; 4098 } 4099 4100 /* 4101 * If the cpuset being removed has its flag 'sched_load_balance' 4102 * enabled, then simulate turning sched_load_balance off, which 4103 * will call rebuild_sched_domains_locked(). That is not needed 4104 * in the default hierarchy where only changes in partition 4105 * will cause repartitioning. 4106 * 4107 * If the cpuset has the 'sched.partition' flag enabled, simulate 4108 * turning 'sched.partition" off. 4109 */ 4110 4111 static void cpuset_css_offline(struct cgroup_subsys_state *css) 4112 { 4113 struct cpuset *cs = css_cs(css); 4114 4115 cpus_read_lock(); 4116 mutex_lock(&cpuset_mutex); 4117 4118 if (is_partition_valid(cs)) 4119 update_prstate(cs, 0); 4120 4121 if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && 4122 is_sched_load_balance(cs)) 4123 update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); 4124 4125 if (cs->use_parent_ecpus) { 4126 struct cpuset *parent = parent_cs(cs); 4127 4128 cs->use_parent_ecpus = false; 4129 parent->child_ecpus_count--; 4130 } 4131 4132 cpuset_dec(); 4133 clear_bit(CS_ONLINE, &cs->flags); 4134 4135 mutex_unlock(&cpuset_mutex); 4136 cpus_read_unlock(); 4137 } 4138 4139 static void cpuset_css_free(struct cgroup_subsys_state *css) 4140 { 4141 struct cpuset *cs = css_cs(css); 4142 4143 free_cpuset(cs); 4144 } 4145 4146 static void cpuset_bind(struct cgroup_subsys_state *root_css) 4147 { 4148 mutex_lock(&cpuset_mutex); 4149 spin_lock_irq(&callback_lock); 4150 4151 if (is_in_v2_mode()) { 4152 cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask); 4153 cpumask_copy(top_cpuset.effective_xcpus, cpu_possible_mask); 4154 top_cpuset.mems_allowed = node_possible_map; 4155 } else { 4156 cpumask_copy(top_cpuset.cpus_allowed, 4157 top_cpuset.effective_cpus); 4158 top_cpuset.mems_allowed = top_cpuset.effective_mems; 4159 } 4160 4161 spin_unlock_irq(&callback_lock); 4162 mutex_unlock(&cpuset_mutex); 4163 } 4164 4165 /* 4166 * In case the child is cloned into a cpuset different from its parent, 4167 * additional checks are done to see if the move is allowed. 4168 */ 4169 static int cpuset_can_fork(struct task_struct *task, struct css_set *cset) 4170 { 4171 struct cpuset *cs = css_cs(cset->subsys[cpuset_cgrp_id]); 4172 bool same_cs; 4173 int ret; 4174 4175 rcu_read_lock(); 4176 same_cs = (cs == task_cs(current)); 4177 rcu_read_unlock(); 4178 4179 if (same_cs) 4180 return 0; 4181 4182 lockdep_assert_held(&cgroup_mutex); 4183 mutex_lock(&cpuset_mutex); 4184 4185 /* Check to see if task is allowed in the cpuset */ 4186 ret = cpuset_can_attach_check(cs); 4187 if (ret) 4188 goto out_unlock; 4189 4190 ret = task_can_attach(task); 4191 if (ret) 4192 goto out_unlock; 4193 4194 ret = security_task_setscheduler(task); 4195 if (ret) 4196 goto out_unlock; 4197 4198 /* 4199 * Mark attach is in progress. This makes validate_change() fail 4200 * changes which zero cpus/mems_allowed. 4201 */ 4202 cs->attach_in_progress++; 4203 out_unlock: 4204 mutex_unlock(&cpuset_mutex); 4205 return ret; 4206 } 4207 4208 static void cpuset_cancel_fork(struct task_struct *task, struct css_set *cset) 4209 { 4210 struct cpuset *cs = css_cs(cset->subsys[cpuset_cgrp_id]); 4211 bool same_cs; 4212 4213 rcu_read_lock(); 4214 same_cs = (cs == task_cs(current)); 4215 rcu_read_unlock(); 4216 4217 if (same_cs) 4218 return; 4219 4220 mutex_lock(&cpuset_mutex); 4221 cs->attach_in_progress--; 4222 if (!cs->attach_in_progress) 4223 wake_up(&cpuset_attach_wq); 4224 mutex_unlock(&cpuset_mutex); 4225 } 4226 4227 /* 4228 * Make sure the new task conform to the current state of its parent, 4229 * which could have been changed by cpuset just after it inherits the 4230 * state from the parent and before it sits on the cgroup's task list. 4231 */ 4232 static void cpuset_fork(struct task_struct *task) 4233 { 4234 struct cpuset *cs; 4235 bool same_cs; 4236 4237 rcu_read_lock(); 4238 cs = task_cs(task); 4239 same_cs = (cs == task_cs(current)); 4240 rcu_read_unlock(); 4241 4242 if (same_cs) { 4243 if (cs == &top_cpuset) 4244 return; 4245 4246 set_cpus_allowed_ptr(task, current->cpus_ptr); 4247 task->mems_allowed = current->mems_allowed; 4248 return; 4249 } 4250 4251 /* CLONE_INTO_CGROUP */ 4252 mutex_lock(&cpuset_mutex); 4253 guarantee_online_mems(cs, &cpuset_attach_nodemask_to); 4254 cpuset_attach_task(cs, task); 4255 4256 cs->attach_in_progress--; 4257 if (!cs->attach_in_progress) 4258 wake_up(&cpuset_attach_wq); 4259 4260 mutex_unlock(&cpuset_mutex); 4261 } 4262 4263 struct cgroup_subsys cpuset_cgrp_subsys = { 4264 .css_alloc = cpuset_css_alloc, 4265 .css_online = cpuset_css_online, 4266 .css_offline = cpuset_css_offline, 4267 .css_free = cpuset_css_free, 4268 .can_attach = cpuset_can_attach, 4269 .cancel_attach = cpuset_cancel_attach, 4270 .attach = cpuset_attach, 4271 .post_attach = cpuset_post_attach, 4272 .bind = cpuset_bind, 4273 .can_fork = cpuset_can_fork, 4274 .cancel_fork = cpuset_cancel_fork, 4275 .fork = cpuset_fork, 4276 .legacy_cftypes = legacy_files, 4277 .dfl_cftypes = dfl_files, 4278 .early_init = true, 4279 .threaded = true, 4280 }; 4281 4282 /** 4283 * cpuset_init - initialize cpusets at system boot 4284 * 4285 * Description: Initialize top_cpuset 4286 **/ 4287 4288 int __init cpuset_init(void) 4289 { 4290 BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL)); 4291 BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL)); 4292 BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_xcpus, GFP_KERNEL)); 4293 BUG_ON(!alloc_cpumask_var(&top_cpuset.exclusive_cpus, GFP_KERNEL)); 4294 BUG_ON(!zalloc_cpumask_var(&subpartitions_cpus, GFP_KERNEL)); 4295 BUG_ON(!zalloc_cpumask_var(&isolated_cpus, GFP_KERNEL)); 4296 4297 cpumask_setall(top_cpuset.cpus_allowed); 4298 nodes_setall(top_cpuset.mems_allowed); 4299 cpumask_setall(top_cpuset.effective_cpus); 4300 cpumask_setall(top_cpuset.effective_xcpus); 4301 cpumask_setall(top_cpuset.exclusive_cpus); 4302 nodes_setall(top_cpuset.effective_mems); 4303 4304 fmeter_init(&top_cpuset.fmeter); 4305 INIT_LIST_HEAD(&remote_children); 4306 4307 BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL)); 4308 4309 return 0; 4310 } 4311 4312 /* 4313 * If CPU and/or memory hotplug handlers, below, unplug any CPUs 4314 * or memory nodes, we need to walk over the cpuset hierarchy, 4315 * removing that CPU or node from all cpusets. If this removes the 4316 * last CPU or node from a cpuset, then move the tasks in the empty 4317 * cpuset to its next-highest non-empty parent. 4318 */ 4319 static void remove_tasks_in_empty_cpuset(struct cpuset *cs) 4320 { 4321 struct cpuset *parent; 4322 4323 /* 4324 * Find its next-highest non-empty parent, (top cpuset 4325 * has online cpus, so can't be empty). 4326 */ 4327 parent = parent_cs(cs); 4328 while (cpumask_empty(parent->cpus_allowed) || 4329 nodes_empty(parent->mems_allowed)) 4330 parent = parent_cs(parent); 4331 4332 if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) { 4333 pr_err("cpuset: failed to transfer tasks out of empty cpuset "); 4334 pr_cont_cgroup_name(cs->css.cgroup); 4335 pr_cont("\n"); 4336 } 4337 } 4338 4339 static void cpuset_migrate_tasks_workfn(struct work_struct *work) 4340 { 4341 struct cpuset_remove_tasks_struct *s; 4342 4343 s = container_of(work, struct cpuset_remove_tasks_struct, work); 4344 remove_tasks_in_empty_cpuset(s->cs); 4345 css_put(&s->cs->css); 4346 kfree(s); 4347 } 4348 4349 static void 4350 hotplug_update_tasks_legacy(struct cpuset *cs, 4351 struct cpumask *new_cpus, nodemask_t *new_mems, 4352 bool cpus_updated, bool mems_updated) 4353 { 4354 bool is_empty; 4355 4356 spin_lock_irq(&callback_lock); 4357 cpumask_copy(cs->cpus_allowed, new_cpus); 4358 cpumask_copy(cs->effective_cpus, new_cpus); 4359 cs->mems_allowed = *new_mems; 4360 cs->effective_mems = *new_mems; 4361 spin_unlock_irq(&callback_lock); 4362 4363 /* 4364 * Don't call update_tasks_cpumask() if the cpuset becomes empty, 4365 * as the tasks will be migrated to an ancestor. 4366 */ 4367 if (cpus_updated && !cpumask_empty(cs->cpus_allowed)) 4368 update_tasks_cpumask(cs, new_cpus); 4369 if (mems_updated && !nodes_empty(cs->mems_allowed)) 4370 update_tasks_nodemask(cs); 4371 4372 is_empty = cpumask_empty(cs->cpus_allowed) || 4373 nodes_empty(cs->mems_allowed); 4374 4375 /* 4376 * Move tasks to the nearest ancestor with execution resources, 4377 * This is full cgroup operation which will also call back into 4378 * cpuset. Execute it asynchronously using workqueue. 4379 */ 4380 if (is_empty && cs->css.cgroup->nr_populated_csets && 4381 css_tryget_online(&cs->css)) { 4382 struct cpuset_remove_tasks_struct *s; 4383 4384 s = kzalloc(sizeof(*s), GFP_KERNEL); 4385 if (WARN_ON_ONCE(!s)) { 4386 css_put(&cs->css); 4387 return; 4388 } 4389 4390 s->cs = cs; 4391 INIT_WORK(&s->work, cpuset_migrate_tasks_workfn); 4392 schedule_work(&s->work); 4393 } 4394 } 4395 4396 static void 4397 hotplug_update_tasks(struct cpuset *cs, 4398 struct cpumask *new_cpus, nodemask_t *new_mems, 4399 bool cpus_updated, bool mems_updated) 4400 { 4401 /* A partition root is allowed to have empty effective cpus */ 4402 if (cpumask_empty(new_cpus) && !is_partition_valid(cs)) 4403 cpumask_copy(new_cpus, parent_cs(cs)->effective_cpus); 4404 if (nodes_empty(*new_mems)) 4405 *new_mems = parent_cs(cs)->effective_mems; 4406 4407 spin_lock_irq(&callback_lock); 4408 cpumask_copy(cs->effective_cpus, new_cpus); 4409 cs->effective_mems = *new_mems; 4410 spin_unlock_irq(&callback_lock); 4411 4412 if (cpus_updated) 4413 update_tasks_cpumask(cs, new_cpus); 4414 if (mems_updated) 4415 update_tasks_nodemask(cs); 4416 } 4417 4418 static bool force_rebuild; 4419 4420 void cpuset_force_rebuild(void) 4421 { 4422 force_rebuild = true; 4423 } 4424 4425 /** 4426 * cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug 4427 * @cs: cpuset in interest 4428 * @tmp: the tmpmasks structure pointer 4429 * 4430 * Compare @cs's cpu and mem masks against top_cpuset and if some have gone 4431 * offline, update @cs accordingly. If @cs ends up with no CPU or memory, 4432 * all its tasks are moved to the nearest ancestor with both resources. 4433 */ 4434 static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp) 4435 { 4436 static cpumask_t new_cpus; 4437 static nodemask_t new_mems; 4438 bool cpus_updated; 4439 bool mems_updated; 4440 bool remote; 4441 int partcmd = -1; 4442 struct cpuset *parent; 4443 retry: 4444 wait_event(cpuset_attach_wq, cs->attach_in_progress == 0); 4445 4446 mutex_lock(&cpuset_mutex); 4447 4448 /* 4449 * We have raced with task attaching. We wait until attaching 4450 * is finished, so we won't attach a task to an empty cpuset. 4451 */ 4452 if (cs->attach_in_progress) { 4453 mutex_unlock(&cpuset_mutex); 4454 goto retry; 4455 } 4456 4457 parent = parent_cs(cs); 4458 compute_effective_cpumask(&new_cpus, cs, parent); 4459 nodes_and(new_mems, cs->mems_allowed, parent->effective_mems); 4460 4461 if (!tmp || !cs->partition_root_state) 4462 goto update_tasks; 4463 4464 /* 4465 * Compute effective_cpus for valid partition root, may invalidate 4466 * child partition roots if necessary. 4467 */ 4468 remote = is_remote_partition(cs); 4469 if (remote || (is_partition_valid(cs) && is_partition_valid(parent))) 4470 compute_partition_effective_cpumask(cs, &new_cpus); 4471 4472 if (remote && cpumask_empty(&new_cpus) && 4473 partition_is_populated(cs, NULL)) { 4474 remote_partition_disable(cs, tmp); 4475 compute_effective_cpumask(&new_cpus, cs, parent); 4476 remote = false; 4477 cpuset_force_rebuild(); 4478 } 4479 4480 /* 4481 * Force the partition to become invalid if either one of 4482 * the following conditions hold: 4483 * 1) empty effective cpus but not valid empty partition. 4484 * 2) parent is invalid or doesn't grant any cpus to child 4485 * partitions. 4486 */ 4487 if (is_local_partition(cs) && (!is_partition_valid(parent) || 4488 tasks_nocpu_error(parent, cs, &new_cpus))) 4489 partcmd = partcmd_invalidate; 4490 /* 4491 * On the other hand, an invalid partition root may be transitioned 4492 * back to a regular one. 4493 */ 4494 else if (is_partition_valid(parent) && is_partition_invalid(cs)) 4495 partcmd = partcmd_update; 4496 4497 if (partcmd >= 0) { 4498 update_parent_effective_cpumask(cs, partcmd, NULL, tmp); 4499 if ((partcmd == partcmd_invalidate) || is_partition_valid(cs)) { 4500 compute_partition_effective_cpumask(cs, &new_cpus); 4501 cpuset_force_rebuild(); 4502 } 4503 } 4504 4505 update_tasks: 4506 cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus); 4507 mems_updated = !nodes_equal(new_mems, cs->effective_mems); 4508 if (!cpus_updated && !mems_updated) 4509 goto unlock; /* Hotplug doesn't affect this cpuset */ 4510 4511 if (mems_updated) 4512 check_insane_mems_config(&new_mems); 4513 4514 if (is_in_v2_mode()) 4515 hotplug_update_tasks(cs, &new_cpus, &new_mems, 4516 cpus_updated, mems_updated); 4517 else 4518 hotplug_update_tasks_legacy(cs, &new_cpus, &new_mems, 4519 cpus_updated, mems_updated); 4520 4521 unlock: 4522 mutex_unlock(&cpuset_mutex); 4523 } 4524 4525 /** 4526 * cpuset_handle_hotplug - handle CPU/memory hot{,un}plug for a cpuset 4527 * 4528 * This function is called after either CPU or memory configuration has 4529 * changed and updates cpuset accordingly. The top_cpuset is always 4530 * synchronized to cpu_active_mask and N_MEMORY, which is necessary in 4531 * order to make cpusets transparent (of no affect) on systems that are 4532 * actively using CPU hotplug but making no active use of cpusets. 4533 * 4534 * Non-root cpusets are only affected by offlining. If any CPUs or memory 4535 * nodes have been taken down, cpuset_hotplug_update_tasks() is invoked on 4536 * all descendants. 4537 * 4538 * Note that CPU offlining during suspend is ignored. We don't modify 4539 * cpusets across suspend/resume cycles at all. 4540 * 4541 * CPU / memory hotplug is handled synchronously. 4542 */ 4543 static void cpuset_handle_hotplug(void) 4544 { 4545 static cpumask_t new_cpus; 4546 static nodemask_t new_mems; 4547 bool cpus_updated, mems_updated; 4548 bool on_dfl = is_in_v2_mode(); 4549 struct tmpmasks tmp, *ptmp = NULL; 4550 4551 if (on_dfl && !alloc_cpumasks(NULL, &tmp)) 4552 ptmp = &tmp; 4553 4554 lockdep_assert_cpus_held(); 4555 mutex_lock(&cpuset_mutex); 4556 4557 /* fetch the available cpus/mems and find out which changed how */ 4558 cpumask_copy(&new_cpus, cpu_active_mask); 4559 new_mems = node_states[N_MEMORY]; 4560 4561 /* 4562 * If subpartitions_cpus is populated, it is likely that the check 4563 * below will produce a false positive on cpus_updated when the cpu 4564 * list isn't changed. It is extra work, but it is better to be safe. 4565 */ 4566 cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus) || 4567 !cpumask_empty(subpartitions_cpus); 4568 mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems); 4569 4570 /* 4571 * In the rare case that hotplug removes all the cpus in 4572 * subpartitions_cpus, we assumed that cpus are updated. 4573 */ 4574 if (!cpus_updated && top_cpuset.nr_subparts) 4575 cpus_updated = true; 4576 4577 /* For v1, synchronize cpus_allowed to cpu_active_mask */ 4578 if (cpus_updated) { 4579 spin_lock_irq(&callback_lock); 4580 if (!on_dfl) 4581 cpumask_copy(top_cpuset.cpus_allowed, &new_cpus); 4582 /* 4583 * Make sure that CPUs allocated to child partitions 4584 * do not show up in effective_cpus. If no CPU is left, 4585 * we clear the subpartitions_cpus & let the child partitions 4586 * fight for the CPUs again. 4587 */ 4588 if (!cpumask_empty(subpartitions_cpus)) { 4589 if (cpumask_subset(&new_cpus, subpartitions_cpus)) { 4590 top_cpuset.nr_subparts = 0; 4591 cpumask_clear(subpartitions_cpus); 4592 } else { 4593 cpumask_andnot(&new_cpus, &new_cpus, 4594 subpartitions_cpus); 4595 } 4596 } 4597 cpumask_copy(top_cpuset.effective_cpus, &new_cpus); 4598 spin_unlock_irq(&callback_lock); 4599 /* we don't mess with cpumasks of tasks in top_cpuset */ 4600 } 4601 4602 /* synchronize mems_allowed to N_MEMORY */ 4603 if (mems_updated) { 4604 spin_lock_irq(&callback_lock); 4605 if (!on_dfl) 4606 top_cpuset.mems_allowed = new_mems; 4607 top_cpuset.effective_mems = new_mems; 4608 spin_unlock_irq(&callback_lock); 4609 update_tasks_nodemask(&top_cpuset); 4610 } 4611 4612 mutex_unlock(&cpuset_mutex); 4613 4614 /* if cpus or mems changed, we need to propagate to descendants */ 4615 if (cpus_updated || mems_updated) { 4616 struct cpuset *cs; 4617 struct cgroup_subsys_state *pos_css; 4618 4619 rcu_read_lock(); 4620 cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) { 4621 if (cs == &top_cpuset || !css_tryget_online(&cs->css)) 4622 continue; 4623 rcu_read_unlock(); 4624 4625 cpuset_hotplug_update_tasks(cs, ptmp); 4626 4627 rcu_read_lock(); 4628 css_put(&cs->css); 4629 } 4630 rcu_read_unlock(); 4631 } 4632 4633 /* rebuild sched domains if cpus_allowed has changed */ 4634 if (cpus_updated || force_rebuild) { 4635 force_rebuild = false; 4636 rebuild_sched_domains_cpuslocked(); 4637 } 4638 4639 free_cpumasks(NULL, ptmp); 4640 } 4641 4642 void cpuset_update_active_cpus(void) 4643 { 4644 /* 4645 * We're inside cpu hotplug critical region which usually nests 4646 * inside cgroup synchronization. Bounce actual hotplug processing 4647 * to a work item to avoid reverse locking order. 4648 */ 4649 cpuset_handle_hotplug(); 4650 } 4651 4652 /* 4653 * Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY]. 4654 * Call this routine anytime after node_states[N_MEMORY] changes. 4655 * See cpuset_update_active_cpus() for CPU hotplug handling. 4656 */ 4657 static int cpuset_track_online_nodes(struct notifier_block *self, 4658 unsigned long action, void *arg) 4659 { 4660 cpuset_handle_hotplug(); 4661 return NOTIFY_OK; 4662 } 4663 4664 /** 4665 * cpuset_init_smp - initialize cpus_allowed 4666 * 4667 * Description: Finish top cpuset after cpu, node maps are initialized 4668 */ 4669 void __init cpuset_init_smp(void) 4670 { 4671 /* 4672 * cpus_allowd/mems_allowed set to v2 values in the initial 4673 * cpuset_bind() call will be reset to v1 values in another 4674 * cpuset_bind() call when v1 cpuset is mounted. 4675 */ 4676 top_cpuset.old_mems_allowed = top_cpuset.mems_allowed; 4677 4678 cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask); 4679 top_cpuset.effective_mems = node_states[N_MEMORY]; 4680 4681 hotplug_memory_notifier(cpuset_track_online_nodes, CPUSET_CALLBACK_PRI); 4682 4683 cpuset_migrate_mm_wq = alloc_ordered_workqueue("cpuset_migrate_mm", 0); 4684 BUG_ON(!cpuset_migrate_mm_wq); 4685 } 4686 4687 /** 4688 * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset. 4689 * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed. 4690 * @pmask: pointer to struct cpumask variable to receive cpus_allowed set. 4691 * 4692 * Description: Returns the cpumask_var_t cpus_allowed of the cpuset 4693 * attached to the specified @tsk. Guaranteed to return some non-empty 4694 * subset of cpu_online_mask, even if this means going outside the 4695 * tasks cpuset, except when the task is in the top cpuset. 4696 **/ 4697 4698 void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) 4699 { 4700 unsigned long flags; 4701 struct cpuset *cs; 4702 4703 spin_lock_irqsave(&callback_lock, flags); 4704 rcu_read_lock(); 4705 4706 cs = task_cs(tsk); 4707 if (cs != &top_cpuset) 4708 guarantee_online_cpus(tsk, pmask); 4709 /* 4710 * Tasks in the top cpuset won't get update to their cpumasks 4711 * when a hotplug online/offline event happens. So we include all 4712 * offline cpus in the allowed cpu list. 4713 */ 4714 if ((cs == &top_cpuset) || cpumask_empty(pmask)) { 4715 const struct cpumask *possible_mask = task_cpu_possible_mask(tsk); 4716 4717 /* 4718 * We first exclude cpus allocated to partitions. If there is no 4719 * allowable online cpu left, we fall back to all possible cpus. 4720 */ 4721 cpumask_andnot(pmask, possible_mask, subpartitions_cpus); 4722 if (!cpumask_intersects(pmask, cpu_online_mask)) 4723 cpumask_copy(pmask, possible_mask); 4724 } 4725 4726 rcu_read_unlock(); 4727 spin_unlock_irqrestore(&callback_lock, flags); 4728 } 4729 4730 /** 4731 * cpuset_cpus_allowed_fallback - final fallback before complete catastrophe. 4732 * @tsk: pointer to task_struct with which the scheduler is struggling 4733 * 4734 * Description: In the case that the scheduler cannot find an allowed cpu in 4735 * tsk->cpus_allowed, we fall back to task_cs(tsk)->cpus_allowed. In legacy 4736 * mode however, this value is the same as task_cs(tsk)->effective_cpus, 4737 * which will not contain a sane cpumask during cases such as cpu hotplugging. 4738 * This is the absolute last resort for the scheduler and it is only used if 4739 * _every_ other avenue has been traveled. 4740 * 4741 * Returns true if the affinity of @tsk was changed, false otherwise. 4742 **/ 4743 4744 bool cpuset_cpus_allowed_fallback(struct task_struct *tsk) 4745 { 4746 const struct cpumask *possible_mask = task_cpu_possible_mask(tsk); 4747 const struct cpumask *cs_mask; 4748 bool changed = false; 4749 4750 rcu_read_lock(); 4751 cs_mask = task_cs(tsk)->cpus_allowed; 4752 if (is_in_v2_mode() && cpumask_subset(cs_mask, possible_mask)) { 4753 do_set_cpus_allowed(tsk, cs_mask); 4754 changed = true; 4755 } 4756 rcu_read_unlock(); 4757 4758 /* 4759 * We own tsk->cpus_allowed, nobody can change it under us. 4760 * 4761 * But we used cs && cs->cpus_allowed lockless and thus can 4762 * race with cgroup_attach_task() or update_cpumask() and get 4763 * the wrong tsk->cpus_allowed. However, both cases imply the 4764 * subsequent cpuset_change_cpumask()->set_cpus_allowed_ptr() 4765 * which takes task_rq_lock(). 4766 * 4767 * If we are called after it dropped the lock we must see all 4768 * changes in tsk_cs()->cpus_allowed. Otherwise we can temporary 4769 * set any mask even if it is not right from task_cs() pov, 4770 * the pending set_cpus_allowed_ptr() will fix things. 4771 * 4772 * select_fallback_rq() will fix things ups and set cpu_possible_mask 4773 * if required. 4774 */ 4775 return changed; 4776 } 4777 4778 void __init cpuset_init_current_mems_allowed(void) 4779 { 4780 nodes_setall(current->mems_allowed); 4781 } 4782 4783 /** 4784 * cpuset_mems_allowed - return mems_allowed mask from a tasks cpuset. 4785 * @tsk: pointer to task_struct from which to obtain cpuset->mems_allowed. 4786 * 4787 * Description: Returns the nodemask_t mems_allowed of the cpuset 4788 * attached to the specified @tsk. Guaranteed to return some non-empty 4789 * subset of node_states[N_MEMORY], even if this means going outside the 4790 * tasks cpuset. 4791 **/ 4792 4793 nodemask_t cpuset_mems_allowed(struct task_struct *tsk) 4794 { 4795 nodemask_t mask; 4796 unsigned long flags; 4797 4798 spin_lock_irqsave(&callback_lock, flags); 4799 rcu_read_lock(); 4800 guarantee_online_mems(task_cs(tsk), &mask); 4801 rcu_read_unlock(); 4802 spin_unlock_irqrestore(&callback_lock, flags); 4803 4804 return mask; 4805 } 4806 4807 /** 4808 * cpuset_nodemask_valid_mems_allowed - check nodemask vs. current mems_allowed 4809 * @nodemask: the nodemask to be checked 4810 * 4811 * Are any of the nodes in the nodemask allowed in current->mems_allowed? 4812 */ 4813 int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask) 4814 { 4815 return nodes_intersects(*nodemask, current->mems_allowed); 4816 } 4817 4818 /* 4819 * nearest_hardwall_ancestor() - Returns the nearest mem_exclusive or 4820 * mem_hardwall ancestor to the specified cpuset. Call holding 4821 * callback_lock. If no ancestor is mem_exclusive or mem_hardwall 4822 * (an unusual configuration), then returns the root cpuset. 4823 */ 4824 static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) 4825 { 4826 while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && parent_cs(cs)) 4827 cs = parent_cs(cs); 4828 return cs; 4829 } 4830 4831 /* 4832 * cpuset_node_allowed - Can we allocate on a memory node? 4833 * @node: is this an allowed node? 4834 * @gfp_mask: memory allocation flags 4835 * 4836 * If we're in interrupt, yes, we can always allocate. If @node is set in 4837 * current's mems_allowed, yes. If it's not a __GFP_HARDWALL request and this 4838 * node is set in the nearest hardwalled cpuset ancestor to current's cpuset, 4839 * yes. If current has access to memory reserves as an oom victim, yes. 4840 * Otherwise, no. 4841 * 4842 * GFP_USER allocations are marked with the __GFP_HARDWALL bit, 4843 * and do not allow allocations outside the current tasks cpuset 4844 * unless the task has been OOM killed. 4845 * GFP_KERNEL allocations are not so marked, so can escape to the 4846 * nearest enclosing hardwalled ancestor cpuset. 4847 * 4848 * Scanning up parent cpusets requires callback_lock. The 4849 * __alloc_pages() routine only calls here with __GFP_HARDWALL bit 4850 * _not_ set if it's a GFP_KERNEL allocation, and all nodes in the 4851 * current tasks mems_allowed came up empty on the first pass over 4852 * the zonelist. So only GFP_KERNEL allocations, if all nodes in the 4853 * cpuset are short of memory, might require taking the callback_lock. 4854 * 4855 * The first call here from mm/page_alloc:get_page_from_freelist() 4856 * has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets, 4857 * so no allocation on a node outside the cpuset is allowed (unless 4858 * in interrupt, of course). 4859 * 4860 * The second pass through get_page_from_freelist() doesn't even call 4861 * here for GFP_ATOMIC calls. For those calls, the __alloc_pages() 4862 * variable 'wait' is not set, and the bit ALLOC_CPUSET is not set 4863 * in alloc_flags. That logic and the checks below have the combined 4864 * affect that: 4865 * in_interrupt - any node ok (current task context irrelevant) 4866 * GFP_ATOMIC - any node ok 4867 * tsk_is_oom_victim - any node ok 4868 * GFP_KERNEL - any node in enclosing hardwalled cpuset ok 4869 * GFP_USER - only nodes in current tasks mems allowed ok. 4870 */ 4871 bool cpuset_node_allowed(int node, gfp_t gfp_mask) 4872 { 4873 struct cpuset *cs; /* current cpuset ancestors */ 4874 bool allowed; /* is allocation in zone z allowed? */ 4875 unsigned long flags; 4876 4877 if (in_interrupt()) 4878 return true; 4879 if (node_isset(node, current->mems_allowed)) 4880 return true; 4881 /* 4882 * Allow tasks that have access to memory reserves because they have 4883 * been OOM killed to get memory anywhere. 4884 */ 4885 if (unlikely(tsk_is_oom_victim(current))) 4886 return true; 4887 if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */ 4888 return false; 4889 4890 if (current->flags & PF_EXITING) /* Let dying task have memory */ 4891 return true; 4892 4893 /* Not hardwall and node outside mems_allowed: scan up cpusets */ 4894 spin_lock_irqsave(&callback_lock, flags); 4895 4896 rcu_read_lock(); 4897 cs = nearest_hardwall_ancestor(task_cs(current)); 4898 allowed = node_isset(node, cs->mems_allowed); 4899 rcu_read_unlock(); 4900 4901 spin_unlock_irqrestore(&callback_lock, flags); 4902 return allowed; 4903 } 4904 4905 /** 4906 * cpuset_spread_node() - On which node to begin search for a page 4907 * @rotor: round robin rotor 4908 * 4909 * If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for 4910 * tasks in a cpuset with is_spread_page or is_spread_slab set), 4911 * and if the memory allocation used cpuset_mem_spread_node() 4912 * to determine on which node to start looking, as it will for 4913 * certain page cache or slab cache pages such as used for file 4914 * system buffers and inode caches, then instead of starting on the 4915 * local node to look for a free page, rather spread the starting 4916 * node around the tasks mems_allowed nodes. 4917 * 4918 * We don't have to worry about the returned node being offline 4919 * because "it can't happen", and even if it did, it would be ok. 4920 * 4921 * The routines calling guarantee_online_mems() are careful to 4922 * only set nodes in task->mems_allowed that are online. So it 4923 * should not be possible for the following code to return an 4924 * offline node. But if it did, that would be ok, as this routine 4925 * is not returning the node where the allocation must be, only 4926 * the node where the search should start. The zonelist passed to 4927 * __alloc_pages() will include all nodes. If the slab allocator 4928 * is passed an offline node, it will fall back to the local node. 4929 * See kmem_cache_alloc_node(). 4930 */ 4931 static int cpuset_spread_node(int *rotor) 4932 { 4933 return *rotor = next_node_in(*rotor, current->mems_allowed); 4934 } 4935 4936 /** 4937 * cpuset_mem_spread_node() - On which node to begin search for a file page 4938 */ 4939 int cpuset_mem_spread_node(void) 4940 { 4941 if (current->cpuset_mem_spread_rotor == NUMA_NO_NODE) 4942 current->cpuset_mem_spread_rotor = 4943 node_random(¤t->mems_allowed); 4944 4945 return cpuset_spread_node(¤t->cpuset_mem_spread_rotor); 4946 } 4947 4948 /** 4949 * cpuset_slab_spread_node() - On which node to begin search for a slab page 4950 */ 4951 int cpuset_slab_spread_node(void) 4952 { 4953 if (current->cpuset_slab_spread_rotor == NUMA_NO_NODE) 4954 current->cpuset_slab_spread_rotor = 4955 node_random(¤t->mems_allowed); 4956 4957 return cpuset_spread_node(¤t->cpuset_slab_spread_rotor); 4958 } 4959 EXPORT_SYMBOL_GPL(cpuset_mem_spread_node); 4960 4961 /** 4962 * cpuset_mems_allowed_intersects - Does @tsk1's mems_allowed intersect @tsk2's? 4963 * @tsk1: pointer to task_struct of some task. 4964 * @tsk2: pointer to task_struct of some other task. 4965 * 4966 * Description: Return true if @tsk1's mems_allowed intersects the 4967 * mems_allowed of @tsk2. Used by the OOM killer to determine if 4968 * one of the task's memory usage might impact the memory available 4969 * to the other. 4970 **/ 4971 4972 int cpuset_mems_allowed_intersects(const struct task_struct *tsk1, 4973 const struct task_struct *tsk2) 4974 { 4975 return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed); 4976 } 4977 4978 /** 4979 * cpuset_print_current_mems_allowed - prints current's cpuset and mems_allowed 4980 * 4981 * Description: Prints current's name, cpuset name, and cached copy of its 4982 * mems_allowed to the kernel log. 4983 */ 4984 void cpuset_print_current_mems_allowed(void) 4985 { 4986 struct cgroup *cgrp; 4987 4988 rcu_read_lock(); 4989 4990 cgrp = task_cs(current)->css.cgroup; 4991 pr_cont(",cpuset="); 4992 pr_cont_cgroup_name(cgrp); 4993 pr_cont(",mems_allowed=%*pbl", 4994 nodemask_pr_args(¤t->mems_allowed)); 4995 4996 rcu_read_unlock(); 4997 } 4998 4999 /* 5000 * Collection of memory_pressure is suppressed unless 5001 * this flag is enabled by writing "1" to the special 5002 * cpuset file 'memory_pressure_enabled' in the root cpuset. 5003 */ 5004 5005 int cpuset_memory_pressure_enabled __read_mostly; 5006 5007 /* 5008 * __cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims. 5009 * 5010 * Keep a running average of the rate of synchronous (direct) 5011 * page reclaim efforts initiated by tasks in each cpuset. 5012 * 5013 * This represents the rate at which some task in the cpuset 5014 * ran low on memory on all nodes it was allowed to use, and 5015 * had to enter the kernels page reclaim code in an effort to 5016 * create more free memory by tossing clean pages or swapping 5017 * or writing dirty pages. 5018 * 5019 * Display to user space in the per-cpuset read-only file 5020 * "memory_pressure". Value displayed is an integer 5021 * representing the recent rate of entry into the synchronous 5022 * (direct) page reclaim by any task attached to the cpuset. 5023 */ 5024 5025 void __cpuset_memory_pressure_bump(void) 5026 { 5027 rcu_read_lock(); 5028 fmeter_markevent(&task_cs(current)->fmeter); 5029 rcu_read_unlock(); 5030 } 5031 5032 #ifdef CONFIG_PROC_PID_CPUSET 5033 /* 5034 * proc_cpuset_show() 5035 * - Print tasks cpuset path into seq_file. 5036 * - Used for /proc/<pid>/cpuset. 5037 * - No need to task_lock(tsk) on this tsk->cpuset reference, as it 5038 * doesn't really matter if tsk->cpuset changes after we read it, 5039 * and we take cpuset_mutex, keeping cpuset_attach() from changing it 5040 * anyway. 5041 */ 5042 int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns, 5043 struct pid *pid, struct task_struct *tsk) 5044 { 5045 char *buf; 5046 struct cgroup_subsys_state *css; 5047 int retval; 5048 5049 retval = -ENOMEM; 5050 buf = kmalloc(PATH_MAX, GFP_KERNEL); 5051 if (!buf) 5052 goto out; 5053 5054 css = task_get_css(tsk, cpuset_cgrp_id); 5055 retval = cgroup_path_ns(css->cgroup, buf, PATH_MAX, 5056 current->nsproxy->cgroup_ns); 5057 css_put(css); 5058 if (retval == -E2BIG) 5059 retval = -ENAMETOOLONG; 5060 if (retval < 0) 5061 goto out_free; 5062 seq_puts(m, buf); 5063 seq_putc(m, '\n'); 5064 retval = 0; 5065 out_free: 5066 kfree(buf); 5067 out: 5068 return retval; 5069 } 5070 #endif /* CONFIG_PROC_PID_CPUSET */ 5071 5072 /* Display task mems_allowed in /proc/<pid>/status file. */ 5073 void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task) 5074 { 5075 seq_printf(m, "Mems_allowed:\t%*pb\n", 5076 nodemask_pr_args(&task->mems_allowed)); 5077 seq_printf(m, "Mems_allowed_list:\t%*pbl\n", 5078 nodemask_pr_args(&task->mems_allowed)); 5079 } 5080