1 /* 2 * kernel/cpuset.c 3 * 4 * Processor and Memory placement constraints for sets of tasks. 5 * 6 * Copyright (C) 2003 BULL SA. 7 * Copyright (C) 2004-2007 Silicon Graphics, Inc. 8 * Copyright (C) 2006 Google, Inc 9 * 10 * Portions derived from Patrick Mochel's sysfs code. 11 * sysfs is Copyright (c) 2001-3 Patrick Mochel 12 * 13 * 2003-10-10 Written by Simon Derr. 14 * 2003-10-22 Updates by Stephen Hemminger. 15 * 2004 May-July Rework by Paul Jackson. 16 * 2006 Rework by Paul Menage to use generic cgroups 17 * 2008 Rework of the scheduler domains and CPU hotplug handling 18 * by Max Krasnyansky 19 * 20 * This file is subject to the terms and conditions of the GNU General Public 21 * License. See the file COPYING in the main directory of the Linux 22 * distribution for more details. 23 */ 24 25 #include <linux/cpu.h> 26 #include <linux/cpumask.h> 27 #include <linux/cpuset.h> 28 #include <linux/err.h> 29 #include <linux/errno.h> 30 #include <linux/file.h> 31 #include <linux/fs.h> 32 #include <linux/init.h> 33 #include <linux/interrupt.h> 34 #include <linux/kernel.h> 35 #include <linux/kmod.h> 36 #include <linux/kthread.h> 37 #include <linux/list.h> 38 #include <linux/mempolicy.h> 39 #include <linux/mm.h> 40 #include <linux/memory.h> 41 #include <linux/export.h> 42 #include <linux/mount.h> 43 #include <linux/fs_context.h> 44 #include <linux/namei.h> 45 #include <linux/pagemap.h> 46 #include <linux/proc_fs.h> 47 #include <linux/rcupdate.h> 48 #include <linux/sched.h> 49 #include <linux/sched/deadline.h> 50 #include <linux/sched/mm.h> 51 #include <linux/sched/task.h> 52 #include <linux/seq_file.h> 53 #include <linux/security.h> 54 #include <linux/slab.h> 55 #include <linux/spinlock.h> 56 #include <linux/stat.h> 57 #include <linux/string.h> 58 #include <linux/time.h> 59 #include <linux/time64.h> 60 #include <linux/backing-dev.h> 61 #include <linux/sort.h> 62 #include <linux/oom.h> 63 #include <linux/sched/isolation.h> 64 #include <linux/uaccess.h> 65 #include <linux/atomic.h> 66 #include <linux/mutex.h> 67 #include <linux/cgroup.h> 68 #include <linux/wait.h> 69 70 DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key); 71 DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key); 72 73 /* 74 * There could be abnormal cpuset configurations for cpu or memory 75 * node binding, add this key to provide a quick low-cost judgment 76 * of the situation. 77 */ 78 DEFINE_STATIC_KEY_FALSE(cpusets_insane_config_key); 79 80 /* See "Frequency meter" comments, below. */ 81 82 struct fmeter { 83 int cnt; /* unprocessed events count */ 84 int val; /* most recent output value */ 85 time64_t time; /* clock (secs) when val computed */ 86 spinlock_t lock; /* guards read or write of above */ 87 }; 88 89 /* 90 * Invalid partition error code 91 */ 92 enum prs_errcode { 93 PERR_NONE = 0, 94 PERR_INVCPUS, 95 PERR_INVPARENT, 96 PERR_NOTPART, 97 PERR_NOTEXCL, 98 PERR_NOCPUS, 99 PERR_HOTPLUG, 100 PERR_CPUSEMPTY, 101 }; 102 103 static const char * const perr_strings[] = { 104 [PERR_INVCPUS] = "Invalid cpu list in cpuset.cpus", 105 [PERR_INVPARENT] = "Parent is an invalid partition root", 106 [PERR_NOTPART] = "Parent is not a partition root", 107 [PERR_NOTEXCL] = "Cpu list in cpuset.cpus not exclusive", 108 [PERR_NOCPUS] = "Parent unable to distribute cpu downstream", 109 [PERR_HOTPLUG] = "No cpu available due to hotplug", 110 [PERR_CPUSEMPTY] = "cpuset.cpus is empty", 111 }; 112 113 struct cpuset { 114 struct cgroup_subsys_state css; 115 116 unsigned long flags; /* "unsigned long" so bitops work */ 117 118 /* 119 * On default hierarchy: 120 * 121 * The user-configured masks can only be changed by writing to 122 * cpuset.cpus and cpuset.mems, and won't be limited by the 123 * parent masks. 124 * 125 * The effective masks is the real masks that apply to the tasks 126 * in the cpuset. They may be changed if the configured masks are 127 * changed or hotplug happens. 128 * 129 * effective_mask == configured_mask & parent's effective_mask, 130 * and if it ends up empty, it will inherit the parent's mask. 131 * 132 * 133 * On legacy hierarchy: 134 * 135 * The user-configured masks are always the same with effective masks. 136 */ 137 138 /* user-configured CPUs and Memory Nodes allow to tasks */ 139 cpumask_var_t cpus_allowed; 140 nodemask_t mems_allowed; 141 142 /* effective CPUs and Memory Nodes allow to tasks */ 143 cpumask_var_t effective_cpus; 144 nodemask_t effective_mems; 145 146 /* 147 * CPUs allocated to child sub-partitions (default hierarchy only) 148 * - CPUs granted by the parent = effective_cpus U subparts_cpus 149 * - effective_cpus and subparts_cpus are mutually exclusive. 150 * 151 * effective_cpus contains only onlined CPUs, but subparts_cpus 152 * may have offlined ones. 153 */ 154 cpumask_var_t subparts_cpus; 155 156 /* 157 * This is old Memory Nodes tasks took on. 158 * 159 * - top_cpuset.old_mems_allowed is initialized to mems_allowed. 160 * - A new cpuset's old_mems_allowed is initialized when some 161 * task is moved into it. 162 * - old_mems_allowed is used in cpuset_migrate_mm() when we change 163 * cpuset.mems_allowed and have tasks' nodemask updated, and 164 * then old_mems_allowed is updated to mems_allowed. 165 */ 166 nodemask_t old_mems_allowed; 167 168 struct fmeter fmeter; /* memory_pressure filter */ 169 170 /* 171 * Tasks are being attached to this cpuset. Used to prevent 172 * zeroing cpus/mems_allowed between ->can_attach() and ->attach(). 173 */ 174 int attach_in_progress; 175 176 /* partition number for rebuild_sched_domains() */ 177 int pn; 178 179 /* for custom sched domain */ 180 int relax_domain_level; 181 182 /* number of CPUs in subparts_cpus */ 183 int nr_subparts_cpus; 184 185 /* partition root state */ 186 int partition_root_state; 187 188 /* 189 * Default hierarchy only: 190 * use_parent_ecpus - set if using parent's effective_cpus 191 * child_ecpus_count - # of children with use_parent_ecpus set 192 */ 193 int use_parent_ecpus; 194 int child_ecpus_count; 195 196 /* Invalid partition error code, not lock protected */ 197 enum prs_errcode prs_err; 198 199 /* Handle for cpuset.cpus.partition */ 200 struct cgroup_file partition_file; 201 }; 202 203 /* 204 * Partition root states: 205 * 206 * 0 - member (not a partition root) 207 * 1 - partition root 208 * 2 - partition root without load balancing (isolated) 209 * -1 - invalid partition root 210 * -2 - invalid isolated partition root 211 */ 212 #define PRS_MEMBER 0 213 #define PRS_ROOT 1 214 #define PRS_ISOLATED 2 215 #define PRS_INVALID_ROOT -1 216 #define PRS_INVALID_ISOLATED -2 217 218 static inline bool is_prs_invalid(int prs_state) 219 { 220 return prs_state < 0; 221 } 222 223 /* 224 * Temporary cpumasks for working with partitions that are passed among 225 * functions to avoid memory allocation in inner functions. 226 */ 227 struct tmpmasks { 228 cpumask_var_t addmask, delmask; /* For partition root */ 229 cpumask_var_t new_cpus; /* For update_cpumasks_hier() */ 230 }; 231 232 static inline struct cpuset *css_cs(struct cgroup_subsys_state *css) 233 { 234 return css ? container_of(css, struct cpuset, css) : NULL; 235 } 236 237 /* Retrieve the cpuset for a task */ 238 static inline struct cpuset *task_cs(struct task_struct *task) 239 { 240 return css_cs(task_css(task, cpuset_cgrp_id)); 241 } 242 243 static inline struct cpuset *parent_cs(struct cpuset *cs) 244 { 245 return css_cs(cs->css.parent); 246 } 247 248 /* bits in struct cpuset flags field */ 249 typedef enum { 250 CS_ONLINE, 251 CS_CPU_EXCLUSIVE, 252 CS_MEM_EXCLUSIVE, 253 CS_MEM_HARDWALL, 254 CS_MEMORY_MIGRATE, 255 CS_SCHED_LOAD_BALANCE, 256 CS_SPREAD_PAGE, 257 CS_SPREAD_SLAB, 258 } cpuset_flagbits_t; 259 260 /* convenient tests for these bits */ 261 static inline bool is_cpuset_online(struct cpuset *cs) 262 { 263 return test_bit(CS_ONLINE, &cs->flags) && !css_is_dying(&cs->css); 264 } 265 266 static inline int is_cpu_exclusive(const struct cpuset *cs) 267 { 268 return test_bit(CS_CPU_EXCLUSIVE, &cs->flags); 269 } 270 271 static inline int is_mem_exclusive(const struct cpuset *cs) 272 { 273 return test_bit(CS_MEM_EXCLUSIVE, &cs->flags); 274 } 275 276 static inline int is_mem_hardwall(const struct cpuset *cs) 277 { 278 return test_bit(CS_MEM_HARDWALL, &cs->flags); 279 } 280 281 static inline int is_sched_load_balance(const struct cpuset *cs) 282 { 283 return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); 284 } 285 286 static inline int is_memory_migrate(const struct cpuset *cs) 287 { 288 return test_bit(CS_MEMORY_MIGRATE, &cs->flags); 289 } 290 291 static inline int is_spread_page(const struct cpuset *cs) 292 { 293 return test_bit(CS_SPREAD_PAGE, &cs->flags); 294 } 295 296 static inline int is_spread_slab(const struct cpuset *cs) 297 { 298 return test_bit(CS_SPREAD_SLAB, &cs->flags); 299 } 300 301 static inline int is_partition_valid(const struct cpuset *cs) 302 { 303 return cs->partition_root_state > 0; 304 } 305 306 static inline int is_partition_invalid(const struct cpuset *cs) 307 { 308 return cs->partition_root_state < 0; 309 } 310 311 /* 312 * Callers should hold callback_lock to modify partition_root_state. 313 */ 314 static inline void make_partition_invalid(struct cpuset *cs) 315 { 316 if (is_partition_valid(cs)) 317 cs->partition_root_state = -cs->partition_root_state; 318 } 319 320 /* 321 * Send notification event of whenever partition_root_state changes. 322 */ 323 static inline void notify_partition_change(struct cpuset *cs, int old_prs) 324 { 325 if (old_prs == cs->partition_root_state) 326 return; 327 cgroup_file_notify(&cs->partition_file); 328 329 /* Reset prs_err if not invalid */ 330 if (is_partition_valid(cs)) 331 WRITE_ONCE(cs->prs_err, PERR_NONE); 332 } 333 334 static struct cpuset top_cpuset = { 335 .flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) | 336 (1 << CS_MEM_EXCLUSIVE)), 337 .partition_root_state = PRS_ROOT, 338 }; 339 340 /** 341 * cpuset_for_each_child - traverse online children of a cpuset 342 * @child_cs: loop cursor pointing to the current child 343 * @pos_css: used for iteration 344 * @parent_cs: target cpuset to walk children of 345 * 346 * Walk @child_cs through the online children of @parent_cs. Must be used 347 * with RCU read locked. 348 */ 349 #define cpuset_for_each_child(child_cs, pos_css, parent_cs) \ 350 css_for_each_child((pos_css), &(parent_cs)->css) \ 351 if (is_cpuset_online(((child_cs) = css_cs((pos_css))))) 352 353 /** 354 * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants 355 * @des_cs: loop cursor pointing to the current descendant 356 * @pos_css: used for iteration 357 * @root_cs: target cpuset to walk ancestor of 358 * 359 * Walk @des_cs through the online descendants of @root_cs. Must be used 360 * with RCU read locked. The caller may modify @pos_css by calling 361 * css_rightmost_descendant() to skip subtree. @root_cs is included in the 362 * iteration and the first node to be visited. 363 */ 364 #define cpuset_for_each_descendant_pre(des_cs, pos_css, root_cs) \ 365 css_for_each_descendant_pre((pos_css), &(root_cs)->css) \ 366 if (is_cpuset_online(((des_cs) = css_cs((pos_css))))) 367 368 /* 369 * There are two global locks guarding cpuset structures - cpuset_rwsem and 370 * callback_lock. We also require taking task_lock() when dereferencing a 371 * task's cpuset pointer. See "The task_lock() exception", at the end of this 372 * comment. The cpuset code uses only cpuset_rwsem write lock. Other 373 * kernel subsystems can use cpuset_read_lock()/cpuset_read_unlock() to 374 * prevent change to cpuset structures. 375 * 376 * A task must hold both locks to modify cpusets. If a task holds 377 * cpuset_rwsem, it blocks others wanting that rwsem, ensuring that it 378 * is the only task able to also acquire callback_lock and be able to 379 * modify cpusets. It can perform various checks on the cpuset structure 380 * first, knowing nothing will change. It can also allocate memory while 381 * just holding cpuset_rwsem. While it is performing these checks, various 382 * callback routines can briefly acquire callback_lock to query cpusets. 383 * Once it is ready to make the changes, it takes callback_lock, blocking 384 * everyone else. 385 * 386 * Calls to the kernel memory allocator can not be made while holding 387 * callback_lock, as that would risk double tripping on callback_lock 388 * from one of the callbacks into the cpuset code from within 389 * __alloc_pages(). 390 * 391 * If a task is only holding callback_lock, then it has read-only 392 * access to cpusets. 393 * 394 * Now, the task_struct fields mems_allowed and mempolicy may be changed 395 * by other task, we use alloc_lock in the task_struct fields to protect 396 * them. 397 * 398 * The cpuset_common_file_read() handlers only hold callback_lock across 399 * small pieces of code, such as when reading out possibly multi-word 400 * cpumasks and nodemasks. 401 * 402 * Accessing a task's cpuset should be done in accordance with the 403 * guidelines for accessing subsystem state in kernel/cgroup.c 404 */ 405 406 DEFINE_STATIC_PERCPU_RWSEM(cpuset_rwsem); 407 408 void cpuset_read_lock(void) 409 { 410 percpu_down_read(&cpuset_rwsem); 411 } 412 413 void cpuset_read_unlock(void) 414 { 415 percpu_up_read(&cpuset_rwsem); 416 } 417 418 static DEFINE_SPINLOCK(callback_lock); 419 420 static struct workqueue_struct *cpuset_migrate_mm_wq; 421 422 /* 423 * CPU / memory hotplug is handled asynchronously. 424 */ 425 static void cpuset_hotplug_workfn(struct work_struct *work); 426 static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn); 427 428 static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq); 429 430 static inline void check_insane_mems_config(nodemask_t *nodes) 431 { 432 if (!cpusets_insane_config() && 433 movable_only_nodes(nodes)) { 434 static_branch_enable(&cpusets_insane_config_key); 435 pr_info("Unsupported (movable nodes only) cpuset configuration detected (nmask=%*pbl)!\n" 436 "Cpuset allocations might fail even with a lot of memory available.\n", 437 nodemask_pr_args(nodes)); 438 } 439 } 440 441 /* 442 * Cgroup v2 behavior is used on the "cpus" and "mems" control files when 443 * on default hierarchy or when the cpuset_v2_mode flag is set by mounting 444 * the v1 cpuset cgroup filesystem with the "cpuset_v2_mode" mount option. 445 * With v2 behavior, "cpus" and "mems" are always what the users have 446 * requested and won't be changed by hotplug events. Only the effective 447 * cpus or mems will be affected. 448 */ 449 static inline bool is_in_v2_mode(void) 450 { 451 return cgroup_subsys_on_dfl(cpuset_cgrp_subsys) || 452 (cpuset_cgrp_subsys.root->flags & CGRP_ROOT_CPUSET_V2_MODE); 453 } 454 455 /** 456 * partition_is_populated - check if partition has tasks 457 * @cs: partition root to be checked 458 * @excluded_child: a child cpuset to be excluded in task checking 459 * Return: true if there are tasks, false otherwise 460 * 461 * It is assumed that @cs is a valid partition root. @excluded_child should 462 * be non-NULL when this cpuset is going to become a partition itself. 463 */ 464 static inline bool partition_is_populated(struct cpuset *cs, 465 struct cpuset *excluded_child) 466 { 467 struct cgroup_subsys_state *css; 468 struct cpuset *child; 469 470 if (cs->css.cgroup->nr_populated_csets) 471 return true; 472 if (!excluded_child && !cs->nr_subparts_cpus) 473 return cgroup_is_populated(cs->css.cgroup); 474 475 rcu_read_lock(); 476 cpuset_for_each_child(child, css, cs) { 477 if (child == excluded_child) 478 continue; 479 if (is_partition_valid(child)) 480 continue; 481 if (cgroup_is_populated(child->css.cgroup)) { 482 rcu_read_unlock(); 483 return true; 484 } 485 } 486 rcu_read_unlock(); 487 return false; 488 } 489 490 /* 491 * Return in pmask the portion of a task's cpusets's cpus_allowed that 492 * are online and are capable of running the task. If none are found, 493 * walk up the cpuset hierarchy until we find one that does have some 494 * appropriate cpus. 495 * 496 * One way or another, we guarantee to return some non-empty subset 497 * of cpu_online_mask. 498 * 499 * Call with callback_lock or cpuset_rwsem held. 500 */ 501 static void guarantee_online_cpus(struct task_struct *tsk, 502 struct cpumask *pmask) 503 { 504 const struct cpumask *possible_mask = task_cpu_possible_mask(tsk); 505 struct cpuset *cs; 506 507 if (WARN_ON(!cpumask_and(pmask, possible_mask, cpu_online_mask))) 508 cpumask_copy(pmask, cpu_online_mask); 509 510 rcu_read_lock(); 511 cs = task_cs(tsk); 512 513 while (!cpumask_intersects(cs->effective_cpus, pmask)) { 514 cs = parent_cs(cs); 515 if (unlikely(!cs)) { 516 /* 517 * The top cpuset doesn't have any online cpu as a 518 * consequence of a race between cpuset_hotplug_work 519 * and cpu hotplug notifier. But we know the top 520 * cpuset's effective_cpus is on its way to be 521 * identical to cpu_online_mask. 522 */ 523 goto out_unlock; 524 } 525 } 526 cpumask_and(pmask, pmask, cs->effective_cpus); 527 528 out_unlock: 529 rcu_read_unlock(); 530 } 531 532 /* 533 * Return in *pmask the portion of a cpusets's mems_allowed that 534 * are online, with memory. If none are online with memory, walk 535 * up the cpuset hierarchy until we find one that does have some 536 * online mems. The top cpuset always has some mems online. 537 * 538 * One way or another, we guarantee to return some non-empty subset 539 * of node_states[N_MEMORY]. 540 * 541 * Call with callback_lock or cpuset_rwsem held. 542 */ 543 static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask) 544 { 545 while (!nodes_intersects(cs->effective_mems, node_states[N_MEMORY])) 546 cs = parent_cs(cs); 547 nodes_and(*pmask, cs->effective_mems, node_states[N_MEMORY]); 548 } 549 550 /* 551 * update task's spread flag if cpuset's page/slab spread flag is set 552 * 553 * Call with callback_lock or cpuset_rwsem held. 554 */ 555 static void cpuset_update_task_spread_flag(struct cpuset *cs, 556 struct task_struct *tsk) 557 { 558 if (is_spread_page(cs)) 559 task_set_spread_page(tsk); 560 else 561 task_clear_spread_page(tsk); 562 563 if (is_spread_slab(cs)) 564 task_set_spread_slab(tsk); 565 else 566 task_clear_spread_slab(tsk); 567 } 568 569 /* 570 * is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q? 571 * 572 * One cpuset is a subset of another if all its allowed CPUs and 573 * Memory Nodes are a subset of the other, and its exclusive flags 574 * are only set if the other's are set. Call holding cpuset_rwsem. 575 */ 576 577 static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) 578 { 579 return cpumask_subset(p->cpus_allowed, q->cpus_allowed) && 580 nodes_subset(p->mems_allowed, q->mems_allowed) && 581 is_cpu_exclusive(p) <= is_cpu_exclusive(q) && 582 is_mem_exclusive(p) <= is_mem_exclusive(q); 583 } 584 585 /** 586 * alloc_cpumasks - allocate three cpumasks for cpuset 587 * @cs: the cpuset that have cpumasks to be allocated. 588 * @tmp: the tmpmasks structure pointer 589 * Return: 0 if successful, -ENOMEM otherwise. 590 * 591 * Only one of the two input arguments should be non-NULL. 592 */ 593 static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp) 594 { 595 cpumask_var_t *pmask1, *pmask2, *pmask3; 596 597 if (cs) { 598 pmask1 = &cs->cpus_allowed; 599 pmask2 = &cs->effective_cpus; 600 pmask3 = &cs->subparts_cpus; 601 } else { 602 pmask1 = &tmp->new_cpus; 603 pmask2 = &tmp->addmask; 604 pmask3 = &tmp->delmask; 605 } 606 607 if (!zalloc_cpumask_var(pmask1, GFP_KERNEL)) 608 return -ENOMEM; 609 610 if (!zalloc_cpumask_var(pmask2, GFP_KERNEL)) 611 goto free_one; 612 613 if (!zalloc_cpumask_var(pmask3, GFP_KERNEL)) 614 goto free_two; 615 616 return 0; 617 618 free_two: 619 free_cpumask_var(*pmask2); 620 free_one: 621 free_cpumask_var(*pmask1); 622 return -ENOMEM; 623 } 624 625 /** 626 * free_cpumasks - free cpumasks in a tmpmasks structure 627 * @cs: the cpuset that have cpumasks to be free. 628 * @tmp: the tmpmasks structure pointer 629 */ 630 static inline void free_cpumasks(struct cpuset *cs, struct tmpmasks *tmp) 631 { 632 if (cs) { 633 free_cpumask_var(cs->cpus_allowed); 634 free_cpumask_var(cs->effective_cpus); 635 free_cpumask_var(cs->subparts_cpus); 636 } 637 if (tmp) { 638 free_cpumask_var(tmp->new_cpus); 639 free_cpumask_var(tmp->addmask); 640 free_cpumask_var(tmp->delmask); 641 } 642 } 643 644 /** 645 * alloc_trial_cpuset - allocate a trial cpuset 646 * @cs: the cpuset that the trial cpuset duplicates 647 */ 648 static struct cpuset *alloc_trial_cpuset(struct cpuset *cs) 649 { 650 struct cpuset *trial; 651 652 trial = kmemdup(cs, sizeof(*cs), GFP_KERNEL); 653 if (!trial) 654 return NULL; 655 656 if (alloc_cpumasks(trial, NULL)) { 657 kfree(trial); 658 return NULL; 659 } 660 661 cpumask_copy(trial->cpus_allowed, cs->cpus_allowed); 662 cpumask_copy(trial->effective_cpus, cs->effective_cpus); 663 return trial; 664 } 665 666 /** 667 * free_cpuset - free the cpuset 668 * @cs: the cpuset to be freed 669 */ 670 static inline void free_cpuset(struct cpuset *cs) 671 { 672 free_cpumasks(cs, NULL); 673 kfree(cs); 674 } 675 676 /* 677 * validate_change_legacy() - Validate conditions specific to legacy (v1) 678 * behavior. 679 */ 680 static int validate_change_legacy(struct cpuset *cur, struct cpuset *trial) 681 { 682 struct cgroup_subsys_state *css; 683 struct cpuset *c, *par; 684 int ret; 685 686 WARN_ON_ONCE(!rcu_read_lock_held()); 687 688 /* Each of our child cpusets must be a subset of us */ 689 ret = -EBUSY; 690 cpuset_for_each_child(c, css, cur) 691 if (!is_cpuset_subset(c, trial)) 692 goto out; 693 694 /* On legacy hierarchy, we must be a subset of our parent cpuset. */ 695 ret = -EACCES; 696 par = parent_cs(cur); 697 if (par && !is_cpuset_subset(trial, par)) 698 goto out; 699 700 ret = 0; 701 out: 702 return ret; 703 } 704 705 /* 706 * validate_change() - Used to validate that any proposed cpuset change 707 * follows the structural rules for cpusets. 708 * 709 * If we replaced the flag and mask values of the current cpuset 710 * (cur) with those values in the trial cpuset (trial), would 711 * our various subset and exclusive rules still be valid? Presumes 712 * cpuset_rwsem held. 713 * 714 * 'cur' is the address of an actual, in-use cpuset. Operations 715 * such as list traversal that depend on the actual address of the 716 * cpuset in the list must use cur below, not trial. 717 * 718 * 'trial' is the address of bulk structure copy of cur, with 719 * perhaps one or more of the fields cpus_allowed, mems_allowed, 720 * or flags changed to new, trial values. 721 * 722 * Return 0 if valid, -errno if not. 723 */ 724 725 static int validate_change(struct cpuset *cur, struct cpuset *trial) 726 { 727 struct cgroup_subsys_state *css; 728 struct cpuset *c, *par; 729 int ret = 0; 730 731 rcu_read_lock(); 732 733 if (!is_in_v2_mode()) 734 ret = validate_change_legacy(cur, trial); 735 if (ret) 736 goto out; 737 738 /* Remaining checks don't apply to root cpuset */ 739 if (cur == &top_cpuset) 740 goto out; 741 742 par = parent_cs(cur); 743 744 /* 745 * Cpusets with tasks - existing or newly being attached - can't 746 * be changed to have empty cpus_allowed or mems_allowed. 747 */ 748 ret = -ENOSPC; 749 if ((cgroup_is_populated(cur->css.cgroup) || cur->attach_in_progress)) { 750 if (!cpumask_empty(cur->cpus_allowed) && 751 cpumask_empty(trial->cpus_allowed)) 752 goto out; 753 if (!nodes_empty(cur->mems_allowed) && 754 nodes_empty(trial->mems_allowed)) 755 goto out; 756 } 757 758 /* 759 * We can't shrink if we won't have enough room for SCHED_DEADLINE 760 * tasks. 761 */ 762 ret = -EBUSY; 763 if (is_cpu_exclusive(cur) && 764 !cpuset_cpumask_can_shrink(cur->cpus_allowed, 765 trial->cpus_allowed)) 766 goto out; 767 768 /* 769 * If either I or some sibling (!= me) is exclusive, we can't 770 * overlap 771 */ 772 ret = -EINVAL; 773 cpuset_for_each_child(c, css, par) { 774 if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && 775 c != cur && 776 cpumask_intersects(trial->cpus_allowed, c->cpus_allowed)) 777 goto out; 778 if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) && 779 c != cur && 780 nodes_intersects(trial->mems_allowed, c->mems_allowed)) 781 goto out; 782 } 783 784 ret = 0; 785 out: 786 rcu_read_unlock(); 787 return ret; 788 } 789 790 #ifdef CONFIG_SMP 791 /* 792 * Helper routine for generate_sched_domains(). 793 * Do cpusets a, b have overlapping effective cpus_allowed masks? 794 */ 795 static int cpusets_overlap(struct cpuset *a, struct cpuset *b) 796 { 797 return cpumask_intersects(a->effective_cpus, b->effective_cpus); 798 } 799 800 static void 801 update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c) 802 { 803 if (dattr->relax_domain_level < c->relax_domain_level) 804 dattr->relax_domain_level = c->relax_domain_level; 805 return; 806 } 807 808 static void update_domain_attr_tree(struct sched_domain_attr *dattr, 809 struct cpuset *root_cs) 810 { 811 struct cpuset *cp; 812 struct cgroup_subsys_state *pos_css; 813 814 rcu_read_lock(); 815 cpuset_for_each_descendant_pre(cp, pos_css, root_cs) { 816 /* skip the whole subtree if @cp doesn't have any CPU */ 817 if (cpumask_empty(cp->cpus_allowed)) { 818 pos_css = css_rightmost_descendant(pos_css); 819 continue; 820 } 821 822 if (is_sched_load_balance(cp)) 823 update_domain_attr(dattr, cp); 824 } 825 rcu_read_unlock(); 826 } 827 828 /* Must be called with cpuset_rwsem held. */ 829 static inline int nr_cpusets(void) 830 { 831 /* jump label reference count + the top-level cpuset */ 832 return static_key_count(&cpusets_enabled_key.key) + 1; 833 } 834 835 /* 836 * generate_sched_domains() 837 * 838 * This function builds a partial partition of the systems CPUs 839 * A 'partial partition' is a set of non-overlapping subsets whose 840 * union is a subset of that set. 841 * The output of this function needs to be passed to kernel/sched/core.c 842 * partition_sched_domains() routine, which will rebuild the scheduler's 843 * load balancing domains (sched domains) as specified by that partial 844 * partition. 845 * 846 * See "What is sched_load_balance" in Documentation/admin-guide/cgroup-v1/cpusets.rst 847 * for a background explanation of this. 848 * 849 * Does not return errors, on the theory that the callers of this 850 * routine would rather not worry about failures to rebuild sched 851 * domains when operating in the severe memory shortage situations 852 * that could cause allocation failures below. 853 * 854 * Must be called with cpuset_rwsem held. 855 * 856 * The three key local variables below are: 857 * cp - cpuset pointer, used (together with pos_css) to perform a 858 * top-down scan of all cpusets. For our purposes, rebuilding 859 * the schedulers sched domains, we can ignore !is_sched_load_ 860 * balance cpusets. 861 * csa - (for CpuSet Array) Array of pointers to all the cpusets 862 * that need to be load balanced, for convenient iterative 863 * access by the subsequent code that finds the best partition, 864 * i.e the set of domains (subsets) of CPUs such that the 865 * cpus_allowed of every cpuset marked is_sched_load_balance 866 * is a subset of one of these domains, while there are as 867 * many such domains as possible, each as small as possible. 868 * doms - Conversion of 'csa' to an array of cpumasks, for passing to 869 * the kernel/sched/core.c routine partition_sched_domains() in a 870 * convenient format, that can be easily compared to the prior 871 * value to determine what partition elements (sched domains) 872 * were changed (added or removed.) 873 * 874 * Finding the best partition (set of domains): 875 * The triple nested loops below over i, j, k scan over the 876 * load balanced cpusets (using the array of cpuset pointers in 877 * csa[]) looking for pairs of cpusets that have overlapping 878 * cpus_allowed, but which don't have the same 'pn' partition 879 * number and gives them in the same partition number. It keeps 880 * looping on the 'restart' label until it can no longer find 881 * any such pairs. 882 * 883 * The union of the cpus_allowed masks from the set of 884 * all cpusets having the same 'pn' value then form the one 885 * element of the partition (one sched domain) to be passed to 886 * partition_sched_domains(). 887 */ 888 static int generate_sched_domains(cpumask_var_t **domains, 889 struct sched_domain_attr **attributes) 890 { 891 struct cpuset *cp; /* top-down scan of cpusets */ 892 struct cpuset **csa; /* array of all cpuset ptrs */ 893 int csn; /* how many cpuset ptrs in csa so far */ 894 int i, j, k; /* indices for partition finding loops */ 895 cpumask_var_t *doms; /* resulting partition; i.e. sched domains */ 896 struct sched_domain_attr *dattr; /* attributes for custom domains */ 897 int ndoms = 0; /* number of sched domains in result */ 898 int nslot; /* next empty doms[] struct cpumask slot */ 899 struct cgroup_subsys_state *pos_css; 900 bool root_load_balance = is_sched_load_balance(&top_cpuset); 901 902 doms = NULL; 903 dattr = NULL; 904 csa = NULL; 905 906 /* Special case for the 99% of systems with one, full, sched domain */ 907 if (root_load_balance && !top_cpuset.nr_subparts_cpus) { 908 ndoms = 1; 909 doms = alloc_sched_domains(ndoms); 910 if (!doms) 911 goto done; 912 913 dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL); 914 if (dattr) { 915 *dattr = SD_ATTR_INIT; 916 update_domain_attr_tree(dattr, &top_cpuset); 917 } 918 cpumask_and(doms[0], top_cpuset.effective_cpus, 919 housekeeping_cpumask(HK_TYPE_DOMAIN)); 920 921 goto done; 922 } 923 924 csa = kmalloc_array(nr_cpusets(), sizeof(cp), GFP_KERNEL); 925 if (!csa) 926 goto done; 927 csn = 0; 928 929 rcu_read_lock(); 930 if (root_load_balance) 931 csa[csn++] = &top_cpuset; 932 cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) { 933 if (cp == &top_cpuset) 934 continue; 935 /* 936 * Continue traversing beyond @cp iff @cp has some CPUs and 937 * isn't load balancing. The former is obvious. The 938 * latter: All child cpusets contain a subset of the 939 * parent's cpus, so just skip them, and then we call 940 * update_domain_attr_tree() to calc relax_domain_level of 941 * the corresponding sched domain. 942 * 943 * If root is load-balancing, we can skip @cp if it 944 * is a subset of the root's effective_cpus. 945 */ 946 if (!cpumask_empty(cp->cpus_allowed) && 947 !(is_sched_load_balance(cp) && 948 cpumask_intersects(cp->cpus_allowed, 949 housekeeping_cpumask(HK_TYPE_DOMAIN)))) 950 continue; 951 952 if (root_load_balance && 953 cpumask_subset(cp->cpus_allowed, top_cpuset.effective_cpus)) 954 continue; 955 956 if (is_sched_load_balance(cp) && 957 !cpumask_empty(cp->effective_cpus)) 958 csa[csn++] = cp; 959 960 /* skip @cp's subtree if not a partition root */ 961 if (!is_partition_valid(cp)) 962 pos_css = css_rightmost_descendant(pos_css); 963 } 964 rcu_read_unlock(); 965 966 for (i = 0; i < csn; i++) 967 csa[i]->pn = i; 968 ndoms = csn; 969 970 restart: 971 /* Find the best partition (set of sched domains) */ 972 for (i = 0; i < csn; i++) { 973 struct cpuset *a = csa[i]; 974 int apn = a->pn; 975 976 for (j = 0; j < csn; j++) { 977 struct cpuset *b = csa[j]; 978 int bpn = b->pn; 979 980 if (apn != bpn && cpusets_overlap(a, b)) { 981 for (k = 0; k < csn; k++) { 982 struct cpuset *c = csa[k]; 983 984 if (c->pn == bpn) 985 c->pn = apn; 986 } 987 ndoms--; /* one less element */ 988 goto restart; 989 } 990 } 991 } 992 993 /* 994 * Now we know how many domains to create. 995 * Convert <csn, csa> to <ndoms, doms> and populate cpu masks. 996 */ 997 doms = alloc_sched_domains(ndoms); 998 if (!doms) 999 goto done; 1000 1001 /* 1002 * The rest of the code, including the scheduler, can deal with 1003 * dattr==NULL case. No need to abort if alloc fails. 1004 */ 1005 dattr = kmalloc_array(ndoms, sizeof(struct sched_domain_attr), 1006 GFP_KERNEL); 1007 1008 for (nslot = 0, i = 0; i < csn; i++) { 1009 struct cpuset *a = csa[i]; 1010 struct cpumask *dp; 1011 int apn = a->pn; 1012 1013 if (apn < 0) { 1014 /* Skip completed partitions */ 1015 continue; 1016 } 1017 1018 dp = doms[nslot]; 1019 1020 if (nslot == ndoms) { 1021 static int warnings = 10; 1022 if (warnings) { 1023 pr_warn("rebuild_sched_domains confused: nslot %d, ndoms %d, csn %d, i %d, apn %d\n", 1024 nslot, ndoms, csn, i, apn); 1025 warnings--; 1026 } 1027 continue; 1028 } 1029 1030 cpumask_clear(dp); 1031 if (dattr) 1032 *(dattr + nslot) = SD_ATTR_INIT; 1033 for (j = i; j < csn; j++) { 1034 struct cpuset *b = csa[j]; 1035 1036 if (apn == b->pn) { 1037 cpumask_or(dp, dp, b->effective_cpus); 1038 cpumask_and(dp, dp, housekeeping_cpumask(HK_TYPE_DOMAIN)); 1039 if (dattr) 1040 update_domain_attr_tree(dattr + nslot, b); 1041 1042 /* Done with this partition */ 1043 b->pn = -1; 1044 } 1045 } 1046 nslot++; 1047 } 1048 BUG_ON(nslot != ndoms); 1049 1050 done: 1051 kfree(csa); 1052 1053 /* 1054 * Fallback to the default domain if kmalloc() failed. 1055 * See comments in partition_sched_domains(). 1056 */ 1057 if (doms == NULL) 1058 ndoms = 1; 1059 1060 *domains = doms; 1061 *attributes = dattr; 1062 return ndoms; 1063 } 1064 1065 static void update_tasks_root_domain(struct cpuset *cs) 1066 { 1067 struct css_task_iter it; 1068 struct task_struct *task; 1069 1070 css_task_iter_start(&cs->css, 0, &it); 1071 1072 while ((task = css_task_iter_next(&it))) 1073 dl_add_task_root_domain(task); 1074 1075 css_task_iter_end(&it); 1076 } 1077 1078 static void rebuild_root_domains(void) 1079 { 1080 struct cpuset *cs = NULL; 1081 struct cgroup_subsys_state *pos_css; 1082 1083 percpu_rwsem_assert_held(&cpuset_rwsem); 1084 lockdep_assert_cpus_held(); 1085 lockdep_assert_held(&sched_domains_mutex); 1086 1087 rcu_read_lock(); 1088 1089 /* 1090 * Clear default root domain DL accounting, it will be computed again 1091 * if a task belongs to it. 1092 */ 1093 dl_clear_root_domain(&def_root_domain); 1094 1095 cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) { 1096 1097 if (cpumask_empty(cs->effective_cpus)) { 1098 pos_css = css_rightmost_descendant(pos_css); 1099 continue; 1100 } 1101 1102 css_get(&cs->css); 1103 1104 rcu_read_unlock(); 1105 1106 update_tasks_root_domain(cs); 1107 1108 rcu_read_lock(); 1109 css_put(&cs->css); 1110 } 1111 rcu_read_unlock(); 1112 } 1113 1114 static void 1115 partition_and_rebuild_sched_domains(int ndoms_new, cpumask_var_t doms_new[], 1116 struct sched_domain_attr *dattr_new) 1117 { 1118 mutex_lock(&sched_domains_mutex); 1119 partition_sched_domains_locked(ndoms_new, doms_new, dattr_new); 1120 rebuild_root_domains(); 1121 mutex_unlock(&sched_domains_mutex); 1122 } 1123 1124 /* 1125 * Rebuild scheduler domains. 1126 * 1127 * If the flag 'sched_load_balance' of any cpuset with non-empty 1128 * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset 1129 * which has that flag enabled, or if any cpuset with a non-empty 1130 * 'cpus' is removed, then call this routine to rebuild the 1131 * scheduler's dynamic sched domains. 1132 * 1133 * Call with cpuset_rwsem held. Takes cpus_read_lock(). 1134 */ 1135 static void rebuild_sched_domains_locked(void) 1136 { 1137 struct cgroup_subsys_state *pos_css; 1138 struct sched_domain_attr *attr; 1139 cpumask_var_t *doms; 1140 struct cpuset *cs; 1141 int ndoms; 1142 1143 lockdep_assert_cpus_held(); 1144 percpu_rwsem_assert_held(&cpuset_rwsem); 1145 1146 /* 1147 * If we have raced with CPU hotplug, return early to avoid 1148 * passing doms with offlined cpu to partition_sched_domains(). 1149 * Anyways, cpuset_hotplug_workfn() will rebuild sched domains. 1150 * 1151 * With no CPUs in any subpartitions, top_cpuset's effective CPUs 1152 * should be the same as the active CPUs, so checking only top_cpuset 1153 * is enough to detect racing CPU offlines. 1154 */ 1155 if (!top_cpuset.nr_subparts_cpus && 1156 !cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask)) 1157 return; 1158 1159 /* 1160 * With subpartition CPUs, however, the effective CPUs of a partition 1161 * root should be only a subset of the active CPUs. Since a CPU in any 1162 * partition root could be offlined, all must be checked. 1163 */ 1164 if (top_cpuset.nr_subparts_cpus) { 1165 rcu_read_lock(); 1166 cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) { 1167 if (!is_partition_valid(cs)) { 1168 pos_css = css_rightmost_descendant(pos_css); 1169 continue; 1170 } 1171 if (!cpumask_subset(cs->effective_cpus, 1172 cpu_active_mask)) { 1173 rcu_read_unlock(); 1174 return; 1175 } 1176 } 1177 rcu_read_unlock(); 1178 } 1179 1180 /* Generate domain masks and attrs */ 1181 ndoms = generate_sched_domains(&doms, &attr); 1182 1183 /* Have scheduler rebuild the domains */ 1184 partition_and_rebuild_sched_domains(ndoms, doms, attr); 1185 } 1186 #else /* !CONFIG_SMP */ 1187 static void rebuild_sched_domains_locked(void) 1188 { 1189 } 1190 #endif /* CONFIG_SMP */ 1191 1192 void rebuild_sched_domains(void) 1193 { 1194 cpus_read_lock(); 1195 percpu_down_write(&cpuset_rwsem); 1196 rebuild_sched_domains_locked(); 1197 percpu_up_write(&cpuset_rwsem); 1198 cpus_read_unlock(); 1199 } 1200 1201 /** 1202 * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset. 1203 * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed 1204 * 1205 * Iterate through each task of @cs updating its cpus_allowed to the 1206 * effective cpuset's. As this function is called with cpuset_rwsem held, 1207 * cpuset membership stays stable. 1208 */ 1209 static void update_tasks_cpumask(struct cpuset *cs) 1210 { 1211 struct css_task_iter it; 1212 struct task_struct *task; 1213 bool top_cs = cs == &top_cpuset; 1214 1215 css_task_iter_start(&cs->css, 0, &it); 1216 while ((task = css_task_iter_next(&it))) { 1217 /* 1218 * Percpu kthreads in top_cpuset are ignored 1219 */ 1220 if (top_cs && (task->flags & PF_KTHREAD) && 1221 kthread_is_per_cpu(task)) 1222 continue; 1223 set_cpus_allowed_ptr(task, cs->effective_cpus); 1224 } 1225 css_task_iter_end(&it); 1226 } 1227 1228 /** 1229 * compute_effective_cpumask - Compute the effective cpumask of the cpuset 1230 * @new_cpus: the temp variable for the new effective_cpus mask 1231 * @cs: the cpuset the need to recompute the new effective_cpus mask 1232 * @parent: the parent cpuset 1233 * 1234 * If the parent has subpartition CPUs, include them in the list of 1235 * allowable CPUs in computing the new effective_cpus mask. Since offlined 1236 * CPUs are not removed from subparts_cpus, we have to use cpu_active_mask 1237 * to mask those out. 1238 */ 1239 static void compute_effective_cpumask(struct cpumask *new_cpus, 1240 struct cpuset *cs, struct cpuset *parent) 1241 { 1242 if (parent->nr_subparts_cpus) { 1243 cpumask_or(new_cpus, parent->effective_cpus, 1244 parent->subparts_cpus); 1245 cpumask_and(new_cpus, new_cpus, cs->cpus_allowed); 1246 cpumask_and(new_cpus, new_cpus, cpu_active_mask); 1247 } else { 1248 cpumask_and(new_cpus, cs->cpus_allowed, parent->effective_cpus); 1249 } 1250 } 1251 1252 /* 1253 * Commands for update_parent_subparts_cpumask 1254 */ 1255 enum subparts_cmd { 1256 partcmd_enable, /* Enable partition root */ 1257 partcmd_disable, /* Disable partition root */ 1258 partcmd_update, /* Update parent's subparts_cpus */ 1259 partcmd_invalidate, /* Make partition invalid */ 1260 }; 1261 1262 static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, 1263 int turning_on); 1264 /** 1265 * update_parent_subparts_cpumask - update subparts_cpus mask of parent cpuset 1266 * @cpuset: The cpuset that requests change in partition root state 1267 * @cmd: Partition root state change command 1268 * @newmask: Optional new cpumask for partcmd_update 1269 * @tmp: Temporary addmask and delmask 1270 * Return: 0 or a partition root state error code 1271 * 1272 * For partcmd_enable, the cpuset is being transformed from a non-partition 1273 * root to a partition root. The cpus_allowed mask of the given cpuset will 1274 * be put into parent's subparts_cpus and taken away from parent's 1275 * effective_cpus. The function will return 0 if all the CPUs listed in 1276 * cpus_allowed can be granted or an error code will be returned. 1277 * 1278 * For partcmd_disable, the cpuset is being transformed from a partition 1279 * root back to a non-partition root. Any CPUs in cpus_allowed that are in 1280 * parent's subparts_cpus will be taken away from that cpumask and put back 1281 * into parent's effective_cpus. 0 will always be returned. 1282 * 1283 * For partcmd_update, if the optional newmask is specified, the cpu list is 1284 * to be changed from cpus_allowed to newmask. Otherwise, cpus_allowed is 1285 * assumed to remain the same. The cpuset should either be a valid or invalid 1286 * partition root. The partition root state may change from valid to invalid 1287 * or vice versa. An error code will only be returned if transitioning from 1288 * invalid to valid violates the exclusivity rule. 1289 * 1290 * For partcmd_invalidate, the current partition will be made invalid. 1291 * 1292 * The partcmd_enable and partcmd_disable commands are used by 1293 * update_prstate(). An error code may be returned and the caller will check 1294 * for error. 1295 * 1296 * The partcmd_update command is used by update_cpumasks_hier() with newmask 1297 * NULL and update_cpumask() with newmask set. The partcmd_invalidate is used 1298 * by update_cpumask() with NULL newmask. In both cases, the callers won't 1299 * check for error and so partition_root_state and prs_error will be updated 1300 * directly. 1301 */ 1302 static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd, 1303 struct cpumask *newmask, 1304 struct tmpmasks *tmp) 1305 { 1306 struct cpuset *parent = parent_cs(cs); 1307 int adding; /* Moving cpus from effective_cpus to subparts_cpus */ 1308 int deleting; /* Moving cpus from subparts_cpus to effective_cpus */ 1309 int old_prs, new_prs; 1310 int part_error = PERR_NONE; /* Partition error? */ 1311 1312 percpu_rwsem_assert_held(&cpuset_rwsem); 1313 1314 /* 1315 * The parent must be a partition root. 1316 * The new cpumask, if present, or the current cpus_allowed must 1317 * not be empty. 1318 */ 1319 if (!is_partition_valid(parent)) { 1320 return is_partition_invalid(parent) 1321 ? PERR_INVPARENT : PERR_NOTPART; 1322 } 1323 if ((newmask && cpumask_empty(newmask)) || 1324 (!newmask && cpumask_empty(cs->cpus_allowed))) 1325 return PERR_CPUSEMPTY; 1326 1327 /* 1328 * new_prs will only be changed for the partcmd_update and 1329 * partcmd_invalidate commands. 1330 */ 1331 adding = deleting = false; 1332 old_prs = new_prs = cs->partition_root_state; 1333 if (cmd == partcmd_enable) { 1334 /* 1335 * Enabling partition root is not allowed if cpus_allowed 1336 * doesn't overlap parent's cpus_allowed. 1337 */ 1338 if (!cpumask_intersects(cs->cpus_allowed, parent->cpus_allowed)) 1339 return PERR_INVCPUS; 1340 1341 /* 1342 * A parent can be left with no CPU as long as there is no 1343 * task directly associated with the parent partition. 1344 */ 1345 if (!cpumask_intersects(cs->cpus_allowed, parent->effective_cpus) && 1346 partition_is_populated(parent, cs)) 1347 return PERR_NOCPUS; 1348 1349 cpumask_copy(tmp->addmask, cs->cpus_allowed); 1350 adding = true; 1351 } else if (cmd == partcmd_disable) { 1352 /* 1353 * Need to remove cpus from parent's subparts_cpus for valid 1354 * partition root. 1355 */ 1356 deleting = !is_prs_invalid(old_prs) && 1357 cpumask_and(tmp->delmask, cs->cpus_allowed, 1358 parent->subparts_cpus); 1359 } else if (cmd == partcmd_invalidate) { 1360 if (is_prs_invalid(old_prs)) 1361 return 0; 1362 1363 /* 1364 * Make the current partition invalid. It is assumed that 1365 * invalidation is caused by violating cpu exclusivity rule. 1366 */ 1367 deleting = cpumask_and(tmp->delmask, cs->cpus_allowed, 1368 parent->subparts_cpus); 1369 if (old_prs > 0) { 1370 new_prs = -old_prs; 1371 part_error = PERR_NOTEXCL; 1372 } 1373 } else if (newmask) { 1374 /* 1375 * partcmd_update with newmask: 1376 * 1377 * Compute add/delete mask to/from subparts_cpus 1378 * 1379 * delmask = cpus_allowed & ~newmask & parent->subparts_cpus 1380 * addmask = newmask & parent->cpus_allowed 1381 * & ~parent->subparts_cpus 1382 */ 1383 cpumask_andnot(tmp->delmask, cs->cpus_allowed, newmask); 1384 deleting = cpumask_and(tmp->delmask, tmp->delmask, 1385 parent->subparts_cpus); 1386 1387 cpumask_and(tmp->addmask, newmask, parent->cpus_allowed); 1388 adding = cpumask_andnot(tmp->addmask, tmp->addmask, 1389 parent->subparts_cpus); 1390 /* 1391 * Make partition invalid if parent's effective_cpus could 1392 * become empty and there are tasks in the parent. 1393 */ 1394 if (adding && 1395 cpumask_subset(parent->effective_cpus, tmp->addmask) && 1396 !cpumask_intersects(tmp->delmask, cpu_active_mask) && 1397 partition_is_populated(parent, cs)) { 1398 part_error = PERR_NOCPUS; 1399 adding = false; 1400 deleting = cpumask_and(tmp->delmask, cs->cpus_allowed, 1401 parent->subparts_cpus); 1402 } 1403 } else { 1404 /* 1405 * partcmd_update w/o newmask: 1406 * 1407 * delmask = cpus_allowed & parent->subparts_cpus 1408 * addmask = cpus_allowed & parent->cpus_allowed 1409 * & ~parent->subparts_cpus 1410 * 1411 * This gets invoked either due to a hotplug event or from 1412 * update_cpumasks_hier(). This can cause the state of a 1413 * partition root to transition from valid to invalid or vice 1414 * versa. So we still need to compute the addmask and delmask. 1415 1416 * A partition error happens when: 1417 * 1) Cpuset is valid partition, but parent does not distribute 1418 * out any CPUs. 1419 * 2) Parent has tasks and all its effective CPUs will have 1420 * to be distributed out. 1421 */ 1422 cpumask_and(tmp->addmask, cs->cpus_allowed, 1423 parent->cpus_allowed); 1424 adding = cpumask_andnot(tmp->addmask, tmp->addmask, 1425 parent->subparts_cpus); 1426 1427 if ((is_partition_valid(cs) && !parent->nr_subparts_cpus) || 1428 (adding && 1429 cpumask_subset(parent->effective_cpus, tmp->addmask) && 1430 partition_is_populated(parent, cs))) { 1431 part_error = PERR_NOCPUS; 1432 adding = false; 1433 } 1434 1435 if (part_error && is_partition_valid(cs) && 1436 parent->nr_subparts_cpus) 1437 deleting = cpumask_and(tmp->delmask, cs->cpus_allowed, 1438 parent->subparts_cpus); 1439 } 1440 if (part_error) 1441 WRITE_ONCE(cs->prs_err, part_error); 1442 1443 if (cmd == partcmd_update) { 1444 /* 1445 * Check for possible transition between valid and invalid 1446 * partition root. 1447 */ 1448 switch (cs->partition_root_state) { 1449 case PRS_ROOT: 1450 case PRS_ISOLATED: 1451 if (part_error) 1452 new_prs = -old_prs; 1453 break; 1454 case PRS_INVALID_ROOT: 1455 case PRS_INVALID_ISOLATED: 1456 if (!part_error) 1457 new_prs = -old_prs; 1458 break; 1459 } 1460 } 1461 1462 if (!adding && !deleting && (new_prs == old_prs)) 1463 return 0; 1464 1465 /* 1466 * Transitioning between invalid to valid or vice versa may require 1467 * changing CS_CPU_EXCLUSIVE and CS_SCHED_LOAD_BALANCE. 1468 */ 1469 if (old_prs != new_prs) { 1470 if (is_prs_invalid(old_prs) && !is_cpu_exclusive(cs) && 1471 (update_flag(CS_CPU_EXCLUSIVE, cs, 1) < 0)) 1472 return PERR_NOTEXCL; 1473 if (is_prs_invalid(new_prs) && is_cpu_exclusive(cs)) 1474 update_flag(CS_CPU_EXCLUSIVE, cs, 0); 1475 } 1476 1477 /* 1478 * Change the parent's subparts_cpus. 1479 * Newly added CPUs will be removed from effective_cpus and 1480 * newly deleted ones will be added back to effective_cpus. 1481 */ 1482 spin_lock_irq(&callback_lock); 1483 if (adding) { 1484 cpumask_or(parent->subparts_cpus, 1485 parent->subparts_cpus, tmp->addmask); 1486 cpumask_andnot(parent->effective_cpus, 1487 parent->effective_cpus, tmp->addmask); 1488 } 1489 if (deleting) { 1490 cpumask_andnot(parent->subparts_cpus, 1491 parent->subparts_cpus, tmp->delmask); 1492 /* 1493 * Some of the CPUs in subparts_cpus might have been offlined. 1494 */ 1495 cpumask_and(tmp->delmask, tmp->delmask, cpu_active_mask); 1496 cpumask_or(parent->effective_cpus, 1497 parent->effective_cpus, tmp->delmask); 1498 } 1499 1500 parent->nr_subparts_cpus = cpumask_weight(parent->subparts_cpus); 1501 1502 if (old_prs != new_prs) 1503 cs->partition_root_state = new_prs; 1504 1505 spin_unlock_irq(&callback_lock); 1506 1507 if (adding || deleting) 1508 update_tasks_cpumask(parent); 1509 1510 /* 1511 * Set or clear CS_SCHED_LOAD_BALANCE when partcmd_update, if necessary. 1512 * rebuild_sched_domains_locked() may be called. 1513 */ 1514 if (old_prs != new_prs) { 1515 if (old_prs == PRS_ISOLATED) 1516 update_flag(CS_SCHED_LOAD_BALANCE, cs, 1); 1517 else if (new_prs == PRS_ISOLATED) 1518 update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); 1519 } 1520 notify_partition_change(cs, old_prs); 1521 return 0; 1522 } 1523 1524 /* 1525 * update_cpumasks_hier - Update effective cpumasks and tasks in the subtree 1526 * @cs: the cpuset to consider 1527 * @tmp: temp variables for calculating effective_cpus & partition setup 1528 * @force: don't skip any descendant cpusets if set 1529 * 1530 * When configured cpumask is changed, the effective cpumasks of this cpuset 1531 * and all its descendants need to be updated. 1532 * 1533 * On legacy hierarchy, effective_cpus will be the same with cpu_allowed. 1534 * 1535 * Called with cpuset_rwsem held 1536 */ 1537 static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp, 1538 bool force) 1539 { 1540 struct cpuset *cp; 1541 struct cgroup_subsys_state *pos_css; 1542 bool need_rebuild_sched_domains = false; 1543 int old_prs, new_prs; 1544 1545 rcu_read_lock(); 1546 cpuset_for_each_descendant_pre(cp, pos_css, cs) { 1547 struct cpuset *parent = parent_cs(cp); 1548 bool update_parent = false; 1549 1550 compute_effective_cpumask(tmp->new_cpus, cp, parent); 1551 1552 /* 1553 * If it becomes empty, inherit the effective mask of the 1554 * parent, which is guaranteed to have some CPUs unless 1555 * it is a partition root that has explicitly distributed 1556 * out all its CPUs. 1557 */ 1558 if (is_in_v2_mode() && cpumask_empty(tmp->new_cpus)) { 1559 if (is_partition_valid(cp) && 1560 cpumask_equal(cp->cpus_allowed, cp->subparts_cpus)) 1561 goto update_parent_subparts; 1562 1563 cpumask_copy(tmp->new_cpus, parent->effective_cpus); 1564 if (!cp->use_parent_ecpus) { 1565 cp->use_parent_ecpus = true; 1566 parent->child_ecpus_count++; 1567 } 1568 } else if (cp->use_parent_ecpus) { 1569 cp->use_parent_ecpus = false; 1570 WARN_ON_ONCE(!parent->child_ecpus_count); 1571 parent->child_ecpus_count--; 1572 } 1573 1574 /* 1575 * Skip the whole subtree if the cpumask remains the same 1576 * and has no partition root state and force flag not set. 1577 */ 1578 if (!cp->partition_root_state && !force && 1579 cpumask_equal(tmp->new_cpus, cp->effective_cpus)) { 1580 pos_css = css_rightmost_descendant(pos_css); 1581 continue; 1582 } 1583 1584 update_parent_subparts: 1585 /* 1586 * update_parent_subparts_cpumask() should have been called 1587 * for cs already in update_cpumask(). We should also call 1588 * update_tasks_cpumask() again for tasks in the parent 1589 * cpuset if the parent's subparts_cpus changes. 1590 */ 1591 old_prs = new_prs = cp->partition_root_state; 1592 if ((cp != cs) && old_prs) { 1593 switch (parent->partition_root_state) { 1594 case PRS_ROOT: 1595 case PRS_ISOLATED: 1596 update_parent = true; 1597 break; 1598 1599 default: 1600 /* 1601 * When parent is not a partition root or is 1602 * invalid, child partition roots become 1603 * invalid too. 1604 */ 1605 if (is_partition_valid(cp)) 1606 new_prs = -cp->partition_root_state; 1607 WRITE_ONCE(cp->prs_err, 1608 is_partition_invalid(parent) 1609 ? PERR_INVPARENT : PERR_NOTPART); 1610 break; 1611 } 1612 } 1613 1614 if (!css_tryget_online(&cp->css)) 1615 continue; 1616 rcu_read_unlock(); 1617 1618 if (update_parent) { 1619 update_parent_subparts_cpumask(cp, partcmd_update, NULL, 1620 tmp); 1621 /* 1622 * The cpuset partition_root_state may become 1623 * invalid. Capture it. 1624 */ 1625 new_prs = cp->partition_root_state; 1626 } 1627 1628 spin_lock_irq(&callback_lock); 1629 1630 if (cp->nr_subparts_cpus && !is_partition_valid(cp)) { 1631 /* 1632 * Put all active subparts_cpus back to effective_cpus. 1633 */ 1634 cpumask_or(tmp->new_cpus, tmp->new_cpus, 1635 cp->subparts_cpus); 1636 cpumask_and(tmp->new_cpus, tmp->new_cpus, 1637 cpu_active_mask); 1638 cp->nr_subparts_cpus = 0; 1639 cpumask_clear(cp->subparts_cpus); 1640 } 1641 1642 cpumask_copy(cp->effective_cpus, tmp->new_cpus); 1643 if (cp->nr_subparts_cpus) { 1644 /* 1645 * Make sure that effective_cpus & subparts_cpus 1646 * are mutually exclusive. 1647 */ 1648 cpumask_andnot(cp->effective_cpus, cp->effective_cpus, 1649 cp->subparts_cpus); 1650 } 1651 1652 cp->partition_root_state = new_prs; 1653 spin_unlock_irq(&callback_lock); 1654 1655 notify_partition_change(cp, old_prs); 1656 1657 WARN_ON(!is_in_v2_mode() && 1658 !cpumask_equal(cp->cpus_allowed, cp->effective_cpus)); 1659 1660 update_tasks_cpumask(cp); 1661 1662 /* 1663 * On legacy hierarchy, if the effective cpumask of any non- 1664 * empty cpuset is changed, we need to rebuild sched domains. 1665 * On default hierarchy, the cpuset needs to be a partition 1666 * root as well. 1667 */ 1668 if (!cpumask_empty(cp->cpus_allowed) && 1669 is_sched_load_balance(cp) && 1670 (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) || 1671 is_partition_valid(cp))) 1672 need_rebuild_sched_domains = true; 1673 1674 rcu_read_lock(); 1675 css_put(&cp->css); 1676 } 1677 rcu_read_unlock(); 1678 1679 if (need_rebuild_sched_domains) 1680 rebuild_sched_domains_locked(); 1681 } 1682 1683 /** 1684 * update_sibling_cpumasks - Update siblings cpumasks 1685 * @parent: Parent cpuset 1686 * @cs: Current cpuset 1687 * @tmp: Temp variables 1688 */ 1689 static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, 1690 struct tmpmasks *tmp) 1691 { 1692 struct cpuset *sibling; 1693 struct cgroup_subsys_state *pos_css; 1694 1695 percpu_rwsem_assert_held(&cpuset_rwsem); 1696 1697 /* 1698 * Check all its siblings and call update_cpumasks_hier() 1699 * if their use_parent_ecpus flag is set in order for them 1700 * to use the right effective_cpus value. 1701 * 1702 * The update_cpumasks_hier() function may sleep. So we have to 1703 * release the RCU read lock before calling it. 1704 */ 1705 rcu_read_lock(); 1706 cpuset_for_each_child(sibling, pos_css, parent) { 1707 if (sibling == cs) 1708 continue; 1709 if (!sibling->use_parent_ecpus) 1710 continue; 1711 if (!css_tryget_online(&sibling->css)) 1712 continue; 1713 1714 rcu_read_unlock(); 1715 update_cpumasks_hier(sibling, tmp, false); 1716 rcu_read_lock(); 1717 css_put(&sibling->css); 1718 } 1719 rcu_read_unlock(); 1720 } 1721 1722 /** 1723 * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it 1724 * @cs: the cpuset to consider 1725 * @trialcs: trial cpuset 1726 * @buf: buffer of cpu numbers written to this cpuset 1727 */ 1728 static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, 1729 const char *buf) 1730 { 1731 int retval; 1732 struct tmpmasks tmp; 1733 bool invalidate = false; 1734 1735 /* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */ 1736 if (cs == &top_cpuset) 1737 return -EACCES; 1738 1739 /* 1740 * An empty cpus_allowed is ok only if the cpuset has no tasks. 1741 * Since cpulist_parse() fails on an empty mask, we special case 1742 * that parsing. The validate_change() call ensures that cpusets 1743 * with tasks have cpus. 1744 */ 1745 if (!*buf) { 1746 cpumask_clear(trialcs->cpus_allowed); 1747 } else { 1748 retval = cpulist_parse(buf, trialcs->cpus_allowed); 1749 if (retval < 0) 1750 return retval; 1751 1752 if (!cpumask_subset(trialcs->cpus_allowed, 1753 top_cpuset.cpus_allowed)) 1754 return -EINVAL; 1755 } 1756 1757 /* Nothing to do if the cpus didn't change */ 1758 if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed)) 1759 return 0; 1760 1761 #ifdef CONFIG_CPUMASK_OFFSTACK 1762 /* 1763 * Use the cpumasks in trialcs for tmpmasks when they are pointers 1764 * to allocated cpumasks. 1765 */ 1766 tmp.addmask = trialcs->subparts_cpus; 1767 tmp.delmask = trialcs->effective_cpus; 1768 tmp.new_cpus = trialcs->cpus_allowed; 1769 #endif 1770 1771 retval = validate_change(cs, trialcs); 1772 1773 if ((retval == -EINVAL) && cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) { 1774 struct cpuset *cp, *parent; 1775 struct cgroup_subsys_state *css; 1776 1777 /* 1778 * The -EINVAL error code indicates that partition sibling 1779 * CPU exclusivity rule has been violated. We still allow 1780 * the cpumask change to proceed while invalidating the 1781 * partition. However, any conflicting sibling partitions 1782 * have to be marked as invalid too. 1783 */ 1784 invalidate = true; 1785 rcu_read_lock(); 1786 parent = parent_cs(cs); 1787 cpuset_for_each_child(cp, css, parent) 1788 if (is_partition_valid(cp) && 1789 cpumask_intersects(trialcs->cpus_allowed, cp->cpus_allowed)) { 1790 rcu_read_unlock(); 1791 update_parent_subparts_cpumask(cp, partcmd_invalidate, NULL, &tmp); 1792 rcu_read_lock(); 1793 } 1794 rcu_read_unlock(); 1795 retval = 0; 1796 } 1797 if (retval < 0) 1798 return retval; 1799 1800 if (cs->partition_root_state) { 1801 if (invalidate) 1802 update_parent_subparts_cpumask(cs, partcmd_invalidate, 1803 NULL, &tmp); 1804 else 1805 update_parent_subparts_cpumask(cs, partcmd_update, 1806 trialcs->cpus_allowed, &tmp); 1807 } 1808 1809 compute_effective_cpumask(trialcs->effective_cpus, trialcs, 1810 parent_cs(cs)); 1811 spin_lock_irq(&callback_lock); 1812 cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed); 1813 1814 /* 1815 * Make sure that subparts_cpus, if not empty, is a subset of 1816 * cpus_allowed. Clear subparts_cpus if partition not valid or 1817 * empty effective cpus with tasks. 1818 */ 1819 if (cs->nr_subparts_cpus) { 1820 if (!is_partition_valid(cs) || 1821 (cpumask_subset(trialcs->effective_cpus, cs->subparts_cpus) && 1822 partition_is_populated(cs, NULL))) { 1823 cs->nr_subparts_cpus = 0; 1824 cpumask_clear(cs->subparts_cpus); 1825 } else { 1826 cpumask_and(cs->subparts_cpus, cs->subparts_cpus, 1827 cs->cpus_allowed); 1828 cs->nr_subparts_cpus = cpumask_weight(cs->subparts_cpus); 1829 } 1830 } 1831 spin_unlock_irq(&callback_lock); 1832 1833 /* effective_cpus will be updated here */ 1834 update_cpumasks_hier(cs, &tmp, false); 1835 1836 if (cs->partition_root_state) { 1837 struct cpuset *parent = parent_cs(cs); 1838 1839 /* 1840 * For partition root, update the cpumasks of sibling 1841 * cpusets if they use parent's effective_cpus. 1842 */ 1843 if (parent->child_ecpus_count) 1844 update_sibling_cpumasks(parent, cs, &tmp); 1845 } 1846 return 0; 1847 } 1848 1849 /* 1850 * Migrate memory region from one set of nodes to another. This is 1851 * performed asynchronously as it can be called from process migration path 1852 * holding locks involved in process management. All mm migrations are 1853 * performed in the queued order and can be waited for by flushing 1854 * cpuset_migrate_mm_wq. 1855 */ 1856 1857 struct cpuset_migrate_mm_work { 1858 struct work_struct work; 1859 struct mm_struct *mm; 1860 nodemask_t from; 1861 nodemask_t to; 1862 }; 1863 1864 static void cpuset_migrate_mm_workfn(struct work_struct *work) 1865 { 1866 struct cpuset_migrate_mm_work *mwork = 1867 container_of(work, struct cpuset_migrate_mm_work, work); 1868 1869 /* on a wq worker, no need to worry about %current's mems_allowed */ 1870 do_migrate_pages(mwork->mm, &mwork->from, &mwork->to, MPOL_MF_MOVE_ALL); 1871 mmput(mwork->mm); 1872 kfree(mwork); 1873 } 1874 1875 static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, 1876 const nodemask_t *to) 1877 { 1878 struct cpuset_migrate_mm_work *mwork; 1879 1880 if (nodes_equal(*from, *to)) { 1881 mmput(mm); 1882 return; 1883 } 1884 1885 mwork = kzalloc(sizeof(*mwork), GFP_KERNEL); 1886 if (mwork) { 1887 mwork->mm = mm; 1888 mwork->from = *from; 1889 mwork->to = *to; 1890 INIT_WORK(&mwork->work, cpuset_migrate_mm_workfn); 1891 queue_work(cpuset_migrate_mm_wq, &mwork->work); 1892 } else { 1893 mmput(mm); 1894 } 1895 } 1896 1897 static void cpuset_post_attach(void) 1898 { 1899 flush_workqueue(cpuset_migrate_mm_wq); 1900 } 1901 1902 /* 1903 * cpuset_change_task_nodemask - change task's mems_allowed and mempolicy 1904 * @tsk: the task to change 1905 * @newmems: new nodes that the task will be set 1906 * 1907 * We use the mems_allowed_seq seqlock to safely update both tsk->mems_allowed 1908 * and rebind an eventual tasks' mempolicy. If the task is allocating in 1909 * parallel, it might temporarily see an empty intersection, which results in 1910 * a seqlock check and retry before OOM or allocation failure. 1911 */ 1912 static void cpuset_change_task_nodemask(struct task_struct *tsk, 1913 nodemask_t *newmems) 1914 { 1915 task_lock(tsk); 1916 1917 local_irq_disable(); 1918 write_seqcount_begin(&tsk->mems_allowed_seq); 1919 1920 nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems); 1921 mpol_rebind_task(tsk, newmems); 1922 tsk->mems_allowed = *newmems; 1923 1924 write_seqcount_end(&tsk->mems_allowed_seq); 1925 local_irq_enable(); 1926 1927 task_unlock(tsk); 1928 } 1929 1930 static void *cpuset_being_rebound; 1931 1932 /** 1933 * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset. 1934 * @cs: the cpuset in which each task's mems_allowed mask needs to be changed 1935 * 1936 * Iterate through each task of @cs updating its mems_allowed to the 1937 * effective cpuset's. As this function is called with cpuset_rwsem held, 1938 * cpuset membership stays stable. 1939 */ 1940 static void update_tasks_nodemask(struct cpuset *cs) 1941 { 1942 static nodemask_t newmems; /* protected by cpuset_rwsem */ 1943 struct css_task_iter it; 1944 struct task_struct *task; 1945 1946 cpuset_being_rebound = cs; /* causes mpol_dup() rebind */ 1947 1948 guarantee_online_mems(cs, &newmems); 1949 1950 /* 1951 * The mpol_rebind_mm() call takes mmap_lock, which we couldn't 1952 * take while holding tasklist_lock. Forks can happen - the 1953 * mpol_dup() cpuset_being_rebound check will catch such forks, 1954 * and rebind their vma mempolicies too. Because we still hold 1955 * the global cpuset_rwsem, we know that no other rebind effort 1956 * will be contending for the global variable cpuset_being_rebound. 1957 * It's ok if we rebind the same mm twice; mpol_rebind_mm() 1958 * is idempotent. Also migrate pages in each mm to new nodes. 1959 */ 1960 css_task_iter_start(&cs->css, 0, &it); 1961 while ((task = css_task_iter_next(&it))) { 1962 struct mm_struct *mm; 1963 bool migrate; 1964 1965 cpuset_change_task_nodemask(task, &newmems); 1966 1967 mm = get_task_mm(task); 1968 if (!mm) 1969 continue; 1970 1971 migrate = is_memory_migrate(cs); 1972 1973 mpol_rebind_mm(mm, &cs->mems_allowed); 1974 if (migrate) 1975 cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems); 1976 else 1977 mmput(mm); 1978 } 1979 css_task_iter_end(&it); 1980 1981 /* 1982 * All the tasks' nodemasks have been updated, update 1983 * cs->old_mems_allowed. 1984 */ 1985 cs->old_mems_allowed = newmems; 1986 1987 /* We're done rebinding vmas to this cpuset's new mems_allowed. */ 1988 cpuset_being_rebound = NULL; 1989 } 1990 1991 /* 1992 * update_nodemasks_hier - Update effective nodemasks and tasks in the subtree 1993 * @cs: the cpuset to consider 1994 * @new_mems: a temp variable for calculating new effective_mems 1995 * 1996 * When configured nodemask is changed, the effective nodemasks of this cpuset 1997 * and all its descendants need to be updated. 1998 * 1999 * On legacy hierarchy, effective_mems will be the same with mems_allowed. 2000 * 2001 * Called with cpuset_rwsem held 2002 */ 2003 static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems) 2004 { 2005 struct cpuset *cp; 2006 struct cgroup_subsys_state *pos_css; 2007 2008 rcu_read_lock(); 2009 cpuset_for_each_descendant_pre(cp, pos_css, cs) { 2010 struct cpuset *parent = parent_cs(cp); 2011 2012 nodes_and(*new_mems, cp->mems_allowed, parent->effective_mems); 2013 2014 /* 2015 * If it becomes empty, inherit the effective mask of the 2016 * parent, which is guaranteed to have some MEMs. 2017 */ 2018 if (is_in_v2_mode() && nodes_empty(*new_mems)) 2019 *new_mems = parent->effective_mems; 2020 2021 /* Skip the whole subtree if the nodemask remains the same. */ 2022 if (nodes_equal(*new_mems, cp->effective_mems)) { 2023 pos_css = css_rightmost_descendant(pos_css); 2024 continue; 2025 } 2026 2027 if (!css_tryget_online(&cp->css)) 2028 continue; 2029 rcu_read_unlock(); 2030 2031 spin_lock_irq(&callback_lock); 2032 cp->effective_mems = *new_mems; 2033 spin_unlock_irq(&callback_lock); 2034 2035 WARN_ON(!is_in_v2_mode() && 2036 !nodes_equal(cp->mems_allowed, cp->effective_mems)); 2037 2038 update_tasks_nodemask(cp); 2039 2040 rcu_read_lock(); 2041 css_put(&cp->css); 2042 } 2043 rcu_read_unlock(); 2044 } 2045 2046 /* 2047 * Handle user request to change the 'mems' memory placement 2048 * of a cpuset. Needs to validate the request, update the 2049 * cpusets mems_allowed, and for each task in the cpuset, 2050 * update mems_allowed and rebind task's mempolicy and any vma 2051 * mempolicies and if the cpuset is marked 'memory_migrate', 2052 * migrate the tasks pages to the new memory. 2053 * 2054 * Call with cpuset_rwsem held. May take callback_lock during call. 2055 * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, 2056 * lock each such tasks mm->mmap_lock, scan its vma's and rebind 2057 * their mempolicies to the cpusets new mems_allowed. 2058 */ 2059 static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, 2060 const char *buf) 2061 { 2062 int retval; 2063 2064 /* 2065 * top_cpuset.mems_allowed tracks node_stats[N_MEMORY]; 2066 * it's read-only 2067 */ 2068 if (cs == &top_cpuset) { 2069 retval = -EACCES; 2070 goto done; 2071 } 2072 2073 /* 2074 * An empty mems_allowed is ok iff there are no tasks in the cpuset. 2075 * Since nodelist_parse() fails on an empty mask, we special case 2076 * that parsing. The validate_change() call ensures that cpusets 2077 * with tasks have memory. 2078 */ 2079 if (!*buf) { 2080 nodes_clear(trialcs->mems_allowed); 2081 } else { 2082 retval = nodelist_parse(buf, trialcs->mems_allowed); 2083 if (retval < 0) 2084 goto done; 2085 2086 if (!nodes_subset(trialcs->mems_allowed, 2087 top_cpuset.mems_allowed)) { 2088 retval = -EINVAL; 2089 goto done; 2090 } 2091 } 2092 2093 if (nodes_equal(cs->mems_allowed, trialcs->mems_allowed)) { 2094 retval = 0; /* Too easy - nothing to do */ 2095 goto done; 2096 } 2097 retval = validate_change(cs, trialcs); 2098 if (retval < 0) 2099 goto done; 2100 2101 check_insane_mems_config(&trialcs->mems_allowed); 2102 2103 spin_lock_irq(&callback_lock); 2104 cs->mems_allowed = trialcs->mems_allowed; 2105 spin_unlock_irq(&callback_lock); 2106 2107 /* use trialcs->mems_allowed as a temp variable */ 2108 update_nodemasks_hier(cs, &trialcs->mems_allowed); 2109 done: 2110 return retval; 2111 } 2112 2113 bool current_cpuset_is_being_rebound(void) 2114 { 2115 bool ret; 2116 2117 rcu_read_lock(); 2118 ret = task_cs(current) == cpuset_being_rebound; 2119 rcu_read_unlock(); 2120 2121 return ret; 2122 } 2123 2124 static int update_relax_domain_level(struct cpuset *cs, s64 val) 2125 { 2126 #ifdef CONFIG_SMP 2127 if (val < -1 || val >= sched_domain_level_max) 2128 return -EINVAL; 2129 #endif 2130 2131 if (val != cs->relax_domain_level) { 2132 cs->relax_domain_level = val; 2133 if (!cpumask_empty(cs->cpus_allowed) && 2134 is_sched_load_balance(cs)) 2135 rebuild_sched_domains_locked(); 2136 } 2137 2138 return 0; 2139 } 2140 2141 /** 2142 * update_tasks_flags - update the spread flags of tasks in the cpuset. 2143 * @cs: the cpuset in which each task's spread flags needs to be changed 2144 * 2145 * Iterate through each task of @cs updating its spread flags. As this 2146 * function is called with cpuset_rwsem held, cpuset membership stays 2147 * stable. 2148 */ 2149 static void update_tasks_flags(struct cpuset *cs) 2150 { 2151 struct css_task_iter it; 2152 struct task_struct *task; 2153 2154 css_task_iter_start(&cs->css, 0, &it); 2155 while ((task = css_task_iter_next(&it))) 2156 cpuset_update_task_spread_flag(cs, task); 2157 css_task_iter_end(&it); 2158 } 2159 2160 /* 2161 * update_flag - read a 0 or a 1 in a file and update associated flag 2162 * bit: the bit to update (see cpuset_flagbits_t) 2163 * cs: the cpuset to update 2164 * turning_on: whether the flag is being set or cleared 2165 * 2166 * Call with cpuset_rwsem held. 2167 */ 2168 2169 static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, 2170 int turning_on) 2171 { 2172 struct cpuset *trialcs; 2173 int balance_flag_changed; 2174 int spread_flag_changed; 2175 int err; 2176 2177 trialcs = alloc_trial_cpuset(cs); 2178 if (!trialcs) 2179 return -ENOMEM; 2180 2181 if (turning_on) 2182 set_bit(bit, &trialcs->flags); 2183 else 2184 clear_bit(bit, &trialcs->flags); 2185 2186 err = validate_change(cs, trialcs); 2187 if (err < 0) 2188 goto out; 2189 2190 balance_flag_changed = (is_sched_load_balance(cs) != 2191 is_sched_load_balance(trialcs)); 2192 2193 spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs)) 2194 || (is_spread_page(cs) != is_spread_page(trialcs))); 2195 2196 spin_lock_irq(&callback_lock); 2197 cs->flags = trialcs->flags; 2198 spin_unlock_irq(&callback_lock); 2199 2200 if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) 2201 rebuild_sched_domains_locked(); 2202 2203 if (spread_flag_changed) 2204 update_tasks_flags(cs); 2205 out: 2206 free_cpuset(trialcs); 2207 return err; 2208 } 2209 2210 /** 2211 * update_prstate - update partition_root_state 2212 * @cs: the cpuset to update 2213 * @new_prs: new partition root state 2214 * Return: 0 if successful, != 0 if error 2215 * 2216 * Call with cpuset_rwsem held. 2217 */ 2218 static int update_prstate(struct cpuset *cs, int new_prs) 2219 { 2220 int err = PERR_NONE, old_prs = cs->partition_root_state; 2221 bool sched_domain_rebuilt = false; 2222 struct cpuset *parent = parent_cs(cs); 2223 struct tmpmasks tmpmask; 2224 2225 if (old_prs == new_prs) 2226 return 0; 2227 2228 /* 2229 * For a previously invalid partition root, leave it at being 2230 * invalid if new_prs is not "member". 2231 */ 2232 if (new_prs && is_prs_invalid(old_prs)) { 2233 cs->partition_root_state = -new_prs; 2234 return 0; 2235 } 2236 2237 if (alloc_cpumasks(NULL, &tmpmask)) 2238 return -ENOMEM; 2239 2240 if (!old_prs) { 2241 /* 2242 * Turning on partition root requires setting the 2243 * CS_CPU_EXCLUSIVE bit implicitly as well and cpus_allowed 2244 * cannot be empty. 2245 */ 2246 if (cpumask_empty(cs->cpus_allowed)) { 2247 err = PERR_CPUSEMPTY; 2248 goto out; 2249 } 2250 2251 err = update_flag(CS_CPU_EXCLUSIVE, cs, 1); 2252 if (err) { 2253 err = PERR_NOTEXCL; 2254 goto out; 2255 } 2256 2257 err = update_parent_subparts_cpumask(cs, partcmd_enable, 2258 NULL, &tmpmask); 2259 if (err) { 2260 update_flag(CS_CPU_EXCLUSIVE, cs, 0); 2261 goto out; 2262 } 2263 2264 if (new_prs == PRS_ISOLATED) { 2265 /* 2266 * Disable the load balance flag should not return an 2267 * error unless the system is running out of memory. 2268 */ 2269 update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); 2270 sched_domain_rebuilt = true; 2271 } 2272 } else if (old_prs && new_prs) { 2273 /* 2274 * A change in load balance state only, no change in cpumasks. 2275 */ 2276 update_flag(CS_SCHED_LOAD_BALANCE, cs, (new_prs != PRS_ISOLATED)); 2277 sched_domain_rebuilt = true; 2278 goto out; /* Sched domain is rebuilt in update_flag() */ 2279 } else { 2280 /* 2281 * Switching back to member is always allowed even if it 2282 * disables child partitions. 2283 */ 2284 update_parent_subparts_cpumask(cs, partcmd_disable, NULL, 2285 &tmpmask); 2286 2287 /* 2288 * If there are child partitions, they will all become invalid. 2289 */ 2290 if (unlikely(cs->nr_subparts_cpus)) { 2291 spin_lock_irq(&callback_lock); 2292 cs->nr_subparts_cpus = 0; 2293 cpumask_clear(cs->subparts_cpus); 2294 compute_effective_cpumask(cs->effective_cpus, cs, parent); 2295 spin_unlock_irq(&callback_lock); 2296 } 2297 2298 /* Turning off CS_CPU_EXCLUSIVE will not return error */ 2299 update_flag(CS_CPU_EXCLUSIVE, cs, 0); 2300 2301 if (!is_sched_load_balance(cs)) { 2302 /* Make sure load balance is on */ 2303 update_flag(CS_SCHED_LOAD_BALANCE, cs, 1); 2304 sched_domain_rebuilt = true; 2305 } 2306 } 2307 2308 update_tasks_cpumask(parent); 2309 2310 if (parent->child_ecpus_count) 2311 update_sibling_cpumasks(parent, cs, &tmpmask); 2312 2313 if (!sched_domain_rebuilt) 2314 rebuild_sched_domains_locked(); 2315 out: 2316 /* 2317 * Make partition invalid if an error happen 2318 */ 2319 if (err) 2320 new_prs = -new_prs; 2321 spin_lock_irq(&callback_lock); 2322 cs->partition_root_state = new_prs; 2323 spin_unlock_irq(&callback_lock); 2324 /* 2325 * Update child cpusets, if present. 2326 * Force update if switching back to member. 2327 */ 2328 if (!list_empty(&cs->css.children)) 2329 update_cpumasks_hier(cs, &tmpmask, !new_prs); 2330 2331 notify_partition_change(cs, old_prs); 2332 free_cpumasks(NULL, &tmpmask); 2333 return 0; 2334 } 2335 2336 /* 2337 * Frequency meter - How fast is some event occurring? 2338 * 2339 * These routines manage a digitally filtered, constant time based, 2340 * event frequency meter. There are four routines: 2341 * fmeter_init() - initialize a frequency meter. 2342 * fmeter_markevent() - called each time the event happens. 2343 * fmeter_getrate() - returns the recent rate of such events. 2344 * fmeter_update() - internal routine used to update fmeter. 2345 * 2346 * A common data structure is passed to each of these routines, 2347 * which is used to keep track of the state required to manage the 2348 * frequency meter and its digital filter. 2349 * 2350 * The filter works on the number of events marked per unit time. 2351 * The filter is single-pole low-pass recursive (IIR). The time unit 2352 * is 1 second. Arithmetic is done using 32-bit integers scaled to 2353 * simulate 3 decimal digits of precision (multiplied by 1000). 2354 * 2355 * With an FM_COEF of 933, and a time base of 1 second, the filter 2356 * has a half-life of 10 seconds, meaning that if the events quit 2357 * happening, then the rate returned from the fmeter_getrate() 2358 * will be cut in half each 10 seconds, until it converges to zero. 2359 * 2360 * It is not worth doing a real infinitely recursive filter. If more 2361 * than FM_MAXTICKS ticks have elapsed since the last filter event, 2362 * just compute FM_MAXTICKS ticks worth, by which point the level 2363 * will be stable. 2364 * 2365 * Limit the count of unprocessed events to FM_MAXCNT, so as to avoid 2366 * arithmetic overflow in the fmeter_update() routine. 2367 * 2368 * Given the simple 32 bit integer arithmetic used, this meter works 2369 * best for reporting rates between one per millisecond (msec) and 2370 * one per 32 (approx) seconds. At constant rates faster than one 2371 * per msec it maxes out at values just under 1,000,000. At constant 2372 * rates between one per msec, and one per second it will stabilize 2373 * to a value N*1000, where N is the rate of events per second. 2374 * At constant rates between one per second and one per 32 seconds, 2375 * it will be choppy, moving up on the seconds that have an event, 2376 * and then decaying until the next event. At rates slower than 2377 * about one in 32 seconds, it decays all the way back to zero between 2378 * each event. 2379 */ 2380 2381 #define FM_COEF 933 /* coefficient for half-life of 10 secs */ 2382 #define FM_MAXTICKS ((u32)99) /* useless computing more ticks than this */ 2383 #define FM_MAXCNT 1000000 /* limit cnt to avoid overflow */ 2384 #define FM_SCALE 1000 /* faux fixed point scale */ 2385 2386 /* Initialize a frequency meter */ 2387 static void fmeter_init(struct fmeter *fmp) 2388 { 2389 fmp->cnt = 0; 2390 fmp->val = 0; 2391 fmp->time = 0; 2392 spin_lock_init(&fmp->lock); 2393 } 2394 2395 /* Internal meter update - process cnt events and update value */ 2396 static void fmeter_update(struct fmeter *fmp) 2397 { 2398 time64_t now; 2399 u32 ticks; 2400 2401 now = ktime_get_seconds(); 2402 ticks = now - fmp->time; 2403 2404 if (ticks == 0) 2405 return; 2406 2407 ticks = min(FM_MAXTICKS, ticks); 2408 while (ticks-- > 0) 2409 fmp->val = (FM_COEF * fmp->val) / FM_SCALE; 2410 fmp->time = now; 2411 2412 fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE; 2413 fmp->cnt = 0; 2414 } 2415 2416 /* Process any previous ticks, then bump cnt by one (times scale). */ 2417 static void fmeter_markevent(struct fmeter *fmp) 2418 { 2419 spin_lock(&fmp->lock); 2420 fmeter_update(fmp); 2421 fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE); 2422 spin_unlock(&fmp->lock); 2423 } 2424 2425 /* Process any previous ticks, then return current value. */ 2426 static int fmeter_getrate(struct fmeter *fmp) 2427 { 2428 int val; 2429 2430 spin_lock(&fmp->lock); 2431 fmeter_update(fmp); 2432 val = fmp->val; 2433 spin_unlock(&fmp->lock); 2434 return val; 2435 } 2436 2437 static struct cpuset *cpuset_attach_old_cs; 2438 2439 /* Called by cgroups to determine if a cpuset is usable; cpuset_rwsem held */ 2440 static int cpuset_can_attach(struct cgroup_taskset *tset) 2441 { 2442 struct cgroup_subsys_state *css; 2443 struct cpuset *cs; 2444 struct task_struct *task; 2445 int ret; 2446 2447 /* used later by cpuset_attach() */ 2448 cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css)); 2449 cs = css_cs(css); 2450 2451 percpu_down_write(&cpuset_rwsem); 2452 2453 /* allow moving tasks into an empty cpuset if on default hierarchy */ 2454 ret = -ENOSPC; 2455 if (!is_in_v2_mode() && 2456 (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))) 2457 goto out_unlock; 2458 2459 /* 2460 * Task cannot be moved to a cpuset with empty effective cpus. 2461 */ 2462 if (cpumask_empty(cs->effective_cpus)) 2463 goto out_unlock; 2464 2465 cgroup_taskset_for_each(task, css, tset) { 2466 ret = task_can_attach(task, cs->effective_cpus); 2467 if (ret) 2468 goto out_unlock; 2469 ret = security_task_setscheduler(task); 2470 if (ret) 2471 goto out_unlock; 2472 } 2473 2474 /* 2475 * Mark attach is in progress. This makes validate_change() fail 2476 * changes which zero cpus/mems_allowed. 2477 */ 2478 cs->attach_in_progress++; 2479 ret = 0; 2480 out_unlock: 2481 percpu_up_write(&cpuset_rwsem); 2482 return ret; 2483 } 2484 2485 static void cpuset_cancel_attach(struct cgroup_taskset *tset) 2486 { 2487 struct cgroup_subsys_state *css; 2488 2489 cgroup_taskset_first(tset, &css); 2490 2491 percpu_down_write(&cpuset_rwsem); 2492 css_cs(css)->attach_in_progress--; 2493 percpu_up_write(&cpuset_rwsem); 2494 } 2495 2496 /* 2497 * Protected by cpuset_rwsem. cpus_attach is used only by cpuset_attach() 2498 * but we can't allocate it dynamically there. Define it global and 2499 * allocate from cpuset_init(). 2500 */ 2501 static cpumask_var_t cpus_attach; 2502 2503 static void cpuset_attach(struct cgroup_taskset *tset) 2504 { 2505 /* static buf protected by cpuset_rwsem */ 2506 static nodemask_t cpuset_attach_nodemask_to; 2507 struct task_struct *task; 2508 struct task_struct *leader; 2509 struct cgroup_subsys_state *css; 2510 struct cpuset *cs; 2511 struct cpuset *oldcs = cpuset_attach_old_cs; 2512 2513 cgroup_taskset_first(tset, &css); 2514 cs = css_cs(css); 2515 2516 lockdep_assert_cpus_held(); /* see cgroup_attach_lock() */ 2517 percpu_down_write(&cpuset_rwsem); 2518 2519 guarantee_online_mems(cs, &cpuset_attach_nodemask_to); 2520 2521 cgroup_taskset_for_each(task, css, tset) { 2522 if (cs != &top_cpuset) 2523 guarantee_online_cpus(task, cpus_attach); 2524 else 2525 cpumask_copy(cpus_attach, task_cpu_possible_mask(task)); 2526 /* 2527 * can_attach beforehand should guarantee that this doesn't 2528 * fail. TODO: have a better way to handle failure here 2529 */ 2530 WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach)); 2531 2532 cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to); 2533 cpuset_update_task_spread_flag(cs, task); 2534 } 2535 2536 /* 2537 * Change mm for all threadgroup leaders. This is expensive and may 2538 * sleep and should be moved outside migration path proper. 2539 */ 2540 cpuset_attach_nodemask_to = cs->effective_mems; 2541 cgroup_taskset_for_each_leader(leader, css, tset) { 2542 struct mm_struct *mm = get_task_mm(leader); 2543 2544 if (mm) { 2545 mpol_rebind_mm(mm, &cpuset_attach_nodemask_to); 2546 2547 /* 2548 * old_mems_allowed is the same with mems_allowed 2549 * here, except if this task is being moved 2550 * automatically due to hotplug. In that case 2551 * @mems_allowed has been updated and is empty, so 2552 * @old_mems_allowed is the right nodesets that we 2553 * migrate mm from. 2554 */ 2555 if (is_memory_migrate(cs)) 2556 cpuset_migrate_mm(mm, &oldcs->old_mems_allowed, 2557 &cpuset_attach_nodemask_to); 2558 else 2559 mmput(mm); 2560 } 2561 } 2562 2563 cs->old_mems_allowed = cpuset_attach_nodemask_to; 2564 2565 cs->attach_in_progress--; 2566 if (!cs->attach_in_progress) 2567 wake_up(&cpuset_attach_wq); 2568 2569 percpu_up_write(&cpuset_rwsem); 2570 } 2571 2572 /* The various types of files and directories in a cpuset file system */ 2573 2574 typedef enum { 2575 FILE_MEMORY_MIGRATE, 2576 FILE_CPULIST, 2577 FILE_MEMLIST, 2578 FILE_EFFECTIVE_CPULIST, 2579 FILE_EFFECTIVE_MEMLIST, 2580 FILE_SUBPARTS_CPULIST, 2581 FILE_CPU_EXCLUSIVE, 2582 FILE_MEM_EXCLUSIVE, 2583 FILE_MEM_HARDWALL, 2584 FILE_SCHED_LOAD_BALANCE, 2585 FILE_PARTITION_ROOT, 2586 FILE_SCHED_RELAX_DOMAIN_LEVEL, 2587 FILE_MEMORY_PRESSURE_ENABLED, 2588 FILE_MEMORY_PRESSURE, 2589 FILE_SPREAD_PAGE, 2590 FILE_SPREAD_SLAB, 2591 } cpuset_filetype_t; 2592 2593 static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, 2594 u64 val) 2595 { 2596 struct cpuset *cs = css_cs(css); 2597 cpuset_filetype_t type = cft->private; 2598 int retval = 0; 2599 2600 cpus_read_lock(); 2601 percpu_down_write(&cpuset_rwsem); 2602 if (!is_cpuset_online(cs)) { 2603 retval = -ENODEV; 2604 goto out_unlock; 2605 } 2606 2607 switch (type) { 2608 case FILE_CPU_EXCLUSIVE: 2609 retval = update_flag(CS_CPU_EXCLUSIVE, cs, val); 2610 break; 2611 case FILE_MEM_EXCLUSIVE: 2612 retval = update_flag(CS_MEM_EXCLUSIVE, cs, val); 2613 break; 2614 case FILE_MEM_HARDWALL: 2615 retval = update_flag(CS_MEM_HARDWALL, cs, val); 2616 break; 2617 case FILE_SCHED_LOAD_BALANCE: 2618 retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, val); 2619 break; 2620 case FILE_MEMORY_MIGRATE: 2621 retval = update_flag(CS_MEMORY_MIGRATE, cs, val); 2622 break; 2623 case FILE_MEMORY_PRESSURE_ENABLED: 2624 cpuset_memory_pressure_enabled = !!val; 2625 break; 2626 case FILE_SPREAD_PAGE: 2627 retval = update_flag(CS_SPREAD_PAGE, cs, val); 2628 break; 2629 case FILE_SPREAD_SLAB: 2630 retval = update_flag(CS_SPREAD_SLAB, cs, val); 2631 break; 2632 default: 2633 retval = -EINVAL; 2634 break; 2635 } 2636 out_unlock: 2637 percpu_up_write(&cpuset_rwsem); 2638 cpus_read_unlock(); 2639 return retval; 2640 } 2641 2642 static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, 2643 s64 val) 2644 { 2645 struct cpuset *cs = css_cs(css); 2646 cpuset_filetype_t type = cft->private; 2647 int retval = -ENODEV; 2648 2649 cpus_read_lock(); 2650 percpu_down_write(&cpuset_rwsem); 2651 if (!is_cpuset_online(cs)) 2652 goto out_unlock; 2653 2654 switch (type) { 2655 case FILE_SCHED_RELAX_DOMAIN_LEVEL: 2656 retval = update_relax_domain_level(cs, val); 2657 break; 2658 default: 2659 retval = -EINVAL; 2660 break; 2661 } 2662 out_unlock: 2663 percpu_up_write(&cpuset_rwsem); 2664 cpus_read_unlock(); 2665 return retval; 2666 } 2667 2668 /* 2669 * Common handling for a write to a "cpus" or "mems" file. 2670 */ 2671 static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, 2672 char *buf, size_t nbytes, loff_t off) 2673 { 2674 struct cpuset *cs = css_cs(of_css(of)); 2675 struct cpuset *trialcs; 2676 int retval = -ENODEV; 2677 2678 buf = strstrip(buf); 2679 2680 /* 2681 * CPU or memory hotunplug may leave @cs w/o any execution 2682 * resources, in which case the hotplug code asynchronously updates 2683 * configuration and transfers all tasks to the nearest ancestor 2684 * which can execute. 2685 * 2686 * As writes to "cpus" or "mems" may restore @cs's execution 2687 * resources, wait for the previously scheduled operations before 2688 * proceeding, so that we don't end up keep removing tasks added 2689 * after execution capability is restored. 2690 * 2691 * cpuset_hotplug_work calls back into cgroup core via 2692 * cgroup_transfer_tasks() and waiting for it from a cgroupfs 2693 * operation like this one can lead to a deadlock through kernfs 2694 * active_ref protection. Let's break the protection. Losing the 2695 * protection is okay as we check whether @cs is online after 2696 * grabbing cpuset_rwsem anyway. This only happens on the legacy 2697 * hierarchies. 2698 */ 2699 css_get(&cs->css); 2700 kernfs_break_active_protection(of->kn); 2701 flush_work(&cpuset_hotplug_work); 2702 2703 cpus_read_lock(); 2704 percpu_down_write(&cpuset_rwsem); 2705 if (!is_cpuset_online(cs)) 2706 goto out_unlock; 2707 2708 trialcs = alloc_trial_cpuset(cs); 2709 if (!trialcs) { 2710 retval = -ENOMEM; 2711 goto out_unlock; 2712 } 2713 2714 switch (of_cft(of)->private) { 2715 case FILE_CPULIST: 2716 retval = update_cpumask(cs, trialcs, buf); 2717 break; 2718 case FILE_MEMLIST: 2719 retval = update_nodemask(cs, trialcs, buf); 2720 break; 2721 default: 2722 retval = -EINVAL; 2723 break; 2724 } 2725 2726 free_cpuset(trialcs); 2727 out_unlock: 2728 percpu_up_write(&cpuset_rwsem); 2729 cpus_read_unlock(); 2730 kernfs_unbreak_active_protection(of->kn); 2731 css_put(&cs->css); 2732 flush_workqueue(cpuset_migrate_mm_wq); 2733 return retval ?: nbytes; 2734 } 2735 2736 /* 2737 * These ascii lists should be read in a single call, by using a user 2738 * buffer large enough to hold the entire map. If read in smaller 2739 * chunks, there is no guarantee of atomicity. Since the display format 2740 * used, list of ranges of sequential numbers, is variable length, 2741 * and since these maps can change value dynamically, one could read 2742 * gibberish by doing partial reads while a list was changing. 2743 */ 2744 static int cpuset_common_seq_show(struct seq_file *sf, void *v) 2745 { 2746 struct cpuset *cs = css_cs(seq_css(sf)); 2747 cpuset_filetype_t type = seq_cft(sf)->private; 2748 int ret = 0; 2749 2750 spin_lock_irq(&callback_lock); 2751 2752 switch (type) { 2753 case FILE_CPULIST: 2754 seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->cpus_allowed)); 2755 break; 2756 case FILE_MEMLIST: 2757 seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->mems_allowed)); 2758 break; 2759 case FILE_EFFECTIVE_CPULIST: 2760 seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->effective_cpus)); 2761 break; 2762 case FILE_EFFECTIVE_MEMLIST: 2763 seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->effective_mems)); 2764 break; 2765 case FILE_SUBPARTS_CPULIST: 2766 seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->subparts_cpus)); 2767 break; 2768 default: 2769 ret = -EINVAL; 2770 } 2771 2772 spin_unlock_irq(&callback_lock); 2773 return ret; 2774 } 2775 2776 static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft) 2777 { 2778 struct cpuset *cs = css_cs(css); 2779 cpuset_filetype_t type = cft->private; 2780 switch (type) { 2781 case FILE_CPU_EXCLUSIVE: 2782 return is_cpu_exclusive(cs); 2783 case FILE_MEM_EXCLUSIVE: 2784 return is_mem_exclusive(cs); 2785 case FILE_MEM_HARDWALL: 2786 return is_mem_hardwall(cs); 2787 case FILE_SCHED_LOAD_BALANCE: 2788 return is_sched_load_balance(cs); 2789 case FILE_MEMORY_MIGRATE: 2790 return is_memory_migrate(cs); 2791 case FILE_MEMORY_PRESSURE_ENABLED: 2792 return cpuset_memory_pressure_enabled; 2793 case FILE_MEMORY_PRESSURE: 2794 return fmeter_getrate(&cs->fmeter); 2795 case FILE_SPREAD_PAGE: 2796 return is_spread_page(cs); 2797 case FILE_SPREAD_SLAB: 2798 return is_spread_slab(cs); 2799 default: 2800 BUG(); 2801 } 2802 2803 /* Unreachable but makes gcc happy */ 2804 return 0; 2805 } 2806 2807 static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft) 2808 { 2809 struct cpuset *cs = css_cs(css); 2810 cpuset_filetype_t type = cft->private; 2811 switch (type) { 2812 case FILE_SCHED_RELAX_DOMAIN_LEVEL: 2813 return cs->relax_domain_level; 2814 default: 2815 BUG(); 2816 } 2817 2818 /* Unreachable but makes gcc happy */ 2819 return 0; 2820 } 2821 2822 static int sched_partition_show(struct seq_file *seq, void *v) 2823 { 2824 struct cpuset *cs = css_cs(seq_css(seq)); 2825 const char *err, *type = NULL; 2826 2827 switch (cs->partition_root_state) { 2828 case PRS_ROOT: 2829 seq_puts(seq, "root\n"); 2830 break; 2831 case PRS_ISOLATED: 2832 seq_puts(seq, "isolated\n"); 2833 break; 2834 case PRS_MEMBER: 2835 seq_puts(seq, "member\n"); 2836 break; 2837 case PRS_INVALID_ROOT: 2838 type = "root"; 2839 fallthrough; 2840 case PRS_INVALID_ISOLATED: 2841 if (!type) 2842 type = "isolated"; 2843 err = perr_strings[READ_ONCE(cs->prs_err)]; 2844 if (err) 2845 seq_printf(seq, "%s invalid (%s)\n", type, err); 2846 else 2847 seq_printf(seq, "%s invalid\n", type); 2848 break; 2849 } 2850 return 0; 2851 } 2852 2853 static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf, 2854 size_t nbytes, loff_t off) 2855 { 2856 struct cpuset *cs = css_cs(of_css(of)); 2857 int val; 2858 int retval = -ENODEV; 2859 2860 buf = strstrip(buf); 2861 2862 /* 2863 * Convert "root" to ENABLED, and convert "member" to DISABLED. 2864 */ 2865 if (!strcmp(buf, "root")) 2866 val = PRS_ROOT; 2867 else if (!strcmp(buf, "member")) 2868 val = PRS_MEMBER; 2869 else if (!strcmp(buf, "isolated")) 2870 val = PRS_ISOLATED; 2871 else 2872 return -EINVAL; 2873 2874 css_get(&cs->css); 2875 cpus_read_lock(); 2876 percpu_down_write(&cpuset_rwsem); 2877 if (!is_cpuset_online(cs)) 2878 goto out_unlock; 2879 2880 retval = update_prstate(cs, val); 2881 out_unlock: 2882 percpu_up_write(&cpuset_rwsem); 2883 cpus_read_unlock(); 2884 css_put(&cs->css); 2885 return retval ?: nbytes; 2886 } 2887 2888 /* 2889 * for the common functions, 'private' gives the type of file 2890 */ 2891 2892 static struct cftype legacy_files[] = { 2893 { 2894 .name = "cpus", 2895 .seq_show = cpuset_common_seq_show, 2896 .write = cpuset_write_resmask, 2897 .max_write_len = (100U + 6 * NR_CPUS), 2898 .private = FILE_CPULIST, 2899 }, 2900 2901 { 2902 .name = "mems", 2903 .seq_show = cpuset_common_seq_show, 2904 .write = cpuset_write_resmask, 2905 .max_write_len = (100U + 6 * MAX_NUMNODES), 2906 .private = FILE_MEMLIST, 2907 }, 2908 2909 { 2910 .name = "effective_cpus", 2911 .seq_show = cpuset_common_seq_show, 2912 .private = FILE_EFFECTIVE_CPULIST, 2913 }, 2914 2915 { 2916 .name = "effective_mems", 2917 .seq_show = cpuset_common_seq_show, 2918 .private = FILE_EFFECTIVE_MEMLIST, 2919 }, 2920 2921 { 2922 .name = "cpu_exclusive", 2923 .read_u64 = cpuset_read_u64, 2924 .write_u64 = cpuset_write_u64, 2925 .private = FILE_CPU_EXCLUSIVE, 2926 }, 2927 2928 { 2929 .name = "mem_exclusive", 2930 .read_u64 = cpuset_read_u64, 2931 .write_u64 = cpuset_write_u64, 2932 .private = FILE_MEM_EXCLUSIVE, 2933 }, 2934 2935 { 2936 .name = "mem_hardwall", 2937 .read_u64 = cpuset_read_u64, 2938 .write_u64 = cpuset_write_u64, 2939 .private = FILE_MEM_HARDWALL, 2940 }, 2941 2942 { 2943 .name = "sched_load_balance", 2944 .read_u64 = cpuset_read_u64, 2945 .write_u64 = cpuset_write_u64, 2946 .private = FILE_SCHED_LOAD_BALANCE, 2947 }, 2948 2949 { 2950 .name = "sched_relax_domain_level", 2951 .read_s64 = cpuset_read_s64, 2952 .write_s64 = cpuset_write_s64, 2953 .private = FILE_SCHED_RELAX_DOMAIN_LEVEL, 2954 }, 2955 2956 { 2957 .name = "memory_migrate", 2958 .read_u64 = cpuset_read_u64, 2959 .write_u64 = cpuset_write_u64, 2960 .private = FILE_MEMORY_MIGRATE, 2961 }, 2962 2963 { 2964 .name = "memory_pressure", 2965 .read_u64 = cpuset_read_u64, 2966 .private = FILE_MEMORY_PRESSURE, 2967 }, 2968 2969 { 2970 .name = "memory_spread_page", 2971 .read_u64 = cpuset_read_u64, 2972 .write_u64 = cpuset_write_u64, 2973 .private = FILE_SPREAD_PAGE, 2974 }, 2975 2976 { 2977 .name = "memory_spread_slab", 2978 .read_u64 = cpuset_read_u64, 2979 .write_u64 = cpuset_write_u64, 2980 .private = FILE_SPREAD_SLAB, 2981 }, 2982 2983 { 2984 .name = "memory_pressure_enabled", 2985 .flags = CFTYPE_ONLY_ON_ROOT, 2986 .read_u64 = cpuset_read_u64, 2987 .write_u64 = cpuset_write_u64, 2988 .private = FILE_MEMORY_PRESSURE_ENABLED, 2989 }, 2990 2991 { } /* terminate */ 2992 }; 2993 2994 /* 2995 * This is currently a minimal set for the default hierarchy. It can be 2996 * expanded later on by migrating more features and control files from v1. 2997 */ 2998 static struct cftype dfl_files[] = { 2999 { 3000 .name = "cpus", 3001 .seq_show = cpuset_common_seq_show, 3002 .write = cpuset_write_resmask, 3003 .max_write_len = (100U + 6 * NR_CPUS), 3004 .private = FILE_CPULIST, 3005 .flags = CFTYPE_NOT_ON_ROOT, 3006 }, 3007 3008 { 3009 .name = "mems", 3010 .seq_show = cpuset_common_seq_show, 3011 .write = cpuset_write_resmask, 3012 .max_write_len = (100U + 6 * MAX_NUMNODES), 3013 .private = FILE_MEMLIST, 3014 .flags = CFTYPE_NOT_ON_ROOT, 3015 }, 3016 3017 { 3018 .name = "cpus.effective", 3019 .seq_show = cpuset_common_seq_show, 3020 .private = FILE_EFFECTIVE_CPULIST, 3021 }, 3022 3023 { 3024 .name = "mems.effective", 3025 .seq_show = cpuset_common_seq_show, 3026 .private = FILE_EFFECTIVE_MEMLIST, 3027 }, 3028 3029 { 3030 .name = "cpus.partition", 3031 .seq_show = sched_partition_show, 3032 .write = sched_partition_write, 3033 .private = FILE_PARTITION_ROOT, 3034 .flags = CFTYPE_NOT_ON_ROOT, 3035 .file_offset = offsetof(struct cpuset, partition_file), 3036 }, 3037 3038 { 3039 .name = "cpus.subpartitions", 3040 .seq_show = cpuset_common_seq_show, 3041 .private = FILE_SUBPARTS_CPULIST, 3042 .flags = CFTYPE_DEBUG, 3043 }, 3044 3045 { } /* terminate */ 3046 }; 3047 3048 3049 /* 3050 * cpuset_css_alloc - allocate a cpuset css 3051 * cgrp: control group that the new cpuset will be part of 3052 */ 3053 3054 static struct cgroup_subsys_state * 3055 cpuset_css_alloc(struct cgroup_subsys_state *parent_css) 3056 { 3057 struct cpuset *cs; 3058 3059 if (!parent_css) 3060 return &top_cpuset.css; 3061 3062 cs = kzalloc(sizeof(*cs), GFP_KERNEL); 3063 if (!cs) 3064 return ERR_PTR(-ENOMEM); 3065 3066 if (alloc_cpumasks(cs, NULL)) { 3067 kfree(cs); 3068 return ERR_PTR(-ENOMEM); 3069 } 3070 3071 __set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); 3072 nodes_clear(cs->mems_allowed); 3073 nodes_clear(cs->effective_mems); 3074 fmeter_init(&cs->fmeter); 3075 cs->relax_domain_level = -1; 3076 3077 /* Set CS_MEMORY_MIGRATE for default hierarchy */ 3078 if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) 3079 __set_bit(CS_MEMORY_MIGRATE, &cs->flags); 3080 3081 return &cs->css; 3082 } 3083 3084 static int cpuset_css_online(struct cgroup_subsys_state *css) 3085 { 3086 struct cpuset *cs = css_cs(css); 3087 struct cpuset *parent = parent_cs(cs); 3088 struct cpuset *tmp_cs; 3089 struct cgroup_subsys_state *pos_css; 3090 3091 if (!parent) 3092 return 0; 3093 3094 cpus_read_lock(); 3095 percpu_down_write(&cpuset_rwsem); 3096 3097 set_bit(CS_ONLINE, &cs->flags); 3098 if (is_spread_page(parent)) 3099 set_bit(CS_SPREAD_PAGE, &cs->flags); 3100 if (is_spread_slab(parent)) 3101 set_bit(CS_SPREAD_SLAB, &cs->flags); 3102 3103 cpuset_inc(); 3104 3105 spin_lock_irq(&callback_lock); 3106 if (is_in_v2_mode()) { 3107 cpumask_copy(cs->effective_cpus, parent->effective_cpus); 3108 cs->effective_mems = parent->effective_mems; 3109 cs->use_parent_ecpus = true; 3110 parent->child_ecpus_count++; 3111 } 3112 spin_unlock_irq(&callback_lock); 3113 3114 if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags)) 3115 goto out_unlock; 3116 3117 /* 3118 * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is 3119 * set. This flag handling is implemented in cgroup core for 3120 * historical reasons - the flag may be specified during mount. 3121 * 3122 * Currently, if any sibling cpusets have exclusive cpus or mem, we 3123 * refuse to clone the configuration - thereby refusing the task to 3124 * be entered, and as a result refusing the sys_unshare() or 3125 * clone() which initiated it. If this becomes a problem for some 3126 * users who wish to allow that scenario, then this could be 3127 * changed to grant parent->cpus_allowed-sibling_cpus_exclusive 3128 * (and likewise for mems) to the new cgroup. 3129 */ 3130 rcu_read_lock(); 3131 cpuset_for_each_child(tmp_cs, pos_css, parent) { 3132 if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) { 3133 rcu_read_unlock(); 3134 goto out_unlock; 3135 } 3136 } 3137 rcu_read_unlock(); 3138 3139 spin_lock_irq(&callback_lock); 3140 cs->mems_allowed = parent->mems_allowed; 3141 cs->effective_mems = parent->mems_allowed; 3142 cpumask_copy(cs->cpus_allowed, parent->cpus_allowed); 3143 cpumask_copy(cs->effective_cpus, parent->cpus_allowed); 3144 spin_unlock_irq(&callback_lock); 3145 out_unlock: 3146 percpu_up_write(&cpuset_rwsem); 3147 cpus_read_unlock(); 3148 return 0; 3149 } 3150 3151 /* 3152 * If the cpuset being removed has its flag 'sched_load_balance' 3153 * enabled, then simulate turning sched_load_balance off, which 3154 * will call rebuild_sched_domains_locked(). That is not needed 3155 * in the default hierarchy where only changes in partition 3156 * will cause repartitioning. 3157 * 3158 * If the cpuset has the 'sched.partition' flag enabled, simulate 3159 * turning 'sched.partition" off. 3160 */ 3161 3162 static void cpuset_css_offline(struct cgroup_subsys_state *css) 3163 { 3164 struct cpuset *cs = css_cs(css); 3165 3166 cpus_read_lock(); 3167 percpu_down_write(&cpuset_rwsem); 3168 3169 if (is_partition_valid(cs)) 3170 update_prstate(cs, 0); 3171 3172 if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && 3173 is_sched_load_balance(cs)) 3174 update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); 3175 3176 if (cs->use_parent_ecpus) { 3177 struct cpuset *parent = parent_cs(cs); 3178 3179 cs->use_parent_ecpus = false; 3180 parent->child_ecpus_count--; 3181 } 3182 3183 cpuset_dec(); 3184 clear_bit(CS_ONLINE, &cs->flags); 3185 3186 percpu_up_write(&cpuset_rwsem); 3187 cpus_read_unlock(); 3188 } 3189 3190 static void cpuset_css_free(struct cgroup_subsys_state *css) 3191 { 3192 struct cpuset *cs = css_cs(css); 3193 3194 free_cpuset(cs); 3195 } 3196 3197 static void cpuset_bind(struct cgroup_subsys_state *root_css) 3198 { 3199 percpu_down_write(&cpuset_rwsem); 3200 spin_lock_irq(&callback_lock); 3201 3202 if (is_in_v2_mode()) { 3203 cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask); 3204 top_cpuset.mems_allowed = node_possible_map; 3205 } else { 3206 cpumask_copy(top_cpuset.cpus_allowed, 3207 top_cpuset.effective_cpus); 3208 top_cpuset.mems_allowed = top_cpuset.effective_mems; 3209 } 3210 3211 spin_unlock_irq(&callback_lock); 3212 percpu_up_write(&cpuset_rwsem); 3213 } 3214 3215 /* 3216 * Make sure the new task conform to the current state of its parent, 3217 * which could have been changed by cpuset just after it inherits the 3218 * state from the parent and before it sits on the cgroup's task list. 3219 */ 3220 static void cpuset_fork(struct task_struct *task) 3221 { 3222 if (task_css_is_root(task, cpuset_cgrp_id)) 3223 return; 3224 3225 set_cpus_allowed_ptr(task, current->cpus_ptr); 3226 task->mems_allowed = current->mems_allowed; 3227 } 3228 3229 struct cgroup_subsys cpuset_cgrp_subsys = { 3230 .css_alloc = cpuset_css_alloc, 3231 .css_online = cpuset_css_online, 3232 .css_offline = cpuset_css_offline, 3233 .css_free = cpuset_css_free, 3234 .can_attach = cpuset_can_attach, 3235 .cancel_attach = cpuset_cancel_attach, 3236 .attach = cpuset_attach, 3237 .post_attach = cpuset_post_attach, 3238 .bind = cpuset_bind, 3239 .fork = cpuset_fork, 3240 .legacy_cftypes = legacy_files, 3241 .dfl_cftypes = dfl_files, 3242 .early_init = true, 3243 .threaded = true, 3244 }; 3245 3246 /** 3247 * cpuset_init - initialize cpusets at system boot 3248 * 3249 * Description: Initialize top_cpuset 3250 **/ 3251 3252 int __init cpuset_init(void) 3253 { 3254 BUG_ON(percpu_init_rwsem(&cpuset_rwsem)); 3255 3256 BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL)); 3257 BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL)); 3258 BUG_ON(!zalloc_cpumask_var(&top_cpuset.subparts_cpus, GFP_KERNEL)); 3259 3260 cpumask_setall(top_cpuset.cpus_allowed); 3261 nodes_setall(top_cpuset.mems_allowed); 3262 cpumask_setall(top_cpuset.effective_cpus); 3263 nodes_setall(top_cpuset.effective_mems); 3264 3265 fmeter_init(&top_cpuset.fmeter); 3266 set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags); 3267 top_cpuset.relax_domain_level = -1; 3268 3269 BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL)); 3270 3271 return 0; 3272 } 3273 3274 /* 3275 * If CPU and/or memory hotplug handlers, below, unplug any CPUs 3276 * or memory nodes, we need to walk over the cpuset hierarchy, 3277 * removing that CPU or node from all cpusets. If this removes the 3278 * last CPU or node from a cpuset, then move the tasks in the empty 3279 * cpuset to its next-highest non-empty parent. 3280 */ 3281 static void remove_tasks_in_empty_cpuset(struct cpuset *cs) 3282 { 3283 struct cpuset *parent; 3284 3285 /* 3286 * Find its next-highest non-empty parent, (top cpuset 3287 * has online cpus, so can't be empty). 3288 */ 3289 parent = parent_cs(cs); 3290 while (cpumask_empty(parent->cpus_allowed) || 3291 nodes_empty(parent->mems_allowed)) 3292 parent = parent_cs(parent); 3293 3294 if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) { 3295 pr_err("cpuset: failed to transfer tasks out of empty cpuset "); 3296 pr_cont_cgroup_name(cs->css.cgroup); 3297 pr_cont("\n"); 3298 } 3299 } 3300 3301 static void 3302 hotplug_update_tasks_legacy(struct cpuset *cs, 3303 struct cpumask *new_cpus, nodemask_t *new_mems, 3304 bool cpus_updated, bool mems_updated) 3305 { 3306 bool is_empty; 3307 3308 spin_lock_irq(&callback_lock); 3309 cpumask_copy(cs->cpus_allowed, new_cpus); 3310 cpumask_copy(cs->effective_cpus, new_cpus); 3311 cs->mems_allowed = *new_mems; 3312 cs->effective_mems = *new_mems; 3313 spin_unlock_irq(&callback_lock); 3314 3315 /* 3316 * Don't call update_tasks_cpumask() if the cpuset becomes empty, 3317 * as the tasks will be migrated to an ancestor. 3318 */ 3319 if (cpus_updated && !cpumask_empty(cs->cpus_allowed)) 3320 update_tasks_cpumask(cs); 3321 if (mems_updated && !nodes_empty(cs->mems_allowed)) 3322 update_tasks_nodemask(cs); 3323 3324 is_empty = cpumask_empty(cs->cpus_allowed) || 3325 nodes_empty(cs->mems_allowed); 3326 3327 percpu_up_write(&cpuset_rwsem); 3328 3329 /* 3330 * Move tasks to the nearest ancestor with execution resources, 3331 * This is full cgroup operation which will also call back into 3332 * cpuset. Should be done outside any lock. 3333 */ 3334 if (is_empty) 3335 remove_tasks_in_empty_cpuset(cs); 3336 3337 percpu_down_write(&cpuset_rwsem); 3338 } 3339 3340 static void 3341 hotplug_update_tasks(struct cpuset *cs, 3342 struct cpumask *new_cpus, nodemask_t *new_mems, 3343 bool cpus_updated, bool mems_updated) 3344 { 3345 /* A partition root is allowed to have empty effective cpus */ 3346 if (cpumask_empty(new_cpus) && !is_partition_valid(cs)) 3347 cpumask_copy(new_cpus, parent_cs(cs)->effective_cpus); 3348 if (nodes_empty(*new_mems)) 3349 *new_mems = parent_cs(cs)->effective_mems; 3350 3351 spin_lock_irq(&callback_lock); 3352 cpumask_copy(cs->effective_cpus, new_cpus); 3353 cs->effective_mems = *new_mems; 3354 spin_unlock_irq(&callback_lock); 3355 3356 if (cpus_updated) 3357 update_tasks_cpumask(cs); 3358 if (mems_updated) 3359 update_tasks_nodemask(cs); 3360 } 3361 3362 static bool force_rebuild; 3363 3364 void cpuset_force_rebuild(void) 3365 { 3366 force_rebuild = true; 3367 } 3368 3369 /** 3370 * cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug 3371 * @cs: cpuset in interest 3372 * @tmp: the tmpmasks structure pointer 3373 * 3374 * Compare @cs's cpu and mem masks against top_cpuset and if some have gone 3375 * offline, update @cs accordingly. If @cs ends up with no CPU or memory, 3376 * all its tasks are moved to the nearest ancestor with both resources. 3377 */ 3378 static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp) 3379 { 3380 static cpumask_t new_cpus; 3381 static nodemask_t new_mems; 3382 bool cpus_updated; 3383 bool mems_updated; 3384 struct cpuset *parent; 3385 retry: 3386 wait_event(cpuset_attach_wq, cs->attach_in_progress == 0); 3387 3388 percpu_down_write(&cpuset_rwsem); 3389 3390 /* 3391 * We have raced with task attaching. We wait until attaching 3392 * is finished, so we won't attach a task to an empty cpuset. 3393 */ 3394 if (cs->attach_in_progress) { 3395 percpu_up_write(&cpuset_rwsem); 3396 goto retry; 3397 } 3398 3399 parent = parent_cs(cs); 3400 compute_effective_cpumask(&new_cpus, cs, parent); 3401 nodes_and(new_mems, cs->mems_allowed, parent->effective_mems); 3402 3403 if (cs->nr_subparts_cpus) 3404 /* 3405 * Make sure that CPUs allocated to child partitions 3406 * do not show up in effective_cpus. 3407 */ 3408 cpumask_andnot(&new_cpus, &new_cpus, cs->subparts_cpus); 3409 3410 if (!tmp || !cs->partition_root_state) 3411 goto update_tasks; 3412 3413 /* 3414 * In the unlikely event that a partition root has empty 3415 * effective_cpus with tasks, we will have to invalidate child 3416 * partitions, if present, by setting nr_subparts_cpus to 0 to 3417 * reclaim their cpus. 3418 */ 3419 if (cs->nr_subparts_cpus && is_partition_valid(cs) && 3420 cpumask_empty(&new_cpus) && partition_is_populated(cs, NULL)) { 3421 spin_lock_irq(&callback_lock); 3422 cs->nr_subparts_cpus = 0; 3423 cpumask_clear(cs->subparts_cpus); 3424 spin_unlock_irq(&callback_lock); 3425 compute_effective_cpumask(&new_cpus, cs, parent); 3426 } 3427 3428 /* 3429 * Force the partition to become invalid if either one of 3430 * the following conditions hold: 3431 * 1) empty effective cpus but not valid empty partition. 3432 * 2) parent is invalid or doesn't grant any cpus to child 3433 * partitions. 3434 */ 3435 if (is_partition_valid(cs) && (!parent->nr_subparts_cpus || 3436 (cpumask_empty(&new_cpus) && partition_is_populated(cs, NULL)))) { 3437 int old_prs, parent_prs; 3438 3439 update_parent_subparts_cpumask(cs, partcmd_disable, NULL, tmp); 3440 if (cs->nr_subparts_cpus) { 3441 spin_lock_irq(&callback_lock); 3442 cs->nr_subparts_cpus = 0; 3443 cpumask_clear(cs->subparts_cpus); 3444 spin_unlock_irq(&callback_lock); 3445 compute_effective_cpumask(&new_cpus, cs, parent); 3446 } 3447 3448 old_prs = cs->partition_root_state; 3449 parent_prs = parent->partition_root_state; 3450 if (is_partition_valid(cs)) { 3451 spin_lock_irq(&callback_lock); 3452 make_partition_invalid(cs); 3453 spin_unlock_irq(&callback_lock); 3454 if (is_prs_invalid(parent_prs)) 3455 WRITE_ONCE(cs->prs_err, PERR_INVPARENT); 3456 else if (!parent_prs) 3457 WRITE_ONCE(cs->prs_err, PERR_NOTPART); 3458 else 3459 WRITE_ONCE(cs->prs_err, PERR_HOTPLUG); 3460 notify_partition_change(cs, old_prs); 3461 } 3462 cpuset_force_rebuild(); 3463 } 3464 3465 /* 3466 * On the other hand, an invalid partition root may be transitioned 3467 * back to a regular one. 3468 */ 3469 else if (is_partition_valid(parent) && is_partition_invalid(cs)) { 3470 update_parent_subparts_cpumask(cs, partcmd_update, NULL, tmp); 3471 if (is_partition_valid(cs)) 3472 cpuset_force_rebuild(); 3473 } 3474 3475 update_tasks: 3476 cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus); 3477 mems_updated = !nodes_equal(new_mems, cs->effective_mems); 3478 3479 if (mems_updated) 3480 check_insane_mems_config(&new_mems); 3481 3482 if (is_in_v2_mode()) 3483 hotplug_update_tasks(cs, &new_cpus, &new_mems, 3484 cpus_updated, mems_updated); 3485 else 3486 hotplug_update_tasks_legacy(cs, &new_cpus, &new_mems, 3487 cpus_updated, mems_updated); 3488 3489 percpu_up_write(&cpuset_rwsem); 3490 } 3491 3492 /** 3493 * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset 3494 * 3495 * This function is called after either CPU or memory configuration has 3496 * changed and updates cpuset accordingly. The top_cpuset is always 3497 * synchronized to cpu_active_mask and N_MEMORY, which is necessary in 3498 * order to make cpusets transparent (of no affect) on systems that are 3499 * actively using CPU hotplug but making no active use of cpusets. 3500 * 3501 * Non-root cpusets are only affected by offlining. If any CPUs or memory 3502 * nodes have been taken down, cpuset_hotplug_update_tasks() is invoked on 3503 * all descendants. 3504 * 3505 * Note that CPU offlining during suspend is ignored. We don't modify 3506 * cpusets across suspend/resume cycles at all. 3507 */ 3508 static void cpuset_hotplug_workfn(struct work_struct *work) 3509 { 3510 static cpumask_t new_cpus; 3511 static nodemask_t new_mems; 3512 bool cpus_updated, mems_updated; 3513 bool on_dfl = is_in_v2_mode(); 3514 struct tmpmasks tmp, *ptmp = NULL; 3515 3516 if (on_dfl && !alloc_cpumasks(NULL, &tmp)) 3517 ptmp = &tmp; 3518 3519 percpu_down_write(&cpuset_rwsem); 3520 3521 /* fetch the available cpus/mems and find out which changed how */ 3522 cpumask_copy(&new_cpus, cpu_active_mask); 3523 new_mems = node_states[N_MEMORY]; 3524 3525 /* 3526 * If subparts_cpus is populated, it is likely that the check below 3527 * will produce a false positive on cpus_updated when the cpu list 3528 * isn't changed. It is extra work, but it is better to be safe. 3529 */ 3530 cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus); 3531 mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems); 3532 3533 /* 3534 * In the rare case that hotplug removes all the cpus in subparts_cpus, 3535 * we assumed that cpus are updated. 3536 */ 3537 if (!cpus_updated && top_cpuset.nr_subparts_cpus) 3538 cpus_updated = true; 3539 3540 /* synchronize cpus_allowed to cpu_active_mask */ 3541 if (cpus_updated) { 3542 spin_lock_irq(&callback_lock); 3543 if (!on_dfl) 3544 cpumask_copy(top_cpuset.cpus_allowed, &new_cpus); 3545 /* 3546 * Make sure that CPUs allocated to child partitions 3547 * do not show up in effective_cpus. If no CPU is left, 3548 * we clear the subparts_cpus & let the child partitions 3549 * fight for the CPUs again. 3550 */ 3551 if (top_cpuset.nr_subparts_cpus) { 3552 if (cpumask_subset(&new_cpus, 3553 top_cpuset.subparts_cpus)) { 3554 top_cpuset.nr_subparts_cpus = 0; 3555 cpumask_clear(top_cpuset.subparts_cpus); 3556 } else { 3557 cpumask_andnot(&new_cpus, &new_cpus, 3558 top_cpuset.subparts_cpus); 3559 } 3560 } 3561 cpumask_copy(top_cpuset.effective_cpus, &new_cpus); 3562 spin_unlock_irq(&callback_lock); 3563 /* we don't mess with cpumasks of tasks in top_cpuset */ 3564 } 3565 3566 /* synchronize mems_allowed to N_MEMORY */ 3567 if (mems_updated) { 3568 spin_lock_irq(&callback_lock); 3569 if (!on_dfl) 3570 top_cpuset.mems_allowed = new_mems; 3571 top_cpuset.effective_mems = new_mems; 3572 spin_unlock_irq(&callback_lock); 3573 update_tasks_nodemask(&top_cpuset); 3574 } 3575 3576 percpu_up_write(&cpuset_rwsem); 3577 3578 /* if cpus or mems changed, we need to propagate to descendants */ 3579 if (cpus_updated || mems_updated) { 3580 struct cpuset *cs; 3581 struct cgroup_subsys_state *pos_css; 3582 3583 rcu_read_lock(); 3584 cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) { 3585 if (cs == &top_cpuset || !css_tryget_online(&cs->css)) 3586 continue; 3587 rcu_read_unlock(); 3588 3589 cpuset_hotplug_update_tasks(cs, ptmp); 3590 3591 rcu_read_lock(); 3592 css_put(&cs->css); 3593 } 3594 rcu_read_unlock(); 3595 } 3596 3597 /* rebuild sched domains if cpus_allowed has changed */ 3598 if (cpus_updated || force_rebuild) { 3599 force_rebuild = false; 3600 rebuild_sched_domains(); 3601 } 3602 3603 free_cpumasks(NULL, ptmp); 3604 } 3605 3606 void cpuset_update_active_cpus(void) 3607 { 3608 /* 3609 * We're inside cpu hotplug critical region which usually nests 3610 * inside cgroup synchronization. Bounce actual hotplug processing 3611 * to a work item to avoid reverse locking order. 3612 */ 3613 schedule_work(&cpuset_hotplug_work); 3614 } 3615 3616 void cpuset_wait_for_hotplug(void) 3617 { 3618 flush_work(&cpuset_hotplug_work); 3619 } 3620 3621 /* 3622 * Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY]. 3623 * Call this routine anytime after node_states[N_MEMORY] changes. 3624 * See cpuset_update_active_cpus() for CPU hotplug handling. 3625 */ 3626 static int cpuset_track_online_nodes(struct notifier_block *self, 3627 unsigned long action, void *arg) 3628 { 3629 schedule_work(&cpuset_hotplug_work); 3630 return NOTIFY_OK; 3631 } 3632 3633 static struct notifier_block cpuset_track_online_nodes_nb = { 3634 .notifier_call = cpuset_track_online_nodes, 3635 .priority = 10, /* ??! */ 3636 }; 3637 3638 /** 3639 * cpuset_init_smp - initialize cpus_allowed 3640 * 3641 * Description: Finish top cpuset after cpu, node maps are initialized 3642 */ 3643 void __init cpuset_init_smp(void) 3644 { 3645 /* 3646 * cpus_allowd/mems_allowed set to v2 values in the initial 3647 * cpuset_bind() call will be reset to v1 values in another 3648 * cpuset_bind() call when v1 cpuset is mounted. 3649 */ 3650 top_cpuset.old_mems_allowed = top_cpuset.mems_allowed; 3651 3652 cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask); 3653 top_cpuset.effective_mems = node_states[N_MEMORY]; 3654 3655 register_hotmemory_notifier(&cpuset_track_online_nodes_nb); 3656 3657 cpuset_migrate_mm_wq = alloc_ordered_workqueue("cpuset_migrate_mm", 0); 3658 BUG_ON(!cpuset_migrate_mm_wq); 3659 } 3660 3661 /** 3662 * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset. 3663 * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed. 3664 * @pmask: pointer to struct cpumask variable to receive cpus_allowed set. 3665 * 3666 * Description: Returns the cpumask_var_t cpus_allowed of the cpuset 3667 * attached to the specified @tsk. Guaranteed to return some non-empty 3668 * subset of cpu_online_mask, even if this means going outside the 3669 * tasks cpuset. 3670 **/ 3671 3672 void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) 3673 { 3674 unsigned long flags; 3675 3676 spin_lock_irqsave(&callback_lock, flags); 3677 guarantee_online_cpus(tsk, pmask); 3678 spin_unlock_irqrestore(&callback_lock, flags); 3679 } 3680 3681 /** 3682 * cpuset_cpus_allowed_fallback - final fallback before complete catastrophe. 3683 * @tsk: pointer to task_struct with which the scheduler is struggling 3684 * 3685 * Description: In the case that the scheduler cannot find an allowed cpu in 3686 * tsk->cpus_allowed, we fall back to task_cs(tsk)->cpus_allowed. In legacy 3687 * mode however, this value is the same as task_cs(tsk)->effective_cpus, 3688 * which will not contain a sane cpumask during cases such as cpu hotplugging. 3689 * This is the absolute last resort for the scheduler and it is only used if 3690 * _every_ other avenue has been traveled. 3691 * 3692 * Returns true if the affinity of @tsk was changed, false otherwise. 3693 **/ 3694 3695 bool cpuset_cpus_allowed_fallback(struct task_struct *tsk) 3696 { 3697 const struct cpumask *possible_mask = task_cpu_possible_mask(tsk); 3698 const struct cpumask *cs_mask; 3699 bool changed = false; 3700 3701 rcu_read_lock(); 3702 cs_mask = task_cs(tsk)->cpus_allowed; 3703 if (is_in_v2_mode() && cpumask_subset(cs_mask, possible_mask)) { 3704 do_set_cpus_allowed(tsk, cs_mask); 3705 changed = true; 3706 } 3707 rcu_read_unlock(); 3708 3709 /* 3710 * We own tsk->cpus_allowed, nobody can change it under us. 3711 * 3712 * But we used cs && cs->cpus_allowed lockless and thus can 3713 * race with cgroup_attach_task() or update_cpumask() and get 3714 * the wrong tsk->cpus_allowed. However, both cases imply the 3715 * subsequent cpuset_change_cpumask()->set_cpus_allowed_ptr() 3716 * which takes task_rq_lock(). 3717 * 3718 * If we are called after it dropped the lock we must see all 3719 * changes in tsk_cs()->cpus_allowed. Otherwise we can temporary 3720 * set any mask even if it is not right from task_cs() pov, 3721 * the pending set_cpus_allowed_ptr() will fix things. 3722 * 3723 * select_fallback_rq() will fix things ups and set cpu_possible_mask 3724 * if required. 3725 */ 3726 return changed; 3727 } 3728 3729 void __init cpuset_init_current_mems_allowed(void) 3730 { 3731 nodes_setall(current->mems_allowed); 3732 } 3733 3734 /** 3735 * cpuset_mems_allowed - return mems_allowed mask from a tasks cpuset. 3736 * @tsk: pointer to task_struct from which to obtain cpuset->mems_allowed. 3737 * 3738 * Description: Returns the nodemask_t mems_allowed of the cpuset 3739 * attached to the specified @tsk. Guaranteed to return some non-empty 3740 * subset of node_states[N_MEMORY], even if this means going outside the 3741 * tasks cpuset. 3742 **/ 3743 3744 nodemask_t cpuset_mems_allowed(struct task_struct *tsk) 3745 { 3746 nodemask_t mask; 3747 unsigned long flags; 3748 3749 spin_lock_irqsave(&callback_lock, flags); 3750 rcu_read_lock(); 3751 guarantee_online_mems(task_cs(tsk), &mask); 3752 rcu_read_unlock(); 3753 spin_unlock_irqrestore(&callback_lock, flags); 3754 3755 return mask; 3756 } 3757 3758 /** 3759 * cpuset_nodemask_valid_mems_allowed - check nodemask vs. current mems_allowed 3760 * @nodemask: the nodemask to be checked 3761 * 3762 * Are any of the nodes in the nodemask allowed in current->mems_allowed? 3763 */ 3764 int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask) 3765 { 3766 return nodes_intersects(*nodemask, current->mems_allowed); 3767 } 3768 3769 /* 3770 * nearest_hardwall_ancestor() - Returns the nearest mem_exclusive or 3771 * mem_hardwall ancestor to the specified cpuset. Call holding 3772 * callback_lock. If no ancestor is mem_exclusive or mem_hardwall 3773 * (an unusual configuration), then returns the root cpuset. 3774 */ 3775 static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) 3776 { 3777 while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && parent_cs(cs)) 3778 cs = parent_cs(cs); 3779 return cs; 3780 } 3781 3782 /* 3783 * __cpuset_node_allowed - Can we allocate on a memory node? 3784 * @node: is this an allowed node? 3785 * @gfp_mask: memory allocation flags 3786 * 3787 * If we're in interrupt, yes, we can always allocate. If @node is set in 3788 * current's mems_allowed, yes. If it's not a __GFP_HARDWALL request and this 3789 * node is set in the nearest hardwalled cpuset ancestor to current's cpuset, 3790 * yes. If current has access to memory reserves as an oom victim, yes. 3791 * Otherwise, no. 3792 * 3793 * GFP_USER allocations are marked with the __GFP_HARDWALL bit, 3794 * and do not allow allocations outside the current tasks cpuset 3795 * unless the task has been OOM killed. 3796 * GFP_KERNEL allocations are not so marked, so can escape to the 3797 * nearest enclosing hardwalled ancestor cpuset. 3798 * 3799 * Scanning up parent cpusets requires callback_lock. The 3800 * __alloc_pages() routine only calls here with __GFP_HARDWALL bit 3801 * _not_ set if it's a GFP_KERNEL allocation, and all nodes in the 3802 * current tasks mems_allowed came up empty on the first pass over 3803 * the zonelist. So only GFP_KERNEL allocations, if all nodes in the 3804 * cpuset are short of memory, might require taking the callback_lock. 3805 * 3806 * The first call here from mm/page_alloc:get_page_from_freelist() 3807 * has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets, 3808 * so no allocation on a node outside the cpuset is allowed (unless 3809 * in interrupt, of course). 3810 * 3811 * The second pass through get_page_from_freelist() doesn't even call 3812 * here for GFP_ATOMIC calls. For those calls, the __alloc_pages() 3813 * variable 'wait' is not set, and the bit ALLOC_CPUSET is not set 3814 * in alloc_flags. That logic and the checks below have the combined 3815 * affect that: 3816 * in_interrupt - any node ok (current task context irrelevant) 3817 * GFP_ATOMIC - any node ok 3818 * tsk_is_oom_victim - any node ok 3819 * GFP_KERNEL - any node in enclosing hardwalled cpuset ok 3820 * GFP_USER - only nodes in current tasks mems allowed ok. 3821 */ 3822 bool __cpuset_node_allowed(int node, gfp_t gfp_mask) 3823 { 3824 struct cpuset *cs; /* current cpuset ancestors */ 3825 bool allowed; /* is allocation in zone z allowed? */ 3826 unsigned long flags; 3827 3828 if (in_interrupt()) 3829 return true; 3830 if (node_isset(node, current->mems_allowed)) 3831 return true; 3832 /* 3833 * Allow tasks that have access to memory reserves because they have 3834 * been OOM killed to get memory anywhere. 3835 */ 3836 if (unlikely(tsk_is_oom_victim(current))) 3837 return true; 3838 if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */ 3839 return false; 3840 3841 if (current->flags & PF_EXITING) /* Let dying task have memory */ 3842 return true; 3843 3844 /* Not hardwall and node outside mems_allowed: scan up cpusets */ 3845 spin_lock_irqsave(&callback_lock, flags); 3846 3847 rcu_read_lock(); 3848 cs = nearest_hardwall_ancestor(task_cs(current)); 3849 allowed = node_isset(node, cs->mems_allowed); 3850 rcu_read_unlock(); 3851 3852 spin_unlock_irqrestore(&callback_lock, flags); 3853 return allowed; 3854 } 3855 3856 /** 3857 * cpuset_mem_spread_node() - On which node to begin search for a file page 3858 * cpuset_slab_spread_node() - On which node to begin search for a slab page 3859 * 3860 * If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for 3861 * tasks in a cpuset with is_spread_page or is_spread_slab set), 3862 * and if the memory allocation used cpuset_mem_spread_node() 3863 * to determine on which node to start looking, as it will for 3864 * certain page cache or slab cache pages such as used for file 3865 * system buffers and inode caches, then instead of starting on the 3866 * local node to look for a free page, rather spread the starting 3867 * node around the tasks mems_allowed nodes. 3868 * 3869 * We don't have to worry about the returned node being offline 3870 * because "it can't happen", and even if it did, it would be ok. 3871 * 3872 * The routines calling guarantee_online_mems() are careful to 3873 * only set nodes in task->mems_allowed that are online. So it 3874 * should not be possible for the following code to return an 3875 * offline node. But if it did, that would be ok, as this routine 3876 * is not returning the node where the allocation must be, only 3877 * the node where the search should start. The zonelist passed to 3878 * __alloc_pages() will include all nodes. If the slab allocator 3879 * is passed an offline node, it will fall back to the local node. 3880 * See kmem_cache_alloc_node(). 3881 */ 3882 3883 static int cpuset_spread_node(int *rotor) 3884 { 3885 return *rotor = next_node_in(*rotor, current->mems_allowed); 3886 } 3887 3888 int cpuset_mem_spread_node(void) 3889 { 3890 if (current->cpuset_mem_spread_rotor == NUMA_NO_NODE) 3891 current->cpuset_mem_spread_rotor = 3892 node_random(¤t->mems_allowed); 3893 3894 return cpuset_spread_node(¤t->cpuset_mem_spread_rotor); 3895 } 3896 3897 int cpuset_slab_spread_node(void) 3898 { 3899 if (current->cpuset_slab_spread_rotor == NUMA_NO_NODE) 3900 current->cpuset_slab_spread_rotor = 3901 node_random(¤t->mems_allowed); 3902 3903 return cpuset_spread_node(¤t->cpuset_slab_spread_rotor); 3904 } 3905 3906 EXPORT_SYMBOL_GPL(cpuset_mem_spread_node); 3907 3908 /** 3909 * cpuset_mems_allowed_intersects - Does @tsk1's mems_allowed intersect @tsk2's? 3910 * @tsk1: pointer to task_struct of some task. 3911 * @tsk2: pointer to task_struct of some other task. 3912 * 3913 * Description: Return true if @tsk1's mems_allowed intersects the 3914 * mems_allowed of @tsk2. Used by the OOM killer to determine if 3915 * one of the task's memory usage might impact the memory available 3916 * to the other. 3917 **/ 3918 3919 int cpuset_mems_allowed_intersects(const struct task_struct *tsk1, 3920 const struct task_struct *tsk2) 3921 { 3922 return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed); 3923 } 3924 3925 /** 3926 * cpuset_print_current_mems_allowed - prints current's cpuset and mems_allowed 3927 * 3928 * Description: Prints current's name, cpuset name, and cached copy of its 3929 * mems_allowed to the kernel log. 3930 */ 3931 void cpuset_print_current_mems_allowed(void) 3932 { 3933 struct cgroup *cgrp; 3934 3935 rcu_read_lock(); 3936 3937 cgrp = task_cs(current)->css.cgroup; 3938 pr_cont(",cpuset="); 3939 pr_cont_cgroup_name(cgrp); 3940 pr_cont(",mems_allowed=%*pbl", 3941 nodemask_pr_args(¤t->mems_allowed)); 3942 3943 rcu_read_unlock(); 3944 } 3945 3946 /* 3947 * Collection of memory_pressure is suppressed unless 3948 * this flag is enabled by writing "1" to the special 3949 * cpuset file 'memory_pressure_enabled' in the root cpuset. 3950 */ 3951 3952 int cpuset_memory_pressure_enabled __read_mostly; 3953 3954 /* 3955 * __cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims. 3956 * 3957 * Keep a running average of the rate of synchronous (direct) 3958 * page reclaim efforts initiated by tasks in each cpuset. 3959 * 3960 * This represents the rate at which some task in the cpuset 3961 * ran low on memory on all nodes it was allowed to use, and 3962 * had to enter the kernels page reclaim code in an effort to 3963 * create more free memory by tossing clean pages or swapping 3964 * or writing dirty pages. 3965 * 3966 * Display to user space in the per-cpuset read-only file 3967 * "memory_pressure". Value displayed is an integer 3968 * representing the recent rate of entry into the synchronous 3969 * (direct) page reclaim by any task attached to the cpuset. 3970 */ 3971 3972 void __cpuset_memory_pressure_bump(void) 3973 { 3974 rcu_read_lock(); 3975 fmeter_markevent(&task_cs(current)->fmeter); 3976 rcu_read_unlock(); 3977 } 3978 3979 #ifdef CONFIG_PROC_PID_CPUSET 3980 /* 3981 * proc_cpuset_show() 3982 * - Print tasks cpuset path into seq_file. 3983 * - Used for /proc/<pid>/cpuset. 3984 * - No need to task_lock(tsk) on this tsk->cpuset reference, as it 3985 * doesn't really matter if tsk->cpuset changes after we read it, 3986 * and we take cpuset_rwsem, keeping cpuset_attach() from changing it 3987 * anyway. 3988 */ 3989 int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns, 3990 struct pid *pid, struct task_struct *tsk) 3991 { 3992 char *buf; 3993 struct cgroup_subsys_state *css; 3994 int retval; 3995 3996 retval = -ENOMEM; 3997 buf = kmalloc(PATH_MAX, GFP_KERNEL); 3998 if (!buf) 3999 goto out; 4000 4001 css = task_get_css(tsk, cpuset_cgrp_id); 4002 retval = cgroup_path_ns(css->cgroup, buf, PATH_MAX, 4003 current->nsproxy->cgroup_ns); 4004 css_put(css); 4005 if (retval >= PATH_MAX) 4006 retval = -ENAMETOOLONG; 4007 if (retval < 0) 4008 goto out_free; 4009 seq_puts(m, buf); 4010 seq_putc(m, '\n'); 4011 retval = 0; 4012 out_free: 4013 kfree(buf); 4014 out: 4015 return retval; 4016 } 4017 #endif /* CONFIG_PROC_PID_CPUSET */ 4018 4019 /* Display task mems_allowed in /proc/<pid>/status file. */ 4020 void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task) 4021 { 4022 seq_printf(m, "Mems_allowed:\t%*pb\n", 4023 nodemask_pr_args(&task->mems_allowed)); 4024 seq_printf(m, "Mems_allowed_list:\t%*pbl\n", 4025 nodemask_pr_args(&task->mems_allowed)); 4026 } 4027