1 /* 2 * kernel/cpuset.c 3 * 4 * Processor and Memory placement constraints for sets of tasks. 5 * 6 * Copyright (C) 2003 BULL SA. 7 * Copyright (C) 2004-2007 Silicon Graphics, Inc. 8 * Copyright (C) 2006 Google, Inc 9 * 10 * Portions derived from Patrick Mochel's sysfs code. 11 * sysfs is Copyright (c) 2001-3 Patrick Mochel 12 * 13 * 2003-10-10 Written by Simon Derr. 14 * 2003-10-22 Updates by Stephen Hemminger. 15 * 2004 May-July Rework by Paul Jackson. 16 * 2006 Rework by Paul Menage to use generic cgroups 17 * 2008 Rework of the scheduler domains and CPU hotplug handling 18 * by Max Krasnyansky 19 * 20 * This file is subject to the terms and conditions of the GNU General Public 21 * License. See the file COPYING in the main directory of the Linux 22 * distribution for more details. 23 */ 24 25 #include <linux/cpu.h> 26 #include <linux/cpumask.h> 27 #include <linux/cpuset.h> 28 #include <linux/err.h> 29 #include <linux/errno.h> 30 #include <linux/file.h> 31 #include <linux/fs.h> 32 #include <linux/init.h> 33 #include <linux/interrupt.h> 34 #include <linux/kernel.h> 35 #include <linux/kmod.h> 36 #include <linux/list.h> 37 #include <linux/mempolicy.h> 38 #include <linux/mm.h> 39 #include <linux/memory.h> 40 #include <linux/export.h> 41 #include <linux/mount.h> 42 #include <linux/namei.h> 43 #include <linux/pagemap.h> 44 #include <linux/proc_fs.h> 45 #include <linux/rcupdate.h> 46 #include <linux/sched.h> 47 #include <linux/sched/mm.h> 48 #include <linux/sched/task.h> 49 #include <linux/seq_file.h> 50 #include <linux/security.h> 51 #include <linux/slab.h> 52 #include <linux/spinlock.h> 53 #include <linux/stat.h> 54 #include <linux/string.h> 55 #include <linux/time.h> 56 #include <linux/time64.h> 57 #include <linux/backing-dev.h> 58 #include <linux/sort.h> 59 60 #include <linux/uaccess.h> 61 #include <linux/atomic.h> 62 #include <linux/mutex.h> 63 #include <linux/cgroup.h> 64 #include <linux/wait.h> 65 66 DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key); 67 DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key); 68 69 /* See "Frequency meter" comments, below. */ 70 71 struct fmeter { 72 int cnt; /* unprocessed events count */ 73 int val; /* most recent output value */ 74 time64_t time; /* clock (secs) when val computed */ 75 spinlock_t lock; /* guards read or write of above */ 76 }; 77 78 struct cpuset { 79 struct cgroup_subsys_state css; 80 81 unsigned long flags; /* "unsigned long" so bitops work */ 82 83 /* 84 * On default hierarchy: 85 * 86 * The user-configured masks can only be changed by writing to 87 * cpuset.cpus and cpuset.mems, and won't be limited by the 88 * parent masks. 89 * 90 * The effective masks is the real masks that apply to the tasks 91 * in the cpuset. They may be changed if the configured masks are 92 * changed or hotplug happens. 93 * 94 * effective_mask == configured_mask & parent's effective_mask, 95 * and if it ends up empty, it will inherit the parent's mask. 96 * 97 * 98 * On legacy hierachy: 99 * 100 * The user-configured masks are always the same with effective masks. 101 */ 102 103 /* user-configured CPUs and Memory Nodes allow to tasks */ 104 cpumask_var_t cpus_allowed; 105 nodemask_t mems_allowed; 106 107 /* effective CPUs and Memory Nodes allow to tasks */ 108 cpumask_var_t effective_cpus; 109 nodemask_t effective_mems; 110 111 /* 112 * This is old Memory Nodes tasks took on. 113 * 114 * - top_cpuset.old_mems_allowed is initialized to mems_allowed. 115 * - A new cpuset's old_mems_allowed is initialized when some 116 * task is moved into it. 117 * - old_mems_allowed is used in cpuset_migrate_mm() when we change 118 * cpuset.mems_allowed and have tasks' nodemask updated, and 119 * then old_mems_allowed is updated to mems_allowed. 120 */ 121 nodemask_t old_mems_allowed; 122 123 struct fmeter fmeter; /* memory_pressure filter */ 124 125 /* 126 * Tasks are being attached to this cpuset. Used to prevent 127 * zeroing cpus/mems_allowed between ->can_attach() and ->attach(). 128 */ 129 int attach_in_progress; 130 131 /* partition number for rebuild_sched_domains() */ 132 int pn; 133 134 /* for custom sched domain */ 135 int relax_domain_level; 136 }; 137 138 static inline struct cpuset *css_cs(struct cgroup_subsys_state *css) 139 { 140 return css ? container_of(css, struct cpuset, css) : NULL; 141 } 142 143 /* Retrieve the cpuset for a task */ 144 static inline struct cpuset *task_cs(struct task_struct *task) 145 { 146 return css_cs(task_css(task, cpuset_cgrp_id)); 147 } 148 149 static inline struct cpuset *parent_cs(struct cpuset *cs) 150 { 151 return css_cs(cs->css.parent); 152 } 153 154 #ifdef CONFIG_NUMA 155 static inline bool task_has_mempolicy(struct task_struct *task) 156 { 157 return task->mempolicy; 158 } 159 #else 160 static inline bool task_has_mempolicy(struct task_struct *task) 161 { 162 return false; 163 } 164 #endif 165 166 167 /* bits in struct cpuset flags field */ 168 typedef enum { 169 CS_ONLINE, 170 CS_CPU_EXCLUSIVE, 171 CS_MEM_EXCLUSIVE, 172 CS_MEM_HARDWALL, 173 CS_MEMORY_MIGRATE, 174 CS_SCHED_LOAD_BALANCE, 175 CS_SPREAD_PAGE, 176 CS_SPREAD_SLAB, 177 } cpuset_flagbits_t; 178 179 /* convenient tests for these bits */ 180 static inline bool is_cpuset_online(struct cpuset *cs) 181 { 182 return test_bit(CS_ONLINE, &cs->flags) && !css_is_dying(&cs->css); 183 } 184 185 static inline int is_cpu_exclusive(const struct cpuset *cs) 186 { 187 return test_bit(CS_CPU_EXCLUSIVE, &cs->flags); 188 } 189 190 static inline int is_mem_exclusive(const struct cpuset *cs) 191 { 192 return test_bit(CS_MEM_EXCLUSIVE, &cs->flags); 193 } 194 195 static inline int is_mem_hardwall(const struct cpuset *cs) 196 { 197 return test_bit(CS_MEM_HARDWALL, &cs->flags); 198 } 199 200 static inline int is_sched_load_balance(const struct cpuset *cs) 201 { 202 return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); 203 } 204 205 static inline int is_memory_migrate(const struct cpuset *cs) 206 { 207 return test_bit(CS_MEMORY_MIGRATE, &cs->flags); 208 } 209 210 static inline int is_spread_page(const struct cpuset *cs) 211 { 212 return test_bit(CS_SPREAD_PAGE, &cs->flags); 213 } 214 215 static inline int is_spread_slab(const struct cpuset *cs) 216 { 217 return test_bit(CS_SPREAD_SLAB, &cs->flags); 218 } 219 220 static struct cpuset top_cpuset = { 221 .flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) | 222 (1 << CS_MEM_EXCLUSIVE)), 223 }; 224 225 /** 226 * cpuset_for_each_child - traverse online children of a cpuset 227 * @child_cs: loop cursor pointing to the current child 228 * @pos_css: used for iteration 229 * @parent_cs: target cpuset to walk children of 230 * 231 * Walk @child_cs through the online children of @parent_cs. Must be used 232 * with RCU read locked. 233 */ 234 #define cpuset_for_each_child(child_cs, pos_css, parent_cs) \ 235 css_for_each_child((pos_css), &(parent_cs)->css) \ 236 if (is_cpuset_online(((child_cs) = css_cs((pos_css))))) 237 238 /** 239 * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants 240 * @des_cs: loop cursor pointing to the current descendant 241 * @pos_css: used for iteration 242 * @root_cs: target cpuset to walk ancestor of 243 * 244 * Walk @des_cs through the online descendants of @root_cs. Must be used 245 * with RCU read locked. The caller may modify @pos_css by calling 246 * css_rightmost_descendant() to skip subtree. @root_cs is included in the 247 * iteration and the first node to be visited. 248 */ 249 #define cpuset_for_each_descendant_pre(des_cs, pos_css, root_cs) \ 250 css_for_each_descendant_pre((pos_css), &(root_cs)->css) \ 251 if (is_cpuset_online(((des_cs) = css_cs((pos_css))))) 252 253 /* 254 * There are two global locks guarding cpuset structures - cpuset_mutex and 255 * callback_lock. We also require taking task_lock() when dereferencing a 256 * task's cpuset pointer. See "The task_lock() exception", at the end of this 257 * comment. 258 * 259 * A task must hold both locks to modify cpusets. If a task holds 260 * cpuset_mutex, then it blocks others wanting that mutex, ensuring that it 261 * is the only task able to also acquire callback_lock and be able to 262 * modify cpusets. It can perform various checks on the cpuset structure 263 * first, knowing nothing will change. It can also allocate memory while 264 * just holding cpuset_mutex. While it is performing these checks, various 265 * callback routines can briefly acquire callback_lock to query cpusets. 266 * Once it is ready to make the changes, it takes callback_lock, blocking 267 * everyone else. 268 * 269 * Calls to the kernel memory allocator can not be made while holding 270 * callback_lock, as that would risk double tripping on callback_lock 271 * from one of the callbacks into the cpuset code from within 272 * __alloc_pages(). 273 * 274 * If a task is only holding callback_lock, then it has read-only 275 * access to cpusets. 276 * 277 * Now, the task_struct fields mems_allowed and mempolicy may be changed 278 * by other task, we use alloc_lock in the task_struct fields to protect 279 * them. 280 * 281 * The cpuset_common_file_read() handlers only hold callback_lock across 282 * small pieces of code, such as when reading out possibly multi-word 283 * cpumasks and nodemasks. 284 * 285 * Accessing a task's cpuset should be done in accordance with the 286 * guidelines for accessing subsystem state in kernel/cgroup.c 287 */ 288 289 static DEFINE_MUTEX(cpuset_mutex); 290 static DEFINE_SPINLOCK(callback_lock); 291 292 static struct workqueue_struct *cpuset_migrate_mm_wq; 293 294 /* 295 * CPU / memory hotplug is handled asynchronously. 296 */ 297 static void cpuset_hotplug_workfn(struct work_struct *work); 298 static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn); 299 300 static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq); 301 302 /* 303 * This is ugly, but preserves the userspace API for existing cpuset 304 * users. If someone tries to mount the "cpuset" filesystem, we 305 * silently switch it to mount "cgroup" instead 306 */ 307 static struct dentry *cpuset_mount(struct file_system_type *fs_type, 308 int flags, const char *unused_dev_name, void *data) 309 { 310 struct file_system_type *cgroup_fs = get_fs_type("cgroup"); 311 struct dentry *ret = ERR_PTR(-ENODEV); 312 if (cgroup_fs) { 313 char mountopts[] = 314 "cpuset,noprefix," 315 "release_agent=/sbin/cpuset_release_agent"; 316 ret = cgroup_fs->mount(cgroup_fs, flags, 317 unused_dev_name, mountopts); 318 put_filesystem(cgroup_fs); 319 } 320 return ret; 321 } 322 323 static struct file_system_type cpuset_fs_type = { 324 .name = "cpuset", 325 .mount = cpuset_mount, 326 }; 327 328 /* 329 * Return in pmask the portion of a cpusets's cpus_allowed that 330 * are online. If none are online, walk up the cpuset hierarchy 331 * until we find one that does have some online cpus. 332 * 333 * One way or another, we guarantee to return some non-empty subset 334 * of cpu_online_mask. 335 * 336 * Call with callback_lock or cpuset_mutex held. 337 */ 338 static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask) 339 { 340 while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask)) { 341 cs = parent_cs(cs); 342 if (unlikely(!cs)) { 343 /* 344 * The top cpuset doesn't have any online cpu as a 345 * consequence of a race between cpuset_hotplug_work 346 * and cpu hotplug notifier. But we know the top 347 * cpuset's effective_cpus is on its way to to be 348 * identical to cpu_online_mask. 349 */ 350 cpumask_copy(pmask, cpu_online_mask); 351 return; 352 } 353 } 354 cpumask_and(pmask, cs->effective_cpus, cpu_online_mask); 355 } 356 357 /* 358 * Return in *pmask the portion of a cpusets's mems_allowed that 359 * are online, with memory. If none are online with memory, walk 360 * up the cpuset hierarchy until we find one that does have some 361 * online mems. The top cpuset always has some mems online. 362 * 363 * One way or another, we guarantee to return some non-empty subset 364 * of node_states[N_MEMORY]. 365 * 366 * Call with callback_lock or cpuset_mutex held. 367 */ 368 static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask) 369 { 370 while (!nodes_intersects(cs->effective_mems, node_states[N_MEMORY])) 371 cs = parent_cs(cs); 372 nodes_and(*pmask, cs->effective_mems, node_states[N_MEMORY]); 373 } 374 375 /* 376 * update task's spread flag if cpuset's page/slab spread flag is set 377 * 378 * Call with callback_lock or cpuset_mutex held. 379 */ 380 static void cpuset_update_task_spread_flag(struct cpuset *cs, 381 struct task_struct *tsk) 382 { 383 if (is_spread_page(cs)) 384 task_set_spread_page(tsk); 385 else 386 task_clear_spread_page(tsk); 387 388 if (is_spread_slab(cs)) 389 task_set_spread_slab(tsk); 390 else 391 task_clear_spread_slab(tsk); 392 } 393 394 /* 395 * is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q? 396 * 397 * One cpuset is a subset of another if all its allowed CPUs and 398 * Memory Nodes are a subset of the other, and its exclusive flags 399 * are only set if the other's are set. Call holding cpuset_mutex. 400 */ 401 402 static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) 403 { 404 return cpumask_subset(p->cpus_allowed, q->cpus_allowed) && 405 nodes_subset(p->mems_allowed, q->mems_allowed) && 406 is_cpu_exclusive(p) <= is_cpu_exclusive(q) && 407 is_mem_exclusive(p) <= is_mem_exclusive(q); 408 } 409 410 /** 411 * alloc_trial_cpuset - allocate a trial cpuset 412 * @cs: the cpuset that the trial cpuset duplicates 413 */ 414 static struct cpuset *alloc_trial_cpuset(struct cpuset *cs) 415 { 416 struct cpuset *trial; 417 418 trial = kmemdup(cs, sizeof(*cs), GFP_KERNEL); 419 if (!trial) 420 return NULL; 421 422 if (!alloc_cpumask_var(&trial->cpus_allowed, GFP_KERNEL)) 423 goto free_cs; 424 if (!alloc_cpumask_var(&trial->effective_cpus, GFP_KERNEL)) 425 goto free_cpus; 426 427 cpumask_copy(trial->cpus_allowed, cs->cpus_allowed); 428 cpumask_copy(trial->effective_cpus, cs->effective_cpus); 429 return trial; 430 431 free_cpus: 432 free_cpumask_var(trial->cpus_allowed); 433 free_cs: 434 kfree(trial); 435 return NULL; 436 } 437 438 /** 439 * free_trial_cpuset - free the trial cpuset 440 * @trial: the trial cpuset to be freed 441 */ 442 static void free_trial_cpuset(struct cpuset *trial) 443 { 444 free_cpumask_var(trial->effective_cpus); 445 free_cpumask_var(trial->cpus_allowed); 446 kfree(trial); 447 } 448 449 /* 450 * validate_change() - Used to validate that any proposed cpuset change 451 * follows the structural rules for cpusets. 452 * 453 * If we replaced the flag and mask values of the current cpuset 454 * (cur) with those values in the trial cpuset (trial), would 455 * our various subset and exclusive rules still be valid? Presumes 456 * cpuset_mutex held. 457 * 458 * 'cur' is the address of an actual, in-use cpuset. Operations 459 * such as list traversal that depend on the actual address of the 460 * cpuset in the list must use cur below, not trial. 461 * 462 * 'trial' is the address of bulk structure copy of cur, with 463 * perhaps one or more of the fields cpus_allowed, mems_allowed, 464 * or flags changed to new, trial values. 465 * 466 * Return 0 if valid, -errno if not. 467 */ 468 469 static int validate_change(struct cpuset *cur, struct cpuset *trial) 470 { 471 struct cgroup_subsys_state *css; 472 struct cpuset *c, *par; 473 int ret; 474 475 rcu_read_lock(); 476 477 /* Each of our child cpusets must be a subset of us */ 478 ret = -EBUSY; 479 cpuset_for_each_child(c, css, cur) 480 if (!is_cpuset_subset(c, trial)) 481 goto out; 482 483 /* Remaining checks don't apply to root cpuset */ 484 ret = 0; 485 if (cur == &top_cpuset) 486 goto out; 487 488 par = parent_cs(cur); 489 490 /* On legacy hiearchy, we must be a subset of our parent cpuset. */ 491 ret = -EACCES; 492 if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && 493 !is_cpuset_subset(trial, par)) 494 goto out; 495 496 /* 497 * If either I or some sibling (!= me) is exclusive, we can't 498 * overlap 499 */ 500 ret = -EINVAL; 501 cpuset_for_each_child(c, css, par) { 502 if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && 503 c != cur && 504 cpumask_intersects(trial->cpus_allowed, c->cpus_allowed)) 505 goto out; 506 if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) && 507 c != cur && 508 nodes_intersects(trial->mems_allowed, c->mems_allowed)) 509 goto out; 510 } 511 512 /* 513 * Cpusets with tasks - existing or newly being attached - can't 514 * be changed to have empty cpus_allowed or mems_allowed. 515 */ 516 ret = -ENOSPC; 517 if ((cgroup_is_populated(cur->css.cgroup) || cur->attach_in_progress)) { 518 if (!cpumask_empty(cur->cpus_allowed) && 519 cpumask_empty(trial->cpus_allowed)) 520 goto out; 521 if (!nodes_empty(cur->mems_allowed) && 522 nodes_empty(trial->mems_allowed)) 523 goto out; 524 } 525 526 /* 527 * We can't shrink if we won't have enough room for SCHED_DEADLINE 528 * tasks. 529 */ 530 ret = -EBUSY; 531 if (is_cpu_exclusive(cur) && 532 !cpuset_cpumask_can_shrink(cur->cpus_allowed, 533 trial->cpus_allowed)) 534 goto out; 535 536 ret = 0; 537 out: 538 rcu_read_unlock(); 539 return ret; 540 } 541 542 #ifdef CONFIG_SMP 543 /* 544 * Helper routine for generate_sched_domains(). 545 * Do cpusets a, b have overlapping effective cpus_allowed masks? 546 */ 547 static int cpusets_overlap(struct cpuset *a, struct cpuset *b) 548 { 549 return cpumask_intersects(a->effective_cpus, b->effective_cpus); 550 } 551 552 static void 553 update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c) 554 { 555 if (dattr->relax_domain_level < c->relax_domain_level) 556 dattr->relax_domain_level = c->relax_domain_level; 557 return; 558 } 559 560 static void update_domain_attr_tree(struct sched_domain_attr *dattr, 561 struct cpuset *root_cs) 562 { 563 struct cpuset *cp; 564 struct cgroup_subsys_state *pos_css; 565 566 rcu_read_lock(); 567 cpuset_for_each_descendant_pre(cp, pos_css, root_cs) { 568 /* skip the whole subtree if @cp doesn't have any CPU */ 569 if (cpumask_empty(cp->cpus_allowed)) { 570 pos_css = css_rightmost_descendant(pos_css); 571 continue; 572 } 573 574 if (is_sched_load_balance(cp)) 575 update_domain_attr(dattr, cp); 576 } 577 rcu_read_unlock(); 578 } 579 580 /* Must be called with cpuset_mutex held. */ 581 static inline int nr_cpusets(void) 582 { 583 /* jump label reference count + the top-level cpuset */ 584 return static_key_count(&cpusets_enabled_key.key) + 1; 585 } 586 587 /* 588 * generate_sched_domains() 589 * 590 * This function builds a partial partition of the systems CPUs 591 * A 'partial partition' is a set of non-overlapping subsets whose 592 * union is a subset of that set. 593 * The output of this function needs to be passed to kernel/sched/core.c 594 * partition_sched_domains() routine, which will rebuild the scheduler's 595 * load balancing domains (sched domains) as specified by that partial 596 * partition. 597 * 598 * See "What is sched_load_balance" in Documentation/cgroups/cpusets.txt 599 * for a background explanation of this. 600 * 601 * Does not return errors, on the theory that the callers of this 602 * routine would rather not worry about failures to rebuild sched 603 * domains when operating in the severe memory shortage situations 604 * that could cause allocation failures below. 605 * 606 * Must be called with cpuset_mutex held. 607 * 608 * The three key local variables below are: 609 * q - a linked-list queue of cpuset pointers, used to implement a 610 * top-down scan of all cpusets. This scan loads a pointer 611 * to each cpuset marked is_sched_load_balance into the 612 * array 'csa'. For our purposes, rebuilding the schedulers 613 * sched domains, we can ignore !is_sched_load_balance cpusets. 614 * csa - (for CpuSet Array) Array of pointers to all the cpusets 615 * that need to be load balanced, for convenient iterative 616 * access by the subsequent code that finds the best partition, 617 * i.e the set of domains (subsets) of CPUs such that the 618 * cpus_allowed of every cpuset marked is_sched_load_balance 619 * is a subset of one of these domains, while there are as 620 * many such domains as possible, each as small as possible. 621 * doms - Conversion of 'csa' to an array of cpumasks, for passing to 622 * the kernel/sched/core.c routine partition_sched_domains() in a 623 * convenient format, that can be easily compared to the prior 624 * value to determine what partition elements (sched domains) 625 * were changed (added or removed.) 626 * 627 * Finding the best partition (set of domains): 628 * The triple nested loops below over i, j, k scan over the 629 * load balanced cpusets (using the array of cpuset pointers in 630 * csa[]) looking for pairs of cpusets that have overlapping 631 * cpus_allowed, but which don't have the same 'pn' partition 632 * number and gives them in the same partition number. It keeps 633 * looping on the 'restart' label until it can no longer find 634 * any such pairs. 635 * 636 * The union of the cpus_allowed masks from the set of 637 * all cpusets having the same 'pn' value then form the one 638 * element of the partition (one sched domain) to be passed to 639 * partition_sched_domains(). 640 */ 641 static int generate_sched_domains(cpumask_var_t **domains, 642 struct sched_domain_attr **attributes) 643 { 644 struct cpuset *cp; /* scans q */ 645 struct cpuset **csa; /* array of all cpuset ptrs */ 646 int csn; /* how many cpuset ptrs in csa so far */ 647 int i, j, k; /* indices for partition finding loops */ 648 cpumask_var_t *doms; /* resulting partition; i.e. sched domains */ 649 cpumask_var_t non_isolated_cpus; /* load balanced CPUs */ 650 struct sched_domain_attr *dattr; /* attributes for custom domains */ 651 int ndoms = 0; /* number of sched domains in result */ 652 int nslot; /* next empty doms[] struct cpumask slot */ 653 struct cgroup_subsys_state *pos_css; 654 655 doms = NULL; 656 dattr = NULL; 657 csa = NULL; 658 659 if (!alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL)) 660 goto done; 661 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); 662 663 /* Special case for the 99% of systems with one, full, sched domain */ 664 if (is_sched_load_balance(&top_cpuset)) { 665 ndoms = 1; 666 doms = alloc_sched_domains(ndoms); 667 if (!doms) 668 goto done; 669 670 dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL); 671 if (dattr) { 672 *dattr = SD_ATTR_INIT; 673 update_domain_attr_tree(dattr, &top_cpuset); 674 } 675 cpumask_and(doms[0], top_cpuset.effective_cpus, 676 non_isolated_cpus); 677 678 goto done; 679 } 680 681 csa = kmalloc(nr_cpusets() * sizeof(cp), GFP_KERNEL); 682 if (!csa) 683 goto done; 684 csn = 0; 685 686 rcu_read_lock(); 687 cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) { 688 if (cp == &top_cpuset) 689 continue; 690 /* 691 * Continue traversing beyond @cp iff @cp has some CPUs and 692 * isn't load balancing. The former is obvious. The 693 * latter: All child cpusets contain a subset of the 694 * parent's cpus, so just skip them, and then we call 695 * update_domain_attr_tree() to calc relax_domain_level of 696 * the corresponding sched domain. 697 */ 698 if (!cpumask_empty(cp->cpus_allowed) && 699 !(is_sched_load_balance(cp) && 700 cpumask_intersects(cp->cpus_allowed, non_isolated_cpus))) 701 continue; 702 703 if (is_sched_load_balance(cp)) 704 csa[csn++] = cp; 705 706 /* skip @cp's subtree */ 707 pos_css = css_rightmost_descendant(pos_css); 708 } 709 rcu_read_unlock(); 710 711 for (i = 0; i < csn; i++) 712 csa[i]->pn = i; 713 ndoms = csn; 714 715 restart: 716 /* Find the best partition (set of sched domains) */ 717 for (i = 0; i < csn; i++) { 718 struct cpuset *a = csa[i]; 719 int apn = a->pn; 720 721 for (j = 0; j < csn; j++) { 722 struct cpuset *b = csa[j]; 723 int bpn = b->pn; 724 725 if (apn != bpn && cpusets_overlap(a, b)) { 726 for (k = 0; k < csn; k++) { 727 struct cpuset *c = csa[k]; 728 729 if (c->pn == bpn) 730 c->pn = apn; 731 } 732 ndoms--; /* one less element */ 733 goto restart; 734 } 735 } 736 } 737 738 /* 739 * Now we know how many domains to create. 740 * Convert <csn, csa> to <ndoms, doms> and populate cpu masks. 741 */ 742 doms = alloc_sched_domains(ndoms); 743 if (!doms) 744 goto done; 745 746 /* 747 * The rest of the code, including the scheduler, can deal with 748 * dattr==NULL case. No need to abort if alloc fails. 749 */ 750 dattr = kmalloc(ndoms * sizeof(struct sched_domain_attr), GFP_KERNEL); 751 752 for (nslot = 0, i = 0; i < csn; i++) { 753 struct cpuset *a = csa[i]; 754 struct cpumask *dp; 755 int apn = a->pn; 756 757 if (apn < 0) { 758 /* Skip completed partitions */ 759 continue; 760 } 761 762 dp = doms[nslot]; 763 764 if (nslot == ndoms) { 765 static int warnings = 10; 766 if (warnings) { 767 pr_warn("rebuild_sched_domains confused: nslot %d, ndoms %d, csn %d, i %d, apn %d\n", 768 nslot, ndoms, csn, i, apn); 769 warnings--; 770 } 771 continue; 772 } 773 774 cpumask_clear(dp); 775 if (dattr) 776 *(dattr + nslot) = SD_ATTR_INIT; 777 for (j = i; j < csn; j++) { 778 struct cpuset *b = csa[j]; 779 780 if (apn == b->pn) { 781 cpumask_or(dp, dp, b->effective_cpus); 782 cpumask_and(dp, dp, non_isolated_cpus); 783 if (dattr) 784 update_domain_attr_tree(dattr + nslot, b); 785 786 /* Done with this partition */ 787 b->pn = -1; 788 } 789 } 790 nslot++; 791 } 792 BUG_ON(nslot != ndoms); 793 794 done: 795 free_cpumask_var(non_isolated_cpus); 796 kfree(csa); 797 798 /* 799 * Fallback to the default domain if kmalloc() failed. 800 * See comments in partition_sched_domains(). 801 */ 802 if (doms == NULL) 803 ndoms = 1; 804 805 *domains = doms; 806 *attributes = dattr; 807 return ndoms; 808 } 809 810 /* 811 * Rebuild scheduler domains. 812 * 813 * If the flag 'sched_load_balance' of any cpuset with non-empty 814 * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset 815 * which has that flag enabled, or if any cpuset with a non-empty 816 * 'cpus' is removed, then call this routine to rebuild the 817 * scheduler's dynamic sched domains. 818 * 819 * Call with cpuset_mutex held. Takes get_online_cpus(). 820 */ 821 static void rebuild_sched_domains_locked(void) 822 { 823 struct sched_domain_attr *attr; 824 cpumask_var_t *doms; 825 int ndoms; 826 827 lockdep_assert_held(&cpuset_mutex); 828 get_online_cpus(); 829 830 /* 831 * We have raced with CPU hotplug. Don't do anything to avoid 832 * passing doms with offlined cpu to partition_sched_domains(). 833 * Anyways, hotplug work item will rebuild sched domains. 834 */ 835 if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask)) 836 goto out; 837 838 /* Generate domain masks and attrs */ 839 ndoms = generate_sched_domains(&doms, &attr); 840 841 /* Have scheduler rebuild the domains */ 842 partition_sched_domains(ndoms, doms, attr); 843 out: 844 put_online_cpus(); 845 } 846 #else /* !CONFIG_SMP */ 847 static void rebuild_sched_domains_locked(void) 848 { 849 } 850 #endif /* CONFIG_SMP */ 851 852 void rebuild_sched_domains(void) 853 { 854 mutex_lock(&cpuset_mutex); 855 rebuild_sched_domains_locked(); 856 mutex_unlock(&cpuset_mutex); 857 } 858 859 /** 860 * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset. 861 * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed 862 * 863 * Iterate through each task of @cs updating its cpus_allowed to the 864 * effective cpuset's. As this function is called with cpuset_mutex held, 865 * cpuset membership stays stable. 866 */ 867 static void update_tasks_cpumask(struct cpuset *cs) 868 { 869 struct css_task_iter it; 870 struct task_struct *task; 871 872 css_task_iter_start(&cs->css, &it); 873 while ((task = css_task_iter_next(&it))) 874 set_cpus_allowed_ptr(task, cs->effective_cpus); 875 css_task_iter_end(&it); 876 } 877 878 /* 879 * update_cpumasks_hier - Update effective cpumasks and tasks in the subtree 880 * @cs: the cpuset to consider 881 * @new_cpus: temp variable for calculating new effective_cpus 882 * 883 * When congifured cpumask is changed, the effective cpumasks of this cpuset 884 * and all its descendants need to be updated. 885 * 886 * On legacy hierachy, effective_cpus will be the same with cpu_allowed. 887 * 888 * Called with cpuset_mutex held 889 */ 890 static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus) 891 { 892 struct cpuset *cp; 893 struct cgroup_subsys_state *pos_css; 894 bool need_rebuild_sched_domains = false; 895 896 rcu_read_lock(); 897 cpuset_for_each_descendant_pre(cp, pos_css, cs) { 898 struct cpuset *parent = parent_cs(cp); 899 900 cpumask_and(new_cpus, cp->cpus_allowed, parent->effective_cpus); 901 902 /* 903 * If it becomes empty, inherit the effective mask of the 904 * parent, which is guaranteed to have some CPUs. 905 */ 906 if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && 907 cpumask_empty(new_cpus)) 908 cpumask_copy(new_cpus, parent->effective_cpus); 909 910 /* Skip the whole subtree if the cpumask remains the same. */ 911 if (cpumask_equal(new_cpus, cp->effective_cpus)) { 912 pos_css = css_rightmost_descendant(pos_css); 913 continue; 914 } 915 916 if (!css_tryget_online(&cp->css)) 917 continue; 918 rcu_read_unlock(); 919 920 spin_lock_irq(&callback_lock); 921 cpumask_copy(cp->effective_cpus, new_cpus); 922 spin_unlock_irq(&callback_lock); 923 924 WARN_ON(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && 925 !cpumask_equal(cp->cpus_allowed, cp->effective_cpus)); 926 927 update_tasks_cpumask(cp); 928 929 /* 930 * If the effective cpumask of any non-empty cpuset is changed, 931 * we need to rebuild sched domains. 932 */ 933 if (!cpumask_empty(cp->cpus_allowed) && 934 is_sched_load_balance(cp)) 935 need_rebuild_sched_domains = true; 936 937 rcu_read_lock(); 938 css_put(&cp->css); 939 } 940 rcu_read_unlock(); 941 942 if (need_rebuild_sched_domains) 943 rebuild_sched_domains_locked(); 944 } 945 946 /** 947 * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it 948 * @cs: the cpuset to consider 949 * @trialcs: trial cpuset 950 * @buf: buffer of cpu numbers written to this cpuset 951 */ 952 static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, 953 const char *buf) 954 { 955 int retval; 956 957 /* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */ 958 if (cs == &top_cpuset) 959 return -EACCES; 960 961 /* 962 * An empty cpus_allowed is ok only if the cpuset has no tasks. 963 * Since cpulist_parse() fails on an empty mask, we special case 964 * that parsing. The validate_change() call ensures that cpusets 965 * with tasks have cpus. 966 */ 967 if (!*buf) { 968 cpumask_clear(trialcs->cpus_allowed); 969 } else { 970 retval = cpulist_parse(buf, trialcs->cpus_allowed); 971 if (retval < 0) 972 return retval; 973 974 if (!cpumask_subset(trialcs->cpus_allowed, 975 top_cpuset.cpus_allowed)) 976 return -EINVAL; 977 } 978 979 /* Nothing to do if the cpus didn't change */ 980 if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed)) 981 return 0; 982 983 retval = validate_change(cs, trialcs); 984 if (retval < 0) 985 return retval; 986 987 spin_lock_irq(&callback_lock); 988 cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed); 989 spin_unlock_irq(&callback_lock); 990 991 /* use trialcs->cpus_allowed as a temp variable */ 992 update_cpumasks_hier(cs, trialcs->cpus_allowed); 993 return 0; 994 } 995 996 /* 997 * Migrate memory region from one set of nodes to another. This is 998 * performed asynchronously as it can be called from process migration path 999 * holding locks involved in process management. All mm migrations are 1000 * performed in the queued order and can be waited for by flushing 1001 * cpuset_migrate_mm_wq. 1002 */ 1003 1004 struct cpuset_migrate_mm_work { 1005 struct work_struct work; 1006 struct mm_struct *mm; 1007 nodemask_t from; 1008 nodemask_t to; 1009 }; 1010 1011 static void cpuset_migrate_mm_workfn(struct work_struct *work) 1012 { 1013 struct cpuset_migrate_mm_work *mwork = 1014 container_of(work, struct cpuset_migrate_mm_work, work); 1015 1016 /* on a wq worker, no need to worry about %current's mems_allowed */ 1017 do_migrate_pages(mwork->mm, &mwork->from, &mwork->to, MPOL_MF_MOVE_ALL); 1018 mmput(mwork->mm); 1019 kfree(mwork); 1020 } 1021 1022 static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, 1023 const nodemask_t *to) 1024 { 1025 struct cpuset_migrate_mm_work *mwork; 1026 1027 mwork = kzalloc(sizeof(*mwork), GFP_KERNEL); 1028 if (mwork) { 1029 mwork->mm = mm; 1030 mwork->from = *from; 1031 mwork->to = *to; 1032 INIT_WORK(&mwork->work, cpuset_migrate_mm_workfn); 1033 queue_work(cpuset_migrate_mm_wq, &mwork->work); 1034 } else { 1035 mmput(mm); 1036 } 1037 } 1038 1039 static void cpuset_post_attach(void) 1040 { 1041 flush_workqueue(cpuset_migrate_mm_wq); 1042 } 1043 1044 /* 1045 * cpuset_change_task_nodemask - change task's mems_allowed and mempolicy 1046 * @tsk: the task to change 1047 * @newmems: new nodes that the task will be set 1048 * 1049 * We use the mems_allowed_seq seqlock to safely update both tsk->mems_allowed 1050 * and rebind an eventual tasks' mempolicy. If the task is allocating in 1051 * parallel, it might temporarily see an empty intersection, which results in 1052 * a seqlock check and retry before OOM or allocation failure. 1053 */ 1054 static void cpuset_change_task_nodemask(struct task_struct *tsk, 1055 nodemask_t *newmems) 1056 { 1057 task_lock(tsk); 1058 1059 local_irq_disable(); 1060 write_seqcount_begin(&tsk->mems_allowed_seq); 1061 1062 nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems); 1063 mpol_rebind_task(tsk, newmems); 1064 tsk->mems_allowed = *newmems; 1065 1066 write_seqcount_end(&tsk->mems_allowed_seq); 1067 local_irq_enable(); 1068 1069 task_unlock(tsk); 1070 } 1071 1072 static void *cpuset_being_rebound; 1073 1074 /** 1075 * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset. 1076 * @cs: the cpuset in which each task's mems_allowed mask needs to be changed 1077 * 1078 * Iterate through each task of @cs updating its mems_allowed to the 1079 * effective cpuset's. As this function is called with cpuset_mutex held, 1080 * cpuset membership stays stable. 1081 */ 1082 static void update_tasks_nodemask(struct cpuset *cs) 1083 { 1084 static nodemask_t newmems; /* protected by cpuset_mutex */ 1085 struct css_task_iter it; 1086 struct task_struct *task; 1087 1088 cpuset_being_rebound = cs; /* causes mpol_dup() rebind */ 1089 1090 guarantee_online_mems(cs, &newmems); 1091 1092 /* 1093 * The mpol_rebind_mm() call takes mmap_sem, which we couldn't 1094 * take while holding tasklist_lock. Forks can happen - the 1095 * mpol_dup() cpuset_being_rebound check will catch such forks, 1096 * and rebind their vma mempolicies too. Because we still hold 1097 * the global cpuset_mutex, we know that no other rebind effort 1098 * will be contending for the global variable cpuset_being_rebound. 1099 * It's ok if we rebind the same mm twice; mpol_rebind_mm() 1100 * is idempotent. Also migrate pages in each mm to new nodes. 1101 */ 1102 css_task_iter_start(&cs->css, &it); 1103 while ((task = css_task_iter_next(&it))) { 1104 struct mm_struct *mm; 1105 bool migrate; 1106 1107 cpuset_change_task_nodemask(task, &newmems); 1108 1109 mm = get_task_mm(task); 1110 if (!mm) 1111 continue; 1112 1113 migrate = is_memory_migrate(cs); 1114 1115 mpol_rebind_mm(mm, &cs->mems_allowed); 1116 if (migrate) 1117 cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems); 1118 else 1119 mmput(mm); 1120 } 1121 css_task_iter_end(&it); 1122 1123 /* 1124 * All the tasks' nodemasks have been updated, update 1125 * cs->old_mems_allowed. 1126 */ 1127 cs->old_mems_allowed = newmems; 1128 1129 /* We're done rebinding vmas to this cpuset's new mems_allowed. */ 1130 cpuset_being_rebound = NULL; 1131 } 1132 1133 /* 1134 * update_nodemasks_hier - Update effective nodemasks and tasks in the subtree 1135 * @cs: the cpuset to consider 1136 * @new_mems: a temp variable for calculating new effective_mems 1137 * 1138 * When configured nodemask is changed, the effective nodemasks of this cpuset 1139 * and all its descendants need to be updated. 1140 * 1141 * On legacy hiearchy, effective_mems will be the same with mems_allowed. 1142 * 1143 * Called with cpuset_mutex held 1144 */ 1145 static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems) 1146 { 1147 struct cpuset *cp; 1148 struct cgroup_subsys_state *pos_css; 1149 1150 rcu_read_lock(); 1151 cpuset_for_each_descendant_pre(cp, pos_css, cs) { 1152 struct cpuset *parent = parent_cs(cp); 1153 1154 nodes_and(*new_mems, cp->mems_allowed, parent->effective_mems); 1155 1156 /* 1157 * If it becomes empty, inherit the effective mask of the 1158 * parent, which is guaranteed to have some MEMs. 1159 */ 1160 if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && 1161 nodes_empty(*new_mems)) 1162 *new_mems = parent->effective_mems; 1163 1164 /* Skip the whole subtree if the nodemask remains the same. */ 1165 if (nodes_equal(*new_mems, cp->effective_mems)) { 1166 pos_css = css_rightmost_descendant(pos_css); 1167 continue; 1168 } 1169 1170 if (!css_tryget_online(&cp->css)) 1171 continue; 1172 rcu_read_unlock(); 1173 1174 spin_lock_irq(&callback_lock); 1175 cp->effective_mems = *new_mems; 1176 spin_unlock_irq(&callback_lock); 1177 1178 WARN_ON(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && 1179 !nodes_equal(cp->mems_allowed, cp->effective_mems)); 1180 1181 update_tasks_nodemask(cp); 1182 1183 rcu_read_lock(); 1184 css_put(&cp->css); 1185 } 1186 rcu_read_unlock(); 1187 } 1188 1189 /* 1190 * Handle user request to change the 'mems' memory placement 1191 * of a cpuset. Needs to validate the request, update the 1192 * cpusets mems_allowed, and for each task in the cpuset, 1193 * update mems_allowed and rebind task's mempolicy and any vma 1194 * mempolicies and if the cpuset is marked 'memory_migrate', 1195 * migrate the tasks pages to the new memory. 1196 * 1197 * Call with cpuset_mutex held. May take callback_lock during call. 1198 * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, 1199 * lock each such tasks mm->mmap_sem, scan its vma's and rebind 1200 * their mempolicies to the cpusets new mems_allowed. 1201 */ 1202 static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, 1203 const char *buf) 1204 { 1205 int retval; 1206 1207 /* 1208 * top_cpuset.mems_allowed tracks node_stats[N_MEMORY]; 1209 * it's read-only 1210 */ 1211 if (cs == &top_cpuset) { 1212 retval = -EACCES; 1213 goto done; 1214 } 1215 1216 /* 1217 * An empty mems_allowed is ok iff there are no tasks in the cpuset. 1218 * Since nodelist_parse() fails on an empty mask, we special case 1219 * that parsing. The validate_change() call ensures that cpusets 1220 * with tasks have memory. 1221 */ 1222 if (!*buf) { 1223 nodes_clear(trialcs->mems_allowed); 1224 } else { 1225 retval = nodelist_parse(buf, trialcs->mems_allowed); 1226 if (retval < 0) 1227 goto done; 1228 1229 if (!nodes_subset(trialcs->mems_allowed, 1230 top_cpuset.mems_allowed)) { 1231 retval = -EINVAL; 1232 goto done; 1233 } 1234 } 1235 1236 if (nodes_equal(cs->mems_allowed, trialcs->mems_allowed)) { 1237 retval = 0; /* Too easy - nothing to do */ 1238 goto done; 1239 } 1240 retval = validate_change(cs, trialcs); 1241 if (retval < 0) 1242 goto done; 1243 1244 spin_lock_irq(&callback_lock); 1245 cs->mems_allowed = trialcs->mems_allowed; 1246 spin_unlock_irq(&callback_lock); 1247 1248 /* use trialcs->mems_allowed as a temp variable */ 1249 update_nodemasks_hier(cs, &trialcs->mems_allowed); 1250 done: 1251 return retval; 1252 } 1253 1254 int current_cpuset_is_being_rebound(void) 1255 { 1256 int ret; 1257 1258 rcu_read_lock(); 1259 ret = task_cs(current) == cpuset_being_rebound; 1260 rcu_read_unlock(); 1261 1262 return ret; 1263 } 1264 1265 static int update_relax_domain_level(struct cpuset *cs, s64 val) 1266 { 1267 #ifdef CONFIG_SMP 1268 if (val < -1 || val >= sched_domain_level_max) 1269 return -EINVAL; 1270 #endif 1271 1272 if (val != cs->relax_domain_level) { 1273 cs->relax_domain_level = val; 1274 if (!cpumask_empty(cs->cpus_allowed) && 1275 is_sched_load_balance(cs)) 1276 rebuild_sched_domains_locked(); 1277 } 1278 1279 return 0; 1280 } 1281 1282 /** 1283 * update_tasks_flags - update the spread flags of tasks in the cpuset. 1284 * @cs: the cpuset in which each task's spread flags needs to be changed 1285 * 1286 * Iterate through each task of @cs updating its spread flags. As this 1287 * function is called with cpuset_mutex held, cpuset membership stays 1288 * stable. 1289 */ 1290 static void update_tasks_flags(struct cpuset *cs) 1291 { 1292 struct css_task_iter it; 1293 struct task_struct *task; 1294 1295 css_task_iter_start(&cs->css, &it); 1296 while ((task = css_task_iter_next(&it))) 1297 cpuset_update_task_spread_flag(cs, task); 1298 css_task_iter_end(&it); 1299 } 1300 1301 /* 1302 * update_flag - read a 0 or a 1 in a file and update associated flag 1303 * bit: the bit to update (see cpuset_flagbits_t) 1304 * cs: the cpuset to update 1305 * turning_on: whether the flag is being set or cleared 1306 * 1307 * Call with cpuset_mutex held. 1308 */ 1309 1310 static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, 1311 int turning_on) 1312 { 1313 struct cpuset *trialcs; 1314 int balance_flag_changed; 1315 int spread_flag_changed; 1316 int err; 1317 1318 trialcs = alloc_trial_cpuset(cs); 1319 if (!trialcs) 1320 return -ENOMEM; 1321 1322 if (turning_on) 1323 set_bit(bit, &trialcs->flags); 1324 else 1325 clear_bit(bit, &trialcs->flags); 1326 1327 err = validate_change(cs, trialcs); 1328 if (err < 0) 1329 goto out; 1330 1331 balance_flag_changed = (is_sched_load_balance(cs) != 1332 is_sched_load_balance(trialcs)); 1333 1334 spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs)) 1335 || (is_spread_page(cs) != is_spread_page(trialcs))); 1336 1337 spin_lock_irq(&callback_lock); 1338 cs->flags = trialcs->flags; 1339 spin_unlock_irq(&callback_lock); 1340 1341 if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) 1342 rebuild_sched_domains_locked(); 1343 1344 if (spread_flag_changed) 1345 update_tasks_flags(cs); 1346 out: 1347 free_trial_cpuset(trialcs); 1348 return err; 1349 } 1350 1351 /* 1352 * Frequency meter - How fast is some event occurring? 1353 * 1354 * These routines manage a digitally filtered, constant time based, 1355 * event frequency meter. There are four routines: 1356 * fmeter_init() - initialize a frequency meter. 1357 * fmeter_markevent() - called each time the event happens. 1358 * fmeter_getrate() - returns the recent rate of such events. 1359 * fmeter_update() - internal routine used to update fmeter. 1360 * 1361 * A common data structure is passed to each of these routines, 1362 * which is used to keep track of the state required to manage the 1363 * frequency meter and its digital filter. 1364 * 1365 * The filter works on the number of events marked per unit time. 1366 * The filter is single-pole low-pass recursive (IIR). The time unit 1367 * is 1 second. Arithmetic is done using 32-bit integers scaled to 1368 * simulate 3 decimal digits of precision (multiplied by 1000). 1369 * 1370 * With an FM_COEF of 933, and a time base of 1 second, the filter 1371 * has a half-life of 10 seconds, meaning that if the events quit 1372 * happening, then the rate returned from the fmeter_getrate() 1373 * will be cut in half each 10 seconds, until it converges to zero. 1374 * 1375 * It is not worth doing a real infinitely recursive filter. If more 1376 * than FM_MAXTICKS ticks have elapsed since the last filter event, 1377 * just compute FM_MAXTICKS ticks worth, by which point the level 1378 * will be stable. 1379 * 1380 * Limit the count of unprocessed events to FM_MAXCNT, so as to avoid 1381 * arithmetic overflow in the fmeter_update() routine. 1382 * 1383 * Given the simple 32 bit integer arithmetic used, this meter works 1384 * best for reporting rates between one per millisecond (msec) and 1385 * one per 32 (approx) seconds. At constant rates faster than one 1386 * per msec it maxes out at values just under 1,000,000. At constant 1387 * rates between one per msec, and one per second it will stabilize 1388 * to a value N*1000, where N is the rate of events per second. 1389 * At constant rates between one per second and one per 32 seconds, 1390 * it will be choppy, moving up on the seconds that have an event, 1391 * and then decaying until the next event. At rates slower than 1392 * about one in 32 seconds, it decays all the way back to zero between 1393 * each event. 1394 */ 1395 1396 #define FM_COEF 933 /* coefficient for half-life of 10 secs */ 1397 #define FM_MAXTICKS ((u32)99) /* useless computing more ticks than this */ 1398 #define FM_MAXCNT 1000000 /* limit cnt to avoid overflow */ 1399 #define FM_SCALE 1000 /* faux fixed point scale */ 1400 1401 /* Initialize a frequency meter */ 1402 static void fmeter_init(struct fmeter *fmp) 1403 { 1404 fmp->cnt = 0; 1405 fmp->val = 0; 1406 fmp->time = 0; 1407 spin_lock_init(&fmp->lock); 1408 } 1409 1410 /* Internal meter update - process cnt events and update value */ 1411 static void fmeter_update(struct fmeter *fmp) 1412 { 1413 time64_t now; 1414 u32 ticks; 1415 1416 now = ktime_get_seconds(); 1417 ticks = now - fmp->time; 1418 1419 if (ticks == 0) 1420 return; 1421 1422 ticks = min(FM_MAXTICKS, ticks); 1423 while (ticks-- > 0) 1424 fmp->val = (FM_COEF * fmp->val) / FM_SCALE; 1425 fmp->time = now; 1426 1427 fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE; 1428 fmp->cnt = 0; 1429 } 1430 1431 /* Process any previous ticks, then bump cnt by one (times scale). */ 1432 static void fmeter_markevent(struct fmeter *fmp) 1433 { 1434 spin_lock(&fmp->lock); 1435 fmeter_update(fmp); 1436 fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE); 1437 spin_unlock(&fmp->lock); 1438 } 1439 1440 /* Process any previous ticks, then return current value. */ 1441 static int fmeter_getrate(struct fmeter *fmp) 1442 { 1443 int val; 1444 1445 spin_lock(&fmp->lock); 1446 fmeter_update(fmp); 1447 val = fmp->val; 1448 spin_unlock(&fmp->lock); 1449 return val; 1450 } 1451 1452 static struct cpuset *cpuset_attach_old_cs; 1453 1454 /* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */ 1455 static int cpuset_can_attach(struct cgroup_taskset *tset) 1456 { 1457 struct cgroup_subsys_state *css; 1458 struct cpuset *cs; 1459 struct task_struct *task; 1460 int ret; 1461 1462 /* used later by cpuset_attach() */ 1463 cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css)); 1464 cs = css_cs(css); 1465 1466 mutex_lock(&cpuset_mutex); 1467 1468 /* allow moving tasks into an empty cpuset if on default hierarchy */ 1469 ret = -ENOSPC; 1470 if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && 1471 (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))) 1472 goto out_unlock; 1473 1474 cgroup_taskset_for_each(task, css, tset) { 1475 ret = task_can_attach(task, cs->cpus_allowed); 1476 if (ret) 1477 goto out_unlock; 1478 ret = security_task_setscheduler(task); 1479 if (ret) 1480 goto out_unlock; 1481 } 1482 1483 /* 1484 * Mark attach is in progress. This makes validate_change() fail 1485 * changes which zero cpus/mems_allowed. 1486 */ 1487 cs->attach_in_progress++; 1488 ret = 0; 1489 out_unlock: 1490 mutex_unlock(&cpuset_mutex); 1491 return ret; 1492 } 1493 1494 static void cpuset_cancel_attach(struct cgroup_taskset *tset) 1495 { 1496 struct cgroup_subsys_state *css; 1497 struct cpuset *cs; 1498 1499 cgroup_taskset_first(tset, &css); 1500 cs = css_cs(css); 1501 1502 mutex_lock(&cpuset_mutex); 1503 css_cs(css)->attach_in_progress--; 1504 mutex_unlock(&cpuset_mutex); 1505 } 1506 1507 /* 1508 * Protected by cpuset_mutex. cpus_attach is used only by cpuset_attach() 1509 * but we can't allocate it dynamically there. Define it global and 1510 * allocate from cpuset_init(). 1511 */ 1512 static cpumask_var_t cpus_attach; 1513 1514 static void cpuset_attach(struct cgroup_taskset *tset) 1515 { 1516 /* static buf protected by cpuset_mutex */ 1517 static nodemask_t cpuset_attach_nodemask_to; 1518 struct task_struct *task; 1519 struct task_struct *leader; 1520 struct cgroup_subsys_state *css; 1521 struct cpuset *cs; 1522 struct cpuset *oldcs = cpuset_attach_old_cs; 1523 1524 cgroup_taskset_first(tset, &css); 1525 cs = css_cs(css); 1526 1527 mutex_lock(&cpuset_mutex); 1528 1529 /* prepare for attach */ 1530 if (cs == &top_cpuset) 1531 cpumask_copy(cpus_attach, cpu_possible_mask); 1532 else 1533 guarantee_online_cpus(cs, cpus_attach); 1534 1535 guarantee_online_mems(cs, &cpuset_attach_nodemask_to); 1536 1537 cgroup_taskset_for_each(task, css, tset) { 1538 /* 1539 * can_attach beforehand should guarantee that this doesn't 1540 * fail. TODO: have a better way to handle failure here 1541 */ 1542 WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach)); 1543 1544 cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to); 1545 cpuset_update_task_spread_flag(cs, task); 1546 } 1547 1548 /* 1549 * Change mm for all threadgroup leaders. This is expensive and may 1550 * sleep and should be moved outside migration path proper. 1551 */ 1552 cpuset_attach_nodemask_to = cs->effective_mems; 1553 cgroup_taskset_for_each_leader(leader, css, tset) { 1554 struct mm_struct *mm = get_task_mm(leader); 1555 1556 if (mm) { 1557 mpol_rebind_mm(mm, &cpuset_attach_nodemask_to); 1558 1559 /* 1560 * old_mems_allowed is the same with mems_allowed 1561 * here, except if this task is being moved 1562 * automatically due to hotplug. In that case 1563 * @mems_allowed has been updated and is empty, so 1564 * @old_mems_allowed is the right nodesets that we 1565 * migrate mm from. 1566 */ 1567 if (is_memory_migrate(cs)) 1568 cpuset_migrate_mm(mm, &oldcs->old_mems_allowed, 1569 &cpuset_attach_nodemask_to); 1570 else 1571 mmput(mm); 1572 } 1573 } 1574 1575 cs->old_mems_allowed = cpuset_attach_nodemask_to; 1576 1577 cs->attach_in_progress--; 1578 if (!cs->attach_in_progress) 1579 wake_up(&cpuset_attach_wq); 1580 1581 mutex_unlock(&cpuset_mutex); 1582 } 1583 1584 /* The various types of files and directories in a cpuset file system */ 1585 1586 typedef enum { 1587 FILE_MEMORY_MIGRATE, 1588 FILE_CPULIST, 1589 FILE_MEMLIST, 1590 FILE_EFFECTIVE_CPULIST, 1591 FILE_EFFECTIVE_MEMLIST, 1592 FILE_CPU_EXCLUSIVE, 1593 FILE_MEM_EXCLUSIVE, 1594 FILE_MEM_HARDWALL, 1595 FILE_SCHED_LOAD_BALANCE, 1596 FILE_SCHED_RELAX_DOMAIN_LEVEL, 1597 FILE_MEMORY_PRESSURE_ENABLED, 1598 FILE_MEMORY_PRESSURE, 1599 FILE_SPREAD_PAGE, 1600 FILE_SPREAD_SLAB, 1601 } cpuset_filetype_t; 1602 1603 static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, 1604 u64 val) 1605 { 1606 struct cpuset *cs = css_cs(css); 1607 cpuset_filetype_t type = cft->private; 1608 int retval = 0; 1609 1610 mutex_lock(&cpuset_mutex); 1611 if (!is_cpuset_online(cs)) { 1612 retval = -ENODEV; 1613 goto out_unlock; 1614 } 1615 1616 switch (type) { 1617 case FILE_CPU_EXCLUSIVE: 1618 retval = update_flag(CS_CPU_EXCLUSIVE, cs, val); 1619 break; 1620 case FILE_MEM_EXCLUSIVE: 1621 retval = update_flag(CS_MEM_EXCLUSIVE, cs, val); 1622 break; 1623 case FILE_MEM_HARDWALL: 1624 retval = update_flag(CS_MEM_HARDWALL, cs, val); 1625 break; 1626 case FILE_SCHED_LOAD_BALANCE: 1627 retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, val); 1628 break; 1629 case FILE_MEMORY_MIGRATE: 1630 retval = update_flag(CS_MEMORY_MIGRATE, cs, val); 1631 break; 1632 case FILE_MEMORY_PRESSURE_ENABLED: 1633 cpuset_memory_pressure_enabled = !!val; 1634 break; 1635 case FILE_SPREAD_PAGE: 1636 retval = update_flag(CS_SPREAD_PAGE, cs, val); 1637 break; 1638 case FILE_SPREAD_SLAB: 1639 retval = update_flag(CS_SPREAD_SLAB, cs, val); 1640 break; 1641 default: 1642 retval = -EINVAL; 1643 break; 1644 } 1645 out_unlock: 1646 mutex_unlock(&cpuset_mutex); 1647 return retval; 1648 } 1649 1650 static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, 1651 s64 val) 1652 { 1653 struct cpuset *cs = css_cs(css); 1654 cpuset_filetype_t type = cft->private; 1655 int retval = -ENODEV; 1656 1657 mutex_lock(&cpuset_mutex); 1658 if (!is_cpuset_online(cs)) 1659 goto out_unlock; 1660 1661 switch (type) { 1662 case FILE_SCHED_RELAX_DOMAIN_LEVEL: 1663 retval = update_relax_domain_level(cs, val); 1664 break; 1665 default: 1666 retval = -EINVAL; 1667 break; 1668 } 1669 out_unlock: 1670 mutex_unlock(&cpuset_mutex); 1671 return retval; 1672 } 1673 1674 /* 1675 * Common handling for a write to a "cpus" or "mems" file. 1676 */ 1677 static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, 1678 char *buf, size_t nbytes, loff_t off) 1679 { 1680 struct cpuset *cs = css_cs(of_css(of)); 1681 struct cpuset *trialcs; 1682 int retval = -ENODEV; 1683 1684 buf = strstrip(buf); 1685 1686 /* 1687 * CPU or memory hotunplug may leave @cs w/o any execution 1688 * resources, in which case the hotplug code asynchronously updates 1689 * configuration and transfers all tasks to the nearest ancestor 1690 * which can execute. 1691 * 1692 * As writes to "cpus" or "mems" may restore @cs's execution 1693 * resources, wait for the previously scheduled operations before 1694 * proceeding, so that we don't end up keep removing tasks added 1695 * after execution capability is restored. 1696 * 1697 * cpuset_hotplug_work calls back into cgroup core via 1698 * cgroup_transfer_tasks() and waiting for it from a cgroupfs 1699 * operation like this one can lead to a deadlock through kernfs 1700 * active_ref protection. Let's break the protection. Losing the 1701 * protection is okay as we check whether @cs is online after 1702 * grabbing cpuset_mutex anyway. This only happens on the legacy 1703 * hierarchies. 1704 */ 1705 css_get(&cs->css); 1706 kernfs_break_active_protection(of->kn); 1707 flush_work(&cpuset_hotplug_work); 1708 1709 mutex_lock(&cpuset_mutex); 1710 if (!is_cpuset_online(cs)) 1711 goto out_unlock; 1712 1713 trialcs = alloc_trial_cpuset(cs); 1714 if (!trialcs) { 1715 retval = -ENOMEM; 1716 goto out_unlock; 1717 } 1718 1719 switch (of_cft(of)->private) { 1720 case FILE_CPULIST: 1721 retval = update_cpumask(cs, trialcs, buf); 1722 break; 1723 case FILE_MEMLIST: 1724 retval = update_nodemask(cs, trialcs, buf); 1725 break; 1726 default: 1727 retval = -EINVAL; 1728 break; 1729 } 1730 1731 free_trial_cpuset(trialcs); 1732 out_unlock: 1733 mutex_unlock(&cpuset_mutex); 1734 kernfs_unbreak_active_protection(of->kn); 1735 css_put(&cs->css); 1736 flush_workqueue(cpuset_migrate_mm_wq); 1737 return retval ?: nbytes; 1738 } 1739 1740 /* 1741 * These ascii lists should be read in a single call, by using a user 1742 * buffer large enough to hold the entire map. If read in smaller 1743 * chunks, there is no guarantee of atomicity. Since the display format 1744 * used, list of ranges of sequential numbers, is variable length, 1745 * and since these maps can change value dynamically, one could read 1746 * gibberish by doing partial reads while a list was changing. 1747 */ 1748 static int cpuset_common_seq_show(struct seq_file *sf, void *v) 1749 { 1750 struct cpuset *cs = css_cs(seq_css(sf)); 1751 cpuset_filetype_t type = seq_cft(sf)->private; 1752 int ret = 0; 1753 1754 spin_lock_irq(&callback_lock); 1755 1756 switch (type) { 1757 case FILE_CPULIST: 1758 seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->cpus_allowed)); 1759 break; 1760 case FILE_MEMLIST: 1761 seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->mems_allowed)); 1762 break; 1763 case FILE_EFFECTIVE_CPULIST: 1764 seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->effective_cpus)); 1765 break; 1766 case FILE_EFFECTIVE_MEMLIST: 1767 seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->effective_mems)); 1768 break; 1769 default: 1770 ret = -EINVAL; 1771 } 1772 1773 spin_unlock_irq(&callback_lock); 1774 return ret; 1775 } 1776 1777 static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft) 1778 { 1779 struct cpuset *cs = css_cs(css); 1780 cpuset_filetype_t type = cft->private; 1781 switch (type) { 1782 case FILE_CPU_EXCLUSIVE: 1783 return is_cpu_exclusive(cs); 1784 case FILE_MEM_EXCLUSIVE: 1785 return is_mem_exclusive(cs); 1786 case FILE_MEM_HARDWALL: 1787 return is_mem_hardwall(cs); 1788 case FILE_SCHED_LOAD_BALANCE: 1789 return is_sched_load_balance(cs); 1790 case FILE_MEMORY_MIGRATE: 1791 return is_memory_migrate(cs); 1792 case FILE_MEMORY_PRESSURE_ENABLED: 1793 return cpuset_memory_pressure_enabled; 1794 case FILE_MEMORY_PRESSURE: 1795 return fmeter_getrate(&cs->fmeter); 1796 case FILE_SPREAD_PAGE: 1797 return is_spread_page(cs); 1798 case FILE_SPREAD_SLAB: 1799 return is_spread_slab(cs); 1800 default: 1801 BUG(); 1802 } 1803 1804 /* Unreachable but makes gcc happy */ 1805 return 0; 1806 } 1807 1808 static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft) 1809 { 1810 struct cpuset *cs = css_cs(css); 1811 cpuset_filetype_t type = cft->private; 1812 switch (type) { 1813 case FILE_SCHED_RELAX_DOMAIN_LEVEL: 1814 return cs->relax_domain_level; 1815 default: 1816 BUG(); 1817 } 1818 1819 /* Unrechable but makes gcc happy */ 1820 return 0; 1821 } 1822 1823 1824 /* 1825 * for the common functions, 'private' gives the type of file 1826 */ 1827 1828 static struct cftype files[] = { 1829 { 1830 .name = "cpus", 1831 .seq_show = cpuset_common_seq_show, 1832 .write = cpuset_write_resmask, 1833 .max_write_len = (100U + 6 * NR_CPUS), 1834 .private = FILE_CPULIST, 1835 }, 1836 1837 { 1838 .name = "mems", 1839 .seq_show = cpuset_common_seq_show, 1840 .write = cpuset_write_resmask, 1841 .max_write_len = (100U + 6 * MAX_NUMNODES), 1842 .private = FILE_MEMLIST, 1843 }, 1844 1845 { 1846 .name = "effective_cpus", 1847 .seq_show = cpuset_common_seq_show, 1848 .private = FILE_EFFECTIVE_CPULIST, 1849 }, 1850 1851 { 1852 .name = "effective_mems", 1853 .seq_show = cpuset_common_seq_show, 1854 .private = FILE_EFFECTIVE_MEMLIST, 1855 }, 1856 1857 { 1858 .name = "cpu_exclusive", 1859 .read_u64 = cpuset_read_u64, 1860 .write_u64 = cpuset_write_u64, 1861 .private = FILE_CPU_EXCLUSIVE, 1862 }, 1863 1864 { 1865 .name = "mem_exclusive", 1866 .read_u64 = cpuset_read_u64, 1867 .write_u64 = cpuset_write_u64, 1868 .private = FILE_MEM_EXCLUSIVE, 1869 }, 1870 1871 { 1872 .name = "mem_hardwall", 1873 .read_u64 = cpuset_read_u64, 1874 .write_u64 = cpuset_write_u64, 1875 .private = FILE_MEM_HARDWALL, 1876 }, 1877 1878 { 1879 .name = "sched_load_balance", 1880 .read_u64 = cpuset_read_u64, 1881 .write_u64 = cpuset_write_u64, 1882 .private = FILE_SCHED_LOAD_BALANCE, 1883 }, 1884 1885 { 1886 .name = "sched_relax_domain_level", 1887 .read_s64 = cpuset_read_s64, 1888 .write_s64 = cpuset_write_s64, 1889 .private = FILE_SCHED_RELAX_DOMAIN_LEVEL, 1890 }, 1891 1892 { 1893 .name = "memory_migrate", 1894 .read_u64 = cpuset_read_u64, 1895 .write_u64 = cpuset_write_u64, 1896 .private = FILE_MEMORY_MIGRATE, 1897 }, 1898 1899 { 1900 .name = "memory_pressure", 1901 .read_u64 = cpuset_read_u64, 1902 .private = FILE_MEMORY_PRESSURE, 1903 }, 1904 1905 { 1906 .name = "memory_spread_page", 1907 .read_u64 = cpuset_read_u64, 1908 .write_u64 = cpuset_write_u64, 1909 .private = FILE_SPREAD_PAGE, 1910 }, 1911 1912 { 1913 .name = "memory_spread_slab", 1914 .read_u64 = cpuset_read_u64, 1915 .write_u64 = cpuset_write_u64, 1916 .private = FILE_SPREAD_SLAB, 1917 }, 1918 1919 { 1920 .name = "memory_pressure_enabled", 1921 .flags = CFTYPE_ONLY_ON_ROOT, 1922 .read_u64 = cpuset_read_u64, 1923 .write_u64 = cpuset_write_u64, 1924 .private = FILE_MEMORY_PRESSURE_ENABLED, 1925 }, 1926 1927 { } /* terminate */ 1928 }; 1929 1930 /* 1931 * cpuset_css_alloc - allocate a cpuset css 1932 * cgrp: control group that the new cpuset will be part of 1933 */ 1934 1935 static struct cgroup_subsys_state * 1936 cpuset_css_alloc(struct cgroup_subsys_state *parent_css) 1937 { 1938 struct cpuset *cs; 1939 1940 if (!parent_css) 1941 return &top_cpuset.css; 1942 1943 cs = kzalloc(sizeof(*cs), GFP_KERNEL); 1944 if (!cs) 1945 return ERR_PTR(-ENOMEM); 1946 if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL)) 1947 goto free_cs; 1948 if (!alloc_cpumask_var(&cs->effective_cpus, GFP_KERNEL)) 1949 goto free_cpus; 1950 1951 set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); 1952 cpumask_clear(cs->cpus_allowed); 1953 nodes_clear(cs->mems_allowed); 1954 cpumask_clear(cs->effective_cpus); 1955 nodes_clear(cs->effective_mems); 1956 fmeter_init(&cs->fmeter); 1957 cs->relax_domain_level = -1; 1958 1959 return &cs->css; 1960 1961 free_cpus: 1962 free_cpumask_var(cs->cpus_allowed); 1963 free_cs: 1964 kfree(cs); 1965 return ERR_PTR(-ENOMEM); 1966 } 1967 1968 static int cpuset_css_online(struct cgroup_subsys_state *css) 1969 { 1970 struct cpuset *cs = css_cs(css); 1971 struct cpuset *parent = parent_cs(cs); 1972 struct cpuset *tmp_cs; 1973 struct cgroup_subsys_state *pos_css; 1974 1975 if (!parent) 1976 return 0; 1977 1978 mutex_lock(&cpuset_mutex); 1979 1980 set_bit(CS_ONLINE, &cs->flags); 1981 if (is_spread_page(parent)) 1982 set_bit(CS_SPREAD_PAGE, &cs->flags); 1983 if (is_spread_slab(parent)) 1984 set_bit(CS_SPREAD_SLAB, &cs->flags); 1985 1986 cpuset_inc(); 1987 1988 spin_lock_irq(&callback_lock); 1989 if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) { 1990 cpumask_copy(cs->effective_cpus, parent->effective_cpus); 1991 cs->effective_mems = parent->effective_mems; 1992 } 1993 spin_unlock_irq(&callback_lock); 1994 1995 if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags)) 1996 goto out_unlock; 1997 1998 /* 1999 * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is 2000 * set. This flag handling is implemented in cgroup core for 2001 * histrical reasons - the flag may be specified during mount. 2002 * 2003 * Currently, if any sibling cpusets have exclusive cpus or mem, we 2004 * refuse to clone the configuration - thereby refusing the task to 2005 * be entered, and as a result refusing the sys_unshare() or 2006 * clone() which initiated it. If this becomes a problem for some 2007 * users who wish to allow that scenario, then this could be 2008 * changed to grant parent->cpus_allowed-sibling_cpus_exclusive 2009 * (and likewise for mems) to the new cgroup. 2010 */ 2011 rcu_read_lock(); 2012 cpuset_for_each_child(tmp_cs, pos_css, parent) { 2013 if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) { 2014 rcu_read_unlock(); 2015 goto out_unlock; 2016 } 2017 } 2018 rcu_read_unlock(); 2019 2020 spin_lock_irq(&callback_lock); 2021 cs->mems_allowed = parent->mems_allowed; 2022 cs->effective_mems = parent->mems_allowed; 2023 cpumask_copy(cs->cpus_allowed, parent->cpus_allowed); 2024 cpumask_copy(cs->effective_cpus, parent->cpus_allowed); 2025 spin_unlock_irq(&callback_lock); 2026 out_unlock: 2027 mutex_unlock(&cpuset_mutex); 2028 return 0; 2029 } 2030 2031 /* 2032 * If the cpuset being removed has its flag 'sched_load_balance' 2033 * enabled, then simulate turning sched_load_balance off, which 2034 * will call rebuild_sched_domains_locked(). 2035 */ 2036 2037 static void cpuset_css_offline(struct cgroup_subsys_state *css) 2038 { 2039 struct cpuset *cs = css_cs(css); 2040 2041 mutex_lock(&cpuset_mutex); 2042 2043 if (is_sched_load_balance(cs)) 2044 update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); 2045 2046 cpuset_dec(); 2047 clear_bit(CS_ONLINE, &cs->flags); 2048 2049 mutex_unlock(&cpuset_mutex); 2050 } 2051 2052 static void cpuset_css_free(struct cgroup_subsys_state *css) 2053 { 2054 struct cpuset *cs = css_cs(css); 2055 2056 free_cpumask_var(cs->effective_cpus); 2057 free_cpumask_var(cs->cpus_allowed); 2058 kfree(cs); 2059 } 2060 2061 static void cpuset_bind(struct cgroup_subsys_state *root_css) 2062 { 2063 mutex_lock(&cpuset_mutex); 2064 spin_lock_irq(&callback_lock); 2065 2066 if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) { 2067 cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask); 2068 top_cpuset.mems_allowed = node_possible_map; 2069 } else { 2070 cpumask_copy(top_cpuset.cpus_allowed, 2071 top_cpuset.effective_cpus); 2072 top_cpuset.mems_allowed = top_cpuset.effective_mems; 2073 } 2074 2075 spin_unlock_irq(&callback_lock); 2076 mutex_unlock(&cpuset_mutex); 2077 } 2078 2079 /* 2080 * Make sure the new task conform to the current state of its parent, 2081 * which could have been changed by cpuset just after it inherits the 2082 * state from the parent and before it sits on the cgroup's task list. 2083 */ 2084 static void cpuset_fork(struct task_struct *task) 2085 { 2086 if (task_css_is_root(task, cpuset_cgrp_id)) 2087 return; 2088 2089 set_cpus_allowed_ptr(task, ¤t->cpus_allowed); 2090 task->mems_allowed = current->mems_allowed; 2091 } 2092 2093 struct cgroup_subsys cpuset_cgrp_subsys = { 2094 .css_alloc = cpuset_css_alloc, 2095 .css_online = cpuset_css_online, 2096 .css_offline = cpuset_css_offline, 2097 .css_free = cpuset_css_free, 2098 .can_attach = cpuset_can_attach, 2099 .cancel_attach = cpuset_cancel_attach, 2100 .attach = cpuset_attach, 2101 .post_attach = cpuset_post_attach, 2102 .bind = cpuset_bind, 2103 .fork = cpuset_fork, 2104 .legacy_cftypes = files, 2105 .early_init = true, 2106 }; 2107 2108 /** 2109 * cpuset_init - initialize cpusets at system boot 2110 * 2111 * Description: Initialize top_cpuset and the cpuset internal file system, 2112 **/ 2113 2114 int __init cpuset_init(void) 2115 { 2116 int err = 0; 2117 2118 BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL)); 2119 BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL)); 2120 2121 cpumask_setall(top_cpuset.cpus_allowed); 2122 nodes_setall(top_cpuset.mems_allowed); 2123 cpumask_setall(top_cpuset.effective_cpus); 2124 nodes_setall(top_cpuset.effective_mems); 2125 2126 fmeter_init(&top_cpuset.fmeter); 2127 set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags); 2128 top_cpuset.relax_domain_level = -1; 2129 2130 err = register_filesystem(&cpuset_fs_type); 2131 if (err < 0) 2132 return err; 2133 2134 BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL)); 2135 2136 return 0; 2137 } 2138 2139 /* 2140 * If CPU and/or memory hotplug handlers, below, unplug any CPUs 2141 * or memory nodes, we need to walk over the cpuset hierarchy, 2142 * removing that CPU or node from all cpusets. If this removes the 2143 * last CPU or node from a cpuset, then move the tasks in the empty 2144 * cpuset to its next-highest non-empty parent. 2145 */ 2146 static void remove_tasks_in_empty_cpuset(struct cpuset *cs) 2147 { 2148 struct cpuset *parent; 2149 2150 /* 2151 * Find its next-highest non-empty parent, (top cpuset 2152 * has online cpus, so can't be empty). 2153 */ 2154 parent = parent_cs(cs); 2155 while (cpumask_empty(parent->cpus_allowed) || 2156 nodes_empty(parent->mems_allowed)) 2157 parent = parent_cs(parent); 2158 2159 if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) { 2160 pr_err("cpuset: failed to transfer tasks out of empty cpuset "); 2161 pr_cont_cgroup_name(cs->css.cgroup); 2162 pr_cont("\n"); 2163 } 2164 } 2165 2166 static void 2167 hotplug_update_tasks_legacy(struct cpuset *cs, 2168 struct cpumask *new_cpus, nodemask_t *new_mems, 2169 bool cpus_updated, bool mems_updated) 2170 { 2171 bool is_empty; 2172 2173 spin_lock_irq(&callback_lock); 2174 cpumask_copy(cs->cpus_allowed, new_cpus); 2175 cpumask_copy(cs->effective_cpus, new_cpus); 2176 cs->mems_allowed = *new_mems; 2177 cs->effective_mems = *new_mems; 2178 spin_unlock_irq(&callback_lock); 2179 2180 /* 2181 * Don't call update_tasks_cpumask() if the cpuset becomes empty, 2182 * as the tasks will be migratecd to an ancestor. 2183 */ 2184 if (cpus_updated && !cpumask_empty(cs->cpus_allowed)) 2185 update_tasks_cpumask(cs); 2186 if (mems_updated && !nodes_empty(cs->mems_allowed)) 2187 update_tasks_nodemask(cs); 2188 2189 is_empty = cpumask_empty(cs->cpus_allowed) || 2190 nodes_empty(cs->mems_allowed); 2191 2192 mutex_unlock(&cpuset_mutex); 2193 2194 /* 2195 * Move tasks to the nearest ancestor with execution resources, 2196 * This is full cgroup operation which will also call back into 2197 * cpuset. Should be done outside any lock. 2198 */ 2199 if (is_empty) 2200 remove_tasks_in_empty_cpuset(cs); 2201 2202 mutex_lock(&cpuset_mutex); 2203 } 2204 2205 static void 2206 hotplug_update_tasks(struct cpuset *cs, 2207 struct cpumask *new_cpus, nodemask_t *new_mems, 2208 bool cpus_updated, bool mems_updated) 2209 { 2210 if (cpumask_empty(new_cpus)) 2211 cpumask_copy(new_cpus, parent_cs(cs)->effective_cpus); 2212 if (nodes_empty(*new_mems)) 2213 *new_mems = parent_cs(cs)->effective_mems; 2214 2215 spin_lock_irq(&callback_lock); 2216 cpumask_copy(cs->effective_cpus, new_cpus); 2217 cs->effective_mems = *new_mems; 2218 spin_unlock_irq(&callback_lock); 2219 2220 if (cpus_updated) 2221 update_tasks_cpumask(cs); 2222 if (mems_updated) 2223 update_tasks_nodemask(cs); 2224 } 2225 2226 /** 2227 * cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug 2228 * @cs: cpuset in interest 2229 * 2230 * Compare @cs's cpu and mem masks against top_cpuset and if some have gone 2231 * offline, update @cs accordingly. If @cs ends up with no CPU or memory, 2232 * all its tasks are moved to the nearest ancestor with both resources. 2233 */ 2234 static void cpuset_hotplug_update_tasks(struct cpuset *cs) 2235 { 2236 static cpumask_t new_cpus; 2237 static nodemask_t new_mems; 2238 bool cpus_updated; 2239 bool mems_updated; 2240 retry: 2241 wait_event(cpuset_attach_wq, cs->attach_in_progress == 0); 2242 2243 mutex_lock(&cpuset_mutex); 2244 2245 /* 2246 * We have raced with task attaching. We wait until attaching 2247 * is finished, so we won't attach a task to an empty cpuset. 2248 */ 2249 if (cs->attach_in_progress) { 2250 mutex_unlock(&cpuset_mutex); 2251 goto retry; 2252 } 2253 2254 cpumask_and(&new_cpus, cs->cpus_allowed, parent_cs(cs)->effective_cpus); 2255 nodes_and(new_mems, cs->mems_allowed, parent_cs(cs)->effective_mems); 2256 2257 cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus); 2258 mems_updated = !nodes_equal(new_mems, cs->effective_mems); 2259 2260 if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) 2261 hotplug_update_tasks(cs, &new_cpus, &new_mems, 2262 cpus_updated, mems_updated); 2263 else 2264 hotplug_update_tasks_legacy(cs, &new_cpus, &new_mems, 2265 cpus_updated, mems_updated); 2266 2267 mutex_unlock(&cpuset_mutex); 2268 } 2269 2270 /** 2271 * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset 2272 * 2273 * This function is called after either CPU or memory configuration has 2274 * changed and updates cpuset accordingly. The top_cpuset is always 2275 * synchronized to cpu_active_mask and N_MEMORY, which is necessary in 2276 * order to make cpusets transparent (of no affect) on systems that are 2277 * actively using CPU hotplug but making no active use of cpusets. 2278 * 2279 * Non-root cpusets are only affected by offlining. If any CPUs or memory 2280 * nodes have been taken down, cpuset_hotplug_update_tasks() is invoked on 2281 * all descendants. 2282 * 2283 * Note that CPU offlining during suspend is ignored. We don't modify 2284 * cpusets across suspend/resume cycles at all. 2285 */ 2286 static void cpuset_hotplug_workfn(struct work_struct *work) 2287 { 2288 static cpumask_t new_cpus; 2289 static nodemask_t new_mems; 2290 bool cpus_updated, mems_updated; 2291 bool on_dfl = cgroup_subsys_on_dfl(cpuset_cgrp_subsys); 2292 2293 mutex_lock(&cpuset_mutex); 2294 2295 /* fetch the available cpus/mems and find out which changed how */ 2296 cpumask_copy(&new_cpus, cpu_active_mask); 2297 new_mems = node_states[N_MEMORY]; 2298 2299 cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus); 2300 mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems); 2301 2302 /* synchronize cpus_allowed to cpu_active_mask */ 2303 if (cpus_updated) { 2304 spin_lock_irq(&callback_lock); 2305 if (!on_dfl) 2306 cpumask_copy(top_cpuset.cpus_allowed, &new_cpus); 2307 cpumask_copy(top_cpuset.effective_cpus, &new_cpus); 2308 spin_unlock_irq(&callback_lock); 2309 /* we don't mess with cpumasks of tasks in top_cpuset */ 2310 } 2311 2312 /* synchronize mems_allowed to N_MEMORY */ 2313 if (mems_updated) { 2314 spin_lock_irq(&callback_lock); 2315 if (!on_dfl) 2316 top_cpuset.mems_allowed = new_mems; 2317 top_cpuset.effective_mems = new_mems; 2318 spin_unlock_irq(&callback_lock); 2319 update_tasks_nodemask(&top_cpuset); 2320 } 2321 2322 mutex_unlock(&cpuset_mutex); 2323 2324 /* if cpus or mems changed, we need to propagate to descendants */ 2325 if (cpus_updated || mems_updated) { 2326 struct cpuset *cs; 2327 struct cgroup_subsys_state *pos_css; 2328 2329 rcu_read_lock(); 2330 cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) { 2331 if (cs == &top_cpuset || !css_tryget_online(&cs->css)) 2332 continue; 2333 rcu_read_unlock(); 2334 2335 cpuset_hotplug_update_tasks(cs); 2336 2337 rcu_read_lock(); 2338 css_put(&cs->css); 2339 } 2340 rcu_read_unlock(); 2341 } 2342 2343 /* rebuild sched domains if cpus_allowed has changed */ 2344 if (cpus_updated) 2345 rebuild_sched_domains(); 2346 } 2347 2348 void cpuset_update_active_cpus(void) 2349 { 2350 /* 2351 * We're inside cpu hotplug critical region which usually nests 2352 * inside cgroup synchronization. Bounce actual hotplug processing 2353 * to a work item to avoid reverse locking order. 2354 */ 2355 schedule_work(&cpuset_hotplug_work); 2356 } 2357 2358 /* 2359 * Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY]. 2360 * Call this routine anytime after node_states[N_MEMORY] changes. 2361 * See cpuset_update_active_cpus() for CPU hotplug handling. 2362 */ 2363 static int cpuset_track_online_nodes(struct notifier_block *self, 2364 unsigned long action, void *arg) 2365 { 2366 schedule_work(&cpuset_hotplug_work); 2367 return NOTIFY_OK; 2368 } 2369 2370 static struct notifier_block cpuset_track_online_nodes_nb = { 2371 .notifier_call = cpuset_track_online_nodes, 2372 .priority = 10, /* ??! */ 2373 }; 2374 2375 /** 2376 * cpuset_init_smp - initialize cpus_allowed 2377 * 2378 * Description: Finish top cpuset after cpu, node maps are initialized 2379 */ 2380 void __init cpuset_init_smp(void) 2381 { 2382 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); 2383 top_cpuset.mems_allowed = node_states[N_MEMORY]; 2384 top_cpuset.old_mems_allowed = top_cpuset.mems_allowed; 2385 2386 cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask); 2387 top_cpuset.effective_mems = node_states[N_MEMORY]; 2388 2389 register_hotmemory_notifier(&cpuset_track_online_nodes_nb); 2390 2391 cpuset_migrate_mm_wq = alloc_ordered_workqueue("cpuset_migrate_mm", 0); 2392 BUG_ON(!cpuset_migrate_mm_wq); 2393 } 2394 2395 /** 2396 * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset. 2397 * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed. 2398 * @pmask: pointer to struct cpumask variable to receive cpus_allowed set. 2399 * 2400 * Description: Returns the cpumask_var_t cpus_allowed of the cpuset 2401 * attached to the specified @tsk. Guaranteed to return some non-empty 2402 * subset of cpu_online_mask, even if this means going outside the 2403 * tasks cpuset. 2404 **/ 2405 2406 void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) 2407 { 2408 unsigned long flags; 2409 2410 spin_lock_irqsave(&callback_lock, flags); 2411 rcu_read_lock(); 2412 guarantee_online_cpus(task_cs(tsk), pmask); 2413 rcu_read_unlock(); 2414 spin_unlock_irqrestore(&callback_lock, flags); 2415 } 2416 2417 void cpuset_cpus_allowed_fallback(struct task_struct *tsk) 2418 { 2419 rcu_read_lock(); 2420 do_set_cpus_allowed(tsk, task_cs(tsk)->effective_cpus); 2421 rcu_read_unlock(); 2422 2423 /* 2424 * We own tsk->cpus_allowed, nobody can change it under us. 2425 * 2426 * But we used cs && cs->cpus_allowed lockless and thus can 2427 * race with cgroup_attach_task() or update_cpumask() and get 2428 * the wrong tsk->cpus_allowed. However, both cases imply the 2429 * subsequent cpuset_change_cpumask()->set_cpus_allowed_ptr() 2430 * which takes task_rq_lock(). 2431 * 2432 * If we are called after it dropped the lock we must see all 2433 * changes in tsk_cs()->cpus_allowed. Otherwise we can temporary 2434 * set any mask even if it is not right from task_cs() pov, 2435 * the pending set_cpus_allowed_ptr() will fix things. 2436 * 2437 * select_fallback_rq() will fix things ups and set cpu_possible_mask 2438 * if required. 2439 */ 2440 } 2441 2442 void __init cpuset_init_current_mems_allowed(void) 2443 { 2444 nodes_setall(current->mems_allowed); 2445 } 2446 2447 /** 2448 * cpuset_mems_allowed - return mems_allowed mask from a tasks cpuset. 2449 * @tsk: pointer to task_struct from which to obtain cpuset->mems_allowed. 2450 * 2451 * Description: Returns the nodemask_t mems_allowed of the cpuset 2452 * attached to the specified @tsk. Guaranteed to return some non-empty 2453 * subset of node_states[N_MEMORY], even if this means going outside the 2454 * tasks cpuset. 2455 **/ 2456 2457 nodemask_t cpuset_mems_allowed(struct task_struct *tsk) 2458 { 2459 nodemask_t mask; 2460 unsigned long flags; 2461 2462 spin_lock_irqsave(&callback_lock, flags); 2463 rcu_read_lock(); 2464 guarantee_online_mems(task_cs(tsk), &mask); 2465 rcu_read_unlock(); 2466 spin_unlock_irqrestore(&callback_lock, flags); 2467 2468 return mask; 2469 } 2470 2471 /** 2472 * cpuset_nodemask_valid_mems_allowed - check nodemask vs. curremt mems_allowed 2473 * @nodemask: the nodemask to be checked 2474 * 2475 * Are any of the nodes in the nodemask allowed in current->mems_allowed? 2476 */ 2477 int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask) 2478 { 2479 return nodes_intersects(*nodemask, current->mems_allowed); 2480 } 2481 2482 /* 2483 * nearest_hardwall_ancestor() - Returns the nearest mem_exclusive or 2484 * mem_hardwall ancestor to the specified cpuset. Call holding 2485 * callback_lock. If no ancestor is mem_exclusive or mem_hardwall 2486 * (an unusual configuration), then returns the root cpuset. 2487 */ 2488 static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) 2489 { 2490 while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && parent_cs(cs)) 2491 cs = parent_cs(cs); 2492 return cs; 2493 } 2494 2495 /** 2496 * cpuset_node_allowed - Can we allocate on a memory node? 2497 * @node: is this an allowed node? 2498 * @gfp_mask: memory allocation flags 2499 * 2500 * If we're in interrupt, yes, we can always allocate. If @node is set in 2501 * current's mems_allowed, yes. If it's not a __GFP_HARDWALL request and this 2502 * node is set in the nearest hardwalled cpuset ancestor to current's cpuset, 2503 * yes. If current has access to memory reserves due to TIF_MEMDIE, yes. 2504 * Otherwise, no. 2505 * 2506 * GFP_USER allocations are marked with the __GFP_HARDWALL bit, 2507 * and do not allow allocations outside the current tasks cpuset 2508 * unless the task has been OOM killed as is marked TIF_MEMDIE. 2509 * GFP_KERNEL allocations are not so marked, so can escape to the 2510 * nearest enclosing hardwalled ancestor cpuset. 2511 * 2512 * Scanning up parent cpusets requires callback_lock. The 2513 * __alloc_pages() routine only calls here with __GFP_HARDWALL bit 2514 * _not_ set if it's a GFP_KERNEL allocation, and all nodes in the 2515 * current tasks mems_allowed came up empty on the first pass over 2516 * the zonelist. So only GFP_KERNEL allocations, if all nodes in the 2517 * cpuset are short of memory, might require taking the callback_lock. 2518 * 2519 * The first call here from mm/page_alloc:get_page_from_freelist() 2520 * has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets, 2521 * so no allocation on a node outside the cpuset is allowed (unless 2522 * in interrupt, of course). 2523 * 2524 * The second pass through get_page_from_freelist() doesn't even call 2525 * here for GFP_ATOMIC calls. For those calls, the __alloc_pages() 2526 * variable 'wait' is not set, and the bit ALLOC_CPUSET is not set 2527 * in alloc_flags. That logic and the checks below have the combined 2528 * affect that: 2529 * in_interrupt - any node ok (current task context irrelevant) 2530 * GFP_ATOMIC - any node ok 2531 * TIF_MEMDIE - any node ok 2532 * GFP_KERNEL - any node in enclosing hardwalled cpuset ok 2533 * GFP_USER - only nodes in current tasks mems allowed ok. 2534 */ 2535 bool __cpuset_node_allowed(int node, gfp_t gfp_mask) 2536 { 2537 struct cpuset *cs; /* current cpuset ancestors */ 2538 int allowed; /* is allocation in zone z allowed? */ 2539 unsigned long flags; 2540 2541 if (in_interrupt()) 2542 return true; 2543 if (node_isset(node, current->mems_allowed)) 2544 return true; 2545 /* 2546 * Allow tasks that have access to memory reserves because they have 2547 * been OOM killed to get memory anywhere. 2548 */ 2549 if (unlikely(test_thread_flag(TIF_MEMDIE))) 2550 return true; 2551 if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */ 2552 return false; 2553 2554 if (current->flags & PF_EXITING) /* Let dying task have memory */ 2555 return true; 2556 2557 /* Not hardwall and node outside mems_allowed: scan up cpusets */ 2558 spin_lock_irqsave(&callback_lock, flags); 2559 2560 rcu_read_lock(); 2561 cs = nearest_hardwall_ancestor(task_cs(current)); 2562 allowed = node_isset(node, cs->mems_allowed); 2563 rcu_read_unlock(); 2564 2565 spin_unlock_irqrestore(&callback_lock, flags); 2566 return allowed; 2567 } 2568 2569 /** 2570 * cpuset_mem_spread_node() - On which node to begin search for a file page 2571 * cpuset_slab_spread_node() - On which node to begin search for a slab page 2572 * 2573 * If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for 2574 * tasks in a cpuset with is_spread_page or is_spread_slab set), 2575 * and if the memory allocation used cpuset_mem_spread_node() 2576 * to determine on which node to start looking, as it will for 2577 * certain page cache or slab cache pages such as used for file 2578 * system buffers and inode caches, then instead of starting on the 2579 * local node to look for a free page, rather spread the starting 2580 * node around the tasks mems_allowed nodes. 2581 * 2582 * We don't have to worry about the returned node being offline 2583 * because "it can't happen", and even if it did, it would be ok. 2584 * 2585 * The routines calling guarantee_online_mems() are careful to 2586 * only set nodes in task->mems_allowed that are online. So it 2587 * should not be possible for the following code to return an 2588 * offline node. But if it did, that would be ok, as this routine 2589 * is not returning the node where the allocation must be, only 2590 * the node where the search should start. The zonelist passed to 2591 * __alloc_pages() will include all nodes. If the slab allocator 2592 * is passed an offline node, it will fall back to the local node. 2593 * See kmem_cache_alloc_node(). 2594 */ 2595 2596 static int cpuset_spread_node(int *rotor) 2597 { 2598 return *rotor = next_node_in(*rotor, current->mems_allowed); 2599 } 2600 2601 int cpuset_mem_spread_node(void) 2602 { 2603 if (current->cpuset_mem_spread_rotor == NUMA_NO_NODE) 2604 current->cpuset_mem_spread_rotor = 2605 node_random(¤t->mems_allowed); 2606 2607 return cpuset_spread_node(¤t->cpuset_mem_spread_rotor); 2608 } 2609 2610 int cpuset_slab_spread_node(void) 2611 { 2612 if (current->cpuset_slab_spread_rotor == NUMA_NO_NODE) 2613 current->cpuset_slab_spread_rotor = 2614 node_random(¤t->mems_allowed); 2615 2616 return cpuset_spread_node(¤t->cpuset_slab_spread_rotor); 2617 } 2618 2619 EXPORT_SYMBOL_GPL(cpuset_mem_spread_node); 2620 2621 /** 2622 * cpuset_mems_allowed_intersects - Does @tsk1's mems_allowed intersect @tsk2's? 2623 * @tsk1: pointer to task_struct of some task. 2624 * @tsk2: pointer to task_struct of some other task. 2625 * 2626 * Description: Return true if @tsk1's mems_allowed intersects the 2627 * mems_allowed of @tsk2. Used by the OOM killer to determine if 2628 * one of the task's memory usage might impact the memory available 2629 * to the other. 2630 **/ 2631 2632 int cpuset_mems_allowed_intersects(const struct task_struct *tsk1, 2633 const struct task_struct *tsk2) 2634 { 2635 return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed); 2636 } 2637 2638 /** 2639 * cpuset_print_current_mems_allowed - prints current's cpuset and mems_allowed 2640 * 2641 * Description: Prints current's name, cpuset name, and cached copy of its 2642 * mems_allowed to the kernel log. 2643 */ 2644 void cpuset_print_current_mems_allowed(void) 2645 { 2646 struct cgroup *cgrp; 2647 2648 rcu_read_lock(); 2649 2650 cgrp = task_cs(current)->css.cgroup; 2651 pr_info("%s cpuset=", current->comm); 2652 pr_cont_cgroup_name(cgrp); 2653 pr_cont(" mems_allowed=%*pbl\n", 2654 nodemask_pr_args(¤t->mems_allowed)); 2655 2656 rcu_read_unlock(); 2657 } 2658 2659 /* 2660 * Collection of memory_pressure is suppressed unless 2661 * this flag is enabled by writing "1" to the special 2662 * cpuset file 'memory_pressure_enabled' in the root cpuset. 2663 */ 2664 2665 int cpuset_memory_pressure_enabled __read_mostly; 2666 2667 /** 2668 * cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims. 2669 * 2670 * Keep a running average of the rate of synchronous (direct) 2671 * page reclaim efforts initiated by tasks in each cpuset. 2672 * 2673 * This represents the rate at which some task in the cpuset 2674 * ran low on memory on all nodes it was allowed to use, and 2675 * had to enter the kernels page reclaim code in an effort to 2676 * create more free memory by tossing clean pages or swapping 2677 * or writing dirty pages. 2678 * 2679 * Display to user space in the per-cpuset read-only file 2680 * "memory_pressure". Value displayed is an integer 2681 * representing the recent rate of entry into the synchronous 2682 * (direct) page reclaim by any task attached to the cpuset. 2683 **/ 2684 2685 void __cpuset_memory_pressure_bump(void) 2686 { 2687 rcu_read_lock(); 2688 fmeter_markevent(&task_cs(current)->fmeter); 2689 rcu_read_unlock(); 2690 } 2691 2692 #ifdef CONFIG_PROC_PID_CPUSET 2693 /* 2694 * proc_cpuset_show() 2695 * - Print tasks cpuset path into seq_file. 2696 * - Used for /proc/<pid>/cpuset. 2697 * - No need to task_lock(tsk) on this tsk->cpuset reference, as it 2698 * doesn't really matter if tsk->cpuset changes after we read it, 2699 * and we take cpuset_mutex, keeping cpuset_attach() from changing it 2700 * anyway. 2701 */ 2702 int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns, 2703 struct pid *pid, struct task_struct *tsk) 2704 { 2705 char *buf; 2706 struct cgroup_subsys_state *css; 2707 int retval; 2708 2709 retval = -ENOMEM; 2710 buf = kmalloc(PATH_MAX, GFP_KERNEL); 2711 if (!buf) 2712 goto out; 2713 2714 css = task_get_css(tsk, cpuset_cgrp_id); 2715 retval = cgroup_path_ns(css->cgroup, buf, PATH_MAX, 2716 current->nsproxy->cgroup_ns); 2717 css_put(css); 2718 if (retval >= PATH_MAX) 2719 retval = -ENAMETOOLONG; 2720 if (retval < 0) 2721 goto out_free; 2722 seq_puts(m, buf); 2723 seq_putc(m, '\n'); 2724 retval = 0; 2725 out_free: 2726 kfree(buf); 2727 out: 2728 return retval; 2729 } 2730 #endif /* CONFIG_PROC_PID_CPUSET */ 2731 2732 /* Display task mems_allowed in /proc/<pid>/status file. */ 2733 void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task) 2734 { 2735 seq_printf(m, "Mems_allowed:\t%*pb\n", 2736 nodemask_pr_args(&task->mems_allowed)); 2737 seq_printf(m, "Mems_allowed_list:\t%*pbl\n", 2738 nodemask_pr_args(&task->mems_allowed)); 2739 } 2740