1 // SPDX-License-Identifier: GPL-2.0-or-later 2 3 #include "cpuset-internal.h" 4 5 /* 6 * Legacy hierarchy call to cgroup_transfer_tasks() is handled asynchrously 7 */ 8 struct cpuset_remove_tasks_struct { 9 struct work_struct work; 10 struct cpuset *cs; 11 }; 12 13 /* 14 * Frequency meter - How fast is some event occurring? 15 * 16 * These routines manage a digitally filtered, constant time based, 17 * event frequency meter. There are four routines: 18 * fmeter_init() - initialize a frequency meter. 19 * fmeter_markevent() - called each time the event happens. 20 * fmeter_getrate() - returns the recent rate of such events. 21 * fmeter_update() - internal routine used to update fmeter. 22 * 23 * A common data structure is passed to each of these routines, 24 * which is used to keep track of the state required to manage the 25 * frequency meter and its digital filter. 26 * 27 * The filter works on the number of events marked per unit time. 28 * The filter is single-pole low-pass recursive (IIR). The time unit 29 * is 1 second. Arithmetic is done using 32-bit integers scaled to 30 * simulate 3 decimal digits of precision (multiplied by 1000). 31 * 32 * With an FM_COEF of 933, and a time base of 1 second, the filter 33 * has a half-life of 10 seconds, meaning that if the events quit 34 * happening, then the rate returned from the fmeter_getrate() 35 * will be cut in half each 10 seconds, until it converges to zero. 36 * 37 * It is not worth doing a real infinitely recursive filter. If more 38 * than FM_MAXTICKS ticks have elapsed since the last filter event, 39 * just compute FM_MAXTICKS ticks worth, by which point the level 40 * will be stable. 41 * 42 * Limit the count of unprocessed events to FM_MAXCNT, so as to avoid 43 * arithmetic overflow in the fmeter_update() routine. 44 * 45 * Given the simple 32 bit integer arithmetic used, this meter works 46 * best for reporting rates between one per millisecond (msec) and 47 * one per 32 (approx) seconds. At constant rates faster than one 48 * per msec it maxes out at values just under 1,000,000. At constant 49 * rates between one per msec, and one per second it will stabilize 50 * to a value N*1000, where N is the rate of events per second. 51 * At constant rates between one per second and one per 32 seconds, 52 * it will be choppy, moving up on the seconds that have an event, 53 * and then decaying until the next event. At rates slower than 54 * about one in 32 seconds, it decays all the way back to zero between 55 * each event. 56 */ 57 58 #define FM_COEF 933 /* coefficient for half-life of 10 secs */ 59 #define FM_MAXTICKS ((u32)99) /* useless computing more ticks than this */ 60 #define FM_MAXCNT 1000000 /* limit cnt to avoid overflow */ 61 #define FM_SCALE 1000 /* faux fixed point scale */ 62 63 /* Initialize a frequency meter */ 64 void fmeter_init(struct fmeter *fmp) 65 { 66 fmp->cnt = 0; 67 fmp->val = 0; 68 fmp->time = 0; 69 spin_lock_init(&fmp->lock); 70 } 71 72 /* Internal meter update - process cnt events and update value */ 73 static void fmeter_update(struct fmeter *fmp) 74 { 75 time64_t now; 76 u32 ticks; 77 78 now = ktime_get_seconds(); 79 ticks = now - fmp->time; 80 81 if (ticks == 0) 82 return; 83 84 ticks = min(FM_MAXTICKS, ticks); 85 while (ticks-- > 0) 86 fmp->val = (FM_COEF * fmp->val) / FM_SCALE; 87 fmp->time = now; 88 89 fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE; 90 fmp->cnt = 0; 91 } 92 93 /* Process any previous ticks, then bump cnt by one (times scale). */ 94 static void fmeter_markevent(struct fmeter *fmp) 95 { 96 spin_lock(&fmp->lock); 97 fmeter_update(fmp); 98 fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE); 99 spin_unlock(&fmp->lock); 100 } 101 102 /* Process any previous ticks, then return current value. */ 103 static int fmeter_getrate(struct fmeter *fmp) 104 { 105 int val; 106 107 spin_lock(&fmp->lock); 108 fmeter_update(fmp); 109 val = fmp->val; 110 spin_unlock(&fmp->lock); 111 return val; 112 } 113 114 /* 115 * Collection of memory_pressure is suppressed unless 116 * this flag is enabled by writing "1" to the special 117 * cpuset file 'memory_pressure_enabled' in the root cpuset. 118 */ 119 120 int cpuset_memory_pressure_enabled __read_mostly; 121 122 /* 123 * __cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims. 124 * 125 * Keep a running average of the rate of synchronous (direct) 126 * page reclaim efforts initiated by tasks in each cpuset. 127 * 128 * This represents the rate at which some task in the cpuset 129 * ran low on memory on all nodes it was allowed to use, and 130 * had to enter the kernels page reclaim code in an effort to 131 * create more free memory by tossing clean pages or swapping 132 * or writing dirty pages. 133 * 134 * Display to user space in the per-cpuset read-only file 135 * "memory_pressure". Value displayed is an integer 136 * representing the recent rate of entry into the synchronous 137 * (direct) page reclaim by any task attached to the cpuset. 138 */ 139 140 void __cpuset_memory_pressure_bump(void) 141 { 142 rcu_read_lock(); 143 fmeter_markevent(&task_cs(current)->fmeter); 144 rcu_read_unlock(); 145 } 146 147 static int update_relax_domain_level(struct cpuset *cs, s64 val) 148 { 149 #ifdef CONFIG_SMP 150 if (val < -1 || val > sched_domain_level_max + 1) 151 return -EINVAL; 152 #endif 153 154 if (val != cs->relax_domain_level) { 155 cs->relax_domain_level = val; 156 if (!cpumask_empty(cs->cpus_allowed) && 157 is_sched_load_balance(cs)) 158 rebuild_sched_domains_locked(); 159 } 160 161 return 0; 162 } 163 164 static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, 165 s64 val) 166 { 167 struct cpuset *cs = css_cs(css); 168 cpuset_filetype_t type = cft->private; 169 int retval = -ENODEV; 170 171 cpus_read_lock(); 172 cpuset_lock(); 173 if (!is_cpuset_online(cs)) 174 goto out_unlock; 175 176 switch (type) { 177 case FILE_SCHED_RELAX_DOMAIN_LEVEL: 178 retval = update_relax_domain_level(cs, val); 179 break; 180 default: 181 retval = -EINVAL; 182 break; 183 } 184 out_unlock: 185 cpuset_unlock(); 186 cpus_read_unlock(); 187 return retval; 188 } 189 190 static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft) 191 { 192 struct cpuset *cs = css_cs(css); 193 cpuset_filetype_t type = cft->private; 194 195 switch (type) { 196 case FILE_SCHED_RELAX_DOMAIN_LEVEL: 197 return cs->relax_domain_level; 198 default: 199 BUG(); 200 } 201 202 /* Unreachable but makes gcc happy */ 203 return 0; 204 } 205 206 /* 207 * update task's spread flag if cpuset's page/slab spread flag is set 208 * 209 * Call with callback_lock or cpuset_mutex held. The check can be skipped 210 * if on default hierarchy. 211 */ 212 void cpuset1_update_task_spread_flags(struct cpuset *cs, 213 struct task_struct *tsk) 214 { 215 if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) 216 return; 217 218 if (is_spread_page(cs)) 219 task_set_spread_page(tsk); 220 else 221 task_clear_spread_page(tsk); 222 223 if (is_spread_slab(cs)) 224 task_set_spread_slab(tsk); 225 else 226 task_clear_spread_slab(tsk); 227 } 228 229 /** 230 * cpuset1_update_tasks_flags - update the spread flags of tasks in the cpuset. 231 * @cs: the cpuset in which each task's spread flags needs to be changed 232 * 233 * Iterate through each task of @cs updating its spread flags. As this 234 * function is called with cpuset_mutex held, cpuset membership stays 235 * stable. 236 */ 237 void cpuset1_update_tasks_flags(struct cpuset *cs) 238 { 239 struct css_task_iter it; 240 struct task_struct *task; 241 242 css_task_iter_start(&cs->css, 0, &it); 243 while ((task = css_task_iter_next(&it))) 244 cpuset1_update_task_spread_flags(cs, task); 245 css_task_iter_end(&it); 246 } 247 248 /* 249 * If CPU and/or memory hotplug handlers, below, unplug any CPUs 250 * or memory nodes, we need to walk over the cpuset hierarchy, 251 * removing that CPU or node from all cpusets. If this removes the 252 * last CPU or node from a cpuset, then move the tasks in the empty 253 * cpuset to its next-highest non-empty parent. 254 */ 255 static void remove_tasks_in_empty_cpuset(struct cpuset *cs) 256 { 257 struct cpuset *parent; 258 259 /* 260 * Find its next-highest non-empty parent, (top cpuset 261 * has online cpus, so can't be empty). 262 */ 263 parent = parent_cs(cs); 264 while (cpumask_empty(parent->cpus_allowed) || 265 nodes_empty(parent->mems_allowed)) 266 parent = parent_cs(parent); 267 268 if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) { 269 pr_err("cpuset: failed to transfer tasks out of empty cpuset "); 270 pr_cont_cgroup_name(cs->css.cgroup); 271 pr_cont("\n"); 272 } 273 } 274 275 static void cpuset_migrate_tasks_workfn(struct work_struct *work) 276 { 277 struct cpuset_remove_tasks_struct *s; 278 279 s = container_of(work, struct cpuset_remove_tasks_struct, work); 280 remove_tasks_in_empty_cpuset(s->cs); 281 css_put(&s->cs->css); 282 kfree(s); 283 } 284 285 void cpuset1_hotplug_update_tasks(struct cpuset *cs, 286 struct cpumask *new_cpus, nodemask_t *new_mems, 287 bool cpus_updated, bool mems_updated) 288 { 289 bool is_empty; 290 291 cpuset_callback_lock_irq(); 292 cpumask_copy(cs->cpus_allowed, new_cpus); 293 cpumask_copy(cs->effective_cpus, new_cpus); 294 cs->mems_allowed = *new_mems; 295 cs->effective_mems = *new_mems; 296 cpuset_callback_unlock_irq(); 297 298 /* 299 * Don't call cpuset_update_tasks_cpumask() if the cpuset becomes empty, 300 * as the tasks will be migrated to an ancestor. 301 */ 302 if (cpus_updated && !cpumask_empty(cs->cpus_allowed)) 303 cpuset_update_tasks_cpumask(cs, new_cpus); 304 if (mems_updated && !nodes_empty(cs->mems_allowed)) 305 cpuset_update_tasks_nodemask(cs); 306 307 is_empty = cpumask_empty(cs->cpus_allowed) || 308 nodes_empty(cs->mems_allowed); 309 310 /* 311 * Move tasks to the nearest ancestor with execution resources, 312 * This is full cgroup operation which will also call back into 313 * cpuset. Execute it asynchronously using workqueue. 314 */ 315 if (is_empty && cs->css.cgroup->nr_populated_csets && 316 css_tryget_online(&cs->css)) { 317 struct cpuset_remove_tasks_struct *s; 318 319 s = kzalloc(sizeof(*s), GFP_KERNEL); 320 if (WARN_ON_ONCE(!s)) { 321 css_put(&cs->css); 322 return; 323 } 324 325 s->cs = cs; 326 INIT_WORK(&s->work, cpuset_migrate_tasks_workfn); 327 schedule_work(&s->work); 328 } 329 } 330 331 /* 332 * is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q? 333 * 334 * One cpuset is a subset of another if all its allowed CPUs and 335 * Memory Nodes are a subset of the other, and its exclusive flags 336 * are only set if the other's are set. Call holding cpuset_mutex. 337 */ 338 339 static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) 340 { 341 return cpumask_subset(p->cpus_allowed, q->cpus_allowed) && 342 nodes_subset(p->mems_allowed, q->mems_allowed) && 343 is_cpu_exclusive(p) <= is_cpu_exclusive(q) && 344 is_mem_exclusive(p) <= is_mem_exclusive(q); 345 } 346 347 /* 348 * cpuset1_validate_change() - Validate conditions specific to legacy (v1) 349 * behavior. 350 */ 351 int cpuset1_validate_change(struct cpuset *cur, struct cpuset *trial) 352 { 353 struct cgroup_subsys_state *css; 354 struct cpuset *c, *par; 355 int ret; 356 357 WARN_ON_ONCE(!rcu_read_lock_held()); 358 359 /* Each of our child cpusets must be a subset of us */ 360 ret = -EBUSY; 361 cpuset_for_each_child(c, css, cur) 362 if (!is_cpuset_subset(c, trial)) 363 goto out; 364 365 /* On legacy hierarchy, we must be a subset of our parent cpuset. */ 366 ret = -EACCES; 367 par = parent_cs(cur); 368 if (par && !is_cpuset_subset(trial, par)) 369 goto out; 370 371 ret = 0; 372 out: 373 return ret; 374 } 375 376 static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft) 377 { 378 struct cpuset *cs = css_cs(css); 379 cpuset_filetype_t type = cft->private; 380 381 switch (type) { 382 case FILE_CPU_EXCLUSIVE: 383 return is_cpu_exclusive(cs); 384 case FILE_MEM_EXCLUSIVE: 385 return is_mem_exclusive(cs); 386 case FILE_MEM_HARDWALL: 387 return is_mem_hardwall(cs); 388 case FILE_SCHED_LOAD_BALANCE: 389 return is_sched_load_balance(cs); 390 case FILE_MEMORY_MIGRATE: 391 return is_memory_migrate(cs); 392 case FILE_MEMORY_PRESSURE_ENABLED: 393 return cpuset_memory_pressure_enabled; 394 case FILE_MEMORY_PRESSURE: 395 return fmeter_getrate(&cs->fmeter); 396 case FILE_SPREAD_PAGE: 397 return is_spread_page(cs); 398 case FILE_SPREAD_SLAB: 399 return is_spread_slab(cs); 400 default: 401 BUG(); 402 } 403 404 /* Unreachable but makes gcc happy */ 405 return 0; 406 } 407 408 static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, 409 u64 val) 410 { 411 struct cpuset *cs = css_cs(css); 412 cpuset_filetype_t type = cft->private; 413 int retval = 0; 414 415 cpus_read_lock(); 416 cpuset_lock(); 417 if (!is_cpuset_online(cs)) { 418 retval = -ENODEV; 419 goto out_unlock; 420 } 421 422 switch (type) { 423 case FILE_CPU_EXCLUSIVE: 424 retval = cpuset_update_flag(CS_CPU_EXCLUSIVE, cs, val); 425 break; 426 case FILE_MEM_EXCLUSIVE: 427 retval = cpuset_update_flag(CS_MEM_EXCLUSIVE, cs, val); 428 break; 429 case FILE_MEM_HARDWALL: 430 retval = cpuset_update_flag(CS_MEM_HARDWALL, cs, val); 431 break; 432 case FILE_SCHED_LOAD_BALANCE: 433 retval = cpuset_update_flag(CS_SCHED_LOAD_BALANCE, cs, val); 434 break; 435 case FILE_MEMORY_MIGRATE: 436 retval = cpuset_update_flag(CS_MEMORY_MIGRATE, cs, val); 437 break; 438 case FILE_MEMORY_PRESSURE_ENABLED: 439 cpuset_memory_pressure_enabled = !!val; 440 break; 441 case FILE_SPREAD_PAGE: 442 retval = cpuset_update_flag(CS_SPREAD_PAGE, cs, val); 443 break; 444 case FILE_SPREAD_SLAB: 445 retval = cpuset_update_flag(CS_SPREAD_SLAB, cs, val); 446 break; 447 default: 448 retval = -EINVAL; 449 break; 450 } 451 out_unlock: 452 cpuset_unlock(); 453 cpus_read_unlock(); 454 return retval; 455 } 456 457 /* 458 * for the common functions, 'private' gives the type of file 459 */ 460 461 struct cftype cpuset1_files[] = { 462 { 463 .name = "cpus", 464 .seq_show = cpuset_common_seq_show, 465 .write = cpuset_write_resmask, 466 .max_write_len = (100U + 6 * NR_CPUS), 467 .private = FILE_CPULIST, 468 }, 469 470 { 471 .name = "mems", 472 .seq_show = cpuset_common_seq_show, 473 .write = cpuset_write_resmask, 474 .max_write_len = (100U + 6 * MAX_NUMNODES), 475 .private = FILE_MEMLIST, 476 }, 477 478 { 479 .name = "effective_cpus", 480 .seq_show = cpuset_common_seq_show, 481 .private = FILE_EFFECTIVE_CPULIST, 482 }, 483 484 { 485 .name = "effective_mems", 486 .seq_show = cpuset_common_seq_show, 487 .private = FILE_EFFECTIVE_MEMLIST, 488 }, 489 490 { 491 .name = "cpu_exclusive", 492 .read_u64 = cpuset_read_u64, 493 .write_u64 = cpuset_write_u64, 494 .private = FILE_CPU_EXCLUSIVE, 495 }, 496 497 { 498 .name = "mem_exclusive", 499 .read_u64 = cpuset_read_u64, 500 .write_u64 = cpuset_write_u64, 501 .private = FILE_MEM_EXCLUSIVE, 502 }, 503 504 { 505 .name = "mem_hardwall", 506 .read_u64 = cpuset_read_u64, 507 .write_u64 = cpuset_write_u64, 508 .private = FILE_MEM_HARDWALL, 509 }, 510 511 { 512 .name = "sched_load_balance", 513 .read_u64 = cpuset_read_u64, 514 .write_u64 = cpuset_write_u64, 515 .private = FILE_SCHED_LOAD_BALANCE, 516 }, 517 518 { 519 .name = "sched_relax_domain_level", 520 .read_s64 = cpuset_read_s64, 521 .write_s64 = cpuset_write_s64, 522 .private = FILE_SCHED_RELAX_DOMAIN_LEVEL, 523 }, 524 525 { 526 .name = "memory_migrate", 527 .read_u64 = cpuset_read_u64, 528 .write_u64 = cpuset_write_u64, 529 .private = FILE_MEMORY_MIGRATE, 530 }, 531 532 { 533 .name = "memory_pressure", 534 .read_u64 = cpuset_read_u64, 535 .private = FILE_MEMORY_PRESSURE, 536 }, 537 538 { 539 .name = "memory_spread_page", 540 .read_u64 = cpuset_read_u64, 541 .write_u64 = cpuset_write_u64, 542 .private = FILE_SPREAD_PAGE, 543 }, 544 545 { 546 /* obsolete, may be removed in the future */ 547 .name = "memory_spread_slab", 548 .read_u64 = cpuset_read_u64, 549 .write_u64 = cpuset_write_u64, 550 .private = FILE_SPREAD_SLAB, 551 }, 552 553 { 554 .name = "memory_pressure_enabled", 555 .flags = CFTYPE_ONLY_ON_ROOT, 556 .read_u64 = cpuset_read_u64, 557 .write_u64 = cpuset_write_u64, 558 .private = FILE_MEMORY_PRESSURE_ENABLED, 559 }, 560 561 { } /* terminate */ 562 }; 563