1 /* SPDX-License-Identifier: GPL-2.0 */ 2 /* 3 * linux/cgroup-defs.h - basic definitions for cgroup 4 * 5 * This file provides basic type and interface. Include this file directly 6 * only if necessary to avoid cyclic dependencies. 7 */ 8 #ifndef _LINUX_CGROUP_DEFS_H 9 #define _LINUX_CGROUP_DEFS_H 10 11 #include <linux/limits.h> 12 #include <linux/list.h> 13 #include <linux/idr.h> 14 #include <linux/wait.h> 15 #include <linux/mutex.h> 16 #include <linux/rcupdate.h> 17 #include <linux/refcount.h> 18 #include <linux/percpu-refcount.h> 19 #include <linux/percpu-rwsem.h> 20 #include <linux/sched.h> 21 #include <linux/u64_stats_sync.h> 22 #include <linux/workqueue.h> 23 #include <linux/bpf-cgroup-defs.h> 24 #include <linux/psi_types.h> 25 26 #ifdef CONFIG_CGROUPS 27 28 struct cgroup; 29 struct cgroup_root; 30 struct cgroup_subsys; 31 struct cgroup_taskset; 32 struct kernfs_node; 33 struct kernfs_ops; 34 struct kernfs_open_file; 35 struct seq_file; 36 struct poll_table_struct; 37 38 #define MAX_CGROUP_TYPE_NAMELEN 32 39 #define MAX_CGROUP_ROOT_NAMELEN 64 40 #define MAX_CFTYPE_NAME 64 41 42 /* define the enumeration of all cgroup subsystems */ 43 #define SUBSYS(_x) _x ## _cgrp_id, 44 enum cgroup_subsys_id { 45 #include <linux/cgroup_subsys.h> 46 CGROUP_SUBSYS_COUNT, 47 }; 48 #undef SUBSYS 49 50 /* bits in struct cgroup_subsys_state flags field */ 51 enum { 52 CSS_NO_REF = (1 << 0), /* no reference counting for this css */ 53 CSS_ONLINE = (1 << 1), /* between ->css_online() and ->css_offline() */ 54 CSS_RELEASED = (1 << 2), /* refcnt reached zero, released */ 55 CSS_VISIBLE = (1 << 3), /* css is visible to userland */ 56 CSS_DYING = (1 << 4), /* css is dying */ 57 }; 58 59 /* bits in struct cgroup flags field */ 60 enum { 61 /* Control Group requires release notifications to userspace */ 62 CGRP_NOTIFY_ON_RELEASE, 63 /* 64 * Clone the parent's configuration when creating a new child 65 * cpuset cgroup. For historical reasons, this option can be 66 * specified at mount time and thus is implemented here. 67 */ 68 CGRP_CPUSET_CLONE_CHILDREN, 69 70 /* Control group has to be frozen. */ 71 CGRP_FREEZE, 72 73 /* Cgroup is frozen. */ 74 CGRP_FROZEN, 75 }; 76 77 /* cgroup_root->flags */ 78 enum { 79 CGRP_ROOT_NOPREFIX = (1 << 1), /* mounted subsystems have no named prefix */ 80 CGRP_ROOT_XATTR = (1 << 2), /* supports extended attributes */ 81 82 /* 83 * Consider namespaces as delegation boundaries. If this flag is 84 * set, controller specific interface files in a namespace root 85 * aren't writeable from inside the namespace. 86 */ 87 CGRP_ROOT_NS_DELEGATE = (1 << 3), 88 89 /* 90 * Reduce latencies on dynamic cgroup modifications such as task 91 * migrations and controller on/offs by disabling percpu operation on 92 * cgroup_threadgroup_rwsem. This makes hot path operations such as 93 * forks and exits into the slow path and more expensive. 94 * 95 * Alleviate the contention between fork, exec, exit operations and 96 * writing to cgroup.procs by taking a per threadgroup rwsem instead of 97 * the global cgroup_threadgroup_rwsem. Fork and other operations 98 * from threads in different thread groups no longer contend with 99 * writing to cgroup.procs. 100 * 101 * The static usage pattern of creating a cgroup, enabling controllers, 102 * and then seeding it with CLONE_INTO_CGROUP doesn't require write 103 * locking cgroup_threadgroup_rwsem and thus doesn't benefit from 104 * favordynmod. 105 */ 106 CGRP_ROOT_FAVOR_DYNMODS = (1 << 4), 107 108 /* 109 * Enable cpuset controller in v1 cgroup to use v2 behavior. 110 */ 111 CGRP_ROOT_CPUSET_V2_MODE = (1 << 16), 112 113 /* 114 * Enable legacy local memory.events. 115 */ 116 CGRP_ROOT_MEMORY_LOCAL_EVENTS = (1 << 17), 117 118 /* 119 * Enable recursive subtree protection 120 */ 121 CGRP_ROOT_MEMORY_RECURSIVE_PROT = (1 << 18), 122 123 /* 124 * Enable hugetlb accounting for the memory controller. 125 */ 126 CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING = (1 << 19), 127 128 /* 129 * Enable legacy local pids.events. 130 */ 131 CGRP_ROOT_PIDS_LOCAL_EVENTS = (1 << 20), 132 }; 133 134 /* cftype->flags */ 135 enum { 136 CFTYPE_ONLY_ON_ROOT = (1 << 0), /* only create on root cgrp */ 137 CFTYPE_NOT_ON_ROOT = (1 << 1), /* don't create on root cgrp */ 138 CFTYPE_NS_DELEGATABLE = (1 << 2), /* writeable beyond delegation boundaries */ 139 140 CFTYPE_NO_PREFIX = (1 << 3), /* (DON'T USE FOR NEW FILES) no subsys prefix */ 141 CFTYPE_WORLD_WRITABLE = (1 << 4), /* (DON'T USE FOR NEW FILES) S_IWUGO */ 142 CFTYPE_DEBUG = (1 << 5), /* create when cgroup_debug */ 143 144 /* internal flags, do not use outside cgroup core proper */ 145 __CFTYPE_ONLY_ON_DFL = (1 << 16), /* only on default hierarchy */ 146 __CFTYPE_NOT_ON_DFL = (1 << 17), /* not on default hierarchy */ 147 __CFTYPE_ADDED = (1 << 18), 148 }; 149 150 enum cgroup_attach_lock_mode { 151 /* Default */ 152 CGRP_ATTACH_LOCK_GLOBAL, 153 154 /* When pid=0 && threadgroup=false, see comments in cgroup_procs_write_start */ 155 CGRP_ATTACH_LOCK_NONE, 156 157 /* When favordynmods is on, see comments above CGRP_ROOT_FAVOR_DYNMODS */ 158 CGRP_ATTACH_LOCK_PER_THREADGROUP, 159 }; 160 161 /* 162 * cgroup_file is the handle for a file instance created in a cgroup which 163 * is used, for example, to generate file changed notifications. This can 164 * be obtained by setting cftype->file_offset. 165 */ 166 struct cgroup_file { 167 /* do not access any fields from outside cgroup core */ 168 struct kernfs_node *kn; 169 unsigned long notified_at; 170 struct timer_list notify_timer; 171 spinlock_t lock; 172 }; 173 174 /* 175 * Per-subsystem/per-cgroup state maintained by the system. This is the 176 * fundamental structural building block that controllers deal with. 177 * 178 * Fields marked with "PI:" are public and immutable and may be accessed 179 * directly without synchronization. 180 */ 181 struct cgroup_subsys_state { 182 /* PI: the cgroup that this css is attached to */ 183 struct cgroup *cgroup; 184 185 /* PI: the cgroup subsystem that this css is attached to */ 186 struct cgroup_subsys *ss; 187 188 /* reference count - access via css_[try]get() and css_put() */ 189 struct percpu_ref refcnt; 190 191 /* 192 * Depending on the context, this field is initialized 193 * via css_rstat_init() at different places: 194 * 195 * when css is associated with cgroup::self 196 * when css->cgroup is the root cgroup 197 * performed in cgroup_init() 198 * when css->cgroup is not the root cgroup 199 * performed in cgroup_create() 200 * when css is associated with a subsystem 201 * when css->cgroup is the root cgroup 202 * performed in cgroup_init_subsys() in the non-early path 203 * when css->cgroup is not the root cgroup 204 * performed in css_create() 205 */ 206 struct css_rstat_cpu __percpu *rstat_cpu; 207 208 /* 209 * siblings list anchored at the parent's ->children 210 * 211 * linkage is protected by cgroup_mutex or RCU 212 */ 213 struct list_head sibling; 214 struct list_head children; 215 216 /* 217 * PI: Subsys-unique ID. 0 is unused and root is always 1. The 218 * matching css can be looked up using css_from_id(). 219 */ 220 int id; 221 222 unsigned int flags; 223 224 /* 225 * Monotonically increasing unique serial number which defines a 226 * uniform order among all csses. It's guaranteed that all 227 * ->children lists are in the ascending order of ->serial_nr and 228 * used to allow interrupting and resuming iterations. 229 */ 230 u64 serial_nr; 231 232 /* 233 * Incremented by online self and children. Used to guarantee that 234 * parents are not offlined before their children. 235 */ 236 atomic_t online_cnt; 237 238 /* percpu_ref killing and RCU release */ 239 struct work_struct destroy_work; 240 struct rcu_work destroy_rwork; 241 242 /* 243 * PI: the parent css. Placed here for cache proximity to following 244 * fields of the containing structure. 245 */ 246 struct cgroup_subsys_state *parent; 247 248 /* 249 * Keep track of total numbers of visible descendant CSSes. 250 * The total number of dying CSSes is tracked in 251 * css->cgroup->nr_dying_subsys[ssid]. 252 * Protected by cgroup_mutex. 253 */ 254 int nr_descendants; 255 256 /* 257 * A singly-linked list of css structures to be rstat flushed. 258 * This is a scratch field to be used exclusively by 259 * css_rstat_flush(). 260 * 261 * Protected by rstat_base_lock when css is cgroup::self. 262 * Protected by css->ss->rstat_ss_lock otherwise. 263 */ 264 struct cgroup_subsys_state *rstat_flush_next; 265 }; 266 267 /* 268 * A css_set is a structure holding pointers to a set of 269 * cgroup_subsys_state objects. This saves space in the task struct 270 * object and speeds up fork()/exit(), since a single inc/dec and a 271 * list_add()/del() can bump the reference count on the entire cgroup 272 * set for a task. 273 */ 274 struct css_set { 275 /* 276 * Set of subsystem states, one for each subsystem. This array is 277 * immutable after creation apart from the init_css_set during 278 * subsystem registration (at boot time). 279 */ 280 struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT]; 281 282 /* reference count */ 283 refcount_t refcount; 284 285 /* 286 * For a domain cgroup, the following points to self. If threaded, 287 * to the matching cset of the nearest domain ancestor. The 288 * dom_cset provides access to the domain cgroup and its csses to 289 * which domain level resource consumptions should be charged. 290 */ 291 struct css_set *dom_cset; 292 293 /* the default cgroup associated with this css_set */ 294 struct cgroup *dfl_cgrp; 295 296 /* internal task count, protected by css_set_lock */ 297 int nr_tasks; 298 299 /* 300 * Lists running through all tasks using this cgroup group. 301 * mg_tasks lists tasks which belong to this cset but are in the 302 * process of being migrated out or in. Protected by 303 * css_set_lock, but, during migration, once tasks are moved to 304 * mg_tasks, it can be read safely while holding cgroup_mutex. 305 */ 306 struct list_head tasks; 307 struct list_head mg_tasks; 308 struct list_head dying_tasks; 309 310 /* all css_task_iters currently walking this cset */ 311 struct list_head task_iters; 312 313 /* 314 * On the default hierarchy, ->subsys[ssid] may point to a css 315 * attached to an ancestor instead of the cgroup this css_set is 316 * associated with. The following node is anchored at 317 * ->subsys[ssid]->cgroup->e_csets[ssid] and provides a way to 318 * iterate through all css's attached to a given cgroup. 319 */ 320 struct list_head e_cset_node[CGROUP_SUBSYS_COUNT]; 321 322 /* all threaded csets whose ->dom_cset points to this cset */ 323 struct list_head threaded_csets; 324 struct list_head threaded_csets_node; 325 326 /* 327 * List running through all cgroup groups in the same hash 328 * slot. Protected by css_set_lock 329 */ 330 struct hlist_node hlist; 331 332 /* 333 * List of cgrp_cset_links pointing at cgroups referenced from this 334 * css_set. Protected by css_set_lock. 335 */ 336 struct list_head cgrp_links; 337 338 /* 339 * List of csets participating in the on-going migration either as 340 * source or destination. Protected by cgroup_mutex. 341 */ 342 struct list_head mg_src_preload_node; 343 struct list_head mg_dst_preload_node; 344 struct list_head mg_node; 345 346 /* 347 * If this cset is acting as the source of migration the following 348 * two fields are set. mg_src_cgrp and mg_dst_cgrp are 349 * respectively the source and destination cgroups of the on-going 350 * migration. mg_dst_cset is the destination cset the target tasks 351 * on this cset should be migrated to. Protected by cgroup_mutex. 352 */ 353 struct cgroup *mg_src_cgrp; 354 struct cgroup *mg_dst_cgrp; 355 struct css_set *mg_dst_cset; 356 357 /* dead and being drained, ignore for migration */ 358 bool dead; 359 360 /* For RCU-protected deletion */ 361 struct rcu_head rcu_head; 362 }; 363 364 struct cgroup_base_stat { 365 struct task_cputime cputime; 366 367 #ifdef CONFIG_SCHED_CORE 368 u64 forceidle_sum; 369 #endif 370 u64 ntime; 371 }; 372 373 /* 374 * rstat - cgroup scalable recursive statistics. Accounting is done 375 * per-cpu in css_rstat_cpu which is then lazily propagated up the 376 * hierarchy on reads. 377 * 378 * When a stat gets updated, the css_rstat_cpu and its ancestors are 379 * linked into the updated tree. On the following read, propagation only 380 * considers and consumes the updated tree. This makes reading O(the 381 * number of descendants which have been active since last read) instead of 382 * O(the total number of descendants). 383 * 384 * This is important because there can be a lot of (draining) cgroups which 385 * aren't active and stat may be read frequently. The combination can 386 * become very expensive. By propagating selectively, increasing reading 387 * frequency decreases the cost of each read. 388 * 389 * This struct hosts both the fields which implement the above - 390 * updated_children and updated_next. 391 */ 392 struct css_rstat_cpu { 393 /* 394 * Child cgroups with stat updates on this cpu since the last read 395 * are linked on the parent's ->updated_children through 396 * ->updated_next. updated_children is terminated by its container css. 397 */ 398 struct cgroup_subsys_state *updated_children; 399 struct cgroup_subsys_state *updated_next; /* NULL if not on the list */ 400 401 struct llist_node lnode; /* lockless list for update */ 402 struct cgroup_subsys_state *owner; /* back pointer */ 403 }; 404 405 /* 406 * This struct hosts the fields which track basic resource statistics on 407 * top of it - bsync, bstat and last_bstat. 408 */ 409 struct cgroup_rstat_base_cpu { 410 /* 411 * ->bsync protects ->bstat. These are the only fields which get 412 * updated in the hot path. 413 */ 414 struct u64_stats_sync bsync; 415 struct cgroup_base_stat bstat; 416 417 /* 418 * Snapshots at the last reading. These are used to calculate the 419 * deltas to propagate to the global counters. 420 */ 421 struct cgroup_base_stat last_bstat; 422 423 /* 424 * This field is used to record the cumulative per-cpu time of 425 * the cgroup and its descendants. Currently it can be read via 426 * eBPF/drgn etc, and we are still trying to determine how to 427 * expose it in the cgroupfs interface. 428 */ 429 struct cgroup_base_stat subtree_bstat; 430 431 /* 432 * Snapshots at the last reading. These are used to calculate the 433 * deltas to propagate to the per-cpu subtree_bstat. 434 */ 435 struct cgroup_base_stat last_subtree_bstat; 436 }; 437 438 struct cgroup_freezer_state { 439 /* Should the cgroup and its descendants be frozen. */ 440 bool freeze; 441 442 /* Should the cgroup actually be frozen? */ 443 bool e_freeze; 444 445 /* Fields below are protected by css_set_lock */ 446 447 /* Number of frozen descendant cgroups */ 448 int nr_frozen_descendants; 449 450 /* 451 * Number of tasks, which are counted as frozen: 452 * frozen, SIGSTOPped, and PTRACEd. 453 */ 454 int nr_frozen_tasks; 455 456 /* Freeze time data consistency protection */ 457 seqcount_spinlock_t freeze_seq; 458 459 /* 460 * Most recent time the cgroup was requested to freeze. 461 * Accesses guarded by freeze_seq counter. Writes serialized 462 * by css_set_lock. 463 */ 464 u64 freeze_start_nsec; 465 466 /* 467 * Total duration the cgroup has spent freezing. 468 * Accesses guarded by freeze_seq counter. Writes serialized 469 * by css_set_lock. 470 */ 471 u64 frozen_nsec; 472 }; 473 474 struct cgroup { 475 /* self css with NULL ->ss, points back to this cgroup */ 476 struct cgroup_subsys_state self; 477 478 unsigned long flags; /* "unsigned long" so bitops work */ 479 480 /* 481 * The depth this cgroup is at. The root is at depth zero and each 482 * step down the hierarchy increments the level. This along with 483 * ancestors[] can determine whether a given cgroup is a 484 * descendant of another without traversing the hierarchy. 485 */ 486 int level; 487 488 /* Maximum allowed descent tree depth */ 489 int max_depth; 490 491 /* 492 * Keep track of total numbers of visible and dying descent cgroups. 493 * Dying cgroups are cgroups which were deleted by a user, 494 * but are still existing because someone else is holding a reference. 495 * max_descendants is a maximum allowed number of descent cgroups. 496 * 497 * nr_descendants and nr_dying_descendants are protected 498 * by cgroup_mutex and css_set_lock. It's fine to read them holding 499 * any of cgroup_mutex and css_set_lock; for writing both locks 500 * should be held. 501 */ 502 int nr_descendants; 503 int nr_dying_descendants; 504 int max_descendants; 505 506 /* 507 * Each non-empty css_set associated with this cgroup contributes 508 * one to nr_populated_csets. The counter is zero iff this cgroup 509 * doesn't have any tasks. 510 * 511 * All children which have non-zero nr_populated_csets and/or 512 * nr_populated_children of their own contribute one to either 513 * nr_populated_domain_children or nr_populated_threaded_children 514 * depending on their type. Each counter is zero iff all cgroups 515 * of the type in the subtree proper don't have any tasks. 516 */ 517 int nr_populated_csets; 518 int nr_populated_domain_children; 519 int nr_populated_threaded_children; 520 521 int nr_threaded_children; /* # of live threaded child cgroups */ 522 523 /* sequence number for cgroup.kill, serialized by css_set_lock. */ 524 unsigned int kill_seq; 525 526 struct kernfs_node *kn; /* cgroup kernfs entry */ 527 struct cgroup_file procs_file; /* handle for "cgroup.procs" */ 528 struct cgroup_file events_file; /* handle for "cgroup.events" */ 529 530 /* handles for "{cpu,memory,io,irq}.pressure" */ 531 struct cgroup_file psi_files[NR_PSI_RESOURCES]; 532 533 /* 534 * The bitmask of subsystems enabled on the child cgroups. 535 * ->subtree_control is the one configured through 536 * "cgroup.subtree_control" while ->subtree_ss_mask is the effective 537 * one which may have more subsystems enabled. Controller knobs 538 * are made available iff it's enabled in ->subtree_control. 539 */ 540 u32 subtree_control; 541 u32 subtree_ss_mask; 542 u32 old_subtree_control; 543 u32 old_subtree_ss_mask; 544 545 /* Private pointers for each registered subsystem */ 546 struct cgroup_subsys_state __rcu *subsys[CGROUP_SUBSYS_COUNT]; 547 548 /* 549 * Keep track of total number of dying CSSes at and below this cgroup. 550 * Protected by cgroup_mutex. 551 */ 552 int nr_dying_subsys[CGROUP_SUBSYS_COUNT]; 553 554 struct cgroup_root *root; 555 556 /* 557 * List of cgrp_cset_links pointing at css_sets with tasks in this 558 * cgroup. Protected by css_set_lock. 559 */ 560 struct list_head cset_links; 561 562 /* 563 * On the default hierarchy, a css_set for a cgroup with some 564 * susbsys disabled will point to css's which are associated with 565 * the closest ancestor which has the subsys enabled. The 566 * following lists all css_sets which point to this cgroup's css 567 * for the given subsystem. 568 */ 569 struct list_head e_csets[CGROUP_SUBSYS_COUNT]; 570 571 /* 572 * If !threaded, self. If threaded, it points to the nearest 573 * domain ancestor. Inside a threaded subtree, cgroups are exempt 574 * from process granularity and no-internal-task constraint. 575 * Domain level resource consumptions which aren't tied to a 576 * specific task are charged to the dom_cgrp. 577 */ 578 struct cgroup *dom_cgrp; 579 struct cgroup *old_dom_cgrp; /* used while enabling threaded */ 580 581 /* 582 * Depending on the context, this field is initialized via 583 * css_rstat_init() at different places: 584 * 585 * when cgroup is the root cgroup 586 * performed in cgroup_setup_root() 587 * otherwise 588 * performed in cgroup_create() 589 */ 590 struct cgroup_rstat_base_cpu __percpu *rstat_base_cpu; 591 592 /* 593 * Add padding to keep the read mostly rstat per-cpu pointer on a 594 * different cacheline than the following *bstat fields which can have 595 * frequent updates. 596 */ 597 CACHELINE_PADDING(_pad_); 598 599 /* cgroup basic resource statistics */ 600 struct cgroup_base_stat last_bstat; 601 struct cgroup_base_stat bstat; 602 struct prev_cputime prev_cputime; /* for printing out cputime */ 603 604 /* 605 * list of pidlists, up to two for each namespace (one for procs, one 606 * for tasks); created on demand. 607 */ 608 struct list_head pidlists; 609 struct mutex pidlist_mutex; 610 611 /* used to wait for offlining of csses */ 612 wait_queue_head_t offline_waitq; 613 614 /* used by cgroup_rmdir() to wait for dying tasks to leave */ 615 wait_queue_head_t dying_populated_waitq; 616 617 /* used to schedule release agent */ 618 struct work_struct release_agent_work; 619 620 /* used to track pressure stalls */ 621 struct psi_group *psi; 622 623 /* used to store eBPF programs */ 624 struct cgroup_bpf bpf; 625 626 /* Used to store internal freezer state */ 627 struct cgroup_freezer_state freezer; 628 629 #ifdef CONFIG_BPF_SYSCALL 630 struct bpf_local_storage __rcu *bpf_cgrp_storage; 631 #endif 632 #ifdef CONFIG_EXT_SUB_SCHED 633 struct scx_sched __rcu *scx_sched; 634 #endif 635 636 /* All ancestors including self */ 637 union { 638 DECLARE_FLEX_ARRAY(struct cgroup *, ancestors); 639 struct { 640 struct cgroup *_root_ancestor; 641 DECLARE_FLEX_ARRAY(struct cgroup *, _low_ancestors); 642 }; 643 }; 644 }; 645 646 /* 647 * A cgroup_root represents the root of a cgroup hierarchy, and may be 648 * associated with a kernfs_root to form an active hierarchy. This is 649 * internal to cgroup core. Don't access directly from controllers. 650 */ 651 struct cgroup_root { 652 struct kernfs_root *kf_root; 653 654 /* The bitmask of subsystems attached to this hierarchy */ 655 unsigned int subsys_mask; 656 657 /* Unique id for this hierarchy. */ 658 int hierarchy_id; 659 660 /* A list running through the active hierarchies */ 661 struct list_head root_list; 662 struct rcu_head rcu; /* Must be near the top */ 663 664 /* Number of cgroups in the hierarchy, used only for /proc/cgroups */ 665 atomic_t nr_cgrps; 666 667 /* Hierarchy-specific flags */ 668 unsigned int flags; 669 670 /* The path to use for release notifications. */ 671 char release_agent_path[PATH_MAX]; 672 673 /* The name for this hierarchy - may be empty */ 674 char name[MAX_CGROUP_ROOT_NAMELEN]; 675 676 /* 677 * The root cgroup. The containing cgroup_root will be destroyed on its 678 * release. This must be embedded last due to flexible array at the end 679 * of struct cgroup. 680 */ 681 struct cgroup cgrp; 682 }; 683 684 /* 685 * struct cftype: handler definitions for cgroup control files 686 * 687 * When reading/writing to a file: 688 * - the cgroup to use is file->f_path.dentry->d_parent->d_fsdata 689 * - the 'cftype' of the file is file->f_path.dentry->d_fsdata 690 */ 691 struct cftype { 692 /* 693 * Name of the subsystem is prepended in cgroup_file_name(). 694 * Zero length string indicates end of cftype array. 695 */ 696 char name[MAX_CFTYPE_NAME]; 697 unsigned long private; 698 699 /* 700 * The maximum length of string, excluding trailing nul, that can 701 * be passed to write. If < PAGE_SIZE-1, PAGE_SIZE-1 is assumed. 702 */ 703 size_t max_write_len; 704 705 /* CFTYPE_* flags */ 706 unsigned int flags; 707 708 /* 709 * If non-zero, should contain the offset from the start of css to 710 * a struct cgroup_file field. cgroup will record the handle of 711 * the created file into it. The recorded handle can be used as 712 * long as the containing css remains accessible. 713 */ 714 unsigned int file_offset; 715 716 /* 717 * Fields used for internal bookkeeping. Initialized automatically 718 * during registration. 719 */ 720 struct cgroup_subsys *ss; /* NULL for cgroup core files */ 721 struct list_head node; /* anchored at ss->cfts */ 722 struct kernfs_ops *kf_ops; 723 724 int (*open)(struct kernfs_open_file *of); 725 void (*release)(struct kernfs_open_file *of); 726 727 /* 728 * read_u64() is a shortcut for the common case of returning a 729 * single integer. Use it in place of read() 730 */ 731 u64 (*read_u64)(struct cgroup_subsys_state *css, struct cftype *cft); 732 /* 733 * read_s64() is a signed version of read_u64() 734 */ 735 s64 (*read_s64)(struct cgroup_subsys_state *css, struct cftype *cft); 736 737 /* generic seq_file read interface */ 738 int (*seq_show)(struct seq_file *sf, void *v); 739 740 /* optional ops, implement all or none */ 741 void *(*seq_start)(struct seq_file *sf, loff_t *ppos); 742 void *(*seq_next)(struct seq_file *sf, void *v, loff_t *ppos); 743 void (*seq_stop)(struct seq_file *sf, void *v); 744 745 /* 746 * write_u64() is a shortcut for the common case of accepting 747 * a single integer (as parsed by simple_strtoull) from 748 * userspace. Use in place of write(); return 0 or error. 749 */ 750 int (*write_u64)(struct cgroup_subsys_state *css, struct cftype *cft, 751 u64 val); 752 /* 753 * write_s64() is a signed version of write_u64() 754 */ 755 int (*write_s64)(struct cgroup_subsys_state *css, struct cftype *cft, 756 s64 val); 757 758 /* 759 * write() is the generic write callback which maps directly to 760 * kernfs write operation and overrides all other operations. 761 * Maximum write size is determined by ->max_write_len. Use 762 * of_css/cft() to access the associated css and cft. 763 */ 764 ssize_t (*write)(struct kernfs_open_file *of, 765 char *buf, size_t nbytes, loff_t off); 766 767 __poll_t (*poll)(struct kernfs_open_file *of, 768 struct poll_table_struct *pt); 769 770 struct lock_class_key lockdep_key; 771 }; 772 773 /* 774 * Control Group subsystem type. 775 * See Documentation/admin-guide/cgroup-v1/cgroups.rst for details 776 */ 777 struct cgroup_subsys { 778 struct cgroup_subsys_state *(*css_alloc)(struct cgroup_subsys_state *parent_css); 779 int (*css_online)(struct cgroup_subsys_state *css); 780 void (*css_offline)(struct cgroup_subsys_state *css); 781 void (*css_released)(struct cgroup_subsys_state *css); 782 void (*css_free)(struct cgroup_subsys_state *css); 783 void (*css_reset)(struct cgroup_subsys_state *css); 784 void (*css_killed)(struct cgroup_subsys_state *css); 785 void (*css_rstat_flush)(struct cgroup_subsys_state *css, int cpu); 786 int (*css_extra_stat_show)(struct seq_file *seq, 787 struct cgroup_subsys_state *css); 788 int (*css_local_stat_show)(struct seq_file *seq, 789 struct cgroup_subsys_state *css); 790 791 int (*can_attach)(struct cgroup_taskset *tset); 792 void (*cancel_attach)(struct cgroup_taskset *tset); 793 void (*attach)(struct cgroup_taskset *tset); 794 int (*can_fork)(struct task_struct *task, 795 struct css_set *cset); 796 void (*cancel_fork)(struct task_struct *task, struct css_set *cset); 797 void (*fork)(struct task_struct *task); 798 void (*exit)(struct task_struct *task); 799 void (*release)(struct task_struct *task); 800 void (*bind)(struct cgroup_subsys_state *root_css); 801 802 bool early_init:1; 803 804 /* 805 * If %true, the controller, on the default hierarchy, doesn't show 806 * up in "cgroup.controllers" or "cgroup.subtree_control", is 807 * implicitly enabled on all cgroups on the default hierarchy, and 808 * bypasses the "no internal process" constraint. This is for 809 * utility type controllers which is transparent to userland. 810 * 811 * An implicit controller can be stolen from the default hierarchy 812 * anytime and thus must be okay with offline csses from previous 813 * hierarchies coexisting with csses for the current one. 814 */ 815 bool implicit_on_dfl:1; 816 817 /* 818 * If %true, the controller, supports threaded mode on the default 819 * hierarchy. In a threaded subtree, both process granularity and 820 * no-internal-process constraint are ignored and a threaded 821 * controllers should be able to handle that. 822 * 823 * Note that as an implicit controller is automatically enabled on 824 * all cgroups on the default hierarchy, it should also be 825 * threaded. implicit && !threaded is not supported. 826 */ 827 bool threaded:1; 828 829 /* the following two fields are initialized automatically during boot */ 830 int id; 831 const char *name; 832 833 /* optional, initialized automatically during boot if not set */ 834 const char *legacy_name; 835 836 /* link to parent, protected by cgroup_lock() */ 837 struct cgroup_root *root; 838 839 /* idr for css->id */ 840 struct idr css_idr; 841 842 /* 843 * List of cftypes. Each entry is the first entry of an array 844 * terminated by zero length name. 845 */ 846 struct list_head cfts; 847 848 /* 849 * Base cftypes which are automatically registered. The two can 850 * point to the same array. 851 */ 852 struct cftype *dfl_cftypes; /* for the default hierarchy */ 853 struct cftype *legacy_cftypes; /* for the legacy hierarchies */ 854 855 /* 856 * A subsystem may depend on other subsystems. When such subsystem 857 * is enabled on a cgroup, the depended-upon subsystems are enabled 858 * together if available. Subsystems enabled due to dependency are 859 * not visible to userland until explicitly enabled. The following 860 * specifies the mask of subsystems that this one depends on. 861 */ 862 unsigned int depends_on; 863 864 spinlock_t rstat_ss_lock; 865 struct llist_head __percpu *lhead; /* lockless update list head */ 866 }; 867 868 extern struct percpu_rw_semaphore cgroup_threadgroup_rwsem; 869 extern bool cgroup_enable_per_threadgroup_rwsem; 870 871 struct cgroup_of_peak { 872 unsigned long value; 873 struct list_head list; 874 }; 875 876 /** 877 * cgroup_threadgroup_change_begin - threadgroup exclusion for cgroups 878 * @tsk: target task 879 * 880 * Allows cgroup operations to synchronize against threadgroup changes 881 * using a global percpu_rw_semaphore and a per threadgroup rw_semaphore when 882 * favordynmods is on. See the comment above CGRP_ROOT_FAVOR_DYNMODS definition. 883 */ 884 static inline void cgroup_threadgroup_change_begin(struct task_struct *tsk) 885 { 886 percpu_down_read(&cgroup_threadgroup_rwsem); 887 if (cgroup_enable_per_threadgroup_rwsem) 888 down_read(&tsk->signal->cgroup_threadgroup_rwsem); 889 } 890 891 /** 892 * cgroup_threadgroup_change_end - threadgroup exclusion for cgroups 893 * @tsk: target task 894 * 895 * Counterpart of cgroup_threadcgroup_change_begin(). 896 */ 897 static inline void cgroup_threadgroup_change_end(struct task_struct *tsk) 898 { 899 if (cgroup_enable_per_threadgroup_rwsem) 900 up_read(&tsk->signal->cgroup_threadgroup_rwsem); 901 percpu_up_read(&cgroup_threadgroup_rwsem); 902 } 903 904 #else /* CONFIG_CGROUPS */ 905 906 #define CGROUP_SUBSYS_COUNT 0 907 908 static inline void cgroup_threadgroup_change_begin(struct task_struct *tsk) 909 { 910 might_sleep(); 911 } 912 913 static inline void cgroup_threadgroup_change_end(struct task_struct *tsk) {} 914 915 #endif /* CONFIG_CGROUPS */ 916 917 #ifdef CONFIG_SOCK_CGROUP_DATA 918 919 /* 920 * sock_cgroup_data is embedded at sock->sk_cgrp_data and contains 921 * per-socket cgroup information except for memcg association. 922 * 923 * On legacy hierarchies, net_prio and net_cls controllers directly 924 * set attributes on each sock which can then be tested by the network 925 * layer. On the default hierarchy, each sock is associated with the 926 * cgroup it was created in and the networking layer can match the 927 * cgroup directly. 928 */ 929 struct sock_cgroup_data { 930 struct cgroup *cgroup; /* v2 */ 931 #ifdef CONFIG_CGROUP_NET_CLASSID 932 u32 classid; /* v1 */ 933 #endif 934 #ifdef CONFIG_CGROUP_NET_PRIO 935 u16 prioidx; /* v1 */ 936 #endif 937 }; 938 939 static inline u16 sock_cgroup_prioidx(const struct sock_cgroup_data *skcd) 940 { 941 #ifdef CONFIG_CGROUP_NET_PRIO 942 return READ_ONCE(skcd->prioidx); 943 #else 944 return 1; 945 #endif 946 } 947 948 #ifdef CONFIG_CGROUP_NET_CLASSID 949 static inline u32 sock_cgroup_classid(const struct sock_cgroup_data *skcd) 950 { 951 return READ_ONCE(skcd->classid); 952 } 953 #endif 954 955 static inline void sock_cgroup_set_prioidx(struct sock_cgroup_data *skcd, 956 u16 prioidx) 957 { 958 #ifdef CONFIG_CGROUP_NET_PRIO 959 WRITE_ONCE(skcd->prioidx, prioidx); 960 #endif 961 } 962 963 #ifdef CONFIG_CGROUP_NET_CLASSID 964 static inline void sock_cgroup_set_classid(struct sock_cgroup_data *skcd, 965 u32 classid) 966 { 967 WRITE_ONCE(skcd->classid, classid); 968 } 969 #endif 970 971 #else /* CONFIG_SOCK_CGROUP_DATA */ 972 973 struct sock_cgroup_data { 974 }; 975 976 #endif /* CONFIG_SOCK_CGROUP_DATA */ 977 978 #endif /* _LINUX_CGROUP_DEFS_H */ 979