1 // SPDX-License-Identifier: GPL-2.0-only 2 #include "cgroup-internal.h" 3 4 #include <linux/cpumask.h> 5 #include <linux/sched/cputime.h> 6 7 #include <linux/bpf.h> 8 #include <linux/btf.h> 9 #include <linux/btf_ids.h> 10 11 #include <trace/events/cgroup.h> 12 13 static DEFINE_SPINLOCK(rstat_base_lock); 14 static DEFINE_PER_CPU(struct llist_head, rstat_backlog_list); 15 16 static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu); 17 18 /* 19 * Determines whether a given css can participate in rstat. 20 * css's that are cgroup::self use rstat for base stats. 21 * Other css's associated with a subsystem use rstat only when 22 * they define the ss->css_rstat_flush callback. 23 */ 24 static inline bool css_uses_rstat(struct cgroup_subsys_state *css) 25 { 26 return css_is_self(css) || css->ss->css_rstat_flush != NULL; 27 } 28 29 static struct css_rstat_cpu *css_rstat_cpu( 30 struct cgroup_subsys_state *css, int cpu) 31 { 32 return per_cpu_ptr(css->rstat_cpu, cpu); 33 } 34 35 static struct cgroup_rstat_base_cpu *cgroup_rstat_base_cpu( 36 struct cgroup *cgrp, int cpu) 37 { 38 return per_cpu_ptr(cgrp->rstat_base_cpu, cpu); 39 } 40 41 static spinlock_t *ss_rstat_lock(struct cgroup_subsys *ss) 42 { 43 if (ss) 44 return &ss->rstat_ss_lock; 45 46 return &rstat_base_lock; 47 } 48 49 static inline struct llist_head *ss_lhead_cpu(struct cgroup_subsys *ss, int cpu) 50 { 51 if (ss) 52 return per_cpu_ptr(ss->lhead, cpu); 53 return per_cpu_ptr(&rstat_backlog_list, cpu); 54 } 55 56 /** 57 * __css_rstat_updated - keep track of updated rstat_cpu 58 * @css: target cgroup subsystem state 59 * @cpu: cpu on which rstat_cpu was updated 60 * 61 * Atomically inserts the css in the ss's llist for the given cpu. This is 62 * reentrant safe i.e. safe against softirq, hardirq and nmi. The ss's llist 63 * will be processed at the flush time to create the update tree. 64 * 65 * NOTE: if the user needs the guarantee that the updater either add itself in 66 * the lockless list or the concurrent flusher flushes its updated stats, a 67 * memory barrier is needed before the call to __css_rstat_updated() i.e. a 68 * barrier after updating the per-cpu stats and before calling 69 * __css_rstat_updated(). 70 */ 71 void __css_rstat_updated(struct cgroup_subsys_state *css, int cpu) 72 { 73 struct llist_head *lhead; 74 struct css_rstat_cpu *rstatc; 75 struct llist_node *self; 76 77 /* Prevent access to uninitialized rstat pointers. */ 78 if (!css_uses_rstat(css)) 79 return; 80 81 lockdep_assert_preemption_disabled(); 82 83 /* 84 * The lockless insertion below relies on NMI-safe cmpxchg; 85 * bail out in NMI on archs that don't provide it. 86 */ 87 if (!IS_ENABLED(CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG) && in_nmi()) 88 return; 89 90 rstatc = css_rstat_cpu(css, cpu); 91 /* 92 * If already on list return. This check is racy and smp_mb() is needed 93 * to pair it with the smp_mb() in css_process_update_tree() if the 94 * guarantee that the updated stats are visible to concurrent flusher is 95 * needed. 96 */ 97 if (llist_on_list(&rstatc->lnode)) 98 return; 99 100 /* 101 * This function can be renentered by irqs and nmis for the same cgroup 102 * and may try to insert the same per-cpu lnode into the llist. Note 103 * that llist_add() does not protect against such scenarios. In addition 104 * this same per-cpu lnode can be modified through init_llist_node() 105 * from css_rstat_flush() running on a different CPU. 106 * 107 * To protect against such stacked contexts of irqs/nmis, we use the 108 * fact that lnode points to itself when not on a list and then use 109 * try_cmpxchg() to atomically set to NULL to select the winner 110 * which will call llist_add(). The losers can assume the insertion is 111 * successful and the winner will eventually add the per-cpu lnode to 112 * the llist. 113 * 114 * Please note that we can not use this_cpu_cmpxchg() here as on some 115 * archs it is not safe against modifications from multiple CPUs. 116 */ 117 self = &rstatc->lnode; 118 if (!try_cmpxchg(&rstatc->lnode.next, &self, NULL)) 119 return; 120 121 lhead = ss_lhead_cpu(css->ss, cpu); 122 llist_add(&rstatc->lnode, lhead); 123 } 124 125 /* 126 * BPF-facing wrapper for __css_rstat_updated(). Validate the caller-provided 127 * CPU before passing it to the internal rstat updater. 128 */ 129 __bpf_kfunc void css_rstat_updated(struct cgroup_subsys_state *css, int cpu) 130 { 131 if (unlikely(cpu < 0 || cpu >= nr_cpu_ids || !cpu_possible(cpu))) 132 return; 133 134 __css_rstat_updated(css, cpu); 135 } 136 137 static void __css_process_update_tree(struct cgroup_subsys_state *css, int cpu) 138 { 139 /* put @css and all ancestors on the corresponding updated lists */ 140 while (true) { 141 struct css_rstat_cpu *rstatc = css_rstat_cpu(css, cpu); 142 struct cgroup_subsys_state *parent = css->parent; 143 struct css_rstat_cpu *prstatc; 144 145 /* 146 * Both additions and removals are bottom-up. If a cgroup 147 * is already in the tree, all ancestors are. 148 */ 149 if (rstatc->updated_next) 150 break; 151 152 /* Root has no parent to link it to, but mark it busy */ 153 if (!parent) { 154 rstatc->updated_next = css; 155 break; 156 } 157 158 prstatc = css_rstat_cpu(parent, cpu); 159 rstatc->updated_next = prstatc->updated_children; 160 prstatc->updated_children = css; 161 162 css = parent; 163 } 164 } 165 166 static void css_process_update_tree(struct cgroup_subsys *ss, int cpu) 167 { 168 struct llist_head *lhead = ss_lhead_cpu(ss, cpu); 169 struct llist_node *lnode; 170 171 while ((lnode = llist_del_first_init(lhead))) { 172 struct css_rstat_cpu *rstatc; 173 174 /* 175 * smp_mb() is needed here (more specifically in between 176 * init_llist_node() and per-cpu stats flushing) if the 177 * guarantee is required by a rstat user where etiher the 178 * updater should add itself on the lockless list or the 179 * flusher flush the stats updated by the updater who have 180 * observed that they are already on the list. The 181 * corresponding barrier pair for this one should be before 182 * __css_rstat_updated() by the user. 183 * 184 * For now, there aren't any such user, so not adding the 185 * barrier here but if such a use-case arise, please add 186 * smp_mb() here. 187 */ 188 189 rstatc = container_of(lnode, struct css_rstat_cpu, lnode); 190 __css_process_update_tree(rstatc->owner, cpu); 191 } 192 } 193 194 /** 195 * css_rstat_push_children - push children css's into the given list 196 * @head: current head of the list (= subtree root) 197 * @child: first child of the root 198 * @cpu: target cpu 199 * Return: A new singly linked list of css's to be flushed 200 * 201 * Iteratively traverse down the css_rstat_cpu updated tree level by 202 * level and push all the parents first before their next level children 203 * into a singly linked list via the rstat_flush_next pointer built from the 204 * tail backward like "pushing" css's into a stack. The root is pushed by 205 * the caller. 206 */ 207 static struct cgroup_subsys_state *css_rstat_push_children( 208 struct cgroup_subsys_state *head, 209 struct cgroup_subsys_state *child, int cpu) 210 { 211 struct cgroup_subsys_state *cnext = child; /* Next head of child css level */ 212 struct cgroup_subsys_state *ghead = NULL; /* Head of grandchild css level */ 213 struct cgroup_subsys_state *parent, *grandchild; 214 struct css_rstat_cpu *crstatc; 215 216 child->rstat_flush_next = NULL; 217 218 /* 219 * The subsystem rstat lock must be held for the whole duration from 220 * here as the rstat_flush_next list is being constructed to when 221 * it is consumed later in css_rstat_flush(). 222 */ 223 lockdep_assert_held(ss_rstat_lock(head->ss)); 224 225 /* 226 * Notation: -> updated_next pointer 227 * => rstat_flush_next pointer 228 * 229 * Assuming the following sample updated_children lists: 230 * P: C1 -> C2 -> P 231 * C1: G11 -> G12 -> C1 232 * C2: G21 -> G22 -> C2 233 * 234 * After 1st iteration: 235 * head => C2 => C1 => NULL 236 * ghead => G21 => G11 => NULL 237 * 238 * After 2nd iteration: 239 * head => G12 => G11 => G22 => G21 => C2 => C1 => NULL 240 */ 241 next_level: 242 while (cnext) { 243 child = cnext; 244 cnext = child->rstat_flush_next; 245 parent = child->parent; 246 247 /* updated_next is parent cgroup terminated if !NULL */ 248 while (child != parent) { 249 child->rstat_flush_next = head; 250 head = child; 251 crstatc = css_rstat_cpu(child, cpu); 252 grandchild = crstatc->updated_children; 253 if (grandchild != child) { 254 /* Push the grand child to the next level */ 255 crstatc->updated_children = child; 256 grandchild->rstat_flush_next = ghead; 257 ghead = grandchild; 258 } 259 child = crstatc->updated_next; 260 crstatc->updated_next = NULL; 261 } 262 } 263 264 if (ghead) { 265 cnext = ghead; 266 ghead = NULL; 267 goto next_level; 268 } 269 return head; 270 } 271 272 /** 273 * css_rstat_updated_list - build a list of updated css's to be flushed 274 * @root: root of the css subtree to traverse 275 * @cpu: target cpu 276 * Return: A singly linked list of css's to be flushed 277 * 278 * Walks the updated rstat_cpu tree on @cpu from @root. During traversal, 279 * each returned css is unlinked from the updated tree. 280 * 281 * The only ordering guarantee is that, for a parent and a child pair 282 * covered by a given traversal, the child is before its parent in 283 * the list. 284 * 285 * Note that updated_children is self terminated and points to a list of 286 * child css's if not empty. Whereas updated_next is like a sibling link 287 * within the children list and terminated by the parent css. An exception 288 * here is the css root whose updated_next can be self terminated. 289 */ 290 static struct cgroup_subsys_state *css_rstat_updated_list( 291 struct cgroup_subsys_state *root, int cpu) 292 { 293 struct css_rstat_cpu *rstatc = css_rstat_cpu(root, cpu); 294 struct cgroup_subsys_state *head = NULL, *parent, *child; 295 296 css_process_update_tree(root->ss, cpu); 297 298 /* Return NULL if this subtree is not on-list */ 299 if (!rstatc->updated_next) 300 return NULL; 301 302 /* 303 * Unlink @root from its parent. As the updated_children list is 304 * singly linked, we have to walk it to find the removal point. 305 */ 306 parent = root->parent; 307 if (parent) { 308 struct css_rstat_cpu *prstatc; 309 struct cgroup_subsys_state **nextp; 310 311 prstatc = css_rstat_cpu(parent, cpu); 312 nextp = &prstatc->updated_children; 313 while (*nextp != root) { 314 struct css_rstat_cpu *nrstatc; 315 316 nrstatc = css_rstat_cpu(*nextp, cpu); 317 WARN_ON_ONCE(*nextp == parent); 318 nextp = &nrstatc->updated_next; 319 } 320 *nextp = rstatc->updated_next; 321 } 322 323 rstatc->updated_next = NULL; 324 325 /* Push @root to the list first before pushing the children */ 326 head = root; 327 root->rstat_flush_next = NULL; 328 child = rstatc->updated_children; 329 rstatc->updated_children = root; 330 if (child != root) 331 head = css_rstat_push_children(head, child, cpu); 332 333 return head; 334 } 335 336 /* 337 * A hook for bpf stat collectors to attach to and flush their stats. 338 * Together with providing bpf kfuncs for css_rstat_updated() and 339 * css_rstat_flush(), this enables a complete workflow where bpf progs that 340 * collect cgroup stats can integrate with rstat for efficient flushing. 341 * 342 * A static noinline declaration here could cause the compiler to optimize away 343 * the function. A global noinline declaration will keep the definition, but may 344 * optimize away the callsite. Therefore, __weak is needed to ensure that the 345 * call is still emitted, by telling the compiler that we don't know what the 346 * function might eventually be. 347 */ 348 349 __bpf_hook_start(); 350 351 __weak noinline void bpf_rstat_flush(struct cgroup *cgrp, 352 struct cgroup *parent, int cpu) 353 { 354 } 355 356 __bpf_hook_end(); 357 358 /* 359 * Helper functions for locking. 360 * 361 * This makes it easier to diagnose locking issues and contention in 362 * production environments. The parameter @cpu_in_loop indicate lock 363 * was released and re-taken when collection data from the CPUs. The 364 * value -1 is used when obtaining the main lock else this is the CPU 365 * number processed last. 366 */ 367 static inline void __css_rstat_lock(struct cgroup_subsys_state *css, 368 int cpu_in_loop) 369 __acquires(ss_rstat_lock(css->ss)) 370 { 371 struct cgroup *cgrp = css->cgroup; 372 spinlock_t *lock; 373 bool contended; 374 375 lock = ss_rstat_lock(css->ss); 376 contended = !spin_trylock_irq(lock); 377 if (contended) { 378 trace_cgroup_rstat_lock_contended(cgrp, cpu_in_loop, contended); 379 spin_lock_irq(lock); 380 } 381 trace_cgroup_rstat_locked(cgrp, cpu_in_loop, contended); 382 } 383 384 static inline void __css_rstat_unlock(struct cgroup_subsys_state *css, 385 int cpu_in_loop) 386 __releases(ss_rstat_lock(css->ss)) 387 { 388 struct cgroup *cgrp = css->cgroup; 389 spinlock_t *lock; 390 391 lock = ss_rstat_lock(css->ss); 392 trace_cgroup_rstat_unlock(cgrp, cpu_in_loop, false); 393 spin_unlock_irq(lock); 394 } 395 396 /** 397 * css_rstat_flush - flush stats in @css's rstat subtree 398 * @css: target cgroup subsystem state 399 * 400 * Collect all per-cpu stats in @css's subtree into the global counters 401 * and propagate them upwards. After this function returns, all rstat 402 * nodes in the subtree have up-to-date ->stat. 403 * 404 * This also gets all rstat nodes in the subtree including @css off the 405 * ->updated_children lists. 406 * 407 * This function may block. 408 */ 409 __bpf_kfunc void css_rstat_flush(struct cgroup_subsys_state *css) 410 { 411 int cpu; 412 bool is_self = css_is_self(css); 413 414 /* 415 * Since bpf programs can call this function, prevent access to 416 * uninitialized rstat pointers. 417 */ 418 if (!css_uses_rstat(css)) 419 return; 420 421 might_sleep(); 422 for_each_possible_cpu(cpu) { 423 struct cgroup_subsys_state *pos; 424 425 /* Reacquire for each CPU to avoid disabling IRQs too long */ 426 __css_rstat_lock(css, cpu); 427 pos = css_rstat_updated_list(css, cpu); 428 for (; pos; pos = pos->rstat_flush_next) { 429 if (is_self) { 430 cgroup_base_stat_flush(pos->cgroup, cpu); 431 bpf_rstat_flush(pos->cgroup, 432 cgroup_parent(pos->cgroup), cpu); 433 } else 434 pos->ss->css_rstat_flush(pos, cpu); 435 } 436 __css_rstat_unlock(css, cpu); 437 if (!cond_resched()) 438 cpu_relax(); 439 } 440 } 441 442 int css_rstat_init(struct cgroup_subsys_state *css) 443 { 444 struct cgroup *cgrp = css->cgroup; 445 int cpu; 446 bool is_self = css_is_self(css); 447 448 if (is_self) { 449 /* the root cgrp has rstat_base_cpu preallocated */ 450 if (!cgrp->rstat_base_cpu) { 451 cgrp->rstat_base_cpu = alloc_percpu(struct cgroup_rstat_base_cpu); 452 if (!cgrp->rstat_base_cpu) 453 return -ENOMEM; 454 } 455 } else if (css->ss->css_rstat_flush == NULL) 456 return 0; 457 458 /* the root cgrp's self css has rstat_cpu preallocated */ 459 if (!css->rstat_cpu) { 460 css->rstat_cpu = alloc_percpu(struct css_rstat_cpu); 461 if (!css->rstat_cpu) { 462 if (is_self) 463 free_percpu(cgrp->rstat_base_cpu); 464 465 return -ENOMEM; 466 } 467 } 468 469 /* ->updated_children list is self terminated */ 470 for_each_possible_cpu(cpu) { 471 struct css_rstat_cpu *rstatc = css_rstat_cpu(css, cpu); 472 473 rstatc->owner = rstatc->updated_children = css; 474 init_llist_node(&rstatc->lnode); 475 476 if (is_self) { 477 struct cgroup_rstat_base_cpu *rstatbc; 478 479 rstatbc = cgroup_rstat_base_cpu(cgrp, cpu); 480 u64_stats_init(&rstatbc->bsync); 481 } 482 } 483 484 return 0; 485 } 486 487 void css_rstat_exit(struct cgroup_subsys_state *css) 488 { 489 int cpu; 490 491 if (!css_uses_rstat(css)) 492 return; 493 494 if (!css->rstat_cpu) 495 return; 496 497 css_rstat_flush(css); 498 499 /* sanity check */ 500 for_each_possible_cpu(cpu) { 501 struct css_rstat_cpu *rstatc = css_rstat_cpu(css, cpu); 502 503 if (WARN_ON_ONCE(rstatc->updated_children != css) || 504 WARN_ON_ONCE(rstatc->updated_next)) 505 return; 506 } 507 508 if (css_is_self(css)) { 509 struct cgroup *cgrp = css->cgroup; 510 511 free_percpu(cgrp->rstat_base_cpu); 512 cgrp->rstat_base_cpu = NULL; 513 } 514 515 free_percpu(css->rstat_cpu); 516 css->rstat_cpu = NULL; 517 } 518 519 /** 520 * ss_rstat_init - subsystem-specific rstat initialization 521 * @ss: target subsystem 522 * 523 * If @ss is NULL, the static locks associated with the base stats 524 * are initialized. If @ss is non-NULL, the subsystem-specific locks 525 * are initialized. 526 */ 527 int __init ss_rstat_init(struct cgroup_subsys *ss) 528 { 529 int cpu; 530 531 if (ss) { 532 ss->lhead = alloc_percpu(struct llist_head); 533 if (!ss->lhead) 534 return -ENOMEM; 535 } 536 537 spin_lock_init(ss_rstat_lock(ss)); 538 for_each_possible_cpu(cpu) 539 init_llist_head(ss_lhead_cpu(ss, cpu)); 540 541 return 0; 542 } 543 544 /* 545 * Functions for cgroup basic resource statistics implemented on top of 546 * rstat. 547 */ 548 static void cgroup_base_stat_add(struct cgroup_base_stat *dst_bstat, 549 struct cgroup_base_stat *src_bstat) 550 { 551 dst_bstat->cputime.utime += src_bstat->cputime.utime; 552 dst_bstat->cputime.stime += src_bstat->cputime.stime; 553 dst_bstat->cputime.sum_exec_runtime += src_bstat->cputime.sum_exec_runtime; 554 #ifdef CONFIG_SCHED_CORE 555 dst_bstat->forceidle_sum += src_bstat->forceidle_sum; 556 #endif 557 dst_bstat->ntime += src_bstat->ntime; 558 } 559 560 static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat, 561 struct cgroup_base_stat *src_bstat) 562 { 563 dst_bstat->cputime.utime -= src_bstat->cputime.utime; 564 dst_bstat->cputime.stime -= src_bstat->cputime.stime; 565 dst_bstat->cputime.sum_exec_runtime -= src_bstat->cputime.sum_exec_runtime; 566 #ifdef CONFIG_SCHED_CORE 567 dst_bstat->forceidle_sum -= src_bstat->forceidle_sum; 568 #endif 569 dst_bstat->ntime -= src_bstat->ntime; 570 } 571 572 static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu) 573 { 574 struct cgroup_rstat_base_cpu *rstatbc = cgroup_rstat_base_cpu(cgrp, cpu); 575 struct cgroup *parent = cgroup_parent(cgrp); 576 struct cgroup_rstat_base_cpu *prstatbc; 577 struct cgroup_base_stat delta; 578 unsigned seq; 579 580 /* Root-level stats are sourced from system-wide CPU stats */ 581 if (!parent) 582 return; 583 584 /* fetch the current per-cpu values */ 585 do { 586 seq = __u64_stats_fetch_begin(&rstatbc->bsync); 587 delta = rstatbc->bstat; 588 } while (__u64_stats_fetch_retry(&rstatbc->bsync, seq)); 589 590 /* propagate per-cpu delta to cgroup and per-cpu global statistics */ 591 cgroup_base_stat_sub(&delta, &rstatbc->last_bstat); 592 cgroup_base_stat_add(&cgrp->bstat, &delta); 593 cgroup_base_stat_add(&rstatbc->last_bstat, &delta); 594 cgroup_base_stat_add(&rstatbc->subtree_bstat, &delta); 595 596 /* propagate cgroup and per-cpu global delta to parent (unless that's root) */ 597 if (cgroup_parent(parent)) { 598 delta = cgrp->bstat; 599 cgroup_base_stat_sub(&delta, &cgrp->last_bstat); 600 cgroup_base_stat_add(&parent->bstat, &delta); 601 cgroup_base_stat_add(&cgrp->last_bstat, &delta); 602 603 delta = rstatbc->subtree_bstat; 604 prstatbc = cgroup_rstat_base_cpu(parent, cpu); 605 cgroup_base_stat_sub(&delta, &rstatbc->last_subtree_bstat); 606 cgroup_base_stat_add(&prstatbc->subtree_bstat, &delta); 607 cgroup_base_stat_add(&rstatbc->last_subtree_bstat, &delta); 608 } 609 } 610 611 static struct cgroup_rstat_base_cpu * 612 cgroup_base_stat_cputime_account_begin(struct cgroup *cgrp, unsigned long *flags) 613 { 614 struct cgroup_rstat_base_cpu *rstatbc; 615 616 rstatbc = get_cpu_ptr(cgrp->rstat_base_cpu); 617 *flags = u64_stats_update_begin_irqsave(&rstatbc->bsync); 618 return rstatbc; 619 } 620 621 static void cgroup_base_stat_cputime_account_end(struct cgroup *cgrp, 622 struct cgroup_rstat_base_cpu *rstatbc, 623 unsigned long flags) 624 { 625 u64_stats_update_end_irqrestore(&rstatbc->bsync, flags); 626 __css_rstat_updated(&cgrp->self, smp_processor_id()); 627 put_cpu_ptr(rstatbc); 628 } 629 630 void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec) 631 { 632 struct cgroup_rstat_base_cpu *rstatbc; 633 unsigned long flags; 634 635 rstatbc = cgroup_base_stat_cputime_account_begin(cgrp, &flags); 636 rstatbc->bstat.cputime.sum_exec_runtime += delta_exec; 637 cgroup_base_stat_cputime_account_end(cgrp, rstatbc, flags); 638 } 639 640 void __cgroup_account_cputime_field(struct cgroup *cgrp, 641 enum cpu_usage_stat index, u64 delta_exec) 642 { 643 struct cgroup_rstat_base_cpu *rstatbc; 644 unsigned long flags; 645 646 rstatbc = cgroup_base_stat_cputime_account_begin(cgrp, &flags); 647 648 switch (index) { 649 case CPUTIME_NICE: 650 rstatbc->bstat.ntime += delta_exec; 651 fallthrough; 652 case CPUTIME_USER: 653 rstatbc->bstat.cputime.utime += delta_exec; 654 break; 655 case CPUTIME_SYSTEM: 656 case CPUTIME_IRQ: 657 case CPUTIME_SOFTIRQ: 658 rstatbc->bstat.cputime.stime += delta_exec; 659 break; 660 #ifdef CONFIG_SCHED_CORE 661 case CPUTIME_FORCEIDLE: 662 rstatbc->bstat.forceidle_sum += delta_exec; 663 break; 664 #endif 665 default: 666 break; 667 } 668 669 cgroup_base_stat_cputime_account_end(cgrp, rstatbc, flags); 670 } 671 672 /* 673 * compute the cputime for the root cgroup by getting the per cpu data 674 * at a global level, then categorizing the fields in a manner consistent 675 * with how it is done by __cgroup_account_cputime_field for each bit of 676 * cpu time attributed to a cgroup. 677 */ 678 static void root_cgroup_cputime(struct cgroup_base_stat *bstat) 679 { 680 struct task_cputime *cputime = &bstat->cputime; 681 int i; 682 683 memset(bstat, 0, sizeof(*bstat)); 684 for_each_possible_cpu(i) { 685 struct kernel_cpustat kcpustat; 686 u64 *cpustat = kcpustat.cpustat; 687 u64 user = 0; 688 u64 sys = 0; 689 690 kcpustat_cpu_fetch(&kcpustat, i); 691 692 user += cpustat[CPUTIME_USER]; 693 user += cpustat[CPUTIME_NICE]; 694 cputime->utime += user; 695 696 sys += cpustat[CPUTIME_SYSTEM]; 697 sys += cpustat[CPUTIME_IRQ]; 698 sys += cpustat[CPUTIME_SOFTIRQ]; 699 cputime->stime += sys; 700 701 cputime->sum_exec_runtime += user; 702 cputime->sum_exec_runtime += sys; 703 704 #ifdef CONFIG_SCHED_CORE 705 bstat->forceidle_sum += cpustat[CPUTIME_FORCEIDLE]; 706 #endif 707 bstat->ntime += cpustat[CPUTIME_NICE]; 708 } 709 } 710 711 712 static void cgroup_force_idle_show(struct seq_file *seq, struct cgroup_base_stat *bstat) 713 { 714 #ifdef CONFIG_SCHED_CORE 715 u64 forceidle_time = bstat->forceidle_sum; 716 717 do_div(forceidle_time, NSEC_PER_USEC); 718 seq_printf(seq, "core_sched.force_idle_usec %llu\n", forceidle_time); 719 #endif 720 } 721 722 void cgroup_base_stat_cputime_show(struct seq_file *seq) 723 { 724 struct cgroup *cgrp = seq_css(seq)->cgroup; 725 struct cgroup_base_stat bstat; 726 727 if (cgroup_parent(cgrp)) { 728 css_rstat_flush(&cgrp->self); 729 __css_rstat_lock(&cgrp->self, -1); 730 bstat = cgrp->bstat; 731 cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime, 732 &bstat.cputime.utime, &bstat.cputime.stime); 733 __css_rstat_unlock(&cgrp->self, -1); 734 } else { 735 root_cgroup_cputime(&bstat); 736 } 737 738 do_div(bstat.cputime.sum_exec_runtime, NSEC_PER_USEC); 739 do_div(bstat.cputime.utime, NSEC_PER_USEC); 740 do_div(bstat.cputime.stime, NSEC_PER_USEC); 741 do_div(bstat.ntime, NSEC_PER_USEC); 742 743 seq_printf(seq, "usage_usec %llu\n" 744 "user_usec %llu\n" 745 "system_usec %llu\n" 746 "nice_usec %llu\n", 747 bstat.cputime.sum_exec_runtime, 748 bstat.cputime.utime, 749 bstat.cputime.stime, 750 bstat.ntime); 751 752 cgroup_force_idle_show(seq, &bstat); 753 } 754 755 /* Add bpf kfuncs for css_rstat_updated() and css_rstat_flush() */ 756 BTF_KFUNCS_START(bpf_rstat_kfunc_ids) 757 BTF_ID_FLAGS(func, css_rstat_updated) 758 BTF_ID_FLAGS(func, css_rstat_flush, KF_SLEEPABLE) 759 BTF_KFUNCS_END(bpf_rstat_kfunc_ids) 760 761 static const struct btf_kfunc_id_set bpf_rstat_kfunc_set = { 762 .owner = THIS_MODULE, 763 .set = &bpf_rstat_kfunc_ids, 764 }; 765 766 static int __init bpf_rstat_kfunc_init(void) 767 { 768 return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, 769 &bpf_rstat_kfunc_set); 770 } 771 late_initcall(bpf_rstat_kfunc_init); 772