1 // SPDX-License-Identifier: GPL-2.0-only 2 #include "cgroup-internal.h" 3 4 #include <linux/sched/cputime.h> 5 6 static DEFINE_SPINLOCK(cgroup_rstat_lock); 7 static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_cpu_lock); 8 9 static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu); 10 11 static struct cgroup_rstat_cpu *cgroup_rstat_cpu(struct cgroup *cgrp, int cpu) 12 { 13 return per_cpu_ptr(cgrp->rstat_cpu, cpu); 14 } 15 16 /** 17 * cgroup_rstat_updated - keep track of updated rstat_cpu 18 * @cgrp: target cgroup 19 * @cpu: cpu on which rstat_cpu was updated 20 * 21 * @cgrp's rstat_cpu on @cpu was updated. Put it on the parent's matching 22 * rstat_cpu->updated_children list. See the comment on top of 23 * cgroup_rstat_cpu definition for details. 24 */ 25 void cgroup_rstat_updated(struct cgroup *cgrp, int cpu) 26 { 27 raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu); 28 unsigned long flags; 29 30 /* 31 * Speculative already-on-list test. This may race leading to 32 * temporary inaccuracies, which is fine. 33 * 34 * Because @parent's updated_children is terminated with @parent 35 * instead of NULL, we can tell whether @cgrp is on the list by 36 * testing the next pointer for NULL. 37 */ 38 if (cgroup_rstat_cpu(cgrp, cpu)->updated_next) 39 return; 40 41 raw_spin_lock_irqsave(cpu_lock, flags); 42 43 /* put @cgrp and all ancestors on the corresponding updated lists */ 44 while (true) { 45 struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); 46 struct cgroup *parent = cgroup_parent(cgrp); 47 struct cgroup_rstat_cpu *prstatc; 48 49 /* 50 * Both additions and removals are bottom-up. If a cgroup 51 * is already in the tree, all ancestors are. 52 */ 53 if (rstatc->updated_next) 54 break; 55 56 /* Root has no parent to link it to, but mark it busy */ 57 if (!parent) { 58 rstatc->updated_next = cgrp; 59 break; 60 } 61 62 prstatc = cgroup_rstat_cpu(parent, cpu); 63 rstatc->updated_next = prstatc->updated_children; 64 prstatc->updated_children = cgrp; 65 66 cgrp = parent; 67 } 68 69 raw_spin_unlock_irqrestore(cpu_lock, flags); 70 } 71 72 /** 73 * cgroup_rstat_cpu_pop_updated - iterate and dismantle rstat_cpu updated tree 74 * @pos: current position 75 * @root: root of the tree to traversal 76 * @cpu: target cpu 77 * 78 * Walks the updated rstat_cpu tree on @cpu from @root. %NULL @pos starts 79 * the traversal and %NULL return indicates the end. During traversal, 80 * each returned cgroup is unlinked from the tree. Must be called with the 81 * matching cgroup_rstat_cpu_lock held. 82 * 83 * The only ordering guarantee is that, for a parent and a child pair 84 * covered by a given traversal, if a child is visited, its parent is 85 * guaranteed to be visited afterwards. 86 */ 87 static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos, 88 struct cgroup *root, int cpu) 89 { 90 struct cgroup_rstat_cpu *rstatc; 91 92 if (pos == root) 93 return NULL; 94 95 /* 96 * We're gonna walk down to the first leaf and visit/remove it. We 97 * can pick whatever unvisited node as the starting point. 98 */ 99 if (!pos) 100 pos = root; 101 else 102 pos = cgroup_parent(pos); 103 104 /* walk down to the first leaf */ 105 while (true) { 106 rstatc = cgroup_rstat_cpu(pos, cpu); 107 if (rstatc->updated_children == pos) 108 break; 109 pos = rstatc->updated_children; 110 } 111 112 /* 113 * Unlink @pos from the tree. As the updated_children list is 114 * singly linked, we have to walk it to find the removal point. 115 * However, due to the way we traverse, @pos will be the first 116 * child in most cases. The only exception is @root. 117 */ 118 if (rstatc->updated_next) { 119 struct cgroup *parent = cgroup_parent(pos); 120 121 if (parent) { 122 struct cgroup_rstat_cpu *prstatc; 123 struct cgroup **nextp; 124 125 prstatc = cgroup_rstat_cpu(parent, cpu); 126 nextp = &prstatc->updated_children; 127 while (true) { 128 struct cgroup_rstat_cpu *nrstatc; 129 130 nrstatc = cgroup_rstat_cpu(*nextp, cpu); 131 if (*nextp == pos) 132 break; 133 WARN_ON_ONCE(*nextp == parent); 134 nextp = &nrstatc->updated_next; 135 } 136 *nextp = rstatc->updated_next; 137 } 138 139 rstatc->updated_next = NULL; 140 return pos; 141 } 142 143 /* only happens for @root */ 144 return NULL; 145 } 146 147 /* see cgroup_rstat_flush() */ 148 static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep) 149 __releases(&cgroup_rstat_lock) __acquires(&cgroup_rstat_lock) 150 { 151 int cpu; 152 153 lockdep_assert_held(&cgroup_rstat_lock); 154 155 for_each_possible_cpu(cpu) { 156 raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, 157 cpu); 158 struct cgroup *pos = NULL; 159 160 raw_spin_lock(cpu_lock); 161 while ((pos = cgroup_rstat_cpu_pop_updated(pos, cgrp, cpu))) { 162 struct cgroup_subsys_state *css; 163 164 cgroup_base_stat_flush(pos, cpu); 165 166 rcu_read_lock(); 167 list_for_each_entry_rcu(css, &pos->rstat_css_list, 168 rstat_css_node) 169 css->ss->css_rstat_flush(css, cpu); 170 rcu_read_unlock(); 171 } 172 raw_spin_unlock(cpu_lock); 173 174 /* if @may_sleep, play nice and yield if necessary */ 175 if (may_sleep && (need_resched() || 176 spin_needbreak(&cgroup_rstat_lock))) { 177 spin_unlock_irq(&cgroup_rstat_lock); 178 if (!cond_resched()) 179 cpu_relax(); 180 spin_lock_irq(&cgroup_rstat_lock); 181 } 182 } 183 } 184 185 /** 186 * cgroup_rstat_flush - flush stats in @cgrp's subtree 187 * @cgrp: target cgroup 188 * 189 * Collect all per-cpu stats in @cgrp's subtree into the global counters 190 * and propagate them upwards. After this function returns, all cgroups in 191 * the subtree have up-to-date ->stat. 192 * 193 * This also gets all cgroups in the subtree including @cgrp off the 194 * ->updated_children lists. 195 * 196 * This function may block. 197 */ 198 void cgroup_rstat_flush(struct cgroup *cgrp) 199 { 200 might_sleep(); 201 202 spin_lock_irq(&cgroup_rstat_lock); 203 cgroup_rstat_flush_locked(cgrp, true); 204 spin_unlock_irq(&cgroup_rstat_lock); 205 } 206 207 /** 208 * cgroup_rstat_flush_irqsafe - irqsafe version of cgroup_rstat_flush() 209 * @cgrp: target cgroup 210 * 211 * This function can be called from any context. 212 */ 213 void cgroup_rstat_flush_irqsafe(struct cgroup *cgrp) 214 { 215 unsigned long flags; 216 217 spin_lock_irqsave(&cgroup_rstat_lock, flags); 218 cgroup_rstat_flush_locked(cgrp, false); 219 spin_unlock_irqrestore(&cgroup_rstat_lock, flags); 220 } 221 222 /** 223 * cgroup_rstat_flush_hold - flush stats in @cgrp's subtree and hold 224 * @cgrp: target cgroup 225 * 226 * Flush stats in @cgrp's subtree and prevent further flushes. Must be 227 * paired with cgroup_rstat_flush_release(). 228 * 229 * This function may block. 230 */ 231 void cgroup_rstat_flush_hold(struct cgroup *cgrp) 232 __acquires(&cgroup_rstat_lock) 233 { 234 might_sleep(); 235 spin_lock_irq(&cgroup_rstat_lock); 236 cgroup_rstat_flush_locked(cgrp, true); 237 } 238 239 /** 240 * cgroup_rstat_flush_release - release cgroup_rstat_flush_hold() 241 */ 242 void cgroup_rstat_flush_release(void) 243 __releases(&cgroup_rstat_lock) 244 { 245 spin_unlock_irq(&cgroup_rstat_lock); 246 } 247 248 int cgroup_rstat_init(struct cgroup *cgrp) 249 { 250 int cpu; 251 252 /* the root cgrp has rstat_cpu preallocated */ 253 if (!cgrp->rstat_cpu) { 254 cgrp->rstat_cpu = alloc_percpu(struct cgroup_rstat_cpu); 255 if (!cgrp->rstat_cpu) 256 return -ENOMEM; 257 } 258 259 /* ->updated_children list is self terminated */ 260 for_each_possible_cpu(cpu) { 261 struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); 262 263 rstatc->updated_children = cgrp; 264 u64_stats_init(&rstatc->bsync); 265 } 266 267 return 0; 268 } 269 270 void cgroup_rstat_exit(struct cgroup *cgrp) 271 { 272 int cpu; 273 274 cgroup_rstat_flush(cgrp); 275 276 /* sanity check */ 277 for_each_possible_cpu(cpu) { 278 struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); 279 280 if (WARN_ON_ONCE(rstatc->updated_children != cgrp) || 281 WARN_ON_ONCE(rstatc->updated_next)) 282 return; 283 } 284 285 free_percpu(cgrp->rstat_cpu); 286 cgrp->rstat_cpu = NULL; 287 } 288 289 void __init cgroup_rstat_boot(void) 290 { 291 int cpu; 292 293 for_each_possible_cpu(cpu) 294 raw_spin_lock_init(per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu)); 295 } 296 297 /* 298 * Functions for cgroup basic resource statistics implemented on top of 299 * rstat. 300 */ 301 static void cgroup_base_stat_add(struct cgroup_base_stat *dst_bstat, 302 struct cgroup_base_stat *src_bstat) 303 { 304 dst_bstat->cputime.utime += src_bstat->cputime.utime; 305 dst_bstat->cputime.stime += src_bstat->cputime.stime; 306 dst_bstat->cputime.sum_exec_runtime += src_bstat->cputime.sum_exec_runtime; 307 } 308 309 static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat, 310 struct cgroup_base_stat *src_bstat) 311 { 312 dst_bstat->cputime.utime -= src_bstat->cputime.utime; 313 dst_bstat->cputime.stime -= src_bstat->cputime.stime; 314 dst_bstat->cputime.sum_exec_runtime -= src_bstat->cputime.sum_exec_runtime; 315 } 316 317 static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu) 318 { 319 struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); 320 struct cgroup *parent = cgroup_parent(cgrp); 321 struct cgroup_base_stat cur, delta; 322 unsigned seq; 323 324 /* Root-level stats are sourced from system-wide CPU stats */ 325 if (!parent) 326 return; 327 328 /* fetch the current per-cpu values */ 329 do { 330 seq = __u64_stats_fetch_begin(&rstatc->bsync); 331 cur.cputime = rstatc->bstat.cputime; 332 } while (__u64_stats_fetch_retry(&rstatc->bsync, seq)); 333 334 /* propagate percpu delta to global */ 335 delta = cur; 336 cgroup_base_stat_sub(&delta, &rstatc->last_bstat); 337 cgroup_base_stat_add(&cgrp->bstat, &delta); 338 cgroup_base_stat_add(&rstatc->last_bstat, &delta); 339 340 /* propagate global delta to parent (unless that's root) */ 341 if (cgroup_parent(parent)) { 342 delta = cgrp->bstat; 343 cgroup_base_stat_sub(&delta, &cgrp->last_bstat); 344 cgroup_base_stat_add(&parent->bstat, &delta); 345 cgroup_base_stat_add(&cgrp->last_bstat, &delta); 346 } 347 } 348 349 static struct cgroup_rstat_cpu * 350 cgroup_base_stat_cputime_account_begin(struct cgroup *cgrp, unsigned long *flags) 351 { 352 struct cgroup_rstat_cpu *rstatc; 353 354 rstatc = get_cpu_ptr(cgrp->rstat_cpu); 355 *flags = u64_stats_update_begin_irqsave(&rstatc->bsync); 356 return rstatc; 357 } 358 359 static void cgroup_base_stat_cputime_account_end(struct cgroup *cgrp, 360 struct cgroup_rstat_cpu *rstatc, 361 unsigned long flags) 362 { 363 u64_stats_update_end_irqrestore(&rstatc->bsync, flags); 364 cgroup_rstat_updated(cgrp, smp_processor_id()); 365 put_cpu_ptr(rstatc); 366 } 367 368 void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec) 369 { 370 struct cgroup_rstat_cpu *rstatc; 371 unsigned long flags; 372 373 rstatc = cgroup_base_stat_cputime_account_begin(cgrp, &flags); 374 rstatc->bstat.cputime.sum_exec_runtime += delta_exec; 375 cgroup_base_stat_cputime_account_end(cgrp, rstatc, flags); 376 } 377 378 void __cgroup_account_cputime_field(struct cgroup *cgrp, 379 enum cpu_usage_stat index, u64 delta_exec) 380 { 381 struct cgroup_rstat_cpu *rstatc; 382 unsigned long flags; 383 384 rstatc = cgroup_base_stat_cputime_account_begin(cgrp, &flags); 385 386 switch (index) { 387 case CPUTIME_USER: 388 case CPUTIME_NICE: 389 rstatc->bstat.cputime.utime += delta_exec; 390 break; 391 case CPUTIME_SYSTEM: 392 case CPUTIME_IRQ: 393 case CPUTIME_SOFTIRQ: 394 rstatc->bstat.cputime.stime += delta_exec; 395 break; 396 default: 397 break; 398 } 399 400 cgroup_base_stat_cputime_account_end(cgrp, rstatc, flags); 401 } 402 403 /* 404 * compute the cputime for the root cgroup by getting the per cpu data 405 * at a global level, then categorizing the fields in a manner consistent 406 * with how it is done by __cgroup_account_cputime_field for each bit of 407 * cpu time attributed to a cgroup. 408 */ 409 static void root_cgroup_cputime(struct task_cputime *cputime) 410 { 411 int i; 412 413 cputime->stime = 0; 414 cputime->utime = 0; 415 cputime->sum_exec_runtime = 0; 416 for_each_possible_cpu(i) { 417 struct kernel_cpustat kcpustat; 418 u64 *cpustat = kcpustat.cpustat; 419 u64 user = 0; 420 u64 sys = 0; 421 422 kcpustat_cpu_fetch(&kcpustat, i); 423 424 user += cpustat[CPUTIME_USER]; 425 user += cpustat[CPUTIME_NICE]; 426 cputime->utime += user; 427 428 sys += cpustat[CPUTIME_SYSTEM]; 429 sys += cpustat[CPUTIME_IRQ]; 430 sys += cpustat[CPUTIME_SOFTIRQ]; 431 cputime->stime += sys; 432 433 cputime->sum_exec_runtime += user; 434 cputime->sum_exec_runtime += sys; 435 cputime->sum_exec_runtime += cpustat[CPUTIME_STEAL]; 436 cputime->sum_exec_runtime += cpustat[CPUTIME_GUEST]; 437 cputime->sum_exec_runtime += cpustat[CPUTIME_GUEST_NICE]; 438 } 439 } 440 441 void cgroup_base_stat_cputime_show(struct seq_file *seq) 442 { 443 struct cgroup *cgrp = seq_css(seq)->cgroup; 444 u64 usage, utime, stime; 445 struct task_cputime cputime; 446 447 if (cgroup_parent(cgrp)) { 448 cgroup_rstat_flush_hold(cgrp); 449 usage = cgrp->bstat.cputime.sum_exec_runtime; 450 cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime, 451 &utime, &stime); 452 cgroup_rstat_flush_release(); 453 } else { 454 root_cgroup_cputime(&cputime); 455 usage = cputime.sum_exec_runtime; 456 utime = cputime.utime; 457 stime = cputime.stime; 458 } 459 460 do_div(usage, NSEC_PER_USEC); 461 do_div(utime, NSEC_PER_USEC); 462 do_div(stime, NSEC_PER_USEC); 463 464 seq_printf(seq, "usage_usec %llu\n" 465 "user_usec %llu\n" 466 "system_usec %llu\n", 467 usage, utime, stime); 468 } 469