1 // SPDX-License-Identifier: GPL-2.0-only 2 #include "cgroup-internal.h" 3 4 #include <linux/sched/cputime.h> 5 6 static DEFINE_SPINLOCK(cgroup_rstat_lock); 7 static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_cpu_lock); 8 9 static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu); 10 11 static struct cgroup_rstat_cpu *cgroup_rstat_cpu(struct cgroup *cgrp, int cpu) 12 { 13 return per_cpu_ptr(cgrp->rstat_cpu, cpu); 14 } 15 16 /** 17 * cgroup_rstat_updated - keep track of updated rstat_cpu 18 * @cgrp: target cgroup 19 * @cpu: cpu on which rstat_cpu was updated 20 * 21 * @cgrp's rstat_cpu on @cpu was updated. Put it on the parent's matching 22 * rstat_cpu->updated_children list. See the comment on top of 23 * cgroup_rstat_cpu definition for details. 24 */ 25 void cgroup_rstat_updated(struct cgroup *cgrp, int cpu) 26 { 27 raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu); 28 struct cgroup *parent; 29 unsigned long flags; 30 31 /* nothing to do for root */ 32 if (!cgroup_parent(cgrp)) 33 return; 34 35 /* 36 * Paired with the one in cgroup_rstat_cpu_pop_upated(). Either we 37 * see NULL updated_next or they see our updated stat. 38 */ 39 smp_mb(); 40 41 /* 42 * Because @parent's updated_children is terminated with @parent 43 * instead of NULL, we can tell whether @cgrp is on the list by 44 * testing the next pointer for NULL. 45 */ 46 if (cgroup_rstat_cpu(cgrp, cpu)->updated_next) 47 return; 48 49 raw_spin_lock_irqsave(cpu_lock, flags); 50 51 /* put @cgrp and all ancestors on the corresponding updated lists */ 52 for (parent = cgroup_parent(cgrp); parent; 53 cgrp = parent, parent = cgroup_parent(cgrp)) { 54 struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); 55 struct cgroup_rstat_cpu *prstatc = cgroup_rstat_cpu(parent, cpu); 56 57 /* 58 * Both additions and removals are bottom-up. If a cgroup 59 * is already in the tree, all ancestors are. 60 */ 61 if (rstatc->updated_next) 62 break; 63 64 rstatc->updated_next = prstatc->updated_children; 65 prstatc->updated_children = cgrp; 66 } 67 68 raw_spin_unlock_irqrestore(cpu_lock, flags); 69 } 70 EXPORT_SYMBOL_GPL(cgroup_rstat_updated); 71 72 /** 73 * cgroup_rstat_cpu_pop_updated - iterate and dismantle rstat_cpu updated tree 74 * @pos: current position 75 * @root: root of the tree to traversal 76 * @cpu: target cpu 77 * 78 * Walks the udpated rstat_cpu tree on @cpu from @root. %NULL @pos starts 79 * the traversal and %NULL return indicates the end. During traversal, 80 * each returned cgroup is unlinked from the tree. Must be called with the 81 * matching cgroup_rstat_cpu_lock held. 82 * 83 * The only ordering guarantee is that, for a parent and a child pair 84 * covered by a given traversal, if a child is visited, its parent is 85 * guaranteed to be visited afterwards. 86 */ 87 static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos, 88 struct cgroup *root, int cpu) 89 { 90 struct cgroup_rstat_cpu *rstatc; 91 92 if (pos == root) 93 return NULL; 94 95 /* 96 * We're gonna walk down to the first leaf and visit/remove it. We 97 * can pick whatever unvisited node as the starting point. 98 */ 99 if (!pos) 100 pos = root; 101 else 102 pos = cgroup_parent(pos); 103 104 /* walk down to the first leaf */ 105 while (true) { 106 rstatc = cgroup_rstat_cpu(pos, cpu); 107 if (rstatc->updated_children == pos) 108 break; 109 pos = rstatc->updated_children; 110 } 111 112 /* 113 * Unlink @pos from the tree. As the updated_children list is 114 * singly linked, we have to walk it to find the removal point. 115 * However, due to the way we traverse, @pos will be the first 116 * child in most cases. The only exception is @root. 117 */ 118 if (rstatc->updated_next) { 119 struct cgroup *parent = cgroup_parent(pos); 120 struct cgroup_rstat_cpu *prstatc = cgroup_rstat_cpu(parent, cpu); 121 struct cgroup_rstat_cpu *nrstatc; 122 struct cgroup **nextp; 123 124 nextp = &prstatc->updated_children; 125 while (true) { 126 nrstatc = cgroup_rstat_cpu(*nextp, cpu); 127 if (*nextp == pos) 128 break; 129 130 WARN_ON_ONCE(*nextp == parent); 131 nextp = &nrstatc->updated_next; 132 } 133 134 *nextp = rstatc->updated_next; 135 rstatc->updated_next = NULL; 136 137 /* 138 * Paired with the one in cgroup_rstat_cpu_updated(). 139 * Either they see NULL updated_next or we see their 140 * updated stat. 141 */ 142 smp_mb(); 143 144 return pos; 145 } 146 147 /* only happens for @root */ 148 return NULL; 149 } 150 151 /* see cgroup_rstat_flush() */ 152 static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep) 153 __releases(&cgroup_rstat_lock) __acquires(&cgroup_rstat_lock) 154 { 155 int cpu; 156 157 lockdep_assert_held(&cgroup_rstat_lock); 158 159 for_each_possible_cpu(cpu) { 160 raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, 161 cpu); 162 struct cgroup *pos = NULL; 163 164 raw_spin_lock(cpu_lock); 165 while ((pos = cgroup_rstat_cpu_pop_updated(pos, cgrp, cpu))) { 166 struct cgroup_subsys_state *css; 167 168 cgroup_base_stat_flush(pos, cpu); 169 170 rcu_read_lock(); 171 list_for_each_entry_rcu(css, &pos->rstat_css_list, 172 rstat_css_node) 173 css->ss->css_rstat_flush(css, cpu); 174 rcu_read_unlock(); 175 } 176 raw_spin_unlock(cpu_lock); 177 178 /* if @may_sleep, play nice and yield if necessary */ 179 if (may_sleep && (need_resched() || 180 spin_needbreak(&cgroup_rstat_lock))) { 181 spin_unlock_irq(&cgroup_rstat_lock); 182 if (!cond_resched()) 183 cpu_relax(); 184 spin_lock_irq(&cgroup_rstat_lock); 185 } 186 } 187 } 188 189 /** 190 * cgroup_rstat_flush - flush stats in @cgrp's subtree 191 * @cgrp: target cgroup 192 * 193 * Collect all per-cpu stats in @cgrp's subtree into the global counters 194 * and propagate them upwards. After this function returns, all cgroups in 195 * the subtree have up-to-date ->stat. 196 * 197 * This also gets all cgroups in the subtree including @cgrp off the 198 * ->updated_children lists. 199 * 200 * This function may block. 201 */ 202 void cgroup_rstat_flush(struct cgroup *cgrp) 203 { 204 might_sleep(); 205 206 spin_lock_irq(&cgroup_rstat_lock); 207 cgroup_rstat_flush_locked(cgrp, true); 208 spin_unlock_irq(&cgroup_rstat_lock); 209 } 210 211 /** 212 * cgroup_rstat_flush_irqsafe - irqsafe version of cgroup_rstat_flush() 213 * @cgrp: target cgroup 214 * 215 * This function can be called from any context. 216 */ 217 void cgroup_rstat_flush_irqsafe(struct cgroup *cgrp) 218 { 219 unsigned long flags; 220 221 spin_lock_irqsave(&cgroup_rstat_lock, flags); 222 cgroup_rstat_flush_locked(cgrp, false); 223 spin_unlock_irqrestore(&cgroup_rstat_lock, flags); 224 } 225 226 /** 227 * cgroup_rstat_flush_begin - flush stats in @cgrp's subtree and hold 228 * @cgrp: target cgroup 229 * 230 * Flush stats in @cgrp's subtree and prevent further flushes. Must be 231 * paired with cgroup_rstat_flush_release(). 232 * 233 * This function may block. 234 */ 235 void cgroup_rstat_flush_hold(struct cgroup *cgrp) 236 __acquires(&cgroup_rstat_lock) 237 { 238 might_sleep(); 239 spin_lock_irq(&cgroup_rstat_lock); 240 cgroup_rstat_flush_locked(cgrp, true); 241 } 242 243 /** 244 * cgroup_rstat_flush_release - release cgroup_rstat_flush_hold() 245 */ 246 void cgroup_rstat_flush_release(void) 247 __releases(&cgroup_rstat_lock) 248 { 249 spin_unlock_irq(&cgroup_rstat_lock); 250 } 251 252 int cgroup_rstat_init(struct cgroup *cgrp) 253 { 254 int cpu; 255 256 /* the root cgrp has rstat_cpu preallocated */ 257 if (!cgrp->rstat_cpu) { 258 cgrp->rstat_cpu = alloc_percpu(struct cgroup_rstat_cpu); 259 if (!cgrp->rstat_cpu) 260 return -ENOMEM; 261 } 262 263 /* ->updated_children list is self terminated */ 264 for_each_possible_cpu(cpu) { 265 struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); 266 267 rstatc->updated_children = cgrp; 268 u64_stats_init(&rstatc->bsync); 269 } 270 271 return 0; 272 } 273 274 void cgroup_rstat_exit(struct cgroup *cgrp) 275 { 276 int cpu; 277 278 cgroup_rstat_flush(cgrp); 279 280 /* sanity check */ 281 for_each_possible_cpu(cpu) { 282 struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); 283 284 if (WARN_ON_ONCE(rstatc->updated_children != cgrp) || 285 WARN_ON_ONCE(rstatc->updated_next)) 286 return; 287 } 288 289 free_percpu(cgrp->rstat_cpu); 290 cgrp->rstat_cpu = NULL; 291 } 292 293 void __init cgroup_rstat_boot(void) 294 { 295 int cpu; 296 297 for_each_possible_cpu(cpu) 298 raw_spin_lock_init(per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu)); 299 300 BUG_ON(cgroup_rstat_init(&cgrp_dfl_root.cgrp)); 301 } 302 303 /* 304 * Functions for cgroup basic resource statistics implemented on top of 305 * rstat. 306 */ 307 static void cgroup_base_stat_accumulate(struct cgroup_base_stat *dst_bstat, 308 struct cgroup_base_stat *src_bstat) 309 { 310 dst_bstat->cputime.utime += src_bstat->cputime.utime; 311 dst_bstat->cputime.stime += src_bstat->cputime.stime; 312 dst_bstat->cputime.sum_exec_runtime += src_bstat->cputime.sum_exec_runtime; 313 } 314 315 static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu) 316 { 317 struct cgroup *parent = cgroup_parent(cgrp); 318 struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); 319 struct task_cputime *last_cputime = &rstatc->last_bstat.cputime; 320 struct task_cputime cputime; 321 struct cgroup_base_stat delta; 322 unsigned seq; 323 324 /* fetch the current per-cpu values */ 325 do { 326 seq = __u64_stats_fetch_begin(&rstatc->bsync); 327 cputime = rstatc->bstat.cputime; 328 } while (__u64_stats_fetch_retry(&rstatc->bsync, seq)); 329 330 /* calculate the delta to propgate */ 331 delta.cputime.utime = cputime.utime - last_cputime->utime; 332 delta.cputime.stime = cputime.stime - last_cputime->stime; 333 delta.cputime.sum_exec_runtime = cputime.sum_exec_runtime - 334 last_cputime->sum_exec_runtime; 335 *last_cputime = cputime; 336 337 /* transfer the pending stat into delta */ 338 cgroup_base_stat_accumulate(&delta, &cgrp->pending_bstat); 339 memset(&cgrp->pending_bstat, 0, sizeof(cgrp->pending_bstat)); 340 341 /* propagate delta into the global stat and the parent's pending */ 342 cgroup_base_stat_accumulate(&cgrp->bstat, &delta); 343 if (parent) 344 cgroup_base_stat_accumulate(&parent->pending_bstat, &delta); 345 } 346 347 static struct cgroup_rstat_cpu * 348 cgroup_base_stat_cputime_account_begin(struct cgroup *cgrp) 349 { 350 struct cgroup_rstat_cpu *rstatc; 351 352 rstatc = get_cpu_ptr(cgrp->rstat_cpu); 353 u64_stats_update_begin(&rstatc->bsync); 354 return rstatc; 355 } 356 357 static void cgroup_base_stat_cputime_account_end(struct cgroup *cgrp, 358 struct cgroup_rstat_cpu *rstatc) 359 { 360 u64_stats_update_end(&rstatc->bsync); 361 cgroup_rstat_updated(cgrp, smp_processor_id()); 362 put_cpu_ptr(rstatc); 363 } 364 365 void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec) 366 { 367 struct cgroup_rstat_cpu *rstatc; 368 369 rstatc = cgroup_base_stat_cputime_account_begin(cgrp); 370 rstatc->bstat.cputime.sum_exec_runtime += delta_exec; 371 cgroup_base_stat_cputime_account_end(cgrp, rstatc); 372 } 373 374 void __cgroup_account_cputime_field(struct cgroup *cgrp, 375 enum cpu_usage_stat index, u64 delta_exec) 376 { 377 struct cgroup_rstat_cpu *rstatc; 378 379 rstatc = cgroup_base_stat_cputime_account_begin(cgrp); 380 381 switch (index) { 382 case CPUTIME_USER: 383 case CPUTIME_NICE: 384 rstatc->bstat.cputime.utime += delta_exec; 385 break; 386 case CPUTIME_SYSTEM: 387 case CPUTIME_IRQ: 388 case CPUTIME_SOFTIRQ: 389 rstatc->bstat.cputime.stime += delta_exec; 390 break; 391 default: 392 break; 393 } 394 395 cgroup_base_stat_cputime_account_end(cgrp, rstatc); 396 } 397 398 void cgroup_base_stat_cputime_show(struct seq_file *seq) 399 { 400 struct cgroup *cgrp = seq_css(seq)->cgroup; 401 u64 usage, utime, stime; 402 403 if (!cgroup_parent(cgrp)) 404 return; 405 406 cgroup_rstat_flush_hold(cgrp); 407 usage = cgrp->bstat.cputime.sum_exec_runtime; 408 cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime, &utime, &stime); 409 cgroup_rstat_flush_release(); 410 411 do_div(usage, NSEC_PER_USEC); 412 do_div(utime, NSEC_PER_USEC); 413 do_div(stime, NSEC_PER_USEC); 414 415 seq_printf(seq, "usage_usec %llu\n" 416 "user_usec %llu\n" 417 "system_usec %llu\n", 418 usage, utime, stime); 419 } 420