1 #include "cgroup-internal.h" 2 3 #include <linux/sched/cputime.h> 4 5 static DEFINE_SPINLOCK(cgroup_rstat_lock); 6 static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_cpu_lock); 7 8 static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu); 9 10 static struct cgroup_rstat_cpu *cgroup_rstat_cpu(struct cgroup *cgrp, int cpu) 11 { 12 return per_cpu_ptr(cgrp->rstat_cpu, cpu); 13 } 14 15 /** 16 * cgroup_rstat_updated - keep track of updated rstat_cpu 17 * @cgrp: target cgroup 18 * @cpu: cpu on which rstat_cpu was updated 19 * 20 * @cgrp's rstat_cpu on @cpu was updated. Put it on the parent's matching 21 * rstat_cpu->updated_children list. See the comment on top of 22 * cgroup_rstat_cpu definition for details. 23 */ 24 void cgroup_rstat_updated(struct cgroup *cgrp, int cpu) 25 { 26 raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu); 27 struct cgroup *parent; 28 unsigned long flags; 29 30 /* nothing to do for root */ 31 if (!cgroup_parent(cgrp)) 32 return; 33 34 /* 35 * Paired with the one in cgroup_rstat_cpu_pop_upated(). Either we 36 * see NULL updated_next or they see our updated stat. 37 */ 38 smp_mb(); 39 40 /* 41 * Because @parent's updated_children is terminated with @parent 42 * instead of NULL, we can tell whether @cgrp is on the list by 43 * testing the next pointer for NULL. 44 */ 45 if (cgroup_rstat_cpu(cgrp, cpu)->updated_next) 46 return; 47 48 raw_spin_lock_irqsave(cpu_lock, flags); 49 50 /* put @cgrp and all ancestors on the corresponding updated lists */ 51 for (parent = cgroup_parent(cgrp); parent; 52 cgrp = parent, parent = cgroup_parent(cgrp)) { 53 struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); 54 struct cgroup_rstat_cpu *prstatc = cgroup_rstat_cpu(parent, cpu); 55 56 /* 57 * Both additions and removals are bottom-up. If a cgroup 58 * is already in the tree, all ancestors are. 59 */ 60 if (rstatc->updated_next) 61 break; 62 63 rstatc->updated_next = prstatc->updated_children; 64 prstatc->updated_children = cgrp; 65 } 66 67 raw_spin_unlock_irqrestore(cpu_lock, flags); 68 } 69 EXPORT_SYMBOL_GPL(cgroup_rstat_updated); 70 71 /** 72 * cgroup_rstat_cpu_pop_updated - iterate and dismantle rstat_cpu updated tree 73 * @pos: current position 74 * @root: root of the tree to traversal 75 * @cpu: target cpu 76 * 77 * Walks the udpated rstat_cpu tree on @cpu from @root. %NULL @pos starts 78 * the traversal and %NULL return indicates the end. During traversal, 79 * each returned cgroup is unlinked from the tree. Must be called with the 80 * matching cgroup_rstat_cpu_lock held. 81 * 82 * The only ordering guarantee is that, for a parent and a child pair 83 * covered by a given traversal, if a child is visited, its parent is 84 * guaranteed to be visited afterwards. 85 */ 86 static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos, 87 struct cgroup *root, int cpu) 88 { 89 struct cgroup_rstat_cpu *rstatc; 90 struct cgroup *parent; 91 92 if (pos == root) 93 return NULL; 94 95 /* 96 * We're gonna walk down to the first leaf and visit/remove it. We 97 * can pick whatever unvisited node as the starting point. 98 */ 99 if (!pos) 100 pos = root; 101 else 102 pos = cgroup_parent(pos); 103 104 /* walk down to the first leaf */ 105 while (true) { 106 rstatc = cgroup_rstat_cpu(pos, cpu); 107 if (rstatc->updated_children == pos) 108 break; 109 pos = rstatc->updated_children; 110 } 111 112 /* 113 * Unlink @pos from the tree. As the updated_children list is 114 * singly linked, we have to walk it to find the removal point. 115 * However, due to the way we traverse, @pos will be the first 116 * child in most cases. The only exception is @root. 117 */ 118 parent = cgroup_parent(pos); 119 if (parent && rstatc->updated_next) { 120 struct cgroup_rstat_cpu *prstatc = cgroup_rstat_cpu(parent, cpu); 121 struct cgroup_rstat_cpu *nrstatc; 122 struct cgroup **nextp; 123 124 nextp = &prstatc->updated_children; 125 while (true) { 126 nrstatc = cgroup_rstat_cpu(*nextp, cpu); 127 if (*nextp == pos) 128 break; 129 130 WARN_ON_ONCE(*nextp == parent); 131 nextp = &nrstatc->updated_next; 132 } 133 134 *nextp = rstatc->updated_next; 135 rstatc->updated_next = NULL; 136 137 /* 138 * Paired with the one in cgroup_rstat_cpu_updated(). 139 * Either they see NULL updated_next or we see their 140 * updated stat. 141 */ 142 smp_mb(); 143 } 144 145 return pos; 146 } 147 148 /* see cgroup_rstat_flush() */ 149 static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep) 150 __releases(&cgroup_rstat_lock) __acquires(&cgroup_rstat_lock) 151 { 152 int cpu; 153 154 lockdep_assert_held(&cgroup_rstat_lock); 155 156 for_each_possible_cpu(cpu) { 157 raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, 158 cpu); 159 struct cgroup *pos = NULL; 160 161 raw_spin_lock(cpu_lock); 162 while ((pos = cgroup_rstat_cpu_pop_updated(pos, cgrp, cpu))) { 163 struct cgroup_subsys_state *css; 164 165 cgroup_base_stat_flush(pos, cpu); 166 167 rcu_read_lock(); 168 list_for_each_entry_rcu(css, &pos->rstat_css_list, 169 rstat_css_node) 170 css->ss->css_rstat_flush(css, cpu); 171 rcu_read_unlock(); 172 } 173 raw_spin_unlock(cpu_lock); 174 175 /* if @may_sleep, play nice and yield if necessary */ 176 if (may_sleep && (need_resched() || 177 spin_needbreak(&cgroup_rstat_lock))) { 178 spin_unlock_irq(&cgroup_rstat_lock); 179 if (!cond_resched()) 180 cpu_relax(); 181 spin_lock_irq(&cgroup_rstat_lock); 182 } 183 } 184 } 185 186 /** 187 * cgroup_rstat_flush - flush stats in @cgrp's subtree 188 * @cgrp: target cgroup 189 * 190 * Collect all per-cpu stats in @cgrp's subtree into the global counters 191 * and propagate them upwards. After this function returns, all cgroups in 192 * the subtree have up-to-date ->stat. 193 * 194 * This also gets all cgroups in the subtree including @cgrp off the 195 * ->updated_children lists. 196 * 197 * This function may block. 198 */ 199 void cgroup_rstat_flush(struct cgroup *cgrp) 200 { 201 might_sleep(); 202 203 spin_lock_irq(&cgroup_rstat_lock); 204 cgroup_rstat_flush_locked(cgrp, true); 205 spin_unlock_irq(&cgroup_rstat_lock); 206 } 207 208 /** 209 * cgroup_rstat_flush_irqsafe - irqsafe version of cgroup_rstat_flush() 210 * @cgrp: target cgroup 211 * 212 * This function can be called from any context. 213 */ 214 void cgroup_rstat_flush_irqsafe(struct cgroup *cgrp) 215 { 216 unsigned long flags; 217 218 spin_lock_irqsave(&cgroup_rstat_lock, flags); 219 cgroup_rstat_flush_locked(cgrp, false); 220 spin_unlock_irqrestore(&cgroup_rstat_lock, flags); 221 } 222 223 /** 224 * cgroup_rstat_flush_begin - flush stats in @cgrp's subtree and hold 225 * @cgrp: target cgroup 226 * 227 * Flush stats in @cgrp's subtree and prevent further flushes. Must be 228 * paired with cgroup_rstat_flush_release(). 229 * 230 * This function may block. 231 */ 232 void cgroup_rstat_flush_hold(struct cgroup *cgrp) 233 __acquires(&cgroup_rstat_lock) 234 { 235 might_sleep(); 236 spin_lock_irq(&cgroup_rstat_lock); 237 cgroup_rstat_flush_locked(cgrp, true); 238 } 239 240 /** 241 * cgroup_rstat_flush_release - release cgroup_rstat_flush_hold() 242 */ 243 void cgroup_rstat_flush_release(void) 244 __releases(&cgroup_rstat_lock) 245 { 246 spin_unlock_irq(&cgroup_rstat_lock); 247 } 248 249 int cgroup_rstat_init(struct cgroup *cgrp) 250 { 251 int cpu; 252 253 /* the root cgrp has rstat_cpu preallocated */ 254 if (!cgrp->rstat_cpu) { 255 cgrp->rstat_cpu = alloc_percpu(struct cgroup_rstat_cpu); 256 if (!cgrp->rstat_cpu) 257 return -ENOMEM; 258 } 259 260 /* ->updated_children list is self terminated */ 261 for_each_possible_cpu(cpu) { 262 struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); 263 264 rstatc->updated_children = cgrp; 265 u64_stats_init(&rstatc->bsync); 266 } 267 268 return 0; 269 } 270 271 void cgroup_rstat_exit(struct cgroup *cgrp) 272 { 273 int cpu; 274 275 cgroup_rstat_flush(cgrp); 276 277 /* sanity check */ 278 for_each_possible_cpu(cpu) { 279 struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); 280 281 if (WARN_ON_ONCE(rstatc->updated_children != cgrp) || 282 WARN_ON_ONCE(rstatc->updated_next)) 283 return; 284 } 285 286 free_percpu(cgrp->rstat_cpu); 287 cgrp->rstat_cpu = NULL; 288 } 289 290 void __init cgroup_rstat_boot(void) 291 { 292 int cpu; 293 294 for_each_possible_cpu(cpu) 295 raw_spin_lock_init(per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu)); 296 297 BUG_ON(cgroup_rstat_init(&cgrp_dfl_root.cgrp)); 298 } 299 300 /* 301 * Functions for cgroup basic resource statistics implemented on top of 302 * rstat. 303 */ 304 static void cgroup_base_stat_accumulate(struct cgroup_base_stat *dst_bstat, 305 struct cgroup_base_stat *src_bstat) 306 { 307 dst_bstat->cputime.utime += src_bstat->cputime.utime; 308 dst_bstat->cputime.stime += src_bstat->cputime.stime; 309 dst_bstat->cputime.sum_exec_runtime += src_bstat->cputime.sum_exec_runtime; 310 } 311 312 static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu) 313 { 314 struct cgroup *parent = cgroup_parent(cgrp); 315 struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); 316 struct task_cputime *last_cputime = &rstatc->last_bstat.cputime; 317 struct task_cputime cputime; 318 struct cgroup_base_stat delta; 319 unsigned seq; 320 321 /* fetch the current per-cpu values */ 322 do { 323 seq = __u64_stats_fetch_begin(&rstatc->bsync); 324 cputime = rstatc->bstat.cputime; 325 } while (__u64_stats_fetch_retry(&rstatc->bsync, seq)); 326 327 /* calculate the delta to propgate */ 328 delta.cputime.utime = cputime.utime - last_cputime->utime; 329 delta.cputime.stime = cputime.stime - last_cputime->stime; 330 delta.cputime.sum_exec_runtime = cputime.sum_exec_runtime - 331 last_cputime->sum_exec_runtime; 332 *last_cputime = cputime; 333 334 /* transfer the pending stat into delta */ 335 cgroup_base_stat_accumulate(&delta, &cgrp->pending_bstat); 336 memset(&cgrp->pending_bstat, 0, sizeof(cgrp->pending_bstat)); 337 338 /* propagate delta into the global stat and the parent's pending */ 339 cgroup_base_stat_accumulate(&cgrp->bstat, &delta); 340 if (parent) 341 cgroup_base_stat_accumulate(&parent->pending_bstat, &delta); 342 } 343 344 static struct cgroup_rstat_cpu * 345 cgroup_base_stat_cputime_account_begin(struct cgroup *cgrp) 346 { 347 struct cgroup_rstat_cpu *rstatc; 348 349 rstatc = get_cpu_ptr(cgrp->rstat_cpu); 350 u64_stats_update_begin(&rstatc->bsync); 351 return rstatc; 352 } 353 354 static void cgroup_base_stat_cputime_account_end(struct cgroup *cgrp, 355 struct cgroup_rstat_cpu *rstatc) 356 { 357 u64_stats_update_end(&rstatc->bsync); 358 cgroup_rstat_updated(cgrp, smp_processor_id()); 359 put_cpu_ptr(rstatc); 360 } 361 362 void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec) 363 { 364 struct cgroup_rstat_cpu *rstatc; 365 366 rstatc = cgroup_base_stat_cputime_account_begin(cgrp); 367 rstatc->bstat.cputime.sum_exec_runtime += delta_exec; 368 cgroup_base_stat_cputime_account_end(cgrp, rstatc); 369 } 370 371 void __cgroup_account_cputime_field(struct cgroup *cgrp, 372 enum cpu_usage_stat index, u64 delta_exec) 373 { 374 struct cgroup_rstat_cpu *rstatc; 375 376 rstatc = cgroup_base_stat_cputime_account_begin(cgrp); 377 378 switch (index) { 379 case CPUTIME_USER: 380 case CPUTIME_NICE: 381 rstatc->bstat.cputime.utime += delta_exec; 382 break; 383 case CPUTIME_SYSTEM: 384 case CPUTIME_IRQ: 385 case CPUTIME_SOFTIRQ: 386 rstatc->bstat.cputime.stime += delta_exec; 387 break; 388 default: 389 break; 390 } 391 392 cgroup_base_stat_cputime_account_end(cgrp, rstatc); 393 } 394 395 void cgroup_base_stat_cputime_show(struct seq_file *seq) 396 { 397 struct cgroup *cgrp = seq_css(seq)->cgroup; 398 u64 usage, utime, stime; 399 400 if (!cgroup_parent(cgrp)) 401 return; 402 403 cgroup_rstat_flush_hold(cgrp); 404 usage = cgrp->bstat.cputime.sum_exec_runtime; 405 cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime, &utime, &stime); 406 cgroup_rstat_flush_release(); 407 408 do_div(usage, NSEC_PER_USEC); 409 do_div(utime, NSEC_PER_USEC); 410 do_div(stime, NSEC_PER_USEC); 411 412 seq_printf(seq, "usage_usec %llu\n" 413 "user_usec %llu\n" 414 "system_usec %llu\n", 415 usage, utime, stime); 416 } 417