xref: /linux/kernel/cgroup/rstat.c (revision e5e95a7639ed5f7dc3e404858ad7910de5fa2057)
1 // SPDX-License-Identifier: GPL-2.0-only
2 #include "cgroup-internal.h"
3 
4 #include <linux/sched/cputime.h>
5 
6 static DEFINE_SPINLOCK(cgroup_rstat_lock);
7 static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_cpu_lock);
8 
9 static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu);
10 
11 static struct cgroup_rstat_cpu *cgroup_rstat_cpu(struct cgroup *cgrp, int cpu)
12 {
13 	return per_cpu_ptr(cgrp->rstat_cpu, cpu);
14 }
15 
16 /**
17  * cgroup_rstat_updated - keep track of updated rstat_cpu
18  * @cgrp: target cgroup
19  * @cpu: cpu on which rstat_cpu was updated
20  *
21  * @cgrp's rstat_cpu on @cpu was updated.  Put it on the parent's matching
22  * rstat_cpu->updated_children list.  See the comment on top of
23  * cgroup_rstat_cpu definition for details.
24  */
25 void cgroup_rstat_updated(struct cgroup *cgrp, int cpu)
26 {
27 	raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu);
28 	struct cgroup *parent;
29 	unsigned long flags;
30 
31 	/* nothing to do for root */
32 	if (!cgroup_parent(cgrp))
33 		return;
34 
35 	/*
36 	 * Speculative already-on-list test. This may race leading to
37 	 * temporary inaccuracies, which is fine.
38 	 *
39 	 * Because @parent's updated_children is terminated with @parent
40 	 * instead of NULL, we can tell whether @cgrp is on the list by
41 	 * testing the next pointer for NULL.
42 	 */
43 	if (cgroup_rstat_cpu(cgrp, cpu)->updated_next)
44 		return;
45 
46 	raw_spin_lock_irqsave(cpu_lock, flags);
47 
48 	/* put @cgrp and all ancestors on the corresponding updated lists */
49 	for (parent = cgroup_parent(cgrp); parent;
50 	     cgrp = parent, parent = cgroup_parent(cgrp)) {
51 		struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
52 		struct cgroup_rstat_cpu *prstatc = cgroup_rstat_cpu(parent, cpu);
53 
54 		/*
55 		 * Both additions and removals are bottom-up.  If a cgroup
56 		 * is already in the tree, all ancestors are.
57 		 */
58 		if (rstatc->updated_next)
59 			break;
60 
61 		rstatc->updated_next = prstatc->updated_children;
62 		prstatc->updated_children = cgrp;
63 	}
64 
65 	raw_spin_unlock_irqrestore(cpu_lock, flags);
66 }
67 
68 /**
69  * cgroup_rstat_cpu_pop_updated - iterate and dismantle rstat_cpu updated tree
70  * @pos: current position
71  * @root: root of the tree to traversal
72  * @cpu: target cpu
73  *
74  * Walks the udpated rstat_cpu tree on @cpu from @root.  %NULL @pos starts
75  * the traversal and %NULL return indicates the end.  During traversal,
76  * each returned cgroup is unlinked from the tree.  Must be called with the
77  * matching cgroup_rstat_cpu_lock held.
78  *
79  * The only ordering guarantee is that, for a parent and a child pair
80  * covered by a given traversal, if a child is visited, its parent is
81  * guaranteed to be visited afterwards.
82  */
83 static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos,
84 						   struct cgroup *root, int cpu)
85 {
86 	struct cgroup_rstat_cpu *rstatc;
87 
88 	if (pos == root)
89 		return NULL;
90 
91 	/*
92 	 * We're gonna walk down to the first leaf and visit/remove it.  We
93 	 * can pick whatever unvisited node as the starting point.
94 	 */
95 	if (!pos)
96 		pos = root;
97 	else
98 		pos = cgroup_parent(pos);
99 
100 	/* walk down to the first leaf */
101 	while (true) {
102 		rstatc = cgroup_rstat_cpu(pos, cpu);
103 		if (rstatc->updated_children == pos)
104 			break;
105 		pos = rstatc->updated_children;
106 	}
107 
108 	/*
109 	 * Unlink @pos from the tree.  As the updated_children list is
110 	 * singly linked, we have to walk it to find the removal point.
111 	 * However, due to the way we traverse, @pos will be the first
112 	 * child in most cases. The only exception is @root.
113 	 */
114 	if (rstatc->updated_next) {
115 		struct cgroup *parent = cgroup_parent(pos);
116 		struct cgroup_rstat_cpu *prstatc = cgroup_rstat_cpu(parent, cpu);
117 		struct cgroup_rstat_cpu *nrstatc;
118 		struct cgroup **nextp;
119 
120 		nextp = &prstatc->updated_children;
121 		while (true) {
122 			nrstatc = cgroup_rstat_cpu(*nextp, cpu);
123 			if (*nextp == pos)
124 				break;
125 
126 			WARN_ON_ONCE(*nextp == parent);
127 			nextp = &nrstatc->updated_next;
128 		}
129 
130 		*nextp = rstatc->updated_next;
131 		rstatc->updated_next = NULL;
132 
133 		return pos;
134 	}
135 
136 	/* only happens for @root */
137 	return NULL;
138 }
139 
140 /* see cgroup_rstat_flush() */
141 static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep)
142 	__releases(&cgroup_rstat_lock) __acquires(&cgroup_rstat_lock)
143 {
144 	int cpu;
145 
146 	lockdep_assert_held(&cgroup_rstat_lock);
147 
148 	for_each_possible_cpu(cpu) {
149 		raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock,
150 						       cpu);
151 		struct cgroup *pos = NULL;
152 
153 		raw_spin_lock(cpu_lock);
154 		while ((pos = cgroup_rstat_cpu_pop_updated(pos, cgrp, cpu))) {
155 			struct cgroup_subsys_state *css;
156 
157 			cgroup_base_stat_flush(pos, cpu);
158 
159 			rcu_read_lock();
160 			list_for_each_entry_rcu(css, &pos->rstat_css_list,
161 						rstat_css_node)
162 				css->ss->css_rstat_flush(css, cpu);
163 			rcu_read_unlock();
164 		}
165 		raw_spin_unlock(cpu_lock);
166 
167 		/* if @may_sleep, play nice and yield if necessary */
168 		if (may_sleep && (need_resched() ||
169 				  spin_needbreak(&cgroup_rstat_lock))) {
170 			spin_unlock_irq(&cgroup_rstat_lock);
171 			if (!cond_resched())
172 				cpu_relax();
173 			spin_lock_irq(&cgroup_rstat_lock);
174 		}
175 	}
176 }
177 
178 /**
179  * cgroup_rstat_flush - flush stats in @cgrp's subtree
180  * @cgrp: target cgroup
181  *
182  * Collect all per-cpu stats in @cgrp's subtree into the global counters
183  * and propagate them upwards.  After this function returns, all cgroups in
184  * the subtree have up-to-date ->stat.
185  *
186  * This also gets all cgroups in the subtree including @cgrp off the
187  * ->updated_children lists.
188  *
189  * This function may block.
190  */
191 void cgroup_rstat_flush(struct cgroup *cgrp)
192 {
193 	might_sleep();
194 
195 	spin_lock_irq(&cgroup_rstat_lock);
196 	cgroup_rstat_flush_locked(cgrp, true);
197 	spin_unlock_irq(&cgroup_rstat_lock);
198 }
199 
200 /**
201  * cgroup_rstat_flush_irqsafe - irqsafe version of cgroup_rstat_flush()
202  * @cgrp: target cgroup
203  *
204  * This function can be called from any context.
205  */
206 void cgroup_rstat_flush_irqsafe(struct cgroup *cgrp)
207 {
208 	unsigned long flags;
209 
210 	spin_lock_irqsave(&cgroup_rstat_lock, flags);
211 	cgroup_rstat_flush_locked(cgrp, false);
212 	spin_unlock_irqrestore(&cgroup_rstat_lock, flags);
213 }
214 
215 /**
216  * cgroup_rstat_flush_begin - flush stats in @cgrp's subtree and hold
217  * @cgrp: target cgroup
218  *
219  * Flush stats in @cgrp's subtree and prevent further flushes.  Must be
220  * paired with cgroup_rstat_flush_release().
221  *
222  * This function may block.
223  */
224 void cgroup_rstat_flush_hold(struct cgroup *cgrp)
225 	__acquires(&cgroup_rstat_lock)
226 {
227 	might_sleep();
228 	spin_lock_irq(&cgroup_rstat_lock);
229 	cgroup_rstat_flush_locked(cgrp, true);
230 }
231 
232 /**
233  * cgroup_rstat_flush_release - release cgroup_rstat_flush_hold()
234  */
235 void cgroup_rstat_flush_release(void)
236 	__releases(&cgroup_rstat_lock)
237 {
238 	spin_unlock_irq(&cgroup_rstat_lock);
239 }
240 
241 int cgroup_rstat_init(struct cgroup *cgrp)
242 {
243 	int cpu;
244 
245 	/* the root cgrp has rstat_cpu preallocated */
246 	if (!cgrp->rstat_cpu) {
247 		cgrp->rstat_cpu = alloc_percpu(struct cgroup_rstat_cpu);
248 		if (!cgrp->rstat_cpu)
249 			return -ENOMEM;
250 	}
251 
252 	/* ->updated_children list is self terminated */
253 	for_each_possible_cpu(cpu) {
254 		struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
255 
256 		rstatc->updated_children = cgrp;
257 		u64_stats_init(&rstatc->bsync);
258 	}
259 
260 	return 0;
261 }
262 
263 void cgroup_rstat_exit(struct cgroup *cgrp)
264 {
265 	int cpu;
266 
267 	cgroup_rstat_flush(cgrp);
268 
269 	/* sanity check */
270 	for_each_possible_cpu(cpu) {
271 		struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
272 
273 		if (WARN_ON_ONCE(rstatc->updated_children != cgrp) ||
274 		    WARN_ON_ONCE(rstatc->updated_next))
275 			return;
276 	}
277 
278 	free_percpu(cgrp->rstat_cpu);
279 	cgrp->rstat_cpu = NULL;
280 }
281 
282 void __init cgroup_rstat_boot(void)
283 {
284 	int cpu;
285 
286 	for_each_possible_cpu(cpu)
287 		raw_spin_lock_init(per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu));
288 
289 	BUG_ON(cgroup_rstat_init(&cgrp_dfl_root.cgrp));
290 }
291 
292 /*
293  * Functions for cgroup basic resource statistics implemented on top of
294  * rstat.
295  */
296 static void cgroup_base_stat_add(struct cgroup_base_stat *dst_bstat,
297 				 struct cgroup_base_stat *src_bstat)
298 {
299 	dst_bstat->cputime.utime += src_bstat->cputime.utime;
300 	dst_bstat->cputime.stime += src_bstat->cputime.stime;
301 	dst_bstat->cputime.sum_exec_runtime += src_bstat->cputime.sum_exec_runtime;
302 }
303 
304 static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat,
305 				 struct cgroup_base_stat *src_bstat)
306 {
307 	dst_bstat->cputime.utime -= src_bstat->cputime.utime;
308 	dst_bstat->cputime.stime -= src_bstat->cputime.stime;
309 	dst_bstat->cputime.sum_exec_runtime -= src_bstat->cputime.sum_exec_runtime;
310 }
311 
312 static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu)
313 {
314 	struct cgroup *parent = cgroup_parent(cgrp);
315 	struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
316 	struct cgroup_base_stat cur, delta;
317 	unsigned seq;
318 
319 	/* fetch the current per-cpu values */
320 	do {
321 		seq = __u64_stats_fetch_begin(&rstatc->bsync);
322 		cur.cputime = rstatc->bstat.cputime;
323 	} while (__u64_stats_fetch_retry(&rstatc->bsync, seq));
324 
325 	/* propagate percpu delta to global */
326 	delta = cur;
327 	cgroup_base_stat_sub(&delta, &rstatc->last_bstat);
328 	cgroup_base_stat_add(&cgrp->bstat, &delta);
329 	cgroup_base_stat_add(&rstatc->last_bstat, &delta);
330 
331 	/* propagate global delta to parent */
332 	if (parent) {
333 		delta = cgrp->bstat;
334 		cgroup_base_stat_sub(&delta, &cgrp->last_bstat);
335 		cgroup_base_stat_add(&parent->bstat, &delta);
336 		cgroup_base_stat_add(&cgrp->last_bstat, &delta);
337 	}
338 }
339 
340 static struct cgroup_rstat_cpu *
341 cgroup_base_stat_cputime_account_begin(struct cgroup *cgrp)
342 {
343 	struct cgroup_rstat_cpu *rstatc;
344 
345 	rstatc = get_cpu_ptr(cgrp->rstat_cpu);
346 	u64_stats_update_begin(&rstatc->bsync);
347 	return rstatc;
348 }
349 
350 static void cgroup_base_stat_cputime_account_end(struct cgroup *cgrp,
351 						 struct cgroup_rstat_cpu *rstatc)
352 {
353 	u64_stats_update_end(&rstatc->bsync);
354 	cgroup_rstat_updated(cgrp, smp_processor_id());
355 	put_cpu_ptr(rstatc);
356 }
357 
358 void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec)
359 {
360 	struct cgroup_rstat_cpu *rstatc;
361 
362 	rstatc = cgroup_base_stat_cputime_account_begin(cgrp);
363 	rstatc->bstat.cputime.sum_exec_runtime += delta_exec;
364 	cgroup_base_stat_cputime_account_end(cgrp, rstatc);
365 }
366 
367 void __cgroup_account_cputime_field(struct cgroup *cgrp,
368 				    enum cpu_usage_stat index, u64 delta_exec)
369 {
370 	struct cgroup_rstat_cpu *rstatc;
371 
372 	rstatc = cgroup_base_stat_cputime_account_begin(cgrp);
373 
374 	switch (index) {
375 	case CPUTIME_USER:
376 	case CPUTIME_NICE:
377 		rstatc->bstat.cputime.utime += delta_exec;
378 		break;
379 	case CPUTIME_SYSTEM:
380 	case CPUTIME_IRQ:
381 	case CPUTIME_SOFTIRQ:
382 		rstatc->bstat.cputime.stime += delta_exec;
383 		break;
384 	default:
385 		break;
386 	}
387 
388 	cgroup_base_stat_cputime_account_end(cgrp, rstatc);
389 }
390 
391 /*
392  * compute the cputime for the root cgroup by getting the per cpu data
393  * at a global level, then categorizing the fields in a manner consistent
394  * with how it is done by __cgroup_account_cputime_field for each bit of
395  * cpu time attributed to a cgroup.
396  */
397 static void root_cgroup_cputime(struct task_cputime *cputime)
398 {
399 	int i;
400 
401 	cputime->stime = 0;
402 	cputime->utime = 0;
403 	cputime->sum_exec_runtime = 0;
404 	for_each_possible_cpu(i) {
405 		struct kernel_cpustat kcpustat;
406 		u64 *cpustat = kcpustat.cpustat;
407 		u64 user = 0;
408 		u64 sys = 0;
409 
410 		kcpustat_cpu_fetch(&kcpustat, i);
411 
412 		user += cpustat[CPUTIME_USER];
413 		user += cpustat[CPUTIME_NICE];
414 		cputime->utime += user;
415 
416 		sys += cpustat[CPUTIME_SYSTEM];
417 		sys += cpustat[CPUTIME_IRQ];
418 		sys += cpustat[CPUTIME_SOFTIRQ];
419 		cputime->stime += sys;
420 
421 		cputime->sum_exec_runtime += user;
422 		cputime->sum_exec_runtime += sys;
423 		cputime->sum_exec_runtime += cpustat[CPUTIME_STEAL];
424 		cputime->sum_exec_runtime += cpustat[CPUTIME_GUEST];
425 		cputime->sum_exec_runtime += cpustat[CPUTIME_GUEST_NICE];
426 	}
427 }
428 
429 void cgroup_base_stat_cputime_show(struct seq_file *seq)
430 {
431 	struct cgroup *cgrp = seq_css(seq)->cgroup;
432 	u64 usage, utime, stime;
433 	struct task_cputime cputime;
434 
435 	if (cgroup_parent(cgrp)) {
436 		cgroup_rstat_flush_hold(cgrp);
437 		usage = cgrp->bstat.cputime.sum_exec_runtime;
438 		cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime,
439 			       &utime, &stime);
440 		cgroup_rstat_flush_release();
441 	} else {
442 		root_cgroup_cputime(&cputime);
443 		usage = cputime.sum_exec_runtime;
444 		utime = cputime.utime;
445 		stime = cputime.stime;
446 	}
447 
448 	do_div(usage, NSEC_PER_USEC);
449 	do_div(utime, NSEC_PER_USEC);
450 	do_div(stime, NSEC_PER_USEC);
451 
452 	seq_printf(seq, "usage_usec %llu\n"
453 		   "user_usec %llu\n"
454 		   "system_usec %llu\n",
455 		   usage, utime, stime);
456 }
457