xref: /linux/kernel/cgroup/rdma.c (revision 0d25e3865841ea5edfedb5af42bf15cef075192e)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * RDMA resource limiting controller for cgroups.
4  *
5  * Used to allow a cgroup hierarchy to stop processes from consuming
6  * additional RDMA resources after a certain limit is reached.
7  *
8  * Copyright (C) 2016 Parav Pandit <pandit.parav@gmail.com>
9  */
10 
11 #include <linux/bitops.h>
12 #include <linux/limits.h>
13 #include <linux/slab.h>
14 #include <linux/seq_file.h>
15 #include <linux/cgroup.h>
16 #include <linux/parser.h>
17 #include <linux/cgroup_rdma.h>
18 
19 #define RDMACG_MAX_STR "max"
20 
21 enum rdmacg_limit_tokens {
22 	RDMACG_HCA_HANDLE_VAL,
23 	RDMACG_HCA_HANDLE_MAX,
24 	RDMACG_HCA_OBJECT_VAL,
25 	RDMACG_HCA_OBJECT_MAX,
26 	NR_RDMACG_LIMIT_TOKENS,
27 };
28 
29 static const match_table_t rdmacg_limit_tokens = {
30 	{ RDMACG_HCA_HANDLE_VAL,	"hca_handle=%d"	},
31 	{ RDMACG_HCA_HANDLE_MAX,	"hca_handle=max"	},
32 	{ RDMACG_HCA_OBJECT_VAL,	"hca_object=%d"	},
33 	{ RDMACG_HCA_OBJECT_MAX,	"hca_object=max"	},
34 	{ NR_RDMACG_LIMIT_TOKENS,	NULL			},
35 };
36 
37 /*
38  * Protects list of resource pools maintained on per cgroup basis
39  * and rdma device list.
40  */
41 static DEFINE_MUTEX(rdmacg_mutex);
42 static LIST_HEAD(rdmacg_devices);
43 
44 enum rdmacg_file_type {
45 	RDMACG_RESOURCE_TYPE_MAX,
46 	RDMACG_RESOURCE_TYPE_STAT,
47 	RDMACG_RESOURCE_TYPE_PEAK,
48 };
49 
50 /*
51  * resource table definition as to be seen by the user.
52  * Need to add entries to it when more resources are
53  * added/defined at IB verb/core layer.
54  */
55 static char const *rdmacg_resource_names[] = {
56 	[RDMACG_RESOURCE_HCA_HANDLE]	= "hca_handle",
57 	[RDMACG_RESOURCE_HCA_OBJECT]	= "hca_object",
58 };
59 
60 /* resource tracker for each resource of rdma cgroup */
61 struct rdmacg_resource {
62 	int max;
63 	int usage;
64 	int peak;
65 };
66 
67 /*
68  * resource pool object which represents per cgroup, per device
69  * resources. There are multiple instances of this object per cgroup,
70  * therefore it cannot be embedded within rdma_cgroup structure. It
71  * is maintained as list.
72  */
73 struct rdmacg_resource_pool {
74 	struct rdmacg_device	*device;
75 	struct rdmacg_resource	resources[RDMACG_RESOURCE_MAX];
76 
77 	struct list_head	cg_node;
78 	struct list_head	dev_node;
79 
80 	/* count active user tasks of this pool */
81 	u64			usage_sum;
82 	/* total number counts which are set to max */
83 	int			num_max_cnt;
84 
85 	/* per-resource event counters */
86 	u64			events_max[RDMACG_RESOURCE_MAX];
87 	u64			events_alloc_fail[RDMACG_RESOURCE_MAX];
88 	u64			events_local_max[RDMACG_RESOURCE_MAX];
89 	u64			events_local_alloc_fail[RDMACG_RESOURCE_MAX];
90 };
91 
92 static struct rdma_cgroup *css_rdmacg(struct cgroup_subsys_state *css)
93 {
94 	return container_of(css, struct rdma_cgroup, css);
95 }
96 
97 static struct rdma_cgroup *parent_rdmacg(struct rdma_cgroup *cg)
98 {
99 	return css_rdmacg(cg->css.parent);
100 }
101 
102 static inline struct rdma_cgroup *get_current_rdmacg(void)
103 {
104 	return css_rdmacg(task_get_css(current, rdma_cgrp_id));
105 }
106 
107 static void set_resource_limit(struct rdmacg_resource_pool *rpool,
108 			       int index, int new_max)
109 {
110 	if (new_max == S32_MAX) {
111 		if (rpool->resources[index].max != S32_MAX)
112 			rpool->num_max_cnt++;
113 	} else {
114 		if (rpool->resources[index].max == S32_MAX)
115 			rpool->num_max_cnt--;
116 	}
117 	rpool->resources[index].max = new_max;
118 }
119 
120 static void set_all_resource_max_limit(struct rdmacg_resource_pool *rpool)
121 {
122 	int i;
123 
124 	for (i = 0; i < RDMACG_RESOURCE_MAX; i++)
125 		set_resource_limit(rpool, i, S32_MAX);
126 }
127 
128 static void free_cg_rpool_locked(struct rdmacg_resource_pool *rpool)
129 {
130 	lockdep_assert_held(&rdmacg_mutex);
131 
132 	list_del(&rpool->cg_node);
133 	list_del(&rpool->dev_node);
134 	kfree(rpool);
135 }
136 
137 static bool rpool_has_persistent_state(struct rdmacg_resource_pool *rpool)
138 {
139 	int i;
140 
141 	/*
142 	 * Keep the rpool alive if any peak value is non-zero,
143 	 * so that rdma.peak persists as a historical high-
144 	 * watermark even after all resources are freed.
145 	 */
146 	for (i = 0; i < RDMACG_RESOURCE_MAX; i++) {
147 		if (rpool->resources[i].peak ||
148 		    rpool->events_max[i] ||
149 		    rpool->events_local_max[i] ||
150 		    rpool->events_alloc_fail[i] ||
151 		    rpool->events_local_alloc_fail[i])
152 			return true;
153 	}
154 	return false;
155 }
156 
157 static struct rdmacg_resource_pool *
158 find_cg_rpool_locked(struct rdma_cgroup *cg,
159 		     struct rdmacg_device *device)
160 
161 {
162 	struct rdmacg_resource_pool *pool;
163 
164 	lockdep_assert_held(&rdmacg_mutex);
165 
166 	list_for_each_entry(pool, &cg->rpools, cg_node)
167 		if (pool->device == device)
168 			return pool;
169 
170 	return NULL;
171 }
172 
173 static struct rdmacg_resource_pool *
174 get_cg_rpool_locked(struct rdma_cgroup *cg, struct rdmacg_device *device)
175 {
176 	struct rdmacg_resource_pool *rpool;
177 
178 	rpool = find_cg_rpool_locked(cg, device);
179 	if (rpool)
180 		return rpool;
181 
182 	rpool = kzalloc_obj(*rpool);
183 	if (!rpool)
184 		return ERR_PTR(-ENOMEM);
185 
186 	rpool->device = device;
187 	set_all_resource_max_limit(rpool);
188 
189 	INIT_LIST_HEAD(&rpool->cg_node);
190 	INIT_LIST_HEAD(&rpool->dev_node);
191 	list_add_tail(&rpool->cg_node, &cg->rpools);
192 	list_add_tail(&rpool->dev_node, &device->rpools);
193 	return rpool;
194 }
195 
196 /**
197  * uncharge_cg_locked - uncharge resource for rdma cgroup
198  * @cg: pointer to cg to uncharge and all parents in hierarchy
199  * @device: pointer to rdmacg device
200  * @index: index of the resource to uncharge in cg (resource pool)
201  *
202  * It also frees the resource pool which was created as part of
203  * charging operation when there are no resources attached to
204  * resource pool.
205  */
206 static void
207 uncharge_cg_locked(struct rdma_cgroup *cg,
208 		   struct rdmacg_device *device,
209 		   enum rdmacg_resource_type index)
210 {
211 	struct rdmacg_resource_pool *rpool;
212 
213 	rpool = find_cg_rpool_locked(cg, device);
214 
215 	/*
216 	 * rpool cannot be null at this stage. Let kernel operate in case
217 	 * if there a bug in IB stack or rdma controller, instead of crashing
218 	 * the system.
219 	 */
220 	if (unlikely(!rpool)) {
221 		pr_warn("Invalid device %p or rdma cgroup %p\n", device, cg);
222 		return;
223 	}
224 
225 	rpool->resources[index].usage--;
226 
227 	/*
228 	 * A negative count (or overflow) is invalid,
229 	 * it indicates a bug in the rdma controller.
230 	 */
231 	WARN_ON_ONCE(rpool->resources[index].usage < 0);
232 	rpool->usage_sum--;
233 	if (rpool->usage_sum == 0 &&
234 	    rpool->num_max_cnt == RDMACG_RESOURCE_MAX) {
235 		if (!rpool_has_persistent_state(rpool)) {
236 			/*
237 			 * No user of the rpool and all entries are set to max, so
238 			 * safe to delete this rpool.
239 			 */
240 			free_cg_rpool_locked(rpool);
241 		}
242 	}
243 }
244 
245 /**
246  * rdmacg_event_locked - fire event when resource allocation exceeds limit
247  * @cg: requesting cgroup
248  * @over_cg: cgroup whose limit was exceeded
249  * @device: rdma device
250  * @index: resource type index
251  *
252  * Must be called under rdmacg_mutex. Updates event counters in the
253  * resource pools of @cg and @over_cg, propagates hierarchical max
254  * events from @over_cg (including itself) upward, and notifies
255  * userspace via cgroup_file_notify().
256  */
257 static void rdmacg_event_locked(struct rdma_cgroup *cg,
258 				struct rdma_cgroup *over_cg,
259 				struct rdmacg_device *device,
260 				enum rdmacg_resource_type index)
261 {
262 	struct rdmacg_resource_pool *rpool;
263 	struct rdma_cgroup *p;
264 
265 	lockdep_assert_held(&rdmacg_mutex);
266 
267 	/* Increment local alloc_fail in requesting cgroup */
268 	rpool = find_cg_rpool_locked(cg, device);
269 	if (rpool) {
270 		rpool->events_local_alloc_fail[index]++;
271 		cgroup_file_notify(&cg->events_local_file);
272 	}
273 
274 	/* Increment local max in the over-limit cgroup */
275 	rpool = find_cg_rpool_locked(over_cg, device);
276 	if (rpool) {
277 		rpool->events_local_max[index]++;
278 		cgroup_file_notify(&over_cg->events_local_file);
279 	}
280 
281 	/* Propagate hierarchical max events upward */
282 	for (p = over_cg; parent_rdmacg(p); p = parent_rdmacg(p)) {
283 		rpool = get_cg_rpool_locked(p, device);
284 		if (!IS_ERR(rpool)) {
285 			rpool->events_max[index]++;
286 			cgroup_file_notify(&p->events_file);
287 		}
288 	}
289 	/* Propagate hierarchical alloc_fail from requesting cgroup upward */
290 	for (p = cg; parent_rdmacg(p); p = parent_rdmacg(p)) {
291 		rpool = get_cg_rpool_locked(p, device);
292 		if (!IS_ERR(rpool)) {
293 			rpool->events_alloc_fail[index]++;
294 			cgroup_file_notify(&p->events_file);
295 		}
296 	}
297 }
298 
299 /**
300  * rdmacg_uncharge_hierarchy - hierarchically uncharge rdma resource count
301  * @cg: pointer to cg to uncharge and all parents in hierarchy
302  * @device: pointer to rdmacg device
303  * @stop_cg: while traversing hirerchy, when meet with stop_cg cgroup
304  *           stop uncharging
305  * @index: index of the resource to uncharge in cg in given resource pool
306  */
307 static void rdmacg_uncharge_hierarchy(struct rdma_cgroup *cg,
308 				     struct rdmacg_device *device,
309 				     struct rdma_cgroup *stop_cg,
310 				     enum rdmacg_resource_type index)
311 {
312 	struct rdma_cgroup *p;
313 
314 	mutex_lock(&rdmacg_mutex);
315 
316 	for (p = cg; p != stop_cg; p = parent_rdmacg(p))
317 		uncharge_cg_locked(p, device, index);
318 
319 	mutex_unlock(&rdmacg_mutex);
320 
321 	css_put(&cg->css);
322 }
323 
324 /**
325  * rdmacg_uncharge - hierarchically uncharge rdma resource count
326  * @cg: pointer to cg to uncharge and all parents in hierarchy
327  * @device: pointer to rdmacg device
328  * @index: index of the resource to uncharge in cgroup in given resource pool
329  */
330 void rdmacg_uncharge(struct rdma_cgroup *cg,
331 		     struct rdmacg_device *device,
332 		     enum rdmacg_resource_type index)
333 {
334 	if (index >= RDMACG_RESOURCE_MAX)
335 		return;
336 
337 	rdmacg_uncharge_hierarchy(cg, device, NULL, index);
338 }
339 EXPORT_SYMBOL(rdmacg_uncharge);
340 
341 /**
342  * rdmacg_try_charge - hierarchically try to charge the rdma resource
343  * @rdmacg: pointer to rdma cgroup which will own this resource
344  * @device: pointer to rdmacg device
345  * @index: index of the resource to charge in cgroup (resource pool)
346  *
347  * This function follows charging resource in hierarchical way.
348  * It will fail if the charge would cause the new value to exceed the
349  * hierarchical limit.
350  * Returns 0 if the charge succeeded, otherwise -EAGAIN, -ENOMEM or -EINVAL.
351  * Returns pointer to rdmacg for this resource when charging is successful.
352  *
353  * Charger needs to account resources on two criteria.
354  * (a) per cgroup & (b) per device resource usage.
355  * Per cgroup resource usage ensures that tasks of cgroup doesn't cross
356  * the configured limits. Per device provides granular configuration
357  * in multi device usage. It allocates resource pool in the hierarchy
358  * for each parent it come across for first resource. Later on resource
359  * pool will be available. Therefore it will be much faster thereon
360  * to charge/uncharge.
361  */
362 int rdmacg_try_charge(struct rdma_cgroup **rdmacg,
363 		      struct rdmacg_device *device,
364 		      enum rdmacg_resource_type index)
365 {
366 	struct rdma_cgroup *cg, *p;
367 	struct rdmacg_resource_pool *rpool;
368 	s64 new;
369 	int ret = 0;
370 
371 	if (index >= RDMACG_RESOURCE_MAX)
372 		return -EINVAL;
373 
374 	/*
375 	 * hold on to css, as cgroup can be removed but resource
376 	 * accounting happens on css.
377 	 */
378 	cg = get_current_rdmacg();
379 
380 	mutex_lock(&rdmacg_mutex);
381 	for (p = cg; p; p = parent_rdmacg(p)) {
382 		rpool = get_cg_rpool_locked(p, device);
383 		if (IS_ERR(rpool)) {
384 			ret = PTR_ERR(rpool);
385 			goto err;
386 		} else {
387 			new = (s64)rpool->resources[index].usage + 1;
388 			if (new > rpool->resources[index].max) {
389 				ret = -EAGAIN;
390 				goto err;
391 			} else {
392 				rpool->resources[index].usage = new;
393 				rpool->usage_sum++;
394 			}
395 		}
396 	}
397 	/* Update peak only after all charges succeed */
398 	for (p = cg; p; p = parent_rdmacg(p)) {
399 		rpool = find_cg_rpool_locked(p, device);
400 		if (rpool && rpool->resources[index].usage > rpool->resources[index].peak)
401 			rpool->resources[index].peak = rpool->resources[index].usage;
402 	}
403 	mutex_unlock(&rdmacg_mutex);
404 
405 	*rdmacg = cg;
406 	return 0;
407 
408 err:
409 	if (ret == -EAGAIN)
410 		rdmacg_event_locked(cg, p, device, index);
411 	mutex_unlock(&rdmacg_mutex);
412 	rdmacg_uncharge_hierarchy(cg, device, p, index);
413 	return ret;
414 }
415 EXPORT_SYMBOL(rdmacg_try_charge);
416 
417 /**
418  * rdmacg_register_device - register rdmacg device to rdma controller.
419  * @device: pointer to rdmacg device whose resources need to be accounted.
420  *
421  * If IB stack wish a device to participate in rdma cgroup resource
422  * tracking, it must invoke this API to register with rdma cgroup before
423  * any user space application can start using the RDMA resources.
424  */
425 void rdmacg_register_device(struct rdmacg_device *device)
426 {
427 	INIT_LIST_HEAD(&device->dev_node);
428 	INIT_LIST_HEAD(&device->rpools);
429 
430 	mutex_lock(&rdmacg_mutex);
431 	list_add_tail(&device->dev_node, &rdmacg_devices);
432 	mutex_unlock(&rdmacg_mutex);
433 }
434 EXPORT_SYMBOL(rdmacg_register_device);
435 
436 /**
437  * rdmacg_unregister_device - unregister rdmacg device from rdma controller.
438  * @device: pointer to rdmacg device which was previously registered with rdma
439  *          controller using rdmacg_register_device().
440  *
441  * IB stack must invoke this after all the resources of the IB device
442  * are destroyed and after ensuring that no more resources will be created
443  * when this API is invoked.
444  */
445 void rdmacg_unregister_device(struct rdmacg_device *device)
446 {
447 	struct rdmacg_resource_pool *rpool, *tmp;
448 
449 	/*
450 	 * Synchronize with any active resource settings,
451 	 * usage query happening via configfs.
452 	 */
453 	mutex_lock(&rdmacg_mutex);
454 	list_del_init(&device->dev_node);
455 
456 	/*
457 	 * Now that this device is off the cgroup list, its safe to free
458 	 * all the rpool resources.
459 	 */
460 	list_for_each_entry_safe(rpool, tmp, &device->rpools, dev_node)
461 		free_cg_rpool_locked(rpool);
462 
463 	mutex_unlock(&rdmacg_mutex);
464 }
465 EXPORT_SYMBOL(rdmacg_unregister_device);
466 
467 static struct rdmacg_device *rdmacg_get_device_locked(const char *name)
468 {
469 	struct rdmacg_device *device;
470 
471 	lockdep_assert_held(&rdmacg_mutex);
472 
473 	list_for_each_entry(device, &rdmacg_devices, dev_node)
474 		if (!strcmp(name, device->name))
475 			return device;
476 
477 	return NULL;
478 }
479 
480 static ssize_t rdmacg_resource_set_max(struct kernfs_open_file *of,
481 				       char *buf, size_t nbytes, loff_t off)
482 {
483 	struct rdma_cgroup *cg = css_rdmacg(of_css(of));
484 	const char *dev_name;
485 	struct rdmacg_resource_pool *rpool;
486 	struct rdmacg_device *device;
487 	char *options = strstrip(buf);
488 	char *p;
489 	int *new_limits;
490 	unsigned long enables = 0;
491 	int i = 0, ret = 0;
492 
493 	/* extract the device name first */
494 	dev_name = strsep(&options, " ");
495 	if (!dev_name) {
496 		ret = -EINVAL;
497 		goto err;
498 	}
499 
500 	new_limits = kzalloc_objs(int, RDMACG_RESOURCE_MAX);
501 	if (!new_limits) {
502 		ret = -ENOMEM;
503 		goto err;
504 	}
505 
506 	/* parse resource limit tokens */
507 	while ((p = strsep(&options, " \t\n"))) {
508 		substring_t args[MAX_OPT_ARGS];
509 		int tok, intval;
510 
511 		if (!*p)
512 			continue;
513 
514 		tok = match_token(p, rdmacg_limit_tokens, args);
515 		switch (tok) {
516 		case RDMACG_HCA_HANDLE_VAL:
517 			if (match_int(&args[0], &intval) || intval < 0) {
518 				ret = -EINVAL;
519 				goto parse_err;
520 			}
521 			new_limits[RDMACG_RESOURCE_HCA_HANDLE] = intval;
522 			enables |= BIT(RDMACG_RESOURCE_HCA_HANDLE);
523 			break;
524 		case RDMACG_HCA_HANDLE_MAX:
525 			new_limits[RDMACG_RESOURCE_HCA_HANDLE] = S32_MAX;
526 			enables |= BIT(RDMACG_RESOURCE_HCA_HANDLE);
527 			break;
528 		case RDMACG_HCA_OBJECT_VAL:
529 			if (match_int(&args[0], &intval) || intval < 0) {
530 				ret = -EINVAL;
531 				goto parse_err;
532 			}
533 			new_limits[RDMACG_RESOURCE_HCA_OBJECT] = intval;
534 			enables |= BIT(RDMACG_RESOURCE_HCA_OBJECT);
535 			break;
536 		case RDMACG_HCA_OBJECT_MAX:
537 			new_limits[RDMACG_RESOURCE_HCA_OBJECT] = S32_MAX;
538 			enables |= BIT(RDMACG_RESOURCE_HCA_OBJECT);
539 			break;
540 		default:
541 			ret = -EINVAL;
542 			goto parse_err;
543 		}
544 	}
545 
546 	/* acquire lock to synchronize with hot plug devices */
547 	mutex_lock(&rdmacg_mutex);
548 
549 	device = rdmacg_get_device_locked(dev_name);
550 	if (!device) {
551 		ret = -ENODEV;
552 		goto dev_err;
553 	}
554 
555 	rpool = get_cg_rpool_locked(cg, device);
556 	if (IS_ERR(rpool)) {
557 		ret = PTR_ERR(rpool);
558 		goto dev_err;
559 	}
560 
561 	/* now set the new limits of the rpool */
562 	for_each_set_bit(i, &enables, RDMACG_RESOURCE_MAX)
563 		set_resource_limit(rpool, i, new_limits[i]);
564 
565 	if (rpool->usage_sum == 0 &&
566 	    rpool->num_max_cnt == RDMACG_RESOURCE_MAX) {
567 		if (!rpool_has_persistent_state(rpool)) {
568 			/*
569 			 * No user of the rpool and all entries are set to max, so
570 			 * safe to delete this rpool.
571 			 */
572 			free_cg_rpool_locked(rpool);
573 		}
574 	}
575 
576 dev_err:
577 	mutex_unlock(&rdmacg_mutex);
578 
579 parse_err:
580 	kfree(new_limits);
581 
582 err:
583 	return ret ?: nbytes;
584 }
585 
586 static void print_rpool_values(struct seq_file *sf,
587 			       struct rdmacg_resource_pool *rpool)
588 {
589 	enum rdmacg_file_type sf_type;
590 	int i;
591 	u32 value;
592 
593 	sf_type = seq_cft(sf)->private;
594 
595 	for (i = 0; i < RDMACG_RESOURCE_MAX; i++) {
596 		seq_puts(sf, rdmacg_resource_names[i]);
597 		seq_putc(sf, '=');
598 		if (sf_type == RDMACG_RESOURCE_TYPE_MAX) {
599 			if (rpool)
600 				value = rpool->resources[i].max;
601 			else
602 				value = S32_MAX;
603 		} else if (sf_type == RDMACG_RESOURCE_TYPE_PEAK) {
604 			value = rpool ? rpool->resources[i].peak : 0;
605 		} else {
606 			if (rpool)
607 				value = rpool->resources[i].usage;
608 			else
609 				value = 0;
610 		}
611 
612 		if (value == S32_MAX)
613 			seq_puts(sf, RDMACG_MAX_STR);
614 		else
615 			seq_printf(sf, "%d", value);
616 		seq_putc(sf, ' ');
617 	}
618 }
619 
620 static int rdmacg_resource_read(struct seq_file *sf, void *v)
621 {
622 	struct rdmacg_device *device;
623 	struct rdmacg_resource_pool *rpool;
624 	struct rdma_cgroup *cg = css_rdmacg(seq_css(sf));
625 
626 	mutex_lock(&rdmacg_mutex);
627 
628 	list_for_each_entry(device, &rdmacg_devices, dev_node) {
629 		seq_printf(sf, "%s ", device->name);
630 
631 		rpool = find_cg_rpool_locked(cg, device);
632 		print_rpool_values(sf, rpool);
633 
634 		seq_putc(sf, '\n');
635 	}
636 
637 	mutex_unlock(&rdmacg_mutex);
638 	return 0;
639 }
640 
641 static int rdmacg_events_show(struct seq_file *sf, void *v)
642 {
643 	struct rdma_cgroup *cg = css_rdmacg(seq_css(sf));
644 	struct rdmacg_resource_pool *rpool;
645 	struct rdmacg_device *device;
646 	int i;
647 
648 	mutex_lock(&rdmacg_mutex);
649 
650 	list_for_each_entry(device, &rdmacg_devices, dev_node) {
651 		rpool = find_cg_rpool_locked(cg, device);
652 
653 		seq_printf(sf, "%s ", device->name);
654 		for (i = 0; i < RDMACG_RESOURCE_MAX; i++) {
655 			seq_printf(sf, "%s.max=%llu %s.alloc_fail=%llu",
656 				   rdmacg_resource_names[i],
657 				   rpool ? rpool->events_max[i] : 0ULL,
658 				   rdmacg_resource_names[i],
659 				   rpool ? rpool->events_alloc_fail[i] : 0ULL);
660 			if (i < RDMACG_RESOURCE_MAX - 1)
661 				seq_putc(sf, ' ');
662 		}
663 		seq_putc(sf, '\n');
664 	}
665 
666 	mutex_unlock(&rdmacg_mutex);
667 	return 0;
668 }
669 
670 static int rdmacg_events_local_show(struct seq_file *sf, void *v)
671 {
672 	struct rdma_cgroup *cg = css_rdmacg(seq_css(sf));
673 	struct rdmacg_resource_pool *rpool;
674 	struct rdmacg_device *device;
675 	int i;
676 
677 	mutex_lock(&rdmacg_mutex);
678 
679 	list_for_each_entry(device, &rdmacg_devices, dev_node) {
680 		rpool = find_cg_rpool_locked(cg, device);
681 
682 		seq_printf(sf, "%s ", device->name);
683 		for (i = 0; i < RDMACG_RESOURCE_MAX; i++) {
684 			seq_printf(sf, "%s.max=%llu %s.alloc_fail=%llu",
685 				   rdmacg_resource_names[i],
686 				   rpool ? rpool->events_local_max[i] : 0ULL,
687 				   rdmacg_resource_names[i],
688 				   rpool ? rpool->events_local_alloc_fail[i] : 0ULL);
689 			if (i < RDMACG_RESOURCE_MAX - 1)
690 				seq_putc(sf, ' ');
691 		}
692 		seq_putc(sf, '\n');
693 	}
694 
695 	mutex_unlock(&rdmacg_mutex);
696 	return 0;
697 }
698 
699 static struct cftype rdmacg_files[] = {
700 	{
701 		.name = "max",
702 		.write = rdmacg_resource_set_max,
703 		.seq_show = rdmacg_resource_read,
704 		.private = RDMACG_RESOURCE_TYPE_MAX,
705 		.flags = CFTYPE_NOT_ON_ROOT,
706 	},
707 	{
708 		.name = "current",
709 		.seq_show = rdmacg_resource_read,
710 		.private = RDMACG_RESOURCE_TYPE_STAT,
711 		.flags = CFTYPE_NOT_ON_ROOT,
712 	},
713 	{
714 		.name = "peak",
715 		.seq_show = rdmacg_resource_read,
716 		.private = RDMACG_RESOURCE_TYPE_PEAK,
717 		.flags = CFTYPE_NOT_ON_ROOT,
718 	},
719 	{
720 		.name = "events",
721 		.seq_show = rdmacg_events_show,
722 		.file_offset = offsetof(struct rdma_cgroup, events_file),
723 		.flags = CFTYPE_NOT_ON_ROOT,
724 	},
725 	{
726 		.name = "events.local",
727 		.seq_show = rdmacg_events_local_show,
728 		.file_offset = offsetof(struct rdma_cgroup, events_local_file),
729 		.flags = CFTYPE_NOT_ON_ROOT,
730 	},
731 	{ }	/* terminate */
732 };
733 
734 static struct cgroup_subsys_state *
735 rdmacg_css_alloc(struct cgroup_subsys_state *parent)
736 {
737 	struct rdma_cgroup *cg;
738 
739 	cg = kzalloc_obj(*cg);
740 	if (!cg)
741 		return ERR_PTR(-ENOMEM);
742 
743 	INIT_LIST_HEAD(&cg->rpools);
744 	return &cg->css;
745 }
746 
747 static void rdmacg_css_free(struct cgroup_subsys_state *css)
748 {
749 	struct rdma_cgroup *cg = css_rdmacg(css);
750 	struct rdmacg_resource_pool *rpool, *tmp;
751 
752 	/* Clean up rpools kept alive by non-zero peak values */
753 	mutex_lock(&rdmacg_mutex);
754 	list_for_each_entry_safe(rpool, tmp, &cg->rpools, cg_node)
755 		free_cg_rpool_locked(rpool);
756 	mutex_unlock(&rdmacg_mutex);
757 
758 	kfree(cg);
759 }
760 
761 /**
762  * rdmacg_css_offline - cgroup css_offline callback
763  * @css: css of interest
764  *
765  * This function is called when @css is about to go away and responsible
766  * for shooting down all rdmacg associated with @css. As part of that it
767  * marks all the resource pool entries to max value, so that when resources are
768  * uncharged, associated resource pool can be freed as well.
769  */
770 static void rdmacg_css_offline(struct cgroup_subsys_state *css)
771 {
772 	struct rdma_cgroup *cg = css_rdmacg(css);
773 	struct rdmacg_resource_pool *rpool;
774 
775 	mutex_lock(&rdmacg_mutex);
776 
777 	list_for_each_entry(rpool, &cg->rpools, cg_node)
778 		set_all_resource_max_limit(rpool);
779 
780 	mutex_unlock(&rdmacg_mutex);
781 }
782 
783 struct cgroup_subsys rdma_cgrp_subsys = {
784 	.css_alloc	= rdmacg_css_alloc,
785 	.css_free	= rdmacg_css_free,
786 	.css_offline	= rdmacg_css_offline,
787 	.legacy_cftypes	= rdmacg_files,
788 	.dfl_cftypes	= rdmacg_files,
789 };
790