xref: /linux/kernel/cgroup/rdma.c (revision 088e88be5a380cc4e81963a9a02815da465d144f)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * RDMA resource limiting controller for cgroups.
4  *
5  * Used to allow a cgroup hierarchy to stop processes from consuming
6  * additional RDMA resources after a certain limit is reached.
7  *
8  * Copyright (C) 2016 Parav Pandit <pandit.parav@gmail.com>
9  */
10 
11 #include <linux/bitops.h>
12 #include <linux/slab.h>
13 #include <linux/seq_file.h>
14 #include <linux/cgroup.h>
15 #include <linux/parser.h>
16 #include <linux/cgroup_rdma.h>
17 
18 #define RDMACG_MAX_STR "max"
19 
20 /*
21  * Protects list of resource pools maintained on per cgroup basis
22  * and rdma device list.
23  */
24 static DEFINE_MUTEX(rdmacg_mutex);
25 static LIST_HEAD(rdmacg_devices);
26 
27 enum rdmacg_file_type {
28 	RDMACG_RESOURCE_TYPE_MAX,
29 	RDMACG_RESOURCE_TYPE_STAT,
30 };
31 
32 /*
33  * resource table definition as to be seen by the user.
34  * Need to add entries to it when more resources are
35  * added/defined at IB verb/core layer.
36  */
37 static char const *rdmacg_resource_names[] = {
38 	[RDMACG_RESOURCE_HCA_HANDLE]	= "hca_handle",
39 	[RDMACG_RESOURCE_HCA_OBJECT]	= "hca_object",
40 };
41 
42 /* resource tracker for each resource of rdma cgroup */
43 struct rdmacg_resource {
44 	int max;
45 	int usage;
46 };
47 
48 /*
49  * resource pool object which represents per cgroup, per device
50  * resources. There are multiple instances of this object per cgroup,
51  * therefore it cannot be embedded within rdma_cgroup structure. It
52  * is maintained as list.
53  */
54 struct rdmacg_resource_pool {
55 	struct rdmacg_device	*device;
56 	struct rdmacg_resource	resources[RDMACG_RESOURCE_MAX];
57 
58 	struct list_head	cg_node;
59 	struct list_head	dev_node;
60 
61 	/* count active user tasks of this pool */
62 	u64			usage_sum;
63 	/* total number counts which are set to max */
64 	int			num_max_cnt;
65 };
66 
67 static struct rdma_cgroup *css_rdmacg(struct cgroup_subsys_state *css)
68 {
69 	return container_of(css, struct rdma_cgroup, css);
70 }
71 
72 static struct rdma_cgroup *parent_rdmacg(struct rdma_cgroup *cg)
73 {
74 	return css_rdmacg(cg->css.parent);
75 }
76 
77 static inline struct rdma_cgroup *get_current_rdmacg(void)
78 {
79 	return css_rdmacg(task_get_css(current, rdma_cgrp_id));
80 }
81 
82 static void set_resource_limit(struct rdmacg_resource_pool *rpool,
83 			       int index, int new_max)
84 {
85 	if (new_max == S32_MAX) {
86 		if (rpool->resources[index].max != S32_MAX)
87 			rpool->num_max_cnt++;
88 	} else {
89 		if (rpool->resources[index].max == S32_MAX)
90 			rpool->num_max_cnt--;
91 	}
92 	rpool->resources[index].max = new_max;
93 }
94 
95 static void set_all_resource_max_limit(struct rdmacg_resource_pool *rpool)
96 {
97 	int i;
98 
99 	for (i = 0; i < RDMACG_RESOURCE_MAX; i++)
100 		set_resource_limit(rpool, i, S32_MAX);
101 }
102 
103 static void free_cg_rpool_locked(struct rdmacg_resource_pool *rpool)
104 {
105 	lockdep_assert_held(&rdmacg_mutex);
106 
107 	list_del(&rpool->cg_node);
108 	list_del(&rpool->dev_node);
109 	kfree(rpool);
110 }
111 
112 static struct rdmacg_resource_pool *
113 find_cg_rpool_locked(struct rdma_cgroup *cg,
114 		     struct rdmacg_device *device)
115 
116 {
117 	struct rdmacg_resource_pool *pool;
118 
119 	lockdep_assert_held(&rdmacg_mutex);
120 
121 	list_for_each_entry(pool, &cg->rpools, cg_node)
122 		if (pool->device == device)
123 			return pool;
124 
125 	return NULL;
126 }
127 
128 static struct rdmacg_resource_pool *
129 get_cg_rpool_locked(struct rdma_cgroup *cg, struct rdmacg_device *device)
130 {
131 	struct rdmacg_resource_pool *rpool;
132 
133 	rpool = find_cg_rpool_locked(cg, device);
134 	if (rpool)
135 		return rpool;
136 
137 	rpool = kzalloc(sizeof(*rpool), GFP_KERNEL);
138 	if (!rpool)
139 		return ERR_PTR(-ENOMEM);
140 
141 	rpool->device = device;
142 	set_all_resource_max_limit(rpool);
143 
144 	INIT_LIST_HEAD(&rpool->cg_node);
145 	INIT_LIST_HEAD(&rpool->dev_node);
146 	list_add_tail(&rpool->cg_node, &cg->rpools);
147 	list_add_tail(&rpool->dev_node, &device->rpools);
148 	return rpool;
149 }
150 
151 /**
152  * uncharge_cg_locked - uncharge resource for rdma cgroup
153  * @cg: pointer to cg to uncharge and all parents in hierarchy
154  * @device: pointer to rdmacg device
155  * @index: index of the resource to uncharge in cg (resource pool)
156  *
157  * It also frees the resource pool which was created as part of
158  * charging operation when there are no resources attached to
159  * resource pool.
160  */
161 static void
162 uncharge_cg_locked(struct rdma_cgroup *cg,
163 		   struct rdmacg_device *device,
164 		   enum rdmacg_resource_type index)
165 {
166 	struct rdmacg_resource_pool *rpool;
167 
168 	rpool = find_cg_rpool_locked(cg, device);
169 
170 	/*
171 	 * rpool cannot be null at this stage. Let kernel operate in case
172 	 * if there a bug in IB stack or rdma controller, instead of crashing
173 	 * the system.
174 	 */
175 	if (unlikely(!rpool)) {
176 		pr_warn("Invalid device %p or rdma cgroup %p\n", cg, device);
177 		return;
178 	}
179 
180 	rpool->resources[index].usage--;
181 
182 	/*
183 	 * A negative count (or overflow) is invalid,
184 	 * it indicates a bug in the rdma controller.
185 	 */
186 	WARN_ON_ONCE(rpool->resources[index].usage < 0);
187 	rpool->usage_sum--;
188 	if (rpool->usage_sum == 0 &&
189 	    rpool->num_max_cnt == RDMACG_RESOURCE_MAX) {
190 		/*
191 		 * No user of the rpool and all entries are set to max, so
192 		 * safe to delete this rpool.
193 		 */
194 		free_cg_rpool_locked(rpool);
195 	}
196 }
197 
198 /**
199  * rdmacg_uncharge_hierarchy - hierarchically uncharge rdma resource count
200  * @device: pointer to rdmacg device
201  * @stop_cg: while traversing hirerchy, when meet with stop_cg cgroup
202  *           stop uncharging
203  * @index: index of the resource to uncharge in cg in given resource pool
204  */
205 static void rdmacg_uncharge_hierarchy(struct rdma_cgroup *cg,
206 				     struct rdmacg_device *device,
207 				     struct rdma_cgroup *stop_cg,
208 				     enum rdmacg_resource_type index)
209 {
210 	struct rdma_cgroup *p;
211 
212 	mutex_lock(&rdmacg_mutex);
213 
214 	for (p = cg; p != stop_cg; p = parent_rdmacg(p))
215 		uncharge_cg_locked(p, device, index);
216 
217 	mutex_unlock(&rdmacg_mutex);
218 
219 	css_put(&cg->css);
220 }
221 
222 /**
223  * rdmacg_uncharge - hierarchically uncharge rdma resource count
224  * @device: pointer to rdmacg device
225  * @index: index of the resource to uncharge in cgroup in given resource pool
226  */
227 void rdmacg_uncharge(struct rdma_cgroup *cg,
228 		     struct rdmacg_device *device,
229 		     enum rdmacg_resource_type index)
230 {
231 	if (index >= RDMACG_RESOURCE_MAX)
232 		return;
233 
234 	rdmacg_uncharge_hierarchy(cg, device, NULL, index);
235 }
236 EXPORT_SYMBOL(rdmacg_uncharge);
237 
238 /**
239  * rdmacg_try_charge - hierarchically try to charge the rdma resource
240  * @rdmacg: pointer to rdma cgroup which will own this resource
241  * @device: pointer to rdmacg device
242  * @index: index of the resource to charge in cgroup (resource pool)
243  *
244  * This function follows charging resource in hierarchical way.
245  * It will fail if the charge would cause the new value to exceed the
246  * hierarchical limit.
247  * Returns 0 if the charge succeded, otherwise -EAGAIN, -ENOMEM or -EINVAL.
248  * Returns pointer to rdmacg for this resource when charging is successful.
249  *
250  * Charger needs to account resources on two criteria.
251  * (a) per cgroup & (b) per device resource usage.
252  * Per cgroup resource usage ensures that tasks of cgroup doesn't cross
253  * the configured limits. Per device provides granular configuration
254  * in multi device usage. It allocates resource pool in the hierarchy
255  * for each parent it come across for first resource. Later on resource
256  * pool will be available. Therefore it will be much faster thereon
257  * to charge/uncharge.
258  */
259 int rdmacg_try_charge(struct rdma_cgroup **rdmacg,
260 		      struct rdmacg_device *device,
261 		      enum rdmacg_resource_type index)
262 {
263 	struct rdma_cgroup *cg, *p;
264 	struct rdmacg_resource_pool *rpool;
265 	s64 new;
266 	int ret = 0;
267 
268 	if (index >= RDMACG_RESOURCE_MAX)
269 		return -EINVAL;
270 
271 	/*
272 	 * hold on to css, as cgroup can be removed but resource
273 	 * accounting happens on css.
274 	 */
275 	cg = get_current_rdmacg();
276 
277 	mutex_lock(&rdmacg_mutex);
278 	for (p = cg; p; p = parent_rdmacg(p)) {
279 		rpool = get_cg_rpool_locked(p, device);
280 		if (IS_ERR(rpool)) {
281 			ret = PTR_ERR(rpool);
282 			goto err;
283 		} else {
284 			new = rpool->resources[index].usage + 1;
285 			if (new > rpool->resources[index].max) {
286 				ret = -EAGAIN;
287 				goto err;
288 			} else {
289 				rpool->resources[index].usage = new;
290 				rpool->usage_sum++;
291 			}
292 		}
293 	}
294 	mutex_unlock(&rdmacg_mutex);
295 
296 	*rdmacg = cg;
297 	return 0;
298 
299 err:
300 	mutex_unlock(&rdmacg_mutex);
301 	rdmacg_uncharge_hierarchy(cg, device, p, index);
302 	return ret;
303 }
304 EXPORT_SYMBOL(rdmacg_try_charge);
305 
306 /**
307  * rdmacg_register_device - register rdmacg device to rdma controller.
308  * @device: pointer to rdmacg device whose resources need to be accounted.
309  *
310  * If IB stack wish a device to participate in rdma cgroup resource
311  * tracking, it must invoke this API to register with rdma cgroup before
312  * any user space application can start using the RDMA resources.
313  */
314 void rdmacg_register_device(struct rdmacg_device *device)
315 {
316 	INIT_LIST_HEAD(&device->dev_node);
317 	INIT_LIST_HEAD(&device->rpools);
318 
319 	mutex_lock(&rdmacg_mutex);
320 	list_add_tail(&device->dev_node, &rdmacg_devices);
321 	mutex_unlock(&rdmacg_mutex);
322 }
323 EXPORT_SYMBOL(rdmacg_register_device);
324 
325 /**
326  * rdmacg_unregister_device - unregister rdmacg device from rdma controller.
327  * @device: pointer to rdmacg device which was previously registered with rdma
328  *          controller using rdmacg_register_device().
329  *
330  * IB stack must invoke this after all the resources of the IB device
331  * are destroyed and after ensuring that no more resources will be created
332  * when this API is invoked.
333  */
334 void rdmacg_unregister_device(struct rdmacg_device *device)
335 {
336 	struct rdmacg_resource_pool *rpool, *tmp;
337 
338 	/*
339 	 * Synchronize with any active resource settings,
340 	 * usage query happening via configfs.
341 	 */
342 	mutex_lock(&rdmacg_mutex);
343 	list_del_init(&device->dev_node);
344 
345 	/*
346 	 * Now that this device is off the cgroup list, its safe to free
347 	 * all the rpool resources.
348 	 */
349 	list_for_each_entry_safe(rpool, tmp, &device->rpools, dev_node)
350 		free_cg_rpool_locked(rpool);
351 
352 	mutex_unlock(&rdmacg_mutex);
353 }
354 EXPORT_SYMBOL(rdmacg_unregister_device);
355 
356 static int parse_resource(char *c, int *intval)
357 {
358 	substring_t argstr;
359 	char *name, *value = c;
360 	size_t len;
361 	int ret, i;
362 
363 	name = strsep(&value, "=");
364 	if (!name || !value)
365 		return -EINVAL;
366 
367 	i = match_string(rdmacg_resource_names, RDMACG_RESOURCE_MAX, name);
368 	if (i < 0)
369 		return i;
370 
371 	len = strlen(value);
372 
373 	argstr.from = value;
374 	argstr.to = value + len;
375 
376 	ret = match_int(&argstr, intval);
377 	if (ret >= 0) {
378 		if (*intval < 0)
379 			return -EINVAL;
380 		return i;
381 	}
382 	if (strncmp(value, RDMACG_MAX_STR, len) == 0) {
383 		*intval = S32_MAX;
384 		return i;
385 	}
386 	return -EINVAL;
387 }
388 
389 static int rdmacg_parse_limits(char *options,
390 			       int *new_limits, unsigned long *enables)
391 {
392 	char *c;
393 	int err = -EINVAL;
394 
395 	/* parse resource options */
396 	while ((c = strsep(&options, " ")) != NULL) {
397 		int index, intval;
398 
399 		index = parse_resource(c, &intval);
400 		if (index < 0)
401 			goto err;
402 
403 		new_limits[index] = intval;
404 		*enables |= BIT(index);
405 	}
406 	return 0;
407 
408 err:
409 	return err;
410 }
411 
412 static struct rdmacg_device *rdmacg_get_device_locked(const char *name)
413 {
414 	struct rdmacg_device *device;
415 
416 	lockdep_assert_held(&rdmacg_mutex);
417 
418 	list_for_each_entry(device, &rdmacg_devices, dev_node)
419 		if (!strcmp(name, device->name))
420 			return device;
421 
422 	return NULL;
423 }
424 
425 static ssize_t rdmacg_resource_set_max(struct kernfs_open_file *of,
426 				       char *buf, size_t nbytes, loff_t off)
427 {
428 	struct rdma_cgroup *cg = css_rdmacg(of_css(of));
429 	const char *dev_name;
430 	struct rdmacg_resource_pool *rpool;
431 	struct rdmacg_device *device;
432 	char *options = strstrip(buf);
433 	int *new_limits;
434 	unsigned long enables = 0;
435 	int i = 0, ret = 0;
436 
437 	/* extract the device name first */
438 	dev_name = strsep(&options, " ");
439 	if (!dev_name) {
440 		ret = -EINVAL;
441 		goto err;
442 	}
443 
444 	new_limits = kcalloc(RDMACG_RESOURCE_MAX, sizeof(int), GFP_KERNEL);
445 	if (!new_limits) {
446 		ret = -ENOMEM;
447 		goto err;
448 	}
449 
450 	ret = rdmacg_parse_limits(options, new_limits, &enables);
451 	if (ret)
452 		goto parse_err;
453 
454 	/* acquire lock to synchronize with hot plug devices */
455 	mutex_lock(&rdmacg_mutex);
456 
457 	device = rdmacg_get_device_locked(dev_name);
458 	if (!device) {
459 		ret = -ENODEV;
460 		goto dev_err;
461 	}
462 
463 	rpool = get_cg_rpool_locked(cg, device);
464 	if (IS_ERR(rpool)) {
465 		ret = PTR_ERR(rpool);
466 		goto dev_err;
467 	}
468 
469 	/* now set the new limits of the rpool */
470 	for_each_set_bit(i, &enables, RDMACG_RESOURCE_MAX)
471 		set_resource_limit(rpool, i, new_limits[i]);
472 
473 	if (rpool->usage_sum == 0 &&
474 	    rpool->num_max_cnt == RDMACG_RESOURCE_MAX) {
475 		/*
476 		 * No user of the rpool and all entries are set to max, so
477 		 * safe to delete this rpool.
478 		 */
479 		free_cg_rpool_locked(rpool);
480 	}
481 
482 dev_err:
483 	mutex_unlock(&rdmacg_mutex);
484 
485 parse_err:
486 	kfree(new_limits);
487 
488 err:
489 	return ret ?: nbytes;
490 }
491 
492 static void print_rpool_values(struct seq_file *sf,
493 			       struct rdmacg_resource_pool *rpool)
494 {
495 	enum rdmacg_file_type sf_type;
496 	int i;
497 	u32 value;
498 
499 	sf_type = seq_cft(sf)->private;
500 
501 	for (i = 0; i < RDMACG_RESOURCE_MAX; i++) {
502 		seq_puts(sf, rdmacg_resource_names[i]);
503 		seq_putc(sf, '=');
504 		if (sf_type == RDMACG_RESOURCE_TYPE_MAX) {
505 			if (rpool)
506 				value = rpool->resources[i].max;
507 			else
508 				value = S32_MAX;
509 		} else {
510 			if (rpool)
511 				value = rpool->resources[i].usage;
512 			else
513 				value = 0;
514 		}
515 
516 		if (value == S32_MAX)
517 			seq_puts(sf, RDMACG_MAX_STR);
518 		else
519 			seq_printf(sf, "%d", value);
520 		seq_putc(sf, ' ');
521 	}
522 }
523 
524 static int rdmacg_resource_read(struct seq_file *sf, void *v)
525 {
526 	struct rdmacg_device *device;
527 	struct rdmacg_resource_pool *rpool;
528 	struct rdma_cgroup *cg = css_rdmacg(seq_css(sf));
529 
530 	mutex_lock(&rdmacg_mutex);
531 
532 	list_for_each_entry(device, &rdmacg_devices, dev_node) {
533 		seq_printf(sf, "%s ", device->name);
534 
535 		rpool = find_cg_rpool_locked(cg, device);
536 		print_rpool_values(sf, rpool);
537 
538 		seq_putc(sf, '\n');
539 	}
540 
541 	mutex_unlock(&rdmacg_mutex);
542 	return 0;
543 }
544 
545 static struct cftype rdmacg_files[] = {
546 	{
547 		.name = "max",
548 		.write = rdmacg_resource_set_max,
549 		.seq_show = rdmacg_resource_read,
550 		.private = RDMACG_RESOURCE_TYPE_MAX,
551 		.flags = CFTYPE_NOT_ON_ROOT,
552 	},
553 	{
554 		.name = "current",
555 		.seq_show = rdmacg_resource_read,
556 		.private = RDMACG_RESOURCE_TYPE_STAT,
557 		.flags = CFTYPE_NOT_ON_ROOT,
558 	},
559 	{ }	/* terminate */
560 };
561 
562 static struct cgroup_subsys_state *
563 rdmacg_css_alloc(struct cgroup_subsys_state *parent)
564 {
565 	struct rdma_cgroup *cg;
566 
567 	cg = kzalloc(sizeof(*cg), GFP_KERNEL);
568 	if (!cg)
569 		return ERR_PTR(-ENOMEM);
570 
571 	INIT_LIST_HEAD(&cg->rpools);
572 	return &cg->css;
573 }
574 
575 static void rdmacg_css_free(struct cgroup_subsys_state *css)
576 {
577 	struct rdma_cgroup *cg = css_rdmacg(css);
578 
579 	kfree(cg);
580 }
581 
582 /**
583  * rdmacg_css_offline - cgroup css_offline callback
584  * @css: css of interest
585  *
586  * This function is called when @css is about to go away and responsible
587  * for shooting down all rdmacg associated with @css. As part of that it
588  * marks all the resource pool entries to max value, so that when resources are
589  * uncharged, associated resource pool can be freed as well.
590  */
591 static void rdmacg_css_offline(struct cgroup_subsys_state *css)
592 {
593 	struct rdma_cgroup *cg = css_rdmacg(css);
594 	struct rdmacg_resource_pool *rpool;
595 
596 	mutex_lock(&rdmacg_mutex);
597 
598 	list_for_each_entry(rpool, &cg->rpools, cg_node)
599 		set_all_resource_max_limit(rpool);
600 
601 	mutex_unlock(&rdmacg_mutex);
602 }
603 
604 struct cgroup_subsys rdma_cgrp_subsys = {
605 	.css_alloc	= rdmacg_css_alloc,
606 	.css_free	= rdmacg_css_free,
607 	.css_offline	= rdmacg_css_offline,
608 	.legacy_cftypes	= rdmacg_files,
609 	.dfl_cftypes	= rdmacg_files,
610 };
611