xref: /linux/kernel/cgroup/rdma.c (revision 7aacf86b75bc5523d20fd9127104384fce51ce9c)
1 /*
2  * RDMA resource limiting controller for cgroups.
3  *
4  * Used to allow a cgroup hierarchy to stop processes from consuming
5  * additional RDMA resources after a certain limit is reached.
6  *
7  * Copyright (C) 2016 Parav Pandit <pandit.parav@gmail.com>
8  *
9  * This file is subject to the terms and conditions of version 2 of the GNU
10  * General Public License. See the file COPYING in the main directory of the
11  * Linux distribution for more details.
12  */
13 
14 #include <linux/bitops.h>
15 #include <linux/slab.h>
16 #include <linux/seq_file.h>
17 #include <linux/cgroup.h>
18 #include <linux/parser.h>
19 #include <linux/cgroup_rdma.h>
20 
21 #define RDMACG_MAX_STR "max"
22 
23 /*
24  * Protects list of resource pools maintained on per cgroup basis
25  * and rdma device list.
26  */
27 static DEFINE_MUTEX(rdmacg_mutex);
28 static LIST_HEAD(rdmacg_devices);
29 
30 enum rdmacg_file_type {
31 	RDMACG_RESOURCE_TYPE_MAX,
32 	RDMACG_RESOURCE_TYPE_STAT,
33 };
34 
35 /*
36  * resource table definition as to be seen by the user.
37  * Need to add entries to it when more resources are
38  * added/defined at IB verb/core layer.
39  */
40 static char const *rdmacg_resource_names[] = {
41 	[RDMACG_RESOURCE_HCA_HANDLE]	= "hca_handle",
42 	[RDMACG_RESOURCE_HCA_OBJECT]	= "hca_object",
43 };
44 
45 /* resource tracker for each resource of rdma cgroup */
46 struct rdmacg_resource {
47 	int max;
48 	int usage;
49 };
50 
51 /*
52  * resource pool object which represents per cgroup, per device
53  * resources. There are multiple instances of this object per cgroup,
54  * therefore it cannot be embedded within rdma_cgroup structure. It
55  * is maintained as list.
56  */
57 struct rdmacg_resource_pool {
58 	struct rdmacg_device	*device;
59 	struct rdmacg_resource	resources[RDMACG_RESOURCE_MAX];
60 
61 	struct list_head	cg_node;
62 	struct list_head	dev_node;
63 
64 	/* count active user tasks of this pool */
65 	u64			usage_sum;
66 	/* total number counts which are set to max */
67 	int			num_max_cnt;
68 };
69 
70 static struct rdma_cgroup *css_rdmacg(struct cgroup_subsys_state *css)
71 {
72 	return container_of(css, struct rdma_cgroup, css);
73 }
74 
75 static struct rdma_cgroup *parent_rdmacg(struct rdma_cgroup *cg)
76 {
77 	return css_rdmacg(cg->css.parent);
78 }
79 
80 static inline struct rdma_cgroup *get_current_rdmacg(void)
81 {
82 	return css_rdmacg(task_get_css(current, rdma_cgrp_id));
83 }
84 
85 static void set_resource_limit(struct rdmacg_resource_pool *rpool,
86 			       int index, int new_max)
87 {
88 	if (new_max == S32_MAX) {
89 		if (rpool->resources[index].max != S32_MAX)
90 			rpool->num_max_cnt++;
91 	} else {
92 		if (rpool->resources[index].max == S32_MAX)
93 			rpool->num_max_cnt--;
94 	}
95 	rpool->resources[index].max = new_max;
96 }
97 
98 static void set_all_resource_max_limit(struct rdmacg_resource_pool *rpool)
99 {
100 	int i;
101 
102 	for (i = 0; i < RDMACG_RESOURCE_MAX; i++)
103 		set_resource_limit(rpool, i, S32_MAX);
104 }
105 
106 static void free_cg_rpool_locked(struct rdmacg_resource_pool *rpool)
107 {
108 	lockdep_assert_held(&rdmacg_mutex);
109 
110 	list_del(&rpool->cg_node);
111 	list_del(&rpool->dev_node);
112 	kfree(rpool);
113 }
114 
115 static struct rdmacg_resource_pool *
116 find_cg_rpool_locked(struct rdma_cgroup *cg,
117 		     struct rdmacg_device *device)
118 
119 {
120 	struct rdmacg_resource_pool *pool;
121 
122 	lockdep_assert_held(&rdmacg_mutex);
123 
124 	list_for_each_entry(pool, &cg->rpools, cg_node)
125 		if (pool->device == device)
126 			return pool;
127 
128 	return NULL;
129 }
130 
131 static struct rdmacg_resource_pool *
132 get_cg_rpool_locked(struct rdma_cgroup *cg, struct rdmacg_device *device)
133 {
134 	struct rdmacg_resource_pool *rpool;
135 
136 	rpool = find_cg_rpool_locked(cg, device);
137 	if (rpool)
138 		return rpool;
139 
140 	rpool = kzalloc(sizeof(*rpool), GFP_KERNEL);
141 	if (!rpool)
142 		return ERR_PTR(-ENOMEM);
143 
144 	rpool->device = device;
145 	set_all_resource_max_limit(rpool);
146 
147 	INIT_LIST_HEAD(&rpool->cg_node);
148 	INIT_LIST_HEAD(&rpool->dev_node);
149 	list_add_tail(&rpool->cg_node, &cg->rpools);
150 	list_add_tail(&rpool->dev_node, &device->rpools);
151 	return rpool;
152 }
153 
154 /**
155  * uncharge_cg_locked - uncharge resource for rdma cgroup
156  * @cg: pointer to cg to uncharge and all parents in hierarchy
157  * @device: pointer to rdmacg device
158  * @index: index of the resource to uncharge in cg (resource pool)
159  *
160  * It also frees the resource pool which was created as part of
161  * charging operation when there are no resources attached to
162  * resource pool.
163  */
164 static void
165 uncharge_cg_locked(struct rdma_cgroup *cg,
166 		   struct rdmacg_device *device,
167 		   enum rdmacg_resource_type index)
168 {
169 	struct rdmacg_resource_pool *rpool;
170 
171 	rpool = find_cg_rpool_locked(cg, device);
172 
173 	/*
174 	 * rpool cannot be null at this stage. Let kernel operate in case
175 	 * if there a bug in IB stack or rdma controller, instead of crashing
176 	 * the system.
177 	 */
178 	if (unlikely(!rpool)) {
179 		pr_warn("Invalid device %p or rdma cgroup %p\n", cg, device);
180 		return;
181 	}
182 
183 	rpool->resources[index].usage--;
184 
185 	/*
186 	 * A negative count (or overflow) is invalid,
187 	 * it indicates a bug in the rdma controller.
188 	 */
189 	WARN_ON_ONCE(rpool->resources[index].usage < 0);
190 	rpool->usage_sum--;
191 	if (rpool->usage_sum == 0 &&
192 	    rpool->num_max_cnt == RDMACG_RESOURCE_MAX) {
193 		/*
194 		 * No user of the rpool and all entries are set to max, so
195 		 * safe to delete this rpool.
196 		 */
197 		free_cg_rpool_locked(rpool);
198 	}
199 }
200 
201 /**
202  * rdmacg_uncharge_hierarchy - hierarchically uncharge rdma resource count
203  * @device: pointer to rdmacg device
204  * @stop_cg: while traversing hirerchy, when meet with stop_cg cgroup
205  *           stop uncharging
206  * @index: index of the resource to uncharge in cg in given resource pool
207  */
208 static void rdmacg_uncharge_hierarchy(struct rdma_cgroup *cg,
209 				     struct rdmacg_device *device,
210 				     struct rdma_cgroup *stop_cg,
211 				     enum rdmacg_resource_type index)
212 {
213 	struct rdma_cgroup *p;
214 
215 	mutex_lock(&rdmacg_mutex);
216 
217 	for (p = cg; p != stop_cg; p = parent_rdmacg(p))
218 		uncharge_cg_locked(p, device, index);
219 
220 	mutex_unlock(&rdmacg_mutex);
221 
222 	css_put(&cg->css);
223 }
224 
225 /**
226  * rdmacg_uncharge - hierarchically uncharge rdma resource count
227  * @device: pointer to rdmacg device
228  * @index: index of the resource to uncharge in cgroup in given resource pool
229  */
230 void rdmacg_uncharge(struct rdma_cgroup *cg,
231 		     struct rdmacg_device *device,
232 		     enum rdmacg_resource_type index)
233 {
234 	if (index >= RDMACG_RESOURCE_MAX)
235 		return;
236 
237 	rdmacg_uncharge_hierarchy(cg, device, NULL, index);
238 }
239 EXPORT_SYMBOL(rdmacg_uncharge);
240 
241 /**
242  * rdmacg_try_charge - hierarchically try to charge the rdma resource
243  * @rdmacg: pointer to rdma cgroup which will own this resource
244  * @device: pointer to rdmacg device
245  * @index: index of the resource to charge in cgroup (resource pool)
246  *
247  * This function follows charging resource in hierarchical way.
248  * It will fail if the charge would cause the new value to exceed the
249  * hierarchical limit.
250  * Returns 0 if the charge succeded, otherwise -EAGAIN, -ENOMEM or -EINVAL.
251  * Returns pointer to rdmacg for this resource when charging is successful.
252  *
253  * Charger needs to account resources on two criteria.
254  * (a) per cgroup & (b) per device resource usage.
255  * Per cgroup resource usage ensures that tasks of cgroup doesn't cross
256  * the configured limits. Per device provides granular configuration
257  * in multi device usage. It allocates resource pool in the hierarchy
258  * for each parent it come across for first resource. Later on resource
259  * pool will be available. Therefore it will be much faster thereon
260  * to charge/uncharge.
261  */
262 int rdmacg_try_charge(struct rdma_cgroup **rdmacg,
263 		      struct rdmacg_device *device,
264 		      enum rdmacg_resource_type index)
265 {
266 	struct rdma_cgroup *cg, *p;
267 	struct rdmacg_resource_pool *rpool;
268 	s64 new;
269 	int ret = 0;
270 
271 	if (index >= RDMACG_RESOURCE_MAX)
272 		return -EINVAL;
273 
274 	/*
275 	 * hold on to css, as cgroup can be removed but resource
276 	 * accounting happens on css.
277 	 */
278 	cg = get_current_rdmacg();
279 
280 	mutex_lock(&rdmacg_mutex);
281 	for (p = cg; p; p = parent_rdmacg(p)) {
282 		rpool = get_cg_rpool_locked(p, device);
283 		if (IS_ERR(rpool)) {
284 			ret = PTR_ERR(rpool);
285 			goto err;
286 		} else {
287 			new = rpool->resources[index].usage + 1;
288 			if (new > rpool->resources[index].max) {
289 				ret = -EAGAIN;
290 				goto err;
291 			} else {
292 				rpool->resources[index].usage = new;
293 				rpool->usage_sum++;
294 			}
295 		}
296 	}
297 	mutex_unlock(&rdmacg_mutex);
298 
299 	*rdmacg = cg;
300 	return 0;
301 
302 err:
303 	mutex_unlock(&rdmacg_mutex);
304 	rdmacg_uncharge_hierarchy(cg, device, p, index);
305 	return ret;
306 }
307 EXPORT_SYMBOL(rdmacg_try_charge);
308 
309 /**
310  * rdmacg_register_device - register rdmacg device to rdma controller.
311  * @device: pointer to rdmacg device whose resources need to be accounted.
312  *
313  * If IB stack wish a device to participate in rdma cgroup resource
314  * tracking, it must invoke this API to register with rdma cgroup before
315  * any user space application can start using the RDMA resources.
316  * Returns 0 on success or EINVAL when table length given is beyond
317  * supported size.
318  */
319 int rdmacg_register_device(struct rdmacg_device *device)
320 {
321 	INIT_LIST_HEAD(&device->dev_node);
322 	INIT_LIST_HEAD(&device->rpools);
323 
324 	mutex_lock(&rdmacg_mutex);
325 	list_add_tail(&device->dev_node, &rdmacg_devices);
326 	mutex_unlock(&rdmacg_mutex);
327 	return 0;
328 }
329 EXPORT_SYMBOL(rdmacg_register_device);
330 
331 /**
332  * rdmacg_unregister_device - unregister rdmacg device from rdma controller.
333  * @device: pointer to rdmacg device which was previously registered with rdma
334  *          controller using rdmacg_register_device().
335  *
336  * IB stack must invoke this after all the resources of the IB device
337  * are destroyed and after ensuring that no more resources will be created
338  * when this API is invoked.
339  */
340 void rdmacg_unregister_device(struct rdmacg_device *device)
341 {
342 	struct rdmacg_resource_pool *rpool, *tmp;
343 
344 	/*
345 	 * Synchronize with any active resource settings,
346 	 * usage query happening via configfs.
347 	 */
348 	mutex_lock(&rdmacg_mutex);
349 	list_del_init(&device->dev_node);
350 
351 	/*
352 	 * Now that this device is off the cgroup list, its safe to free
353 	 * all the rpool resources.
354 	 */
355 	list_for_each_entry_safe(rpool, tmp, &device->rpools, dev_node)
356 		free_cg_rpool_locked(rpool);
357 
358 	mutex_unlock(&rdmacg_mutex);
359 }
360 EXPORT_SYMBOL(rdmacg_unregister_device);
361 
362 static int parse_resource(char *c, int *intval)
363 {
364 	substring_t argstr;
365 	const char **table = &rdmacg_resource_names[0];
366 	char *name, *value = c;
367 	size_t len;
368 	int ret, i = 0;
369 
370 	name = strsep(&value, "=");
371 	if (!name || !value)
372 		return -EINVAL;
373 
374 	len = strlen(value);
375 
376 	for (i = 0; i < RDMACG_RESOURCE_MAX; i++) {
377 		if (strcmp(table[i], name))
378 			continue;
379 
380 		argstr.from = value;
381 		argstr.to = value + len;
382 
383 		ret = match_int(&argstr, intval);
384 		if (ret >= 0) {
385 			if (*intval < 0)
386 				break;
387 			return i;
388 		}
389 		if (strncmp(value, RDMACG_MAX_STR, len) == 0) {
390 			*intval = S32_MAX;
391 			return i;
392 		}
393 		break;
394 	}
395 	return -EINVAL;
396 }
397 
398 static int rdmacg_parse_limits(char *options,
399 			       int *new_limits, unsigned long *enables)
400 {
401 	char *c;
402 	int err = -EINVAL;
403 
404 	/* parse resource options */
405 	while ((c = strsep(&options, " ")) != NULL) {
406 		int index, intval;
407 
408 		index = parse_resource(c, &intval);
409 		if (index < 0)
410 			goto err;
411 
412 		new_limits[index] = intval;
413 		*enables |= BIT(index);
414 	}
415 	return 0;
416 
417 err:
418 	return err;
419 }
420 
421 static struct rdmacg_device *rdmacg_get_device_locked(const char *name)
422 {
423 	struct rdmacg_device *device;
424 
425 	lockdep_assert_held(&rdmacg_mutex);
426 
427 	list_for_each_entry(device, &rdmacg_devices, dev_node)
428 		if (!strcmp(name, device->name))
429 			return device;
430 
431 	return NULL;
432 }
433 
434 static ssize_t rdmacg_resource_set_max(struct kernfs_open_file *of,
435 				       char *buf, size_t nbytes, loff_t off)
436 {
437 	struct rdma_cgroup *cg = css_rdmacg(of_css(of));
438 	const char *dev_name;
439 	struct rdmacg_resource_pool *rpool;
440 	struct rdmacg_device *device;
441 	char *options = strstrip(buf);
442 	int *new_limits;
443 	unsigned long enables = 0;
444 	int i = 0, ret = 0;
445 
446 	/* extract the device name first */
447 	dev_name = strsep(&options, " ");
448 	if (!dev_name) {
449 		ret = -EINVAL;
450 		goto err;
451 	}
452 
453 	new_limits = kcalloc(RDMACG_RESOURCE_MAX, sizeof(int), GFP_KERNEL);
454 	if (!new_limits) {
455 		ret = -ENOMEM;
456 		goto err;
457 	}
458 
459 	ret = rdmacg_parse_limits(options, new_limits, &enables);
460 	if (ret)
461 		goto parse_err;
462 
463 	/* acquire lock to synchronize with hot plug devices */
464 	mutex_lock(&rdmacg_mutex);
465 
466 	device = rdmacg_get_device_locked(dev_name);
467 	if (!device) {
468 		ret = -ENODEV;
469 		goto dev_err;
470 	}
471 
472 	rpool = get_cg_rpool_locked(cg, device);
473 	if (IS_ERR(rpool)) {
474 		ret = PTR_ERR(rpool);
475 		goto dev_err;
476 	}
477 
478 	/* now set the new limits of the rpool */
479 	for_each_set_bit(i, &enables, RDMACG_RESOURCE_MAX)
480 		set_resource_limit(rpool, i, new_limits[i]);
481 
482 	if (rpool->usage_sum == 0 &&
483 	    rpool->num_max_cnt == RDMACG_RESOURCE_MAX) {
484 		/*
485 		 * No user of the rpool and all entries are set to max, so
486 		 * safe to delete this rpool.
487 		 */
488 		free_cg_rpool_locked(rpool);
489 	}
490 
491 dev_err:
492 	mutex_unlock(&rdmacg_mutex);
493 
494 parse_err:
495 	kfree(new_limits);
496 
497 err:
498 	return ret ?: nbytes;
499 }
500 
501 static void print_rpool_values(struct seq_file *sf,
502 			       struct rdmacg_resource_pool *rpool)
503 {
504 	enum rdmacg_file_type sf_type;
505 	int i;
506 	u32 value;
507 
508 	sf_type = seq_cft(sf)->private;
509 
510 	for (i = 0; i < RDMACG_RESOURCE_MAX; i++) {
511 		seq_puts(sf, rdmacg_resource_names[i]);
512 		seq_putc(sf, '=');
513 		if (sf_type == RDMACG_RESOURCE_TYPE_MAX) {
514 			if (rpool)
515 				value = rpool->resources[i].max;
516 			else
517 				value = S32_MAX;
518 		} else {
519 			if (rpool)
520 				value = rpool->resources[i].usage;
521 			else
522 				value = 0;
523 		}
524 
525 		if (value == S32_MAX)
526 			seq_puts(sf, RDMACG_MAX_STR);
527 		else
528 			seq_printf(sf, "%d", value);
529 		seq_putc(sf, ' ');
530 	}
531 }
532 
533 static int rdmacg_resource_read(struct seq_file *sf, void *v)
534 {
535 	struct rdmacg_device *device;
536 	struct rdmacg_resource_pool *rpool;
537 	struct rdma_cgroup *cg = css_rdmacg(seq_css(sf));
538 
539 	mutex_lock(&rdmacg_mutex);
540 
541 	list_for_each_entry(device, &rdmacg_devices, dev_node) {
542 		seq_printf(sf, "%s ", device->name);
543 
544 		rpool = find_cg_rpool_locked(cg, device);
545 		print_rpool_values(sf, rpool);
546 
547 		seq_putc(sf, '\n');
548 	}
549 
550 	mutex_unlock(&rdmacg_mutex);
551 	return 0;
552 }
553 
554 static struct cftype rdmacg_files[] = {
555 	{
556 		.name = "max",
557 		.write = rdmacg_resource_set_max,
558 		.seq_show = rdmacg_resource_read,
559 		.private = RDMACG_RESOURCE_TYPE_MAX,
560 		.flags = CFTYPE_NOT_ON_ROOT,
561 	},
562 	{
563 		.name = "current",
564 		.seq_show = rdmacg_resource_read,
565 		.private = RDMACG_RESOURCE_TYPE_STAT,
566 		.flags = CFTYPE_NOT_ON_ROOT,
567 	},
568 	{ }	/* terminate */
569 };
570 
571 static struct cgroup_subsys_state *
572 rdmacg_css_alloc(struct cgroup_subsys_state *parent)
573 {
574 	struct rdma_cgroup *cg;
575 
576 	cg = kzalloc(sizeof(*cg), GFP_KERNEL);
577 	if (!cg)
578 		return ERR_PTR(-ENOMEM);
579 
580 	INIT_LIST_HEAD(&cg->rpools);
581 	return &cg->css;
582 }
583 
584 static void rdmacg_css_free(struct cgroup_subsys_state *css)
585 {
586 	struct rdma_cgroup *cg = css_rdmacg(css);
587 
588 	kfree(cg);
589 }
590 
591 /**
592  * rdmacg_css_offline - cgroup css_offline callback
593  * @css: css of interest
594  *
595  * This function is called when @css is about to go away and responsible
596  * for shooting down all rdmacg associated with @css. As part of that it
597  * marks all the resource pool entries to max value, so that when resources are
598  * uncharged, associated resource pool can be freed as well.
599  */
600 static void rdmacg_css_offline(struct cgroup_subsys_state *css)
601 {
602 	struct rdma_cgroup *cg = css_rdmacg(css);
603 	struct rdmacg_resource_pool *rpool;
604 
605 	mutex_lock(&rdmacg_mutex);
606 
607 	list_for_each_entry(rpool, &cg->rpools, cg_node)
608 		set_all_resource_max_limit(rpool);
609 
610 	mutex_unlock(&rdmacg_mutex);
611 }
612 
613 struct cgroup_subsys rdma_cgrp_subsys = {
614 	.css_alloc	= rdmacg_css_alloc,
615 	.css_free	= rdmacg_css_free,
616 	.css_offline	= rdmacg_css_offline,
617 	.legacy_cftypes	= rdmacg_files,
618 	.dfl_cftypes	= rdmacg_files,
619 };
620