1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * RDMA resource limiting controller for cgroups. 4 * 5 * Used to allow a cgroup hierarchy to stop processes from consuming 6 * additional RDMA resources after a certain limit is reached. 7 * 8 * Copyright (C) 2016 Parav Pandit <pandit.parav@gmail.com> 9 */ 10 11 #include <linux/bitops.h> 12 #include <linux/slab.h> 13 #include <linux/seq_file.h> 14 #include <linux/cgroup.h> 15 #include <linux/parser.h> 16 #include <linux/cgroup_rdma.h> 17 18 #define RDMACG_MAX_STR "max" 19 20 /* 21 * Protects list of resource pools maintained on per cgroup basis 22 * and rdma device list. 23 */ 24 static DEFINE_MUTEX(rdmacg_mutex); 25 static LIST_HEAD(rdmacg_devices); 26 27 enum rdmacg_file_type { 28 RDMACG_RESOURCE_TYPE_MAX, 29 RDMACG_RESOURCE_TYPE_STAT, 30 }; 31 32 /* 33 * resource table definition as to be seen by the user. 34 * Need to add entries to it when more resources are 35 * added/defined at IB verb/core layer. 36 */ 37 static char const *rdmacg_resource_names[] = { 38 [RDMACG_RESOURCE_HCA_HANDLE] = "hca_handle", 39 [RDMACG_RESOURCE_HCA_OBJECT] = "hca_object", 40 }; 41 42 /* resource tracker for each resource of rdma cgroup */ 43 struct rdmacg_resource { 44 int max; 45 int usage; 46 }; 47 48 /* 49 * resource pool object which represents per cgroup, per device 50 * resources. There are multiple instances of this object per cgroup, 51 * therefore it cannot be embedded within rdma_cgroup structure. It 52 * is maintained as list. 53 */ 54 struct rdmacg_resource_pool { 55 struct rdmacg_device *device; 56 struct rdmacg_resource resources[RDMACG_RESOURCE_MAX]; 57 58 struct list_head cg_node; 59 struct list_head dev_node; 60 61 /* count active user tasks of this pool */ 62 u64 usage_sum; 63 /* total number counts which are set to max */ 64 int num_max_cnt; 65 }; 66 67 static struct rdma_cgroup *css_rdmacg(struct cgroup_subsys_state *css) 68 { 69 return container_of(css, struct rdma_cgroup, css); 70 } 71 72 static struct rdma_cgroup *parent_rdmacg(struct rdma_cgroup *cg) 73 { 74 return css_rdmacg(cg->css.parent); 75 } 76 77 static inline struct rdma_cgroup *get_current_rdmacg(void) 78 { 79 return css_rdmacg(task_get_css(current, rdma_cgrp_id)); 80 } 81 82 static void set_resource_limit(struct rdmacg_resource_pool *rpool, 83 int index, int new_max) 84 { 85 if (new_max == S32_MAX) { 86 if (rpool->resources[index].max != S32_MAX) 87 rpool->num_max_cnt++; 88 } else { 89 if (rpool->resources[index].max == S32_MAX) 90 rpool->num_max_cnt--; 91 } 92 rpool->resources[index].max = new_max; 93 } 94 95 static void set_all_resource_max_limit(struct rdmacg_resource_pool *rpool) 96 { 97 int i; 98 99 for (i = 0; i < RDMACG_RESOURCE_MAX; i++) 100 set_resource_limit(rpool, i, S32_MAX); 101 } 102 103 static void free_cg_rpool_locked(struct rdmacg_resource_pool *rpool) 104 { 105 lockdep_assert_held(&rdmacg_mutex); 106 107 list_del(&rpool->cg_node); 108 list_del(&rpool->dev_node); 109 kfree(rpool); 110 } 111 112 static struct rdmacg_resource_pool * 113 find_cg_rpool_locked(struct rdma_cgroup *cg, 114 struct rdmacg_device *device) 115 116 { 117 struct rdmacg_resource_pool *pool; 118 119 lockdep_assert_held(&rdmacg_mutex); 120 121 list_for_each_entry(pool, &cg->rpools, cg_node) 122 if (pool->device == device) 123 return pool; 124 125 return NULL; 126 } 127 128 static struct rdmacg_resource_pool * 129 get_cg_rpool_locked(struct rdma_cgroup *cg, struct rdmacg_device *device) 130 { 131 struct rdmacg_resource_pool *rpool; 132 133 rpool = find_cg_rpool_locked(cg, device); 134 if (rpool) 135 return rpool; 136 137 rpool = kzalloc(sizeof(*rpool), GFP_KERNEL); 138 if (!rpool) 139 return ERR_PTR(-ENOMEM); 140 141 rpool->device = device; 142 set_all_resource_max_limit(rpool); 143 144 INIT_LIST_HEAD(&rpool->cg_node); 145 INIT_LIST_HEAD(&rpool->dev_node); 146 list_add_tail(&rpool->cg_node, &cg->rpools); 147 list_add_tail(&rpool->dev_node, &device->rpools); 148 return rpool; 149 } 150 151 /** 152 * uncharge_cg_locked - uncharge resource for rdma cgroup 153 * @cg: pointer to cg to uncharge and all parents in hierarchy 154 * @device: pointer to rdmacg device 155 * @index: index of the resource to uncharge in cg (resource pool) 156 * 157 * It also frees the resource pool which was created as part of 158 * charging operation when there are no resources attached to 159 * resource pool. 160 */ 161 static void 162 uncharge_cg_locked(struct rdma_cgroup *cg, 163 struct rdmacg_device *device, 164 enum rdmacg_resource_type index) 165 { 166 struct rdmacg_resource_pool *rpool; 167 168 rpool = find_cg_rpool_locked(cg, device); 169 170 /* 171 * rpool cannot be null at this stage. Let kernel operate in case 172 * if there a bug in IB stack or rdma controller, instead of crashing 173 * the system. 174 */ 175 if (unlikely(!rpool)) { 176 pr_warn("Invalid device %p or rdma cgroup %p\n", cg, device); 177 return; 178 } 179 180 rpool->resources[index].usage--; 181 182 /* 183 * A negative count (or overflow) is invalid, 184 * it indicates a bug in the rdma controller. 185 */ 186 WARN_ON_ONCE(rpool->resources[index].usage < 0); 187 rpool->usage_sum--; 188 if (rpool->usage_sum == 0 && 189 rpool->num_max_cnt == RDMACG_RESOURCE_MAX) { 190 /* 191 * No user of the rpool and all entries are set to max, so 192 * safe to delete this rpool. 193 */ 194 free_cg_rpool_locked(rpool); 195 } 196 } 197 198 /** 199 * rdmacg_uncharge_hierarchy - hierarchically uncharge rdma resource count 200 * @device: pointer to rdmacg device 201 * @stop_cg: while traversing hirerchy, when meet with stop_cg cgroup 202 * stop uncharging 203 * @index: index of the resource to uncharge in cg in given resource pool 204 */ 205 static void rdmacg_uncharge_hierarchy(struct rdma_cgroup *cg, 206 struct rdmacg_device *device, 207 struct rdma_cgroup *stop_cg, 208 enum rdmacg_resource_type index) 209 { 210 struct rdma_cgroup *p; 211 212 mutex_lock(&rdmacg_mutex); 213 214 for (p = cg; p != stop_cg; p = parent_rdmacg(p)) 215 uncharge_cg_locked(p, device, index); 216 217 mutex_unlock(&rdmacg_mutex); 218 219 css_put(&cg->css); 220 } 221 222 /** 223 * rdmacg_uncharge - hierarchically uncharge rdma resource count 224 * @device: pointer to rdmacg device 225 * @index: index of the resource to uncharge in cgroup in given resource pool 226 */ 227 void rdmacg_uncharge(struct rdma_cgroup *cg, 228 struct rdmacg_device *device, 229 enum rdmacg_resource_type index) 230 { 231 if (index >= RDMACG_RESOURCE_MAX) 232 return; 233 234 rdmacg_uncharge_hierarchy(cg, device, NULL, index); 235 } 236 EXPORT_SYMBOL(rdmacg_uncharge); 237 238 /** 239 * rdmacg_try_charge - hierarchically try to charge the rdma resource 240 * @rdmacg: pointer to rdma cgroup which will own this resource 241 * @device: pointer to rdmacg device 242 * @index: index of the resource to charge in cgroup (resource pool) 243 * 244 * This function follows charging resource in hierarchical way. 245 * It will fail if the charge would cause the new value to exceed the 246 * hierarchical limit. 247 * Returns 0 if the charge succeeded, otherwise -EAGAIN, -ENOMEM or -EINVAL. 248 * Returns pointer to rdmacg for this resource when charging is successful. 249 * 250 * Charger needs to account resources on two criteria. 251 * (a) per cgroup & (b) per device resource usage. 252 * Per cgroup resource usage ensures that tasks of cgroup doesn't cross 253 * the configured limits. Per device provides granular configuration 254 * in multi device usage. It allocates resource pool in the hierarchy 255 * for each parent it come across for first resource. Later on resource 256 * pool will be available. Therefore it will be much faster thereon 257 * to charge/uncharge. 258 */ 259 int rdmacg_try_charge(struct rdma_cgroup **rdmacg, 260 struct rdmacg_device *device, 261 enum rdmacg_resource_type index) 262 { 263 struct rdma_cgroup *cg, *p; 264 struct rdmacg_resource_pool *rpool; 265 s64 new; 266 int ret = 0; 267 268 if (index >= RDMACG_RESOURCE_MAX) 269 return -EINVAL; 270 271 /* 272 * hold on to css, as cgroup can be removed but resource 273 * accounting happens on css. 274 */ 275 cg = get_current_rdmacg(); 276 277 mutex_lock(&rdmacg_mutex); 278 for (p = cg; p; p = parent_rdmacg(p)) { 279 rpool = get_cg_rpool_locked(p, device); 280 if (IS_ERR(rpool)) { 281 ret = PTR_ERR(rpool); 282 goto err; 283 } else { 284 new = rpool->resources[index].usage + 1; 285 if (new > rpool->resources[index].max) { 286 ret = -EAGAIN; 287 goto err; 288 } else { 289 rpool->resources[index].usage = new; 290 rpool->usage_sum++; 291 } 292 } 293 } 294 mutex_unlock(&rdmacg_mutex); 295 296 *rdmacg = cg; 297 return 0; 298 299 err: 300 mutex_unlock(&rdmacg_mutex); 301 rdmacg_uncharge_hierarchy(cg, device, p, index); 302 return ret; 303 } 304 EXPORT_SYMBOL(rdmacg_try_charge); 305 306 /** 307 * rdmacg_register_device - register rdmacg device to rdma controller. 308 * @device: pointer to rdmacg device whose resources need to be accounted. 309 * 310 * If IB stack wish a device to participate in rdma cgroup resource 311 * tracking, it must invoke this API to register with rdma cgroup before 312 * any user space application can start using the RDMA resources. 313 */ 314 void rdmacg_register_device(struct rdmacg_device *device) 315 { 316 INIT_LIST_HEAD(&device->dev_node); 317 INIT_LIST_HEAD(&device->rpools); 318 319 mutex_lock(&rdmacg_mutex); 320 list_add_tail(&device->dev_node, &rdmacg_devices); 321 mutex_unlock(&rdmacg_mutex); 322 } 323 EXPORT_SYMBOL(rdmacg_register_device); 324 325 /** 326 * rdmacg_unregister_device - unregister rdmacg device from rdma controller. 327 * @device: pointer to rdmacg device which was previously registered with rdma 328 * controller using rdmacg_register_device(). 329 * 330 * IB stack must invoke this after all the resources of the IB device 331 * are destroyed and after ensuring that no more resources will be created 332 * when this API is invoked. 333 */ 334 void rdmacg_unregister_device(struct rdmacg_device *device) 335 { 336 struct rdmacg_resource_pool *rpool, *tmp; 337 338 /* 339 * Synchronize with any active resource settings, 340 * usage query happening via configfs. 341 */ 342 mutex_lock(&rdmacg_mutex); 343 list_del_init(&device->dev_node); 344 345 /* 346 * Now that this device is off the cgroup list, its safe to free 347 * all the rpool resources. 348 */ 349 list_for_each_entry_safe(rpool, tmp, &device->rpools, dev_node) 350 free_cg_rpool_locked(rpool); 351 352 mutex_unlock(&rdmacg_mutex); 353 } 354 EXPORT_SYMBOL(rdmacg_unregister_device); 355 356 static int parse_resource(char *c, int *intval) 357 { 358 substring_t argstr; 359 char *name, *value = c; 360 size_t len; 361 int ret, i; 362 363 name = strsep(&value, "="); 364 if (!name || !value) 365 return -EINVAL; 366 367 i = match_string(rdmacg_resource_names, RDMACG_RESOURCE_MAX, name); 368 if (i < 0) 369 return i; 370 371 len = strlen(value); 372 373 argstr.from = value; 374 argstr.to = value + len; 375 376 ret = match_int(&argstr, intval); 377 if (ret >= 0) { 378 if (*intval < 0) 379 return -EINVAL; 380 return i; 381 } 382 if (strncmp(value, RDMACG_MAX_STR, len) == 0) { 383 *intval = S32_MAX; 384 return i; 385 } 386 return -EINVAL; 387 } 388 389 static int rdmacg_parse_limits(char *options, 390 int *new_limits, unsigned long *enables) 391 { 392 char *c; 393 int err = -EINVAL; 394 395 /* parse resource options */ 396 while ((c = strsep(&options, " ")) != NULL) { 397 int index, intval; 398 399 index = parse_resource(c, &intval); 400 if (index < 0) 401 goto err; 402 403 new_limits[index] = intval; 404 *enables |= BIT(index); 405 } 406 return 0; 407 408 err: 409 return err; 410 } 411 412 static struct rdmacg_device *rdmacg_get_device_locked(const char *name) 413 { 414 struct rdmacg_device *device; 415 416 lockdep_assert_held(&rdmacg_mutex); 417 418 list_for_each_entry(device, &rdmacg_devices, dev_node) 419 if (!strcmp(name, device->name)) 420 return device; 421 422 return NULL; 423 } 424 425 static ssize_t rdmacg_resource_set_max(struct kernfs_open_file *of, 426 char *buf, size_t nbytes, loff_t off) 427 { 428 struct rdma_cgroup *cg = css_rdmacg(of_css(of)); 429 const char *dev_name; 430 struct rdmacg_resource_pool *rpool; 431 struct rdmacg_device *device; 432 char *options = strstrip(buf); 433 int *new_limits; 434 unsigned long enables = 0; 435 int i = 0, ret = 0; 436 437 /* extract the device name first */ 438 dev_name = strsep(&options, " "); 439 if (!dev_name) { 440 ret = -EINVAL; 441 goto err; 442 } 443 444 new_limits = kcalloc(RDMACG_RESOURCE_MAX, sizeof(int), GFP_KERNEL); 445 if (!new_limits) { 446 ret = -ENOMEM; 447 goto err; 448 } 449 450 ret = rdmacg_parse_limits(options, new_limits, &enables); 451 if (ret) 452 goto parse_err; 453 454 /* acquire lock to synchronize with hot plug devices */ 455 mutex_lock(&rdmacg_mutex); 456 457 device = rdmacg_get_device_locked(dev_name); 458 if (!device) { 459 ret = -ENODEV; 460 goto dev_err; 461 } 462 463 rpool = get_cg_rpool_locked(cg, device); 464 if (IS_ERR(rpool)) { 465 ret = PTR_ERR(rpool); 466 goto dev_err; 467 } 468 469 /* now set the new limits of the rpool */ 470 for_each_set_bit(i, &enables, RDMACG_RESOURCE_MAX) 471 set_resource_limit(rpool, i, new_limits[i]); 472 473 if (rpool->usage_sum == 0 && 474 rpool->num_max_cnt == RDMACG_RESOURCE_MAX) { 475 /* 476 * No user of the rpool and all entries are set to max, so 477 * safe to delete this rpool. 478 */ 479 free_cg_rpool_locked(rpool); 480 } 481 482 dev_err: 483 mutex_unlock(&rdmacg_mutex); 484 485 parse_err: 486 kfree(new_limits); 487 488 err: 489 return ret ?: nbytes; 490 } 491 492 static void print_rpool_values(struct seq_file *sf, 493 struct rdmacg_resource_pool *rpool) 494 { 495 enum rdmacg_file_type sf_type; 496 int i; 497 u32 value; 498 499 sf_type = seq_cft(sf)->private; 500 501 for (i = 0; i < RDMACG_RESOURCE_MAX; i++) { 502 seq_puts(sf, rdmacg_resource_names[i]); 503 seq_putc(sf, '='); 504 if (sf_type == RDMACG_RESOURCE_TYPE_MAX) { 505 if (rpool) 506 value = rpool->resources[i].max; 507 else 508 value = S32_MAX; 509 } else { 510 if (rpool) 511 value = rpool->resources[i].usage; 512 else 513 value = 0; 514 } 515 516 if (value == S32_MAX) 517 seq_puts(sf, RDMACG_MAX_STR); 518 else 519 seq_printf(sf, "%d", value); 520 seq_putc(sf, ' '); 521 } 522 } 523 524 static int rdmacg_resource_read(struct seq_file *sf, void *v) 525 { 526 struct rdmacg_device *device; 527 struct rdmacg_resource_pool *rpool; 528 struct rdma_cgroup *cg = css_rdmacg(seq_css(sf)); 529 530 mutex_lock(&rdmacg_mutex); 531 532 list_for_each_entry(device, &rdmacg_devices, dev_node) { 533 seq_printf(sf, "%s ", device->name); 534 535 rpool = find_cg_rpool_locked(cg, device); 536 print_rpool_values(sf, rpool); 537 538 seq_putc(sf, '\n'); 539 } 540 541 mutex_unlock(&rdmacg_mutex); 542 return 0; 543 } 544 545 static struct cftype rdmacg_files[] = { 546 { 547 .name = "max", 548 .write = rdmacg_resource_set_max, 549 .seq_show = rdmacg_resource_read, 550 .private = RDMACG_RESOURCE_TYPE_MAX, 551 .flags = CFTYPE_NOT_ON_ROOT, 552 }, 553 { 554 .name = "current", 555 .seq_show = rdmacg_resource_read, 556 .private = RDMACG_RESOURCE_TYPE_STAT, 557 .flags = CFTYPE_NOT_ON_ROOT, 558 }, 559 { } /* terminate */ 560 }; 561 562 static struct cgroup_subsys_state * 563 rdmacg_css_alloc(struct cgroup_subsys_state *parent) 564 { 565 struct rdma_cgroup *cg; 566 567 cg = kzalloc(sizeof(*cg), GFP_KERNEL); 568 if (!cg) 569 return ERR_PTR(-ENOMEM); 570 571 INIT_LIST_HEAD(&cg->rpools); 572 return &cg->css; 573 } 574 575 static void rdmacg_css_free(struct cgroup_subsys_state *css) 576 { 577 struct rdma_cgroup *cg = css_rdmacg(css); 578 579 kfree(cg); 580 } 581 582 /** 583 * rdmacg_css_offline - cgroup css_offline callback 584 * @css: css of interest 585 * 586 * This function is called when @css is about to go away and responsible 587 * for shooting down all rdmacg associated with @css. As part of that it 588 * marks all the resource pool entries to max value, so that when resources are 589 * uncharged, associated resource pool can be freed as well. 590 */ 591 static void rdmacg_css_offline(struct cgroup_subsys_state *css) 592 { 593 struct rdma_cgroup *cg = css_rdmacg(css); 594 struct rdmacg_resource_pool *rpool; 595 596 mutex_lock(&rdmacg_mutex); 597 598 list_for_each_entry(rpool, &cg->rpools, cg_node) 599 set_all_resource_max_limit(rpool); 600 601 mutex_unlock(&rdmacg_mutex); 602 } 603 604 struct cgroup_subsys rdma_cgrp_subsys = { 605 .css_alloc = rdmacg_css_alloc, 606 .css_free = rdmacg_css_free, 607 .css_offline = rdmacg_css_offline, 608 .legacy_cftypes = rdmacg_files, 609 .dfl_cftypes = rdmacg_files, 610 }; 611