1 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB 2 /* 3 * Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 4 */ 5 6 #include <linux/slab.h> 7 #include <linux/rbtree.h> 8 #include <linux/sort.h> 9 #include <linux/spinlock.h> 10 #include <rdma/ib_verbs.h> 11 #include <linux/timer.h> 12 13 #include "frmr_pools.h" 14 15 #define FRMR_POOLS_DEFAULT_AGING_PERIOD_SECS 60 16 17 static int push_handle_to_queue_locked(struct frmr_queue *queue, u32 handle) 18 { 19 u32 tmp = queue->ci % NUM_HANDLES_PER_PAGE; 20 struct frmr_handles_page *page; 21 22 if (queue->ci >= queue->num_pages * NUM_HANDLES_PER_PAGE) { 23 page = kzalloc_obj(*page, GFP_ATOMIC); 24 if (!page) 25 return -ENOMEM; 26 queue->num_pages++; 27 list_add_tail(&page->list, &queue->pages_list); 28 } else { 29 page = list_last_entry(&queue->pages_list, 30 struct frmr_handles_page, list); 31 } 32 33 page->handles[tmp] = handle; 34 queue->ci++; 35 return 0; 36 } 37 38 static u32 pop_handle_from_queue_locked(struct frmr_queue *queue) 39 { 40 u32 tmp = (queue->ci - 1) % NUM_HANDLES_PER_PAGE; 41 struct frmr_handles_page *page; 42 u32 handle; 43 44 page = list_last_entry(&queue->pages_list, struct frmr_handles_page, 45 list); 46 handle = page->handles[tmp]; 47 queue->ci--; 48 49 if (!tmp) { 50 list_del(&page->list); 51 queue->num_pages--; 52 kfree(page); 53 } 54 55 return handle; 56 } 57 58 static bool pop_frmr_handles_page(struct ib_frmr_pool *pool, 59 struct frmr_queue *queue, 60 struct frmr_handles_page **page, u32 *count) 61 { 62 spin_lock(&pool->lock); 63 if (list_empty(&queue->pages_list)) { 64 spin_unlock(&pool->lock); 65 return false; 66 } 67 68 *page = list_first_entry(&queue->pages_list, struct frmr_handles_page, 69 list); 70 list_del(&(*page)->list); 71 queue->num_pages--; 72 73 /* If this is the last page, count may be less than 74 * NUM_HANDLES_PER_PAGE. 75 */ 76 if (queue->ci >= NUM_HANDLES_PER_PAGE) 77 *count = NUM_HANDLES_PER_PAGE; 78 else 79 *count = queue->ci; 80 81 queue->ci -= *count; 82 spin_unlock(&pool->lock); 83 return true; 84 } 85 86 static void destroy_all_handles_in_queue(struct ib_device *device, 87 struct ib_frmr_pool *pool, 88 struct frmr_queue *queue) 89 { 90 struct ib_frmr_pools *pools = device->frmr_pools; 91 struct frmr_handles_page *page; 92 u32 count; 93 94 while (pop_frmr_handles_page(pool, queue, &page, &count)) { 95 pools->pool_ops->destroy_frmrs(device, page->handles, count); 96 kfree(page); 97 } 98 } 99 100 /* 101 * Bulk-move all handles from @src into @dst without allocating new pages. 102 * If @dst has a partial tail page, fill it handle-by-handle from @src first 103 * to preserve the invariant that only the tail page is partial, then splice 104 * the remaining @src pages onto @dst. On return @src is empty. 105 * 106 * Caller must hold the lock protecting both queues. 107 */ 108 static void splice_frmr_queue_locked(struct frmr_queue *dst, 109 struct frmr_queue *src) 110 { 111 u32 free_in_tail = dst->ci % NUM_HANDLES_PER_PAGE; 112 u32 handle; 113 114 if (free_in_tail) { 115 free_in_tail = NUM_HANDLES_PER_PAGE - free_in_tail; 116 while (free_in_tail && src->ci) { 117 handle = pop_handle_from_queue_locked(src); 118 push_handle_to_queue_locked(dst, handle); 119 free_in_tail--; 120 } 121 } 122 123 if (src->ci > 0) { 124 list_splice_tail_init(&src->pages_list, &dst->pages_list); 125 dst->num_pages += src->num_pages; 126 dst->ci += src->ci; 127 src->num_pages = 0; 128 src->ci = 0; 129 } 130 } 131 132 static bool age_pinned_pool(struct ib_device *device, struct ib_frmr_pool *pool) 133 { 134 struct ib_frmr_pools *pools = device->frmr_pools; 135 u32 total, to_destroy, destroyed = 0; 136 bool has_work = false; 137 u32 *handles; 138 139 spin_lock(&pool->lock); 140 total = pool->queue.ci + pool->inactive_queue.ci + pool->in_use; 141 if (total <= pool->pinned_handles) { 142 spin_unlock(&pool->lock); 143 return false; 144 } 145 146 to_destroy = min(total - pool->pinned_handles, pool->inactive_queue.ci); 147 148 handles = kcalloc(to_destroy, sizeof(*handles), GFP_ATOMIC); 149 if (!handles) { 150 spin_unlock(&pool->lock); 151 return true; 152 } 153 154 /* Destroy all excess handles in the inactive queue */ 155 for (; destroyed < to_destroy; destroyed++) 156 handles[destroyed] = pop_handle_from_queue_locked( 157 &pool->inactive_queue); 158 159 /* Move all handles from regular queue to inactive queue */ 160 if (pool->queue.ci > 0) { 161 splice_frmr_queue_locked(&pool->inactive_queue, &pool->queue); 162 has_work = true; 163 } 164 165 spin_unlock(&pool->lock); 166 167 if (destroyed) 168 pools->pool_ops->destroy_frmrs(device, handles, destroyed); 169 kfree(handles); 170 return has_work; 171 } 172 173 static void pool_aging_work(struct work_struct *work) 174 { 175 struct ib_frmr_pool *pool = container_of( 176 to_delayed_work(work), struct ib_frmr_pool, aging_work); 177 struct ib_frmr_pools *pools = pool->device->frmr_pools; 178 bool has_work = false; 179 180 if (pool->pinned_handles) { 181 has_work = age_pinned_pool(pool->device, pool); 182 goto out; 183 } 184 185 destroy_all_handles_in_queue(pool->device, pool, &pool->inactive_queue); 186 187 /* Move all pages from regular queue to inactive queue */ 188 spin_lock(&pool->lock); 189 if (pool->queue.ci > 0) { 190 splice_frmr_queue_locked(&pool->inactive_queue, &pool->queue); 191 has_work = true; 192 } 193 spin_unlock(&pool->lock); 194 195 out: 196 /* Reschedule if there are handles to age in next aging period */ 197 if (has_work) 198 queue_delayed_work( 199 pools->aging_wq, &pool->aging_work, 200 secs_to_jiffies(READ_ONCE(pools->aging_period_sec))); 201 } 202 203 static void destroy_frmr_pool(struct ib_device *device, 204 struct ib_frmr_pool *pool) 205 { 206 cancel_delayed_work_sync(&pool->aging_work); 207 destroy_all_handles_in_queue(device, pool, &pool->queue); 208 destroy_all_handles_in_queue(device, pool, &pool->inactive_queue); 209 210 kfree(pool); 211 } 212 213 /* 214 * Initialize the FRMR pools for a device. 215 * 216 * @device: The device to initialize the FRMR pools for. 217 * @pool_ops: The pool operations to use. 218 * 219 * Returns 0 on success, negative error code on failure. 220 */ 221 int ib_frmr_pools_init(struct ib_device *device, 222 const struct ib_frmr_pool_ops *pool_ops) 223 { 224 struct ib_frmr_pools *pools; 225 226 pools = kzalloc_obj(*pools); 227 if (!pools) 228 return -ENOMEM; 229 230 pools->rb_root = RB_ROOT; 231 rwlock_init(&pools->rb_lock); 232 pools->pool_ops = pool_ops; 233 pools->aging_wq = create_singlethread_workqueue("frmr_aging_wq"); 234 if (!pools->aging_wq) { 235 kfree(pools); 236 return -ENOMEM; 237 } 238 239 pools->aging_period_sec = FRMR_POOLS_DEFAULT_AGING_PERIOD_SECS; 240 241 device->frmr_pools = pools; 242 return 0; 243 } 244 EXPORT_SYMBOL(ib_frmr_pools_init); 245 246 /* 247 * Clean up the FRMR pools for a device. 248 * 249 * @device: The device to clean up the FRMR pools for. 250 * 251 * Call cleanup only after all FRMR handles have been pushed back to the pool 252 * and no other FRMR operations are allowed to run in parallel. 253 * Ensuring this allows us to save synchronization overhead in pop and push 254 * operations. 255 */ 256 void ib_frmr_pools_cleanup(struct ib_device *device) 257 { 258 struct ib_frmr_pools *pools = device->frmr_pools; 259 struct ib_frmr_pool *pool, *next; 260 261 if (!pools) 262 return; 263 264 rbtree_postorder_for_each_entry_safe(pool, next, &pools->rb_root, node) 265 destroy_frmr_pool(device, pool); 266 267 destroy_workqueue(pools->aging_wq); 268 kfree(pools); 269 device->frmr_pools = NULL; 270 } 271 EXPORT_SYMBOL(ib_frmr_pools_cleanup); 272 273 int ib_frmr_pools_set_aging_period(struct ib_device *device, u32 period_sec) 274 { 275 struct ib_frmr_pools *pools = device->frmr_pools; 276 struct ib_frmr_pool *pool; 277 struct rb_node *node; 278 279 if (!pools) 280 return -EINVAL; 281 282 if (period_sec == 0) 283 return -EINVAL; 284 285 WRITE_ONCE(pools->aging_period_sec, period_sec); 286 287 read_lock(&pools->rb_lock); 288 for (node = rb_first(&pools->rb_root); node; node = rb_next(node)) { 289 pool = rb_entry(node, struct ib_frmr_pool, node); 290 mod_delayed_work(pools->aging_wq, &pool->aging_work, 291 secs_to_jiffies(period_sec)); 292 } 293 read_unlock(&pools->rb_lock); 294 295 return 0; 296 } 297 298 static inline int compare_keys(struct ib_frmr_key *key1, 299 struct ib_frmr_key *key2) 300 { 301 int res; 302 303 res = cmp_int(key1->ats, key2->ats); 304 if (res) 305 return res; 306 307 res = cmp_int(key1->access_flags, key2->access_flags); 308 if (res) 309 return res; 310 311 res = cmp_int(key1->vendor_key, key2->vendor_key); 312 if (res) 313 return res; 314 315 res = cmp_int(key1->kernel_vendor_key, key2->kernel_vendor_key); 316 if (res) 317 return res; 318 319 /* 320 * allow using handles that support more DMA blocks, up to twice the 321 * requested number 322 */ 323 res = cmp_int(key1->num_dma_blocks, key2->num_dma_blocks); 324 if (res > 0) { 325 if (key1->num_dma_blocks - key2->num_dma_blocks < 326 key2->num_dma_blocks) 327 return 0; 328 } 329 330 return res; 331 } 332 333 static int frmr_pool_cmp_find(const void *key, const struct rb_node *node) 334 { 335 struct ib_frmr_pool *pool = rb_entry(node, struct ib_frmr_pool, node); 336 337 return compare_keys(&pool->key, (struct ib_frmr_key *)key); 338 } 339 340 static int frmr_pool_cmp_add(struct rb_node *new, const struct rb_node *node) 341 { 342 struct ib_frmr_pool *new_pool = 343 rb_entry(new, struct ib_frmr_pool, node); 344 struct ib_frmr_pool *pool = rb_entry(node, struct ib_frmr_pool, node); 345 346 return compare_keys(&pool->key, &new_pool->key); 347 } 348 349 static struct ib_frmr_pool *ib_frmr_pool_find(struct ib_frmr_pools *pools, 350 struct ib_frmr_key *key) 351 { 352 struct ib_frmr_pool *pool; 353 struct rb_node *node; 354 355 /* find operation is done under read lock for performance reasons. 356 * The case of threads failing to find the same pool and creating it 357 * is handled by the create_frmr_pool function. 358 */ 359 read_lock(&pools->rb_lock); 360 node = rb_find(key, &pools->rb_root, frmr_pool_cmp_find); 361 pool = rb_entry_safe(node, struct ib_frmr_pool, node); 362 read_unlock(&pools->rb_lock); 363 364 return pool; 365 } 366 367 static struct ib_frmr_pool *create_frmr_pool(struct ib_device *device, 368 struct ib_frmr_key *key) 369 { 370 struct ib_frmr_pools *pools = device->frmr_pools; 371 struct ib_frmr_pool *pool; 372 struct rb_node *existing; 373 374 pool = kzalloc_obj(*pool); 375 if (!pool) 376 return ERR_PTR(-ENOMEM); 377 378 memcpy(&pool->key, key, sizeof(*key)); 379 INIT_LIST_HEAD(&pool->queue.pages_list); 380 INIT_LIST_HEAD(&pool->inactive_queue.pages_list); 381 spin_lock_init(&pool->lock); 382 INIT_DELAYED_WORK(&pool->aging_work, pool_aging_work); 383 pool->device = device; 384 385 write_lock(&pools->rb_lock); 386 existing = rb_find_add(&pool->node, &pools->rb_root, frmr_pool_cmp_add); 387 write_unlock(&pools->rb_lock); 388 389 /* If a different thread has already created the pool, return it. 390 * The insert operation is done under the write lock so we are sure 391 * that the pool is not inserted twice. 392 */ 393 if (existing) { 394 kfree(pool); 395 return rb_entry(existing, struct ib_frmr_pool, node); 396 } 397 398 return pool; 399 } 400 401 int ib_frmr_pools_set_pinned(struct ib_device *device, struct ib_frmr_key *key, 402 u32 pinned_handles) 403 { 404 struct ib_frmr_pools *pools = device->frmr_pools; 405 struct ib_frmr_key driver_key = {}; 406 struct ib_frmr_pool *pool; 407 u32 needed_handles; 408 u32 current_total; 409 int i, ret = 0; 410 u32 *handles; 411 412 if (!pools) 413 return -EINVAL; 414 415 ret = ib_check_mr_access(device, key->access_flags); 416 if (ret) 417 return ret; 418 419 if (pools->pool_ops->build_key) { 420 ret = pools->pool_ops->build_key(device, key, &driver_key); 421 if (ret) 422 return ret; 423 } else { 424 memcpy(&driver_key, key, sizeof(*key)); 425 } 426 427 pool = ib_frmr_pool_find(pools, &driver_key); 428 if (!pool) { 429 pool = create_frmr_pool(device, &driver_key); 430 if (IS_ERR(pool)) 431 return PTR_ERR(pool); 432 } 433 434 spin_lock(&pool->lock); 435 current_total = pool->in_use + pool->queue.ci + pool->inactive_queue.ci; 436 437 if (current_total < pinned_handles) 438 needed_handles = pinned_handles - current_total; 439 else 440 needed_handles = 0; 441 442 pool->pinned_handles = pinned_handles; 443 spin_unlock(&pool->lock); 444 445 if (!needed_handles) 446 goto schedule_aging; 447 448 handles = kcalloc(needed_handles, sizeof(*handles), GFP_KERNEL); 449 if (!handles) 450 return -ENOMEM; 451 452 ret = pools->pool_ops->create_frmrs(device, &driver_key, handles, 453 needed_handles); 454 if (ret) { 455 kfree(handles); 456 return ret; 457 } 458 459 spin_lock(&pool->lock); 460 for (i = 0; i < needed_handles; i++) { 461 ret = push_handle_to_queue_locked(&pool->queue, 462 handles[i]); 463 if (ret) 464 break; 465 } 466 spin_unlock(&pool->lock); 467 468 if (ret) { 469 /* Destroy handles created but never pushed to the pool. */ 470 pools->pool_ops->destroy_frmrs(device, &handles[i], 471 needed_handles - i); 472 } 473 474 kfree(handles); 475 476 schedule_aging: 477 /* Ensure aging is scheduled to adjust to new pinned handles count */ 478 mod_delayed_work(pools->aging_wq, &pool->aging_work, 0); 479 480 return ret; 481 } 482 483 static int get_frmr_from_pool(struct ib_device *device, 484 struct ib_frmr_pool *pool, struct ib_mr *mr) 485 { 486 struct ib_frmr_pools *pools = device->frmr_pools; 487 u32 handle; 488 int err; 489 490 spin_lock(&pool->lock); 491 if (pool->queue.ci == 0) { 492 if (pool->inactive_queue.ci > 0) { 493 handle = pop_handle_from_queue_locked( 494 &pool->inactive_queue); 495 } else { 496 spin_unlock(&pool->lock); 497 err = pools->pool_ops->create_frmrs(device, &pool->key, 498 &handle, 1); 499 if (err) 500 return err; 501 spin_lock(&pool->lock); 502 } 503 } else { 504 handle = pop_handle_from_queue_locked(&pool->queue); 505 } 506 507 pool->in_use++; 508 if (pool->in_use > pool->max_in_use) 509 pool->max_in_use = pool->in_use; 510 511 spin_unlock(&pool->lock); 512 513 mr->frmr.pool = pool; 514 mr->frmr.handle = handle; 515 516 return 0; 517 } 518 519 /* 520 * Pop an FRMR handle from the pool. 521 * 522 * @device: The device to pop the FRMR handle from. 523 * @mr: The MR to pop the FRMR handle from. 524 * 525 * Returns 0 on success, negative error code on failure. 526 */ 527 int ib_frmr_pool_pop(struct ib_device *device, struct ib_mr *mr) 528 { 529 struct ib_frmr_pools *pools = device->frmr_pools; 530 struct ib_frmr_pool *pool; 531 532 if (WARN_ON_ONCE(!pools)) 533 return -EINVAL; 534 535 pool = ib_frmr_pool_find(pools, &mr->frmr.key); 536 if (!pool) { 537 pool = create_frmr_pool(device, &mr->frmr.key); 538 if (IS_ERR(pool)) 539 return PTR_ERR(pool); 540 } 541 542 return get_frmr_from_pool(device, pool, mr); 543 } 544 EXPORT_SYMBOL(ib_frmr_pool_pop); 545 546 /* 547 * Push an FRMR handle back to the pool. 548 * 549 * @device: The device to push the FRMR handle to. 550 * @mr: The MR containing the FRMR handle to push back to the pool. 551 * 552 */ 553 void ib_frmr_pool_push(struct ib_device *device, struct ib_mr *mr) 554 { 555 struct ib_frmr_pool *pool = mr->frmr.pool; 556 struct ib_frmr_pools *pools = device->frmr_pools; 557 bool schedule_aging = false; 558 int ret; 559 560 spin_lock(&pool->lock); 561 pool->in_use--; 562 ret = push_handle_to_queue_locked(&pool->queue, mr->frmr.handle); 563 564 /* Schedule aging every time an empty pool becomes non-empty */ 565 if (!ret && pool->queue.ci == 1) 566 schedule_aging = true; 567 568 spin_unlock(&pool->lock); 569 570 if (ret) { 571 pools->pool_ops->destroy_frmrs(device, &mr->frmr.handle, 1); 572 return; 573 } 574 575 if (schedule_aging) 576 queue_delayed_work(pools->aging_wq, &pool->aging_work, 577 secs_to_jiffies(READ_ONCE(pools->aging_period_sec))); 578 579 } 580 EXPORT_SYMBOL(ib_frmr_pool_push); 581 582 /* 583 * Drop a handle previously popped from the pool without returning it for 584 * reuse. The caller is responsible for destroying the underlying hardware 585 * resource. 586 */ 587 void ib_frmr_pool_drop(struct ib_mr *mr) 588 { 589 struct ib_frmr_pool *pool = mr->frmr.pool; 590 591 spin_lock(&pool->lock); 592 pool->in_use--; 593 spin_unlock(&pool->lock); 594 } 595 EXPORT_SYMBOL(ib_frmr_pool_drop); 596