1 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB 2 /* 3 * Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 4 */ 5 6 #include <linux/slab.h> 7 #include <linux/rbtree.h> 8 #include <linux/sort.h> 9 #include <linux/spinlock.h> 10 #include <rdma/ib_verbs.h> 11 #include <linux/timer.h> 12 13 #include "frmr_pools.h" 14 15 #define FRMR_POOLS_DEFAULT_AGING_PERIOD_SECS 60 16 17 static int push_handle_to_queue_locked(struct frmr_queue *queue, u32 handle) 18 { 19 u32 tmp = queue->ci % NUM_HANDLES_PER_PAGE; 20 struct frmr_handles_page *page; 21 22 if (queue->ci >= queue->num_pages * NUM_HANDLES_PER_PAGE) { 23 page = kzalloc_obj(*page, GFP_ATOMIC); 24 if (!page) 25 return -ENOMEM; 26 queue->num_pages++; 27 list_add_tail(&page->list, &queue->pages_list); 28 } else { 29 page = list_last_entry(&queue->pages_list, 30 struct frmr_handles_page, list); 31 } 32 33 page->handles[tmp] = handle; 34 queue->ci++; 35 return 0; 36 } 37 38 static u32 pop_handle_from_queue_locked(struct frmr_queue *queue) 39 { 40 u32 tmp = (queue->ci - 1) % NUM_HANDLES_PER_PAGE; 41 struct frmr_handles_page *page; 42 u32 handle; 43 44 page = list_last_entry(&queue->pages_list, struct frmr_handles_page, 45 list); 46 handle = page->handles[tmp]; 47 queue->ci--; 48 49 if (!tmp) { 50 list_del(&page->list); 51 queue->num_pages--; 52 kfree(page); 53 } 54 55 return handle; 56 } 57 58 static bool pop_frmr_handles_page(struct ib_frmr_pool *pool, 59 struct frmr_queue *queue, 60 struct frmr_handles_page **page, u32 *count) 61 { 62 spin_lock(&pool->lock); 63 if (list_empty(&queue->pages_list)) { 64 spin_unlock(&pool->lock); 65 return false; 66 } 67 68 *page = list_first_entry(&queue->pages_list, struct frmr_handles_page, 69 list); 70 list_del(&(*page)->list); 71 queue->num_pages--; 72 73 /* If this is the last page, count may be less than 74 * NUM_HANDLES_PER_PAGE. 75 */ 76 if (queue->ci >= NUM_HANDLES_PER_PAGE) 77 *count = NUM_HANDLES_PER_PAGE; 78 else 79 *count = queue->ci; 80 81 queue->ci -= *count; 82 spin_unlock(&pool->lock); 83 return true; 84 } 85 86 static void destroy_all_handles_in_queue(struct ib_device *device, 87 struct ib_frmr_pool *pool, 88 struct frmr_queue *queue) 89 { 90 struct ib_frmr_pools *pools = device->frmr_pools; 91 struct frmr_handles_page *page; 92 u32 count; 93 94 while (pop_frmr_handles_page(pool, queue, &page, &count)) { 95 pools->pool_ops->destroy_frmrs(device, page->handles, count); 96 kfree(page); 97 } 98 } 99 100 static bool age_pinned_pool(struct ib_device *device, struct ib_frmr_pool *pool) 101 { 102 struct ib_frmr_pools *pools = device->frmr_pools; 103 u32 total, to_destroy, destroyed = 0; 104 bool has_work = false; 105 u32 *handles; 106 u32 handle; 107 108 spin_lock(&pool->lock); 109 total = pool->queue.ci + pool->inactive_queue.ci + pool->in_use; 110 if (total <= pool->pinned_handles) { 111 spin_unlock(&pool->lock); 112 return false; 113 } 114 115 to_destroy = total - pool->pinned_handles; 116 117 handles = kcalloc(to_destroy, sizeof(*handles), GFP_ATOMIC); 118 if (!handles) { 119 spin_unlock(&pool->lock); 120 return true; 121 } 122 123 /* Destroy all excess handles in the inactive queue */ 124 while (pool->inactive_queue.ci && destroyed < to_destroy) { 125 handles[destroyed++] = pop_handle_from_queue_locked( 126 &pool->inactive_queue); 127 } 128 129 /* Move all handles from regular queue to inactive queue */ 130 while (pool->queue.ci) { 131 handle = pop_handle_from_queue_locked(&pool->queue); 132 push_handle_to_queue_locked(&pool->inactive_queue, handle); 133 has_work = true; 134 } 135 136 spin_unlock(&pool->lock); 137 138 if (destroyed) 139 pools->pool_ops->destroy_frmrs(device, handles, destroyed); 140 kfree(handles); 141 return has_work; 142 } 143 144 static void pool_aging_work(struct work_struct *work) 145 { 146 struct ib_frmr_pool *pool = container_of( 147 to_delayed_work(work), struct ib_frmr_pool, aging_work); 148 struct ib_frmr_pools *pools = pool->device->frmr_pools; 149 bool has_work = false; 150 151 if (pool->pinned_handles) { 152 has_work = age_pinned_pool(pool->device, pool); 153 goto out; 154 } 155 156 destroy_all_handles_in_queue(pool->device, pool, &pool->inactive_queue); 157 158 /* Move all pages from regular queue to inactive queue */ 159 spin_lock(&pool->lock); 160 if (pool->queue.ci > 0) { 161 list_splice_tail_init(&pool->queue.pages_list, 162 &pool->inactive_queue.pages_list); 163 pool->inactive_queue.num_pages = pool->queue.num_pages; 164 pool->inactive_queue.ci = pool->queue.ci; 165 166 pool->queue.num_pages = 0; 167 pool->queue.ci = 0; 168 has_work = true; 169 } 170 spin_unlock(&pool->lock); 171 172 out: 173 /* Reschedule if there are handles to age in next aging period */ 174 if (has_work) 175 queue_delayed_work( 176 pools->aging_wq, &pool->aging_work, 177 secs_to_jiffies(READ_ONCE(pools->aging_period_sec))); 178 } 179 180 static void destroy_frmr_pool(struct ib_device *device, 181 struct ib_frmr_pool *pool) 182 { 183 cancel_delayed_work_sync(&pool->aging_work); 184 destroy_all_handles_in_queue(device, pool, &pool->queue); 185 destroy_all_handles_in_queue(device, pool, &pool->inactive_queue); 186 187 kfree(pool); 188 } 189 190 /* 191 * Initialize the FRMR pools for a device. 192 * 193 * @device: The device to initialize the FRMR pools for. 194 * @pool_ops: The pool operations to use. 195 * 196 * Returns 0 on success, negative error code on failure. 197 */ 198 int ib_frmr_pools_init(struct ib_device *device, 199 const struct ib_frmr_pool_ops *pool_ops) 200 { 201 struct ib_frmr_pools *pools; 202 203 pools = kzalloc_obj(*pools); 204 if (!pools) 205 return -ENOMEM; 206 207 pools->rb_root = RB_ROOT; 208 rwlock_init(&pools->rb_lock); 209 pools->pool_ops = pool_ops; 210 pools->aging_wq = create_singlethread_workqueue("frmr_aging_wq"); 211 if (!pools->aging_wq) { 212 kfree(pools); 213 return -ENOMEM; 214 } 215 216 pools->aging_period_sec = FRMR_POOLS_DEFAULT_AGING_PERIOD_SECS; 217 218 device->frmr_pools = pools; 219 return 0; 220 } 221 EXPORT_SYMBOL(ib_frmr_pools_init); 222 223 /* 224 * Clean up the FRMR pools for a device. 225 * 226 * @device: The device to clean up the FRMR pools for. 227 * 228 * Call cleanup only after all FRMR handles have been pushed back to the pool 229 * and no other FRMR operations are allowed to run in parallel. 230 * Ensuring this allows us to save synchronization overhead in pop and push 231 * operations. 232 */ 233 void ib_frmr_pools_cleanup(struct ib_device *device) 234 { 235 struct ib_frmr_pools *pools = device->frmr_pools; 236 struct ib_frmr_pool *pool, *next; 237 238 if (!pools) 239 return; 240 241 rbtree_postorder_for_each_entry_safe(pool, next, &pools->rb_root, node) 242 destroy_frmr_pool(device, pool); 243 244 destroy_workqueue(pools->aging_wq); 245 kfree(pools); 246 device->frmr_pools = NULL; 247 } 248 EXPORT_SYMBOL(ib_frmr_pools_cleanup); 249 250 int ib_frmr_pools_set_aging_period(struct ib_device *device, u32 period_sec) 251 { 252 struct ib_frmr_pools *pools = device->frmr_pools; 253 struct ib_frmr_pool *pool; 254 struct rb_node *node; 255 256 if (!pools) 257 return -EINVAL; 258 259 if (period_sec == 0) 260 return -EINVAL; 261 262 WRITE_ONCE(pools->aging_period_sec, period_sec); 263 264 read_lock(&pools->rb_lock); 265 for (node = rb_first(&pools->rb_root); node; node = rb_next(node)) { 266 pool = rb_entry(node, struct ib_frmr_pool, node); 267 mod_delayed_work(pools->aging_wq, &pool->aging_work, 268 secs_to_jiffies(period_sec)); 269 } 270 read_unlock(&pools->rb_lock); 271 272 return 0; 273 } 274 275 static inline int compare_keys(struct ib_frmr_key *key1, 276 struct ib_frmr_key *key2) 277 { 278 int res; 279 280 res = cmp_int(key1->ats, key2->ats); 281 if (res) 282 return res; 283 284 res = cmp_int(key1->access_flags, key2->access_flags); 285 if (res) 286 return res; 287 288 res = cmp_int(key1->vendor_key, key2->vendor_key); 289 if (res) 290 return res; 291 292 res = cmp_int(key1->kernel_vendor_key, key2->kernel_vendor_key); 293 if (res) 294 return res; 295 296 /* 297 * allow using handles that support more DMA blocks, up to twice the 298 * requested number 299 */ 300 res = cmp_int(key1->num_dma_blocks, key2->num_dma_blocks); 301 if (res > 0) { 302 if (key1->num_dma_blocks - key2->num_dma_blocks < 303 key2->num_dma_blocks) 304 return 0; 305 } 306 307 return res; 308 } 309 310 static int frmr_pool_cmp_find(const void *key, const struct rb_node *node) 311 { 312 struct ib_frmr_pool *pool = rb_entry(node, struct ib_frmr_pool, node); 313 314 return compare_keys(&pool->key, (struct ib_frmr_key *)key); 315 } 316 317 static int frmr_pool_cmp_add(struct rb_node *new, const struct rb_node *node) 318 { 319 struct ib_frmr_pool *new_pool = 320 rb_entry(new, struct ib_frmr_pool, node); 321 struct ib_frmr_pool *pool = rb_entry(node, struct ib_frmr_pool, node); 322 323 return compare_keys(&pool->key, &new_pool->key); 324 } 325 326 static struct ib_frmr_pool *ib_frmr_pool_find(struct ib_frmr_pools *pools, 327 struct ib_frmr_key *key) 328 { 329 struct ib_frmr_pool *pool; 330 struct rb_node *node; 331 332 /* find operation is done under read lock for performance reasons. 333 * The case of threads failing to find the same pool and creating it 334 * is handled by the create_frmr_pool function. 335 */ 336 read_lock(&pools->rb_lock); 337 node = rb_find(key, &pools->rb_root, frmr_pool_cmp_find); 338 pool = rb_entry_safe(node, struct ib_frmr_pool, node); 339 read_unlock(&pools->rb_lock); 340 341 return pool; 342 } 343 344 static struct ib_frmr_pool *create_frmr_pool(struct ib_device *device, 345 struct ib_frmr_key *key) 346 { 347 struct ib_frmr_pools *pools = device->frmr_pools; 348 struct ib_frmr_pool *pool; 349 struct rb_node *existing; 350 351 pool = kzalloc_obj(*pool); 352 if (!pool) 353 return ERR_PTR(-ENOMEM); 354 355 memcpy(&pool->key, key, sizeof(*key)); 356 INIT_LIST_HEAD(&pool->queue.pages_list); 357 INIT_LIST_HEAD(&pool->inactive_queue.pages_list); 358 spin_lock_init(&pool->lock); 359 INIT_DELAYED_WORK(&pool->aging_work, pool_aging_work); 360 pool->device = device; 361 362 write_lock(&pools->rb_lock); 363 existing = rb_find_add(&pool->node, &pools->rb_root, frmr_pool_cmp_add); 364 write_unlock(&pools->rb_lock); 365 366 /* If a different thread has already created the pool, return it. 367 * The insert operation is done under the write lock so we are sure 368 * that the pool is not inserted twice. 369 */ 370 if (existing) { 371 kfree(pool); 372 return rb_entry(existing, struct ib_frmr_pool, node); 373 } 374 375 return pool; 376 } 377 378 int ib_frmr_pools_set_pinned(struct ib_device *device, struct ib_frmr_key *key, 379 u32 pinned_handles) 380 { 381 struct ib_frmr_pools *pools = device->frmr_pools; 382 struct ib_frmr_key driver_key = {}; 383 struct ib_frmr_pool *pool; 384 u32 needed_handles; 385 u32 current_total; 386 int i, ret = 0; 387 u32 *handles; 388 389 if (!pools) 390 return -EINVAL; 391 392 ret = ib_check_mr_access(device, key->access_flags); 393 if (ret) 394 return ret; 395 396 if (pools->pool_ops->build_key) { 397 ret = pools->pool_ops->build_key(device, key, &driver_key); 398 if (ret) 399 return ret; 400 } else { 401 memcpy(&driver_key, key, sizeof(*key)); 402 } 403 404 pool = ib_frmr_pool_find(pools, &driver_key); 405 if (!pool) { 406 pool = create_frmr_pool(device, &driver_key); 407 if (IS_ERR(pool)) 408 return PTR_ERR(pool); 409 } 410 411 spin_lock(&pool->lock); 412 current_total = pool->in_use + pool->queue.ci + pool->inactive_queue.ci; 413 414 if (current_total < pinned_handles) 415 needed_handles = pinned_handles - current_total; 416 else 417 needed_handles = 0; 418 419 pool->pinned_handles = pinned_handles; 420 spin_unlock(&pool->lock); 421 422 if (!needed_handles) 423 goto schedule_aging; 424 425 handles = kcalloc(needed_handles, sizeof(*handles), GFP_KERNEL); 426 if (!handles) 427 return -ENOMEM; 428 429 ret = pools->pool_ops->create_frmrs(device, key, handles, 430 needed_handles); 431 if (ret) { 432 kfree(handles); 433 return ret; 434 } 435 436 spin_lock(&pool->lock); 437 for (i = 0; i < needed_handles; i++) { 438 ret = push_handle_to_queue_locked(&pool->queue, 439 handles[i]); 440 if (ret) 441 goto end; 442 } 443 444 end: 445 spin_unlock(&pool->lock); 446 kfree(handles); 447 448 schedule_aging: 449 /* Ensure aging is scheduled to adjust to new pinned handles count */ 450 mod_delayed_work(pools->aging_wq, &pool->aging_work, 0); 451 452 return ret; 453 } 454 455 static int get_frmr_from_pool(struct ib_device *device, 456 struct ib_frmr_pool *pool, struct ib_mr *mr) 457 { 458 struct ib_frmr_pools *pools = device->frmr_pools; 459 u32 handle; 460 int err; 461 462 spin_lock(&pool->lock); 463 if (pool->queue.ci == 0) { 464 if (pool->inactive_queue.ci > 0) { 465 handle = pop_handle_from_queue_locked( 466 &pool->inactive_queue); 467 } else { 468 spin_unlock(&pool->lock); 469 err = pools->pool_ops->create_frmrs(device, &pool->key, 470 &handle, 1); 471 if (err) 472 return err; 473 spin_lock(&pool->lock); 474 } 475 } else { 476 handle = pop_handle_from_queue_locked(&pool->queue); 477 } 478 479 pool->in_use++; 480 if (pool->in_use > pool->max_in_use) 481 pool->max_in_use = pool->in_use; 482 483 spin_unlock(&pool->lock); 484 485 mr->frmr.pool = pool; 486 mr->frmr.handle = handle; 487 488 return 0; 489 } 490 491 /* 492 * Pop an FRMR handle from the pool. 493 * 494 * @device: The device to pop the FRMR handle from. 495 * @mr: The MR to pop the FRMR handle from. 496 * 497 * Returns 0 on success, negative error code on failure. 498 */ 499 int ib_frmr_pool_pop(struct ib_device *device, struct ib_mr *mr) 500 { 501 struct ib_frmr_pools *pools = device->frmr_pools; 502 struct ib_frmr_pool *pool; 503 504 WARN_ON_ONCE(!device->frmr_pools); 505 pool = ib_frmr_pool_find(pools, &mr->frmr.key); 506 if (!pool) { 507 pool = create_frmr_pool(device, &mr->frmr.key); 508 if (IS_ERR(pool)) 509 return PTR_ERR(pool); 510 } 511 512 return get_frmr_from_pool(device, pool, mr); 513 } 514 EXPORT_SYMBOL(ib_frmr_pool_pop); 515 516 /* 517 * Push an FRMR handle back to the pool. 518 * 519 * @device: The device to push the FRMR handle to. 520 * @mr: The MR containing the FRMR handle to push back to the pool. 521 * 522 * Returns 0 on success, negative error code on failure. 523 */ 524 int ib_frmr_pool_push(struct ib_device *device, struct ib_mr *mr) 525 { 526 struct ib_frmr_pool *pool = mr->frmr.pool; 527 struct ib_frmr_pools *pools = device->frmr_pools; 528 bool schedule_aging = false; 529 int ret; 530 531 spin_lock(&pool->lock); 532 /* Schedule aging every time an empty pool becomes non-empty */ 533 if (pool->queue.ci == 0) 534 schedule_aging = true; 535 ret = push_handle_to_queue_locked(&pool->queue, mr->frmr.handle); 536 if (ret == 0) 537 pool->in_use--; 538 539 spin_unlock(&pool->lock); 540 541 if (ret == 0 && schedule_aging) 542 queue_delayed_work(pools->aging_wq, &pool->aging_work, 543 secs_to_jiffies(READ_ONCE(pools->aging_period_sec))); 544 545 return ret; 546 } 547 EXPORT_SYMBOL(ib_frmr_pool_push); 548