xref: /linux/drivers/infiniband/core/frmr_pools.c (revision bba2c3615bd6cfee7456d1130f2e6b01b3f4e9ba)
1 // SPDX-License-Identifier: GPL-2.0  OR Linux-OpenIB
2 /*
3  * Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
4  */
5 
6 #include <linux/slab.h>
7 #include <linux/rbtree.h>
8 #include <linux/sort.h>
9 #include <linux/spinlock.h>
10 #include <rdma/ib_verbs.h>
11 #include <linux/timer.h>
12 
13 #include "frmr_pools.h"
14 
15 #define FRMR_POOLS_DEFAULT_AGING_PERIOD_SECS 60
16 
17 static int push_handle_to_queue_locked(struct frmr_queue *queue, u32 handle)
18 {
19 	u32 tmp = queue->ci % NUM_HANDLES_PER_PAGE;
20 	struct frmr_handles_page *page;
21 
22 	if (queue->ci >= queue->num_pages * NUM_HANDLES_PER_PAGE) {
23 		page = kzalloc_obj(*page, GFP_ATOMIC);
24 		if (!page)
25 			return -ENOMEM;
26 		queue->num_pages++;
27 		list_add_tail(&page->list, &queue->pages_list);
28 	} else {
29 		page = list_last_entry(&queue->pages_list,
30 				       struct frmr_handles_page, list);
31 	}
32 
33 	page->handles[tmp] = handle;
34 	queue->ci++;
35 	return 0;
36 }
37 
38 static u32 pop_handle_from_queue_locked(struct frmr_queue *queue)
39 {
40 	u32 tmp = (queue->ci - 1) % NUM_HANDLES_PER_PAGE;
41 	struct frmr_handles_page *page;
42 	u32 handle;
43 
44 	page = list_last_entry(&queue->pages_list, struct frmr_handles_page,
45 			       list);
46 	handle = page->handles[tmp];
47 	queue->ci--;
48 
49 	if (!tmp) {
50 		list_del(&page->list);
51 		queue->num_pages--;
52 		kfree(page);
53 	}
54 
55 	return handle;
56 }
57 
58 static bool pop_frmr_handles_page(struct ib_frmr_pool *pool,
59 				  struct frmr_queue *queue,
60 				  struct frmr_handles_page **page, u32 *count)
61 {
62 	spin_lock(&pool->lock);
63 	if (list_empty(&queue->pages_list)) {
64 		spin_unlock(&pool->lock);
65 		return false;
66 	}
67 
68 	*page = list_first_entry(&queue->pages_list, struct frmr_handles_page,
69 				 list);
70 	list_del(&(*page)->list);
71 	queue->num_pages--;
72 
73 	/* If this is the last page, count may be less than
74 	 * NUM_HANDLES_PER_PAGE.
75 	 */
76 	if (queue->ci >= NUM_HANDLES_PER_PAGE)
77 		*count = NUM_HANDLES_PER_PAGE;
78 	else
79 		*count = queue->ci;
80 
81 	queue->ci -= *count;
82 	spin_unlock(&pool->lock);
83 	return true;
84 }
85 
86 static void destroy_all_handles_in_queue(struct ib_device *device,
87 					 struct ib_frmr_pool *pool,
88 					 struct frmr_queue *queue)
89 {
90 	struct ib_frmr_pools *pools = device->frmr_pools;
91 	struct frmr_handles_page *page;
92 	u32 count;
93 
94 	while (pop_frmr_handles_page(pool, queue, &page, &count)) {
95 		pools->pool_ops->destroy_frmrs(device, page->handles, count);
96 		kfree(page);
97 	}
98 }
99 
100 /*
101  * Bulk-move all handles from @src into @dst without allocating new pages.
102  * If @dst has a partial tail page, fill it handle-by-handle from @src first
103  * to preserve the invariant that only the tail page is partial, then splice
104  * the remaining @src pages onto @dst. On return @src is empty.
105  *
106  * Caller must hold the lock protecting both queues.
107  */
108 static void splice_frmr_queue_locked(struct frmr_queue *dst,
109 				     struct frmr_queue *src)
110 {
111 	u32 free_in_tail = dst->ci % NUM_HANDLES_PER_PAGE;
112 	u32 handle;
113 
114 	if (free_in_tail) {
115 		free_in_tail = NUM_HANDLES_PER_PAGE - free_in_tail;
116 		while (free_in_tail && src->ci) {
117 			handle = pop_handle_from_queue_locked(src);
118 			push_handle_to_queue_locked(dst, handle);
119 			free_in_tail--;
120 		}
121 	}
122 
123 	if (src->ci > 0) {
124 		list_splice_tail_init(&src->pages_list, &dst->pages_list);
125 		dst->num_pages += src->num_pages;
126 		dst->ci += src->ci;
127 		src->num_pages = 0;
128 		src->ci = 0;
129 	}
130 }
131 
132 static bool age_pinned_pool(struct ib_device *device, struct ib_frmr_pool *pool)
133 {
134 	struct ib_frmr_pools *pools = device->frmr_pools;
135 	u32 total, to_destroy, destroyed = 0;
136 	bool has_work = false;
137 	u32 *handles;
138 
139 	spin_lock(&pool->lock);
140 	total = pool->queue.ci + pool->inactive_queue.ci + pool->in_use;
141 	if (total <= pool->pinned_handles) {
142 		spin_unlock(&pool->lock);
143 		return false;
144 	}
145 
146 	to_destroy = min(total - pool->pinned_handles, pool->inactive_queue.ci);
147 
148 	handles = kcalloc(to_destroy, sizeof(*handles), GFP_ATOMIC);
149 	if (!handles) {
150 		spin_unlock(&pool->lock);
151 		return true;
152 	}
153 
154 	/* Destroy all excess handles in the inactive queue */
155 	for (; destroyed < to_destroy; destroyed++)
156 		handles[destroyed] = pop_handle_from_queue_locked(
157 			&pool->inactive_queue);
158 
159 	/* Move all handles from regular queue to inactive queue */
160 	if (pool->queue.ci > 0) {
161 		splice_frmr_queue_locked(&pool->inactive_queue, &pool->queue);
162 		has_work = true;
163 	}
164 
165 	spin_unlock(&pool->lock);
166 
167 	if (destroyed)
168 		pools->pool_ops->destroy_frmrs(device, handles, destroyed);
169 	kfree(handles);
170 	return has_work;
171 }
172 
173 static void pool_aging_work(struct work_struct *work)
174 {
175 	struct ib_frmr_pool *pool = container_of(
176 		to_delayed_work(work), struct ib_frmr_pool, aging_work);
177 	struct ib_frmr_pools *pools = pool->device->frmr_pools;
178 	bool has_work = false;
179 
180 	if (pool->pinned_handles) {
181 		has_work = age_pinned_pool(pool->device, pool);
182 		goto out;
183 	}
184 
185 	destroy_all_handles_in_queue(pool->device, pool, &pool->inactive_queue);
186 
187 	/* Move all pages from regular queue to inactive queue */
188 	spin_lock(&pool->lock);
189 	if (pool->queue.ci > 0) {
190 		splice_frmr_queue_locked(&pool->inactive_queue, &pool->queue);
191 		has_work = true;
192 	}
193 	spin_unlock(&pool->lock);
194 
195 out:
196 	/* Reschedule if there are handles to age in next aging period */
197 	if (has_work)
198 		queue_delayed_work(
199 			pools->aging_wq, &pool->aging_work,
200 			secs_to_jiffies(READ_ONCE(pools->aging_period_sec)));
201 }
202 
203 static void destroy_frmr_pool(struct ib_device *device,
204 			      struct ib_frmr_pool *pool)
205 {
206 	cancel_delayed_work_sync(&pool->aging_work);
207 	destroy_all_handles_in_queue(device, pool, &pool->queue);
208 	destroy_all_handles_in_queue(device, pool, &pool->inactive_queue);
209 
210 	kfree(pool);
211 }
212 
213 /*
214  * Initialize the FRMR pools for a device.
215  *
216  * @device: The device to initialize the FRMR pools for.
217  * @pool_ops: The pool operations to use.
218  *
219  * Returns 0 on success, negative error code on failure.
220  */
221 int ib_frmr_pools_init(struct ib_device *device,
222 		       const struct ib_frmr_pool_ops *pool_ops)
223 {
224 	struct ib_frmr_pools *pools;
225 
226 	pools = kzalloc_obj(*pools);
227 	if (!pools)
228 		return -ENOMEM;
229 
230 	pools->rb_root = RB_ROOT;
231 	rwlock_init(&pools->rb_lock);
232 	pools->pool_ops = pool_ops;
233 	pools->aging_wq = create_singlethread_workqueue("frmr_aging_wq");
234 	if (!pools->aging_wq) {
235 		kfree(pools);
236 		return -ENOMEM;
237 	}
238 
239 	pools->aging_period_sec = FRMR_POOLS_DEFAULT_AGING_PERIOD_SECS;
240 
241 	device->frmr_pools = pools;
242 	return 0;
243 }
244 EXPORT_SYMBOL(ib_frmr_pools_init);
245 
246 /*
247  * Clean up the FRMR pools for a device.
248  *
249  * @device: The device to clean up the FRMR pools for.
250  *
251  * Call cleanup only after all FRMR handles have been pushed back to the pool
252  * and no other FRMR operations are allowed to run in parallel.
253  * Ensuring this allows us to save synchronization overhead in pop and push
254  * operations.
255  */
256 void ib_frmr_pools_cleanup(struct ib_device *device)
257 {
258 	struct ib_frmr_pools *pools = device->frmr_pools;
259 	struct ib_frmr_pool *pool, *next;
260 
261 	if (!pools)
262 		return;
263 
264 	rbtree_postorder_for_each_entry_safe(pool, next, &pools->rb_root, node)
265 		destroy_frmr_pool(device, pool);
266 
267 	destroy_workqueue(pools->aging_wq);
268 	kfree(pools);
269 	device->frmr_pools = NULL;
270 }
271 EXPORT_SYMBOL(ib_frmr_pools_cleanup);
272 
273 int ib_frmr_pools_set_aging_period(struct ib_device *device, u32 period_sec)
274 {
275 	struct ib_frmr_pools *pools = device->frmr_pools;
276 	struct ib_frmr_pool *pool;
277 	struct rb_node *node;
278 
279 	if (!pools)
280 		return -EINVAL;
281 
282 	if (period_sec == 0)
283 		return -EINVAL;
284 
285 	WRITE_ONCE(pools->aging_period_sec, period_sec);
286 
287 	read_lock(&pools->rb_lock);
288 	for (node = rb_first(&pools->rb_root); node; node = rb_next(node)) {
289 		pool = rb_entry(node, struct ib_frmr_pool, node);
290 		mod_delayed_work(pools->aging_wq, &pool->aging_work,
291 				 secs_to_jiffies(period_sec));
292 	}
293 	read_unlock(&pools->rb_lock);
294 
295 	return 0;
296 }
297 
298 static inline int compare_keys(struct ib_frmr_key *key1,
299 			       struct ib_frmr_key *key2)
300 {
301 	int res;
302 
303 	res = cmp_int(key1->ats, key2->ats);
304 	if (res)
305 		return res;
306 
307 	res = cmp_int(key1->access_flags, key2->access_flags);
308 	if (res)
309 		return res;
310 
311 	res = cmp_int(key1->vendor_key, key2->vendor_key);
312 	if (res)
313 		return res;
314 
315 	res = cmp_int(key1->kernel_vendor_key, key2->kernel_vendor_key);
316 	if (res)
317 		return res;
318 
319 	/*
320 	 * allow using handles that support more DMA blocks, up to twice the
321 	 * requested number
322 	 */
323 	res = cmp_int(key1->num_dma_blocks, key2->num_dma_blocks);
324 	if (res > 0) {
325 		if (key1->num_dma_blocks - key2->num_dma_blocks <
326 		    key2->num_dma_blocks)
327 			return 0;
328 	}
329 
330 	return res;
331 }
332 
333 static int frmr_pool_cmp_find(const void *key, const struct rb_node *node)
334 {
335 	struct ib_frmr_pool *pool = rb_entry(node, struct ib_frmr_pool, node);
336 
337 	return compare_keys(&pool->key, (struct ib_frmr_key *)key);
338 }
339 
340 static int frmr_pool_cmp_add(struct rb_node *new, const struct rb_node *node)
341 {
342 	struct ib_frmr_pool *new_pool =
343 		rb_entry(new, struct ib_frmr_pool, node);
344 	struct ib_frmr_pool *pool = rb_entry(node, struct ib_frmr_pool, node);
345 
346 	return compare_keys(&pool->key, &new_pool->key);
347 }
348 
349 static struct ib_frmr_pool *ib_frmr_pool_find(struct ib_frmr_pools *pools,
350 					      struct ib_frmr_key *key)
351 {
352 	struct ib_frmr_pool *pool;
353 	struct rb_node *node;
354 
355 	/* find operation is done under read lock for performance reasons.
356 	 * The case of threads failing to find the same pool and creating it
357 	 * is handled by the create_frmr_pool function.
358 	 */
359 	read_lock(&pools->rb_lock);
360 	node = rb_find(key, &pools->rb_root, frmr_pool_cmp_find);
361 	pool = rb_entry_safe(node, struct ib_frmr_pool, node);
362 	read_unlock(&pools->rb_lock);
363 
364 	return pool;
365 }
366 
367 static struct ib_frmr_pool *create_frmr_pool(struct ib_device *device,
368 					     struct ib_frmr_key *key)
369 {
370 	struct ib_frmr_pools *pools = device->frmr_pools;
371 	struct ib_frmr_pool *pool;
372 	struct rb_node *existing;
373 
374 	pool = kzalloc_obj(*pool);
375 	if (!pool)
376 		return ERR_PTR(-ENOMEM);
377 
378 	memcpy(&pool->key, key, sizeof(*key));
379 	INIT_LIST_HEAD(&pool->queue.pages_list);
380 	INIT_LIST_HEAD(&pool->inactive_queue.pages_list);
381 	spin_lock_init(&pool->lock);
382 	INIT_DELAYED_WORK(&pool->aging_work, pool_aging_work);
383 	pool->device = device;
384 
385 	write_lock(&pools->rb_lock);
386 	existing = rb_find_add(&pool->node, &pools->rb_root, frmr_pool_cmp_add);
387 	write_unlock(&pools->rb_lock);
388 
389 	/* If a different thread has already created the pool, return it.
390 	 * The insert operation is done under the write lock so we are sure
391 	 * that the pool is not inserted twice.
392 	 */
393 	if (existing) {
394 		kfree(pool);
395 		return rb_entry(existing, struct ib_frmr_pool, node);
396 	}
397 
398 	return pool;
399 }
400 
401 int ib_frmr_pools_set_pinned(struct ib_device *device, struct ib_frmr_key *key,
402 			     u32 pinned_handles)
403 {
404 	struct ib_frmr_pools *pools = device->frmr_pools;
405 	struct ib_frmr_key driver_key = {};
406 	struct ib_frmr_pool *pool;
407 	u32 needed_handles;
408 	u32 current_total;
409 	int i, ret = 0;
410 	u32 *handles;
411 
412 	if (!pools)
413 		return -EINVAL;
414 
415 	ret = ib_check_mr_access(device, key->access_flags);
416 	if (ret)
417 		return ret;
418 
419 	if (pools->pool_ops->build_key) {
420 		ret = pools->pool_ops->build_key(device, key, &driver_key);
421 		if (ret)
422 			return ret;
423 	} else {
424 		memcpy(&driver_key, key, sizeof(*key));
425 	}
426 
427 	pool = ib_frmr_pool_find(pools, &driver_key);
428 	if (!pool) {
429 		pool = create_frmr_pool(device, &driver_key);
430 		if (IS_ERR(pool))
431 			return PTR_ERR(pool);
432 	}
433 
434 	spin_lock(&pool->lock);
435 	current_total = pool->in_use + pool->queue.ci + pool->inactive_queue.ci;
436 
437 	if (current_total < pinned_handles)
438 		needed_handles = pinned_handles - current_total;
439 	else
440 		needed_handles = 0;
441 
442 	pool->pinned_handles = pinned_handles;
443 	spin_unlock(&pool->lock);
444 
445 	if (!needed_handles)
446 		goto schedule_aging;
447 
448 	handles = kcalloc(needed_handles, sizeof(*handles), GFP_KERNEL);
449 	if (!handles)
450 		return -ENOMEM;
451 
452 	ret = pools->pool_ops->create_frmrs(device, &driver_key, handles,
453 					    needed_handles);
454 	if (ret) {
455 		kfree(handles);
456 		return ret;
457 	}
458 
459 	spin_lock(&pool->lock);
460 	for (i = 0; i < needed_handles; i++) {
461 		ret = push_handle_to_queue_locked(&pool->queue,
462 						  handles[i]);
463 		if (ret)
464 			break;
465 	}
466 	spin_unlock(&pool->lock);
467 
468 	if (ret) {
469 		/* Destroy handles created but never pushed to the pool. */
470 		pools->pool_ops->destroy_frmrs(device, &handles[i],
471 				needed_handles - i);
472 	}
473 
474 	kfree(handles);
475 
476 schedule_aging:
477 	/* Ensure aging is scheduled to adjust to new pinned handles count */
478 	mod_delayed_work(pools->aging_wq, &pool->aging_work, 0);
479 
480 	return ret;
481 }
482 
483 static int get_frmr_from_pool(struct ib_device *device,
484 			      struct ib_frmr_pool *pool, struct ib_mr *mr)
485 {
486 	struct ib_frmr_pools *pools = device->frmr_pools;
487 	u32 handle;
488 	int err;
489 
490 	spin_lock(&pool->lock);
491 	if (pool->queue.ci == 0) {
492 		if (pool->inactive_queue.ci > 0) {
493 			handle = pop_handle_from_queue_locked(
494 				&pool->inactive_queue);
495 		} else {
496 			spin_unlock(&pool->lock);
497 			err = pools->pool_ops->create_frmrs(device, &pool->key,
498 							    &handle, 1);
499 			if (err)
500 				return err;
501 			spin_lock(&pool->lock);
502 		}
503 	} else {
504 		handle = pop_handle_from_queue_locked(&pool->queue);
505 	}
506 
507 	pool->in_use++;
508 	if (pool->in_use > pool->max_in_use)
509 		pool->max_in_use = pool->in_use;
510 
511 	spin_unlock(&pool->lock);
512 
513 	mr->frmr.pool = pool;
514 	mr->frmr.handle = handle;
515 
516 	return 0;
517 }
518 
519 /*
520  * Pop an FRMR handle from the pool.
521  *
522  * @device: The device to pop the FRMR handle from.
523  * @mr: The MR to pop the FRMR handle from.
524  *
525  * Returns 0 on success, negative error code on failure.
526  */
527 int ib_frmr_pool_pop(struct ib_device *device, struct ib_mr *mr)
528 {
529 	struct ib_frmr_pools *pools = device->frmr_pools;
530 	struct ib_frmr_pool *pool;
531 
532 	if (WARN_ON_ONCE(!pools))
533 		return -EINVAL;
534 
535 	pool = ib_frmr_pool_find(pools, &mr->frmr.key);
536 	if (!pool) {
537 		pool = create_frmr_pool(device, &mr->frmr.key);
538 		if (IS_ERR(pool))
539 			return PTR_ERR(pool);
540 	}
541 
542 	return get_frmr_from_pool(device, pool, mr);
543 }
544 EXPORT_SYMBOL(ib_frmr_pool_pop);
545 
546 /*
547  * Push an FRMR handle back to the pool.
548  *
549  * @device: The device to push the FRMR handle to.
550  * @mr: The MR containing the FRMR handle to push back to the pool.
551  *
552  */
553 void ib_frmr_pool_push(struct ib_device *device, struct ib_mr *mr)
554 {
555 	struct ib_frmr_pool *pool = mr->frmr.pool;
556 	struct ib_frmr_pools *pools = device->frmr_pools;
557 	bool schedule_aging = false;
558 	int ret;
559 
560 	spin_lock(&pool->lock);
561 	pool->in_use--;
562 	ret = push_handle_to_queue_locked(&pool->queue, mr->frmr.handle);
563 
564 	/* Schedule aging every time an empty pool becomes non-empty */
565 	if (!ret && pool->queue.ci == 1)
566 		schedule_aging = true;
567 
568 	spin_unlock(&pool->lock);
569 
570 	if (ret) {
571 		pools->pool_ops->destroy_frmrs(device, &mr->frmr.handle, 1);
572 		return;
573 	}
574 
575 	if (schedule_aging)
576 		queue_delayed_work(pools->aging_wq, &pool->aging_work,
577 			secs_to_jiffies(READ_ONCE(pools->aging_period_sec)));
578 
579 }
580 EXPORT_SYMBOL(ib_frmr_pool_push);
581 
582 /*
583  * Drop a handle previously popped from the pool without returning it for
584  * reuse. The caller is responsible for destroying the underlying hardware
585  * resource.
586  */
587 void ib_frmr_pool_drop(struct ib_mr *mr)
588 {
589 	struct ib_frmr_pool *pool = mr->frmr.pool;
590 
591 	spin_lock(&pool->lock);
592 	pool->in_use--;
593 	spin_unlock(&pool->lock);
594 }
595 EXPORT_SYMBOL(ib_frmr_pool_drop);
596