xref: /linux/drivers/infiniband/core/frmr_pools.c (revision 4b0b946019e7376752456380b67e54eea2f10a7c)
1 // SPDX-License-Identifier: GPL-2.0  OR Linux-OpenIB
2 /*
3  * Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
4  */
5 
6 #include <linux/slab.h>
7 #include <linux/rbtree.h>
8 #include <linux/sort.h>
9 #include <linux/spinlock.h>
10 #include <rdma/ib_verbs.h>
11 #include <linux/timer.h>
12 
13 #include "frmr_pools.h"
14 
15 #define FRMR_POOLS_DEFAULT_AGING_PERIOD_SECS 60
16 
17 static int push_handle_to_queue_locked(struct frmr_queue *queue, u32 handle)
18 {
19 	u32 tmp = queue->ci % NUM_HANDLES_PER_PAGE;
20 	struct frmr_handles_page *page;
21 
22 	if (queue->ci >= queue->num_pages * NUM_HANDLES_PER_PAGE) {
23 		page = kzalloc_obj(*page, GFP_ATOMIC);
24 		if (!page)
25 			return -ENOMEM;
26 		queue->num_pages++;
27 		list_add_tail(&page->list, &queue->pages_list);
28 	} else {
29 		page = list_last_entry(&queue->pages_list,
30 				       struct frmr_handles_page, list);
31 	}
32 
33 	page->handles[tmp] = handle;
34 	queue->ci++;
35 	return 0;
36 }
37 
38 static u32 pop_handle_from_queue_locked(struct frmr_queue *queue)
39 {
40 	u32 tmp = (queue->ci - 1) % NUM_HANDLES_PER_PAGE;
41 	struct frmr_handles_page *page;
42 	u32 handle;
43 
44 	page = list_last_entry(&queue->pages_list, struct frmr_handles_page,
45 			       list);
46 	handle = page->handles[tmp];
47 	queue->ci--;
48 
49 	if (!tmp) {
50 		list_del(&page->list);
51 		queue->num_pages--;
52 		kfree(page);
53 	}
54 
55 	return handle;
56 }
57 
58 static bool pop_frmr_handles_page(struct ib_frmr_pool *pool,
59 				  struct frmr_queue *queue,
60 				  struct frmr_handles_page **page, u32 *count)
61 {
62 	spin_lock(&pool->lock);
63 	if (list_empty(&queue->pages_list)) {
64 		spin_unlock(&pool->lock);
65 		return false;
66 	}
67 
68 	*page = list_first_entry(&queue->pages_list, struct frmr_handles_page,
69 				 list);
70 	list_del(&(*page)->list);
71 	queue->num_pages--;
72 
73 	/* If this is the last page, count may be less than
74 	 * NUM_HANDLES_PER_PAGE.
75 	 */
76 	if (queue->ci >= NUM_HANDLES_PER_PAGE)
77 		*count = NUM_HANDLES_PER_PAGE;
78 	else
79 		*count = queue->ci;
80 
81 	queue->ci -= *count;
82 	spin_unlock(&pool->lock);
83 	return true;
84 }
85 
86 static void destroy_all_handles_in_queue(struct ib_device *device,
87 					 struct ib_frmr_pool *pool,
88 					 struct frmr_queue *queue)
89 {
90 	struct ib_frmr_pools *pools = device->frmr_pools;
91 	struct frmr_handles_page *page;
92 	u32 count;
93 
94 	while (pop_frmr_handles_page(pool, queue, &page, &count)) {
95 		pools->pool_ops->destroy_frmrs(device, page->handles, count);
96 		kfree(page);
97 	}
98 }
99 
100 static bool age_pinned_pool(struct ib_device *device, struct ib_frmr_pool *pool)
101 {
102 	struct ib_frmr_pools *pools = device->frmr_pools;
103 	u32 total, to_destroy, destroyed = 0;
104 	bool has_work = false;
105 	u32 *handles;
106 	u32 handle;
107 
108 	spin_lock(&pool->lock);
109 	total = pool->queue.ci + pool->inactive_queue.ci + pool->in_use;
110 	if (total <= pool->pinned_handles) {
111 		spin_unlock(&pool->lock);
112 		return false;
113 	}
114 
115 	to_destroy = total - pool->pinned_handles;
116 
117 	handles = kcalloc(to_destroy, sizeof(*handles), GFP_ATOMIC);
118 	if (!handles) {
119 		spin_unlock(&pool->lock);
120 		return true;
121 	}
122 
123 	/* Destroy all excess handles in the inactive queue */
124 	while (pool->inactive_queue.ci && destroyed < to_destroy) {
125 		handles[destroyed++] = pop_handle_from_queue_locked(
126 			&pool->inactive_queue);
127 	}
128 
129 	/* Move all handles from regular queue to inactive queue */
130 	while (pool->queue.ci) {
131 		handle = pop_handle_from_queue_locked(&pool->queue);
132 		push_handle_to_queue_locked(&pool->inactive_queue, handle);
133 		has_work = true;
134 	}
135 
136 	spin_unlock(&pool->lock);
137 
138 	if (destroyed)
139 		pools->pool_ops->destroy_frmrs(device, handles, destroyed);
140 	kfree(handles);
141 	return has_work;
142 }
143 
144 static void pool_aging_work(struct work_struct *work)
145 {
146 	struct ib_frmr_pool *pool = container_of(
147 		to_delayed_work(work), struct ib_frmr_pool, aging_work);
148 	struct ib_frmr_pools *pools = pool->device->frmr_pools;
149 	bool has_work = false;
150 
151 	if (pool->pinned_handles) {
152 		has_work = age_pinned_pool(pool->device, pool);
153 		goto out;
154 	}
155 
156 	destroy_all_handles_in_queue(pool->device, pool, &pool->inactive_queue);
157 
158 	/* Move all pages from regular queue to inactive queue */
159 	spin_lock(&pool->lock);
160 	if (pool->queue.ci > 0) {
161 		list_splice_tail_init(&pool->queue.pages_list,
162 				      &pool->inactive_queue.pages_list);
163 		pool->inactive_queue.num_pages = pool->queue.num_pages;
164 		pool->inactive_queue.ci = pool->queue.ci;
165 
166 		pool->queue.num_pages = 0;
167 		pool->queue.ci = 0;
168 		has_work = true;
169 	}
170 	spin_unlock(&pool->lock);
171 
172 out:
173 	/* Reschedule if there are handles to age in next aging period */
174 	if (has_work)
175 		queue_delayed_work(
176 			pools->aging_wq, &pool->aging_work,
177 			secs_to_jiffies(READ_ONCE(pools->aging_period_sec)));
178 }
179 
180 static void destroy_frmr_pool(struct ib_device *device,
181 			      struct ib_frmr_pool *pool)
182 {
183 	cancel_delayed_work_sync(&pool->aging_work);
184 	destroy_all_handles_in_queue(device, pool, &pool->queue);
185 	destroy_all_handles_in_queue(device, pool, &pool->inactive_queue);
186 
187 	kfree(pool);
188 }
189 
190 /*
191  * Initialize the FRMR pools for a device.
192  *
193  * @device: The device to initialize the FRMR pools for.
194  * @pool_ops: The pool operations to use.
195  *
196  * Returns 0 on success, negative error code on failure.
197  */
198 int ib_frmr_pools_init(struct ib_device *device,
199 		       const struct ib_frmr_pool_ops *pool_ops)
200 {
201 	struct ib_frmr_pools *pools;
202 
203 	pools = kzalloc_obj(*pools);
204 	if (!pools)
205 		return -ENOMEM;
206 
207 	pools->rb_root = RB_ROOT;
208 	rwlock_init(&pools->rb_lock);
209 	pools->pool_ops = pool_ops;
210 	pools->aging_wq = create_singlethread_workqueue("frmr_aging_wq");
211 	if (!pools->aging_wq) {
212 		kfree(pools);
213 		return -ENOMEM;
214 	}
215 
216 	pools->aging_period_sec = FRMR_POOLS_DEFAULT_AGING_PERIOD_SECS;
217 
218 	device->frmr_pools = pools;
219 	return 0;
220 }
221 EXPORT_SYMBOL(ib_frmr_pools_init);
222 
223 /*
224  * Clean up the FRMR pools for a device.
225  *
226  * @device: The device to clean up the FRMR pools for.
227  *
228  * Call cleanup only after all FRMR handles have been pushed back to the pool
229  * and no other FRMR operations are allowed to run in parallel.
230  * Ensuring this allows us to save synchronization overhead in pop and push
231  * operations.
232  */
233 void ib_frmr_pools_cleanup(struct ib_device *device)
234 {
235 	struct ib_frmr_pools *pools = device->frmr_pools;
236 	struct ib_frmr_pool *pool, *next;
237 
238 	if (!pools)
239 		return;
240 
241 	rbtree_postorder_for_each_entry_safe(pool, next, &pools->rb_root, node)
242 		destroy_frmr_pool(device, pool);
243 
244 	destroy_workqueue(pools->aging_wq);
245 	kfree(pools);
246 	device->frmr_pools = NULL;
247 }
248 EXPORT_SYMBOL(ib_frmr_pools_cleanup);
249 
250 int ib_frmr_pools_set_aging_period(struct ib_device *device, u32 period_sec)
251 {
252 	struct ib_frmr_pools *pools = device->frmr_pools;
253 	struct ib_frmr_pool *pool;
254 	struct rb_node *node;
255 
256 	if (!pools)
257 		return -EINVAL;
258 
259 	if (period_sec == 0)
260 		return -EINVAL;
261 
262 	WRITE_ONCE(pools->aging_period_sec, period_sec);
263 
264 	read_lock(&pools->rb_lock);
265 	for (node = rb_first(&pools->rb_root); node; node = rb_next(node)) {
266 		pool = rb_entry(node, struct ib_frmr_pool, node);
267 		mod_delayed_work(pools->aging_wq, &pool->aging_work,
268 				 secs_to_jiffies(period_sec));
269 	}
270 	read_unlock(&pools->rb_lock);
271 
272 	return 0;
273 }
274 
275 static inline int compare_keys(struct ib_frmr_key *key1,
276 			       struct ib_frmr_key *key2)
277 {
278 	int res;
279 
280 	res = cmp_int(key1->ats, key2->ats);
281 	if (res)
282 		return res;
283 
284 	res = cmp_int(key1->access_flags, key2->access_flags);
285 	if (res)
286 		return res;
287 
288 	res = cmp_int(key1->vendor_key, key2->vendor_key);
289 	if (res)
290 		return res;
291 
292 	res = cmp_int(key1->kernel_vendor_key, key2->kernel_vendor_key);
293 	if (res)
294 		return res;
295 
296 	/*
297 	 * allow using handles that support more DMA blocks, up to twice the
298 	 * requested number
299 	 */
300 	res = cmp_int(key1->num_dma_blocks, key2->num_dma_blocks);
301 	if (res > 0) {
302 		if (key1->num_dma_blocks - key2->num_dma_blocks <
303 		    key2->num_dma_blocks)
304 			return 0;
305 	}
306 
307 	return res;
308 }
309 
310 static int frmr_pool_cmp_find(const void *key, const struct rb_node *node)
311 {
312 	struct ib_frmr_pool *pool = rb_entry(node, struct ib_frmr_pool, node);
313 
314 	return compare_keys(&pool->key, (struct ib_frmr_key *)key);
315 }
316 
317 static int frmr_pool_cmp_add(struct rb_node *new, const struct rb_node *node)
318 {
319 	struct ib_frmr_pool *new_pool =
320 		rb_entry(new, struct ib_frmr_pool, node);
321 	struct ib_frmr_pool *pool = rb_entry(node, struct ib_frmr_pool, node);
322 
323 	return compare_keys(&pool->key, &new_pool->key);
324 }
325 
326 static struct ib_frmr_pool *ib_frmr_pool_find(struct ib_frmr_pools *pools,
327 					      struct ib_frmr_key *key)
328 {
329 	struct ib_frmr_pool *pool;
330 	struct rb_node *node;
331 
332 	/* find operation is done under read lock for performance reasons.
333 	 * The case of threads failing to find the same pool and creating it
334 	 * is handled by the create_frmr_pool function.
335 	 */
336 	read_lock(&pools->rb_lock);
337 	node = rb_find(key, &pools->rb_root, frmr_pool_cmp_find);
338 	pool = rb_entry_safe(node, struct ib_frmr_pool, node);
339 	read_unlock(&pools->rb_lock);
340 
341 	return pool;
342 }
343 
344 static struct ib_frmr_pool *create_frmr_pool(struct ib_device *device,
345 					     struct ib_frmr_key *key)
346 {
347 	struct ib_frmr_pools *pools = device->frmr_pools;
348 	struct ib_frmr_pool *pool;
349 	struct rb_node *existing;
350 
351 	pool = kzalloc_obj(*pool);
352 	if (!pool)
353 		return ERR_PTR(-ENOMEM);
354 
355 	memcpy(&pool->key, key, sizeof(*key));
356 	INIT_LIST_HEAD(&pool->queue.pages_list);
357 	INIT_LIST_HEAD(&pool->inactive_queue.pages_list);
358 	spin_lock_init(&pool->lock);
359 	INIT_DELAYED_WORK(&pool->aging_work, pool_aging_work);
360 	pool->device = device;
361 
362 	write_lock(&pools->rb_lock);
363 	existing = rb_find_add(&pool->node, &pools->rb_root, frmr_pool_cmp_add);
364 	write_unlock(&pools->rb_lock);
365 
366 	/* If a different thread has already created the pool, return it.
367 	 * The insert operation is done under the write lock so we are sure
368 	 * that the pool is not inserted twice.
369 	 */
370 	if (existing) {
371 		kfree(pool);
372 		return rb_entry(existing, struct ib_frmr_pool, node);
373 	}
374 
375 	return pool;
376 }
377 
378 int ib_frmr_pools_set_pinned(struct ib_device *device, struct ib_frmr_key *key,
379 			     u32 pinned_handles)
380 {
381 	struct ib_frmr_pools *pools = device->frmr_pools;
382 	struct ib_frmr_key driver_key = {};
383 	struct ib_frmr_pool *pool;
384 	u32 needed_handles;
385 	u32 current_total;
386 	int i, ret = 0;
387 	u32 *handles;
388 
389 	if (!pools)
390 		return -EINVAL;
391 
392 	ret = ib_check_mr_access(device, key->access_flags);
393 	if (ret)
394 		return ret;
395 
396 	if (pools->pool_ops->build_key) {
397 		ret = pools->pool_ops->build_key(device, key, &driver_key);
398 		if (ret)
399 			return ret;
400 	} else {
401 		memcpy(&driver_key, key, sizeof(*key));
402 	}
403 
404 	pool = ib_frmr_pool_find(pools, &driver_key);
405 	if (!pool) {
406 		pool = create_frmr_pool(device, &driver_key);
407 		if (IS_ERR(pool))
408 			return PTR_ERR(pool);
409 	}
410 
411 	spin_lock(&pool->lock);
412 	current_total = pool->in_use + pool->queue.ci + pool->inactive_queue.ci;
413 
414 	if (current_total < pinned_handles)
415 		needed_handles = pinned_handles - current_total;
416 	else
417 		needed_handles = 0;
418 
419 	pool->pinned_handles = pinned_handles;
420 	spin_unlock(&pool->lock);
421 
422 	if (!needed_handles)
423 		goto schedule_aging;
424 
425 	handles = kcalloc(needed_handles, sizeof(*handles), GFP_KERNEL);
426 	if (!handles)
427 		return -ENOMEM;
428 
429 	ret = pools->pool_ops->create_frmrs(device, key, handles,
430 					    needed_handles);
431 	if (ret) {
432 		kfree(handles);
433 		return ret;
434 	}
435 
436 	spin_lock(&pool->lock);
437 	for (i = 0; i < needed_handles; i++) {
438 		ret = push_handle_to_queue_locked(&pool->queue,
439 						  handles[i]);
440 		if (ret)
441 			goto end;
442 	}
443 
444 end:
445 	spin_unlock(&pool->lock);
446 	kfree(handles);
447 
448 schedule_aging:
449 	/* Ensure aging is scheduled to adjust to new pinned handles count */
450 	mod_delayed_work(pools->aging_wq, &pool->aging_work, 0);
451 
452 	return ret;
453 }
454 
455 static int get_frmr_from_pool(struct ib_device *device,
456 			      struct ib_frmr_pool *pool, struct ib_mr *mr)
457 {
458 	struct ib_frmr_pools *pools = device->frmr_pools;
459 	u32 handle;
460 	int err;
461 
462 	spin_lock(&pool->lock);
463 	if (pool->queue.ci == 0) {
464 		if (pool->inactive_queue.ci > 0) {
465 			handle = pop_handle_from_queue_locked(
466 				&pool->inactive_queue);
467 		} else {
468 			spin_unlock(&pool->lock);
469 			err = pools->pool_ops->create_frmrs(device, &pool->key,
470 							    &handle, 1);
471 			if (err)
472 				return err;
473 			spin_lock(&pool->lock);
474 		}
475 	} else {
476 		handle = pop_handle_from_queue_locked(&pool->queue);
477 	}
478 
479 	pool->in_use++;
480 	if (pool->in_use > pool->max_in_use)
481 		pool->max_in_use = pool->in_use;
482 
483 	spin_unlock(&pool->lock);
484 
485 	mr->frmr.pool = pool;
486 	mr->frmr.handle = handle;
487 
488 	return 0;
489 }
490 
491 /*
492  * Pop an FRMR handle from the pool.
493  *
494  * @device: The device to pop the FRMR handle from.
495  * @mr: The MR to pop the FRMR handle from.
496  *
497  * Returns 0 on success, negative error code on failure.
498  */
499 int ib_frmr_pool_pop(struct ib_device *device, struct ib_mr *mr)
500 {
501 	struct ib_frmr_pools *pools = device->frmr_pools;
502 	struct ib_frmr_pool *pool;
503 
504 	WARN_ON_ONCE(!device->frmr_pools);
505 	pool = ib_frmr_pool_find(pools, &mr->frmr.key);
506 	if (!pool) {
507 		pool = create_frmr_pool(device, &mr->frmr.key);
508 		if (IS_ERR(pool))
509 			return PTR_ERR(pool);
510 	}
511 
512 	return get_frmr_from_pool(device, pool, mr);
513 }
514 EXPORT_SYMBOL(ib_frmr_pool_pop);
515 
516 /*
517  * Push an FRMR handle back to the pool.
518  *
519  * @device: The device to push the FRMR handle to.
520  * @mr: The MR containing the FRMR handle to push back to the pool.
521  *
522  * Returns 0 on success, negative error code on failure.
523  */
524 int ib_frmr_pool_push(struct ib_device *device, struct ib_mr *mr)
525 {
526 	struct ib_frmr_pool *pool = mr->frmr.pool;
527 	struct ib_frmr_pools *pools = device->frmr_pools;
528 	bool schedule_aging = false;
529 	int ret;
530 
531 	spin_lock(&pool->lock);
532 	/* Schedule aging every time an empty pool becomes non-empty */
533 	if (pool->queue.ci == 0)
534 		schedule_aging = true;
535 	ret = push_handle_to_queue_locked(&pool->queue, mr->frmr.handle);
536 	if (ret == 0)
537 		pool->in_use--;
538 
539 	spin_unlock(&pool->lock);
540 
541 	if (ret == 0 && schedule_aging)
542 		queue_delayed_work(pools->aging_wq, &pool->aging_work,
543 			secs_to_jiffies(READ_ONCE(pools->aging_period_sec)));
544 
545 	return ret;
546 }
547 EXPORT_SYMBOL(ib_frmr_pool_push);
548