xref: /linux/drivers/infiniband/hw/mlx5/mr.c (revision 6015fb905d89063231ed33bc15be19ef0fc339b8)
1 /*
2  * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
3  * Copyright (c) 2020, Intel Corporation. All rights reserved.
4  *
5  * This software is available to you under a choice of one of two
6  * licenses.  You may choose to be licensed under the terms of the GNU
7  * General Public License (GPL) Version 2, available from the file
8  * COPYING in the main directory of this source tree, or the
9  * OpenIB.org BSD license below:
10  *
11  *     Redistribution and use in source and binary forms, with or
12  *     without modification, are permitted provided that the following
13  *     conditions are met:
14  *
15  *      - Redistributions of source code must retain the above
16  *        copyright notice, this list of conditions and the following
17  *        disclaimer.
18  *
19  *      - Redistributions in binary form must reproduce the above
20  *        copyright notice, this list of conditions and the following
21  *        disclaimer in the documentation and/or other materials
22  *        provided with the distribution.
23  *
24  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
27  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
28  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
29  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
30  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
31  * SOFTWARE.
32  */
33 
34 
35 #include <linux/kref.h>
36 #include <linux/random.h>
37 #include <linux/debugfs.h>
38 #include <linux/export.h>
39 #include <linux/delay.h>
40 #include <linux/dma-buf.h>
41 #include <linux/dma-resv.h>
42 #include <rdma/ib_umem.h>
43 #include <rdma/ib_umem_odp.h>
44 #include <rdma/ib_verbs.h>
45 #include "dm.h"
46 #include "mlx5_ib.h"
47 
48 /*
49  * We can't use an array for xlt_emergency_page because dma_map_single doesn't
50  * work on kernel modules memory
51  */
52 void *xlt_emergency_page;
53 static DEFINE_MUTEX(xlt_emergency_page_mutex);
54 
55 enum {
56 	MAX_PENDING_REG_MR = 8,
57 };
58 
59 #define MLX5_UMR_ALIGN 2048
60 
61 static void
62 create_mkey_callback(int status, struct mlx5_async_work *context);
63 static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem,
64 				     u64 iova, int access_flags,
65 				     unsigned int page_size, bool populate);
66 
67 static void set_mkc_access_pd_addr_fields(void *mkc, int acc, u64 start_addr,
68 					  struct ib_pd *pd)
69 {
70 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
71 	bool ro_pci_enabled = pcie_relaxed_ordering_enabled(dev->mdev->pdev);
72 
73 	MLX5_SET(mkc, mkc, a, !!(acc & IB_ACCESS_REMOTE_ATOMIC));
74 	MLX5_SET(mkc, mkc, rw, !!(acc & IB_ACCESS_REMOTE_WRITE));
75 	MLX5_SET(mkc, mkc, rr, !!(acc & IB_ACCESS_REMOTE_READ));
76 	MLX5_SET(mkc, mkc, lw, !!(acc & IB_ACCESS_LOCAL_WRITE));
77 	MLX5_SET(mkc, mkc, lr, 1);
78 
79 	if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write))
80 		MLX5_SET(mkc, mkc, relaxed_ordering_write,
81 			 (acc & IB_ACCESS_RELAXED_ORDERING) && ro_pci_enabled);
82 	if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read))
83 		MLX5_SET(mkc, mkc, relaxed_ordering_read,
84 			 (acc & IB_ACCESS_RELAXED_ORDERING) && ro_pci_enabled);
85 
86 	MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn);
87 	MLX5_SET(mkc, mkc, qpn, 0xffffff);
88 	MLX5_SET64(mkc, mkc, start_addr, start_addr);
89 }
90 
91 static void assign_mkey_variant(struct mlx5_ib_dev *dev,
92 				struct mlx5_ib_mkey *mkey, u32 *in)
93 {
94 	u8 key = atomic_inc_return(&dev->mkey_var);
95 	void *mkc;
96 
97 	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
98 	MLX5_SET(mkc, mkc, mkey_7_0, key);
99 	mkey->key = key;
100 }
101 
102 static int mlx5_ib_create_mkey(struct mlx5_ib_dev *dev,
103 			       struct mlx5_ib_mkey *mkey, u32 *in, int inlen)
104 {
105 	int ret;
106 
107 	assign_mkey_variant(dev, mkey, in);
108 	ret = mlx5_core_create_mkey(dev->mdev, &mkey->key, in, inlen);
109 	if (!ret)
110 		init_waitqueue_head(&mkey->wait);
111 
112 	return ret;
113 }
114 
115 static int
116 mlx5_ib_create_mkey_cb(struct mlx5_ib_dev *dev,
117 		       struct mlx5_ib_mkey *mkey,
118 		       struct mlx5_async_ctx *async_ctx,
119 		       u32 *in, int inlen, u32 *out, int outlen,
120 		       struct mlx5_async_work *context)
121 {
122 	MLX5_SET(create_mkey_in, in, opcode, MLX5_CMD_OP_CREATE_MKEY);
123 	assign_mkey_variant(dev, mkey, in);
124 	return mlx5_cmd_exec_cb(async_ctx, in, inlen, out, outlen,
125 				create_mkey_callback, context);
126 }
127 
128 static int mr_cache_max_order(struct mlx5_ib_dev *dev);
129 static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent);
130 
131 static bool umr_can_use_indirect_mkey(struct mlx5_ib_dev *dev)
132 {
133 	return !MLX5_CAP_GEN(dev->mdev, umr_indirect_mkey_disabled);
134 }
135 
136 static int destroy_mkey(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
137 {
138 	WARN_ON(xa_load(&dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key)));
139 
140 	return mlx5_core_destroy_mkey(dev->mdev, mr->mmkey.key);
141 }
142 
143 static void create_mkey_warn(struct mlx5_ib_dev *dev, int status, void *out)
144 {
145 	if (status == -ENXIO) /* core driver is not available */
146 		return;
147 
148 	mlx5_ib_warn(dev, "async reg mr failed. status %d\n", status);
149 	if (status != -EREMOTEIO) /* driver specific failure */
150 		return;
151 
152 	/* Failed in FW, print cmd out failure details */
153 	mlx5_cmd_out_err(dev->mdev, MLX5_CMD_OP_CREATE_MKEY, 0, out);
154 }
155 
156 static void create_mkey_callback(int status, struct mlx5_async_work *context)
157 {
158 	struct mlx5_ib_mr *mr =
159 		container_of(context, struct mlx5_ib_mr, cb_work);
160 	struct mlx5_cache_ent *ent = mr->cache_ent;
161 	struct mlx5_ib_dev *dev = ent->dev;
162 	unsigned long flags;
163 
164 	if (status) {
165 		create_mkey_warn(dev, status, mr->out);
166 		kfree(mr);
167 		spin_lock_irqsave(&ent->lock, flags);
168 		ent->pending--;
169 		WRITE_ONCE(dev->fill_delay, 1);
170 		spin_unlock_irqrestore(&ent->lock, flags);
171 		mod_timer(&dev->delay_timer, jiffies + HZ);
172 		return;
173 	}
174 
175 	mr->mmkey.type = MLX5_MKEY_MR;
176 	mr->mmkey.key |= mlx5_idx_to_mkey(
177 		MLX5_GET(create_mkey_out, mr->out, mkey_index));
178 	init_waitqueue_head(&mr->mmkey.wait);
179 
180 	WRITE_ONCE(dev->cache.last_add, jiffies);
181 
182 	spin_lock_irqsave(&ent->lock, flags);
183 	list_add_tail(&mr->list, &ent->head);
184 	ent->available_mrs++;
185 	ent->total_mrs++;
186 	/* If we are doing fill_to_high_water then keep going. */
187 	queue_adjust_cache_locked(ent);
188 	ent->pending--;
189 	spin_unlock_irqrestore(&ent->lock, flags);
190 }
191 
192 static struct mlx5_ib_mr *alloc_cache_mr(struct mlx5_cache_ent *ent, void *mkc)
193 {
194 	struct mlx5_ib_mr *mr;
195 
196 	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
197 	if (!mr)
198 		return NULL;
199 	mr->cache_ent = ent;
200 
201 	set_mkc_access_pd_addr_fields(mkc, 0, 0, ent->dev->umrc.pd);
202 	MLX5_SET(mkc, mkc, free, 1);
203 	MLX5_SET(mkc, mkc, umr_en, 1);
204 	MLX5_SET(mkc, mkc, access_mode_1_0, ent->access_mode & 0x3);
205 	MLX5_SET(mkc, mkc, access_mode_4_2, (ent->access_mode >> 2) & 0x7);
206 
207 	MLX5_SET(mkc, mkc, translations_octword_size, ent->xlt);
208 	MLX5_SET(mkc, mkc, log_page_size, ent->page);
209 	return mr;
210 }
211 
212 /* Asynchronously schedule new MRs to be populated in the cache. */
213 static int add_keys(struct mlx5_cache_ent *ent, unsigned int num)
214 {
215 	size_t inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
216 	struct mlx5_ib_mr *mr;
217 	void *mkc;
218 	u32 *in;
219 	int err = 0;
220 	int i;
221 
222 	in = kzalloc(inlen, GFP_KERNEL);
223 	if (!in)
224 		return -ENOMEM;
225 
226 	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
227 	for (i = 0; i < num; i++) {
228 		mr = alloc_cache_mr(ent, mkc);
229 		if (!mr) {
230 			err = -ENOMEM;
231 			break;
232 		}
233 		spin_lock_irq(&ent->lock);
234 		if (ent->pending >= MAX_PENDING_REG_MR) {
235 			err = -EAGAIN;
236 			spin_unlock_irq(&ent->lock);
237 			kfree(mr);
238 			break;
239 		}
240 		ent->pending++;
241 		spin_unlock_irq(&ent->lock);
242 		err = mlx5_ib_create_mkey_cb(ent->dev, &mr->mmkey,
243 					     &ent->dev->async_ctx, in, inlen,
244 					     mr->out, sizeof(mr->out),
245 					     &mr->cb_work);
246 		if (err) {
247 			spin_lock_irq(&ent->lock);
248 			ent->pending--;
249 			spin_unlock_irq(&ent->lock);
250 			mlx5_ib_warn(ent->dev, "create mkey failed %d\n", err);
251 			kfree(mr);
252 			break;
253 		}
254 	}
255 
256 	kfree(in);
257 	return err;
258 }
259 
260 /* Synchronously create a MR in the cache */
261 static struct mlx5_ib_mr *create_cache_mr(struct mlx5_cache_ent *ent)
262 {
263 	size_t inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
264 	struct mlx5_ib_mr *mr;
265 	void *mkc;
266 	u32 *in;
267 	int err;
268 
269 	in = kzalloc(inlen, GFP_KERNEL);
270 	if (!in)
271 		return ERR_PTR(-ENOMEM);
272 	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
273 
274 	mr = alloc_cache_mr(ent, mkc);
275 	if (!mr) {
276 		err = -ENOMEM;
277 		goto free_in;
278 	}
279 
280 	err = mlx5_core_create_mkey(ent->dev->mdev, &mr->mmkey.key, in, inlen);
281 	if (err)
282 		goto free_mr;
283 
284 	init_waitqueue_head(&mr->mmkey.wait);
285 	mr->mmkey.type = MLX5_MKEY_MR;
286 	WRITE_ONCE(ent->dev->cache.last_add, jiffies);
287 	spin_lock_irq(&ent->lock);
288 	ent->total_mrs++;
289 	spin_unlock_irq(&ent->lock);
290 	kfree(in);
291 	return mr;
292 free_mr:
293 	kfree(mr);
294 free_in:
295 	kfree(in);
296 	return ERR_PTR(err);
297 }
298 
299 static void remove_cache_mr_locked(struct mlx5_cache_ent *ent)
300 {
301 	struct mlx5_ib_mr *mr;
302 
303 	lockdep_assert_held(&ent->lock);
304 	if (list_empty(&ent->head))
305 		return;
306 	mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list);
307 	list_del(&mr->list);
308 	ent->available_mrs--;
309 	ent->total_mrs--;
310 	spin_unlock_irq(&ent->lock);
311 	mlx5_core_destroy_mkey(ent->dev->mdev, mr->mmkey.key);
312 	kfree(mr);
313 	spin_lock_irq(&ent->lock);
314 }
315 
316 static int resize_available_mrs(struct mlx5_cache_ent *ent, unsigned int target,
317 				bool limit_fill)
318 {
319 	int err;
320 
321 	lockdep_assert_held(&ent->lock);
322 
323 	while (true) {
324 		if (limit_fill)
325 			target = ent->limit * 2;
326 		if (target == ent->available_mrs + ent->pending)
327 			return 0;
328 		if (target > ent->available_mrs + ent->pending) {
329 			u32 todo = target - (ent->available_mrs + ent->pending);
330 
331 			spin_unlock_irq(&ent->lock);
332 			err = add_keys(ent, todo);
333 			if (err == -EAGAIN)
334 				usleep_range(3000, 5000);
335 			spin_lock_irq(&ent->lock);
336 			if (err) {
337 				if (err != -EAGAIN)
338 					return err;
339 			} else
340 				return 0;
341 		} else {
342 			remove_cache_mr_locked(ent);
343 		}
344 	}
345 }
346 
347 static ssize_t size_write(struct file *filp, const char __user *buf,
348 			  size_t count, loff_t *pos)
349 {
350 	struct mlx5_cache_ent *ent = filp->private_data;
351 	u32 target;
352 	int err;
353 
354 	err = kstrtou32_from_user(buf, count, 0, &target);
355 	if (err)
356 		return err;
357 
358 	/*
359 	 * Target is the new value of total_mrs the user requests, however we
360 	 * cannot free MRs that are in use. Compute the target value for
361 	 * available_mrs.
362 	 */
363 	spin_lock_irq(&ent->lock);
364 	if (target < ent->total_mrs - ent->available_mrs) {
365 		err = -EINVAL;
366 		goto err_unlock;
367 	}
368 	target = target - (ent->total_mrs - ent->available_mrs);
369 	if (target < ent->limit || target > ent->limit*2) {
370 		err = -EINVAL;
371 		goto err_unlock;
372 	}
373 	err = resize_available_mrs(ent, target, false);
374 	if (err)
375 		goto err_unlock;
376 	spin_unlock_irq(&ent->lock);
377 
378 	return count;
379 
380 err_unlock:
381 	spin_unlock_irq(&ent->lock);
382 	return err;
383 }
384 
385 static ssize_t size_read(struct file *filp, char __user *buf, size_t count,
386 			 loff_t *pos)
387 {
388 	struct mlx5_cache_ent *ent = filp->private_data;
389 	char lbuf[20];
390 	int err;
391 
392 	err = snprintf(lbuf, sizeof(lbuf), "%d\n", ent->total_mrs);
393 	if (err < 0)
394 		return err;
395 
396 	return simple_read_from_buffer(buf, count, pos, lbuf, err);
397 }
398 
399 static const struct file_operations size_fops = {
400 	.owner	= THIS_MODULE,
401 	.open	= simple_open,
402 	.write	= size_write,
403 	.read	= size_read,
404 };
405 
406 static ssize_t limit_write(struct file *filp, const char __user *buf,
407 			   size_t count, loff_t *pos)
408 {
409 	struct mlx5_cache_ent *ent = filp->private_data;
410 	u32 var;
411 	int err;
412 
413 	err = kstrtou32_from_user(buf, count, 0, &var);
414 	if (err)
415 		return err;
416 
417 	/*
418 	 * Upon set we immediately fill the cache to high water mark implied by
419 	 * the limit.
420 	 */
421 	spin_lock_irq(&ent->lock);
422 	ent->limit = var;
423 	err = resize_available_mrs(ent, 0, true);
424 	spin_unlock_irq(&ent->lock);
425 	if (err)
426 		return err;
427 	return count;
428 }
429 
430 static ssize_t limit_read(struct file *filp, char __user *buf, size_t count,
431 			  loff_t *pos)
432 {
433 	struct mlx5_cache_ent *ent = filp->private_data;
434 	char lbuf[20];
435 	int err;
436 
437 	err = snprintf(lbuf, sizeof(lbuf), "%d\n", ent->limit);
438 	if (err < 0)
439 		return err;
440 
441 	return simple_read_from_buffer(buf, count, pos, lbuf, err);
442 }
443 
444 static const struct file_operations limit_fops = {
445 	.owner	= THIS_MODULE,
446 	.open	= simple_open,
447 	.write	= limit_write,
448 	.read	= limit_read,
449 };
450 
451 static bool someone_adding(struct mlx5_mr_cache *cache)
452 {
453 	unsigned int i;
454 
455 	for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
456 		struct mlx5_cache_ent *ent = &cache->ent[i];
457 		bool ret;
458 
459 		spin_lock_irq(&ent->lock);
460 		ret = ent->available_mrs < ent->limit;
461 		spin_unlock_irq(&ent->lock);
462 		if (ret)
463 			return true;
464 	}
465 	return false;
466 }
467 
468 /*
469  * Check if the bucket is outside the high/low water mark and schedule an async
470  * update. The cache refill has hysteresis, once the low water mark is hit it is
471  * refilled up to the high mark.
472  */
473 static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent)
474 {
475 	lockdep_assert_held(&ent->lock);
476 
477 	if (ent->disabled || READ_ONCE(ent->dev->fill_delay))
478 		return;
479 	if (ent->available_mrs < ent->limit) {
480 		ent->fill_to_high_water = true;
481 		queue_work(ent->dev->cache.wq, &ent->work);
482 	} else if (ent->fill_to_high_water &&
483 		   ent->available_mrs + ent->pending < 2 * ent->limit) {
484 		/*
485 		 * Once we start populating due to hitting a low water mark
486 		 * continue until we pass the high water mark.
487 		 */
488 		queue_work(ent->dev->cache.wq, &ent->work);
489 	} else if (ent->available_mrs == 2 * ent->limit) {
490 		ent->fill_to_high_water = false;
491 	} else if (ent->available_mrs > 2 * ent->limit) {
492 		/* Queue deletion of excess entries */
493 		ent->fill_to_high_water = false;
494 		if (ent->pending)
495 			queue_delayed_work(ent->dev->cache.wq, &ent->dwork,
496 					   msecs_to_jiffies(1000));
497 		else
498 			queue_work(ent->dev->cache.wq, &ent->work);
499 	}
500 }
501 
502 static void __cache_work_func(struct mlx5_cache_ent *ent)
503 {
504 	struct mlx5_ib_dev *dev = ent->dev;
505 	struct mlx5_mr_cache *cache = &dev->cache;
506 	int err;
507 
508 	spin_lock_irq(&ent->lock);
509 	if (ent->disabled)
510 		goto out;
511 
512 	if (ent->fill_to_high_water &&
513 	    ent->available_mrs + ent->pending < 2 * ent->limit &&
514 	    !READ_ONCE(dev->fill_delay)) {
515 		spin_unlock_irq(&ent->lock);
516 		err = add_keys(ent, 1);
517 		spin_lock_irq(&ent->lock);
518 		if (ent->disabled)
519 			goto out;
520 		if (err) {
521 			/*
522 			 * EAGAIN only happens if pending is positive, so we
523 			 * will be rescheduled from reg_mr_callback(). The only
524 			 * failure path here is ENOMEM.
525 			 */
526 			if (err != -EAGAIN) {
527 				mlx5_ib_warn(
528 					dev,
529 					"command failed order %d, err %d\n",
530 					ent->order, err);
531 				queue_delayed_work(cache->wq, &ent->dwork,
532 						   msecs_to_jiffies(1000));
533 			}
534 		}
535 	} else if (ent->available_mrs > 2 * ent->limit) {
536 		bool need_delay;
537 
538 		/*
539 		 * The remove_cache_mr() logic is performed as garbage
540 		 * collection task. Such task is intended to be run when no
541 		 * other active processes are running.
542 		 *
543 		 * The need_resched() will return TRUE if there are user tasks
544 		 * to be activated in near future.
545 		 *
546 		 * In such case, we don't execute remove_cache_mr() and postpone
547 		 * the garbage collection work to try to run in next cycle, in
548 		 * order to free CPU resources to other tasks.
549 		 */
550 		spin_unlock_irq(&ent->lock);
551 		need_delay = need_resched() || someone_adding(cache) ||
552 			     !time_after(jiffies,
553 					 READ_ONCE(cache->last_add) + 300 * HZ);
554 		spin_lock_irq(&ent->lock);
555 		if (ent->disabled)
556 			goto out;
557 		if (need_delay)
558 			queue_delayed_work(cache->wq, &ent->dwork, 300 * HZ);
559 		remove_cache_mr_locked(ent);
560 		queue_adjust_cache_locked(ent);
561 	}
562 out:
563 	spin_unlock_irq(&ent->lock);
564 }
565 
566 static void delayed_cache_work_func(struct work_struct *work)
567 {
568 	struct mlx5_cache_ent *ent;
569 
570 	ent = container_of(work, struct mlx5_cache_ent, dwork.work);
571 	__cache_work_func(ent);
572 }
573 
574 static void cache_work_func(struct work_struct *work)
575 {
576 	struct mlx5_cache_ent *ent;
577 
578 	ent = container_of(work, struct mlx5_cache_ent, work);
579 	__cache_work_func(ent);
580 }
581 
582 /* Allocate a special entry from the cache */
583 struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev,
584 				       unsigned int entry, int access_flags)
585 {
586 	struct mlx5_mr_cache *cache = &dev->cache;
587 	struct mlx5_cache_ent *ent;
588 	struct mlx5_ib_mr *mr;
589 
590 	if (WARN_ON(entry <= MR_CACHE_LAST_STD_ENTRY ||
591 		    entry >= ARRAY_SIZE(cache->ent)))
592 		return ERR_PTR(-EINVAL);
593 
594 	/* Matches access in alloc_cache_mr() */
595 	if (!mlx5_ib_can_reconfig_with_umr(dev, 0, access_flags))
596 		return ERR_PTR(-EOPNOTSUPP);
597 
598 	ent = &cache->ent[entry];
599 	spin_lock_irq(&ent->lock);
600 	if (list_empty(&ent->head)) {
601 		spin_unlock_irq(&ent->lock);
602 		mr = create_cache_mr(ent);
603 		if (IS_ERR(mr))
604 			return mr;
605 	} else {
606 		mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list);
607 		list_del(&mr->list);
608 		ent->available_mrs--;
609 		queue_adjust_cache_locked(ent);
610 		spin_unlock_irq(&ent->lock);
611 
612 		mlx5_clear_mr(mr);
613 	}
614 	mr->access_flags = access_flags;
615 	return mr;
616 }
617 
618 /* Return a MR already available in the cache */
619 static struct mlx5_ib_mr *get_cache_mr(struct mlx5_cache_ent *req_ent)
620 {
621 	struct mlx5_ib_mr *mr = NULL;
622 	struct mlx5_cache_ent *ent = req_ent;
623 
624 	spin_lock_irq(&ent->lock);
625 	if (!list_empty(&ent->head)) {
626 		mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list);
627 		list_del(&mr->list);
628 		ent->available_mrs--;
629 		queue_adjust_cache_locked(ent);
630 		spin_unlock_irq(&ent->lock);
631 		mlx5_clear_mr(mr);
632 		return mr;
633 	}
634 	queue_adjust_cache_locked(ent);
635 	spin_unlock_irq(&ent->lock);
636 	req_ent->miss++;
637 	return NULL;
638 }
639 
640 static void mlx5_mr_cache_free(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
641 {
642 	struct mlx5_cache_ent *ent = mr->cache_ent;
643 
644 	spin_lock_irq(&ent->lock);
645 	list_add_tail(&mr->list, &ent->head);
646 	ent->available_mrs++;
647 	queue_adjust_cache_locked(ent);
648 	spin_unlock_irq(&ent->lock);
649 }
650 
651 static void clean_keys(struct mlx5_ib_dev *dev, int c)
652 {
653 	struct mlx5_mr_cache *cache = &dev->cache;
654 	struct mlx5_cache_ent *ent = &cache->ent[c];
655 	struct mlx5_ib_mr *tmp_mr;
656 	struct mlx5_ib_mr *mr;
657 	LIST_HEAD(del_list);
658 
659 	cancel_delayed_work(&ent->dwork);
660 	while (1) {
661 		spin_lock_irq(&ent->lock);
662 		if (list_empty(&ent->head)) {
663 			spin_unlock_irq(&ent->lock);
664 			break;
665 		}
666 		mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list);
667 		list_move(&mr->list, &del_list);
668 		ent->available_mrs--;
669 		ent->total_mrs--;
670 		spin_unlock_irq(&ent->lock);
671 		mlx5_core_destroy_mkey(dev->mdev, mr->mmkey.key);
672 	}
673 
674 	list_for_each_entry_safe(mr, tmp_mr, &del_list, list) {
675 		list_del(&mr->list);
676 		kfree(mr);
677 	}
678 }
679 
680 static void mlx5_mr_cache_debugfs_cleanup(struct mlx5_ib_dev *dev)
681 {
682 	if (!mlx5_debugfs_root || dev->is_rep)
683 		return;
684 
685 	debugfs_remove_recursive(dev->cache.root);
686 	dev->cache.root = NULL;
687 }
688 
689 static void mlx5_mr_cache_debugfs_init(struct mlx5_ib_dev *dev)
690 {
691 	struct mlx5_mr_cache *cache = &dev->cache;
692 	struct mlx5_cache_ent *ent;
693 	struct dentry *dir;
694 	int i;
695 
696 	if (!mlx5_debugfs_root || dev->is_rep)
697 		return;
698 
699 	cache->root = debugfs_create_dir("mr_cache", dev->mdev->priv.dbg_root);
700 
701 	for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
702 		ent = &cache->ent[i];
703 		sprintf(ent->name, "%d", ent->order);
704 		dir = debugfs_create_dir(ent->name, cache->root);
705 		debugfs_create_file("size", 0600, dir, ent, &size_fops);
706 		debugfs_create_file("limit", 0600, dir, ent, &limit_fops);
707 		debugfs_create_u32("cur", 0400, dir, &ent->available_mrs);
708 		debugfs_create_u32("miss", 0600, dir, &ent->miss);
709 	}
710 }
711 
712 static void delay_time_func(struct timer_list *t)
713 {
714 	struct mlx5_ib_dev *dev = from_timer(dev, t, delay_timer);
715 
716 	WRITE_ONCE(dev->fill_delay, 0);
717 }
718 
719 int mlx5_mr_cache_init(struct mlx5_ib_dev *dev)
720 {
721 	struct mlx5_mr_cache *cache = &dev->cache;
722 	struct mlx5_cache_ent *ent;
723 	int i;
724 
725 	mutex_init(&dev->slow_path_mutex);
726 	cache->wq = alloc_ordered_workqueue("mkey_cache", WQ_MEM_RECLAIM);
727 	if (!cache->wq) {
728 		mlx5_ib_warn(dev, "failed to create work queue\n");
729 		return -ENOMEM;
730 	}
731 
732 	mlx5_cmd_init_async_ctx(dev->mdev, &dev->async_ctx);
733 	timer_setup(&dev->delay_timer, delay_time_func, 0);
734 	for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
735 		ent = &cache->ent[i];
736 		INIT_LIST_HEAD(&ent->head);
737 		spin_lock_init(&ent->lock);
738 		ent->order = i + 2;
739 		ent->dev = dev;
740 		ent->limit = 0;
741 
742 		INIT_WORK(&ent->work, cache_work_func);
743 		INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func);
744 
745 		if (i > MR_CACHE_LAST_STD_ENTRY) {
746 			mlx5_odp_init_mr_cache_entry(ent);
747 			continue;
748 		}
749 
750 		if (ent->order > mr_cache_max_order(dev))
751 			continue;
752 
753 		ent->page = PAGE_SHIFT;
754 		ent->xlt = (1 << ent->order) * sizeof(struct mlx5_mtt) /
755 			   MLX5_IB_UMR_OCTOWORD;
756 		ent->access_mode = MLX5_MKC_ACCESS_MODE_MTT;
757 		if ((dev->mdev->profile.mask & MLX5_PROF_MASK_MR_CACHE) &&
758 		    !dev->is_rep && mlx5_core_is_pf(dev->mdev) &&
759 		    mlx5_ib_can_load_pas_with_umr(dev, 0))
760 			ent->limit = dev->mdev->profile.mr_cache[i].limit;
761 		else
762 			ent->limit = 0;
763 		spin_lock_irq(&ent->lock);
764 		queue_adjust_cache_locked(ent);
765 		spin_unlock_irq(&ent->lock);
766 	}
767 
768 	mlx5_mr_cache_debugfs_init(dev);
769 
770 	return 0;
771 }
772 
773 int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev)
774 {
775 	unsigned int i;
776 
777 	if (!dev->cache.wq)
778 		return 0;
779 
780 	for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
781 		struct mlx5_cache_ent *ent = &dev->cache.ent[i];
782 
783 		spin_lock_irq(&ent->lock);
784 		ent->disabled = true;
785 		spin_unlock_irq(&ent->lock);
786 		cancel_work_sync(&ent->work);
787 		cancel_delayed_work_sync(&ent->dwork);
788 	}
789 
790 	mlx5_mr_cache_debugfs_cleanup(dev);
791 	mlx5_cmd_cleanup_async_ctx(&dev->async_ctx);
792 
793 	for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++)
794 		clean_keys(dev, i);
795 
796 	destroy_workqueue(dev->cache.wq);
797 	del_timer_sync(&dev->delay_timer);
798 
799 	return 0;
800 }
801 
802 struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc)
803 {
804 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
805 	int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
806 	struct mlx5_ib_mr *mr;
807 	void *mkc;
808 	u32 *in;
809 	int err;
810 
811 	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
812 	if (!mr)
813 		return ERR_PTR(-ENOMEM);
814 
815 	in = kzalloc(inlen, GFP_KERNEL);
816 	if (!in) {
817 		err = -ENOMEM;
818 		goto err_free;
819 	}
820 
821 	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
822 
823 	MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_PA);
824 	MLX5_SET(mkc, mkc, length64, 1);
825 	set_mkc_access_pd_addr_fields(mkc, acc | IB_ACCESS_RELAXED_ORDERING, 0,
826 				      pd);
827 
828 	err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen);
829 	if (err)
830 		goto err_in;
831 
832 	kfree(in);
833 	mr->mmkey.type = MLX5_MKEY_MR;
834 	mr->ibmr.lkey = mr->mmkey.key;
835 	mr->ibmr.rkey = mr->mmkey.key;
836 	mr->umem = NULL;
837 
838 	return &mr->ibmr;
839 
840 err_in:
841 	kfree(in);
842 
843 err_free:
844 	kfree(mr);
845 
846 	return ERR_PTR(err);
847 }
848 
849 static int get_octo_len(u64 addr, u64 len, int page_shift)
850 {
851 	u64 page_size = 1ULL << page_shift;
852 	u64 offset;
853 	int npages;
854 
855 	offset = addr & (page_size - 1);
856 	npages = ALIGN(len + offset, page_size) >> page_shift;
857 	return (npages + 1) / 2;
858 }
859 
860 static int mr_cache_max_order(struct mlx5_ib_dev *dev)
861 {
862 	if (MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset))
863 		return MR_CACHE_LAST_STD_ENTRY + 2;
864 	return MLX5_MAX_UMR_SHIFT;
865 }
866 
867 static void mlx5_ib_umr_done(struct ib_cq *cq, struct ib_wc *wc)
868 {
869 	struct mlx5_ib_umr_context *context =
870 		container_of(wc->wr_cqe, struct mlx5_ib_umr_context, cqe);
871 
872 	context->status = wc->status;
873 	complete(&context->done);
874 }
875 
876 static inline void mlx5_ib_init_umr_context(struct mlx5_ib_umr_context *context)
877 {
878 	context->cqe.done = mlx5_ib_umr_done;
879 	context->status = -1;
880 	init_completion(&context->done);
881 }
882 
883 static int mlx5_ib_post_send_wait(struct mlx5_ib_dev *dev,
884 				  struct mlx5_umr_wr *umrwr)
885 {
886 	struct umr_common *umrc = &dev->umrc;
887 	const struct ib_send_wr *bad;
888 	int err;
889 	struct mlx5_ib_umr_context umr_context;
890 
891 	mlx5_ib_init_umr_context(&umr_context);
892 	umrwr->wr.wr_cqe = &umr_context.cqe;
893 
894 	down(&umrc->sem);
895 	err = ib_post_send(umrc->qp, &umrwr->wr, &bad);
896 	if (err) {
897 		mlx5_ib_warn(dev, "UMR post send failed, err %d\n", err);
898 	} else {
899 		wait_for_completion(&umr_context.done);
900 		if (umr_context.status != IB_WC_SUCCESS) {
901 			mlx5_ib_warn(dev, "reg umr failed (%u)\n",
902 				     umr_context.status);
903 			err = -EFAULT;
904 		}
905 	}
906 	up(&umrc->sem);
907 	return err;
908 }
909 
910 static struct mlx5_cache_ent *mr_cache_ent_from_order(struct mlx5_ib_dev *dev,
911 						      unsigned int order)
912 {
913 	struct mlx5_mr_cache *cache = &dev->cache;
914 
915 	if (order < cache->ent[0].order)
916 		return &cache->ent[0];
917 	order = order - cache->ent[0].order;
918 	if (order > MR_CACHE_LAST_STD_ENTRY)
919 		return NULL;
920 	return &cache->ent[order];
921 }
922 
923 static void set_mr_fields(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr,
924 			  u64 length, int access_flags, u64 iova)
925 {
926 	mr->ibmr.lkey = mr->mmkey.key;
927 	mr->ibmr.rkey = mr->mmkey.key;
928 	mr->ibmr.length = length;
929 	mr->ibmr.device = &dev->ib_dev;
930 	mr->ibmr.iova = iova;
931 	mr->access_flags = access_flags;
932 }
933 
934 static unsigned int mlx5_umem_dmabuf_default_pgsz(struct ib_umem *umem,
935 						  u64 iova)
936 {
937 	/*
938 	 * The alignment of iova has already been checked upon entering
939 	 * UVERBS_METHOD_REG_DMABUF_MR
940 	 */
941 	umem->iova = iova;
942 	return PAGE_SIZE;
943 }
944 
945 static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd,
946 					     struct ib_umem *umem, u64 iova,
947 					     int access_flags)
948 {
949 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
950 	struct mlx5_cache_ent *ent;
951 	struct mlx5_ib_mr *mr;
952 	unsigned int page_size;
953 
954 	if (umem->is_dmabuf)
955 		page_size = mlx5_umem_dmabuf_default_pgsz(umem, iova);
956 	else
957 		page_size = mlx5_umem_find_best_pgsz(umem, mkc, log_page_size,
958 						     0, iova);
959 	if (WARN_ON(!page_size))
960 		return ERR_PTR(-EINVAL);
961 	ent = mr_cache_ent_from_order(
962 		dev, order_base_2(ib_umem_num_dma_blocks(umem, page_size)));
963 	/*
964 	 * Matches access in alloc_cache_mr(). If the MR can't come from the
965 	 * cache then synchronously create an uncached one.
966 	 */
967 	if (!ent || ent->limit == 0 ||
968 	    !mlx5_ib_can_reconfig_with_umr(dev, 0, access_flags)) {
969 		mutex_lock(&dev->slow_path_mutex);
970 		mr = reg_create(pd, umem, iova, access_flags, page_size, false);
971 		mutex_unlock(&dev->slow_path_mutex);
972 		return mr;
973 	}
974 
975 	mr = get_cache_mr(ent);
976 	if (!mr) {
977 		mr = create_cache_mr(ent);
978 		/*
979 		 * The above already tried to do the same stuff as reg_create(),
980 		 * no reason to try it again.
981 		 */
982 		if (IS_ERR(mr))
983 			return mr;
984 	}
985 
986 	mr->ibmr.pd = pd;
987 	mr->umem = umem;
988 	mr->page_shift = order_base_2(page_size);
989 	set_mr_fields(dev, mr, umem->length, access_flags, iova);
990 
991 	return mr;
992 }
993 
994 #define MLX5_MAX_UMR_CHUNK ((1 << (MLX5_MAX_UMR_SHIFT + 4)) - \
995 			    MLX5_UMR_MTT_ALIGNMENT)
996 #define MLX5_SPARE_UMR_CHUNK 0x10000
997 
998 /*
999  * Allocate a temporary buffer to hold the per-page information to transfer to
1000  * HW. For efficiency this should be as large as it can be, but buffer
1001  * allocation failure is not allowed, so try smaller sizes.
1002  */
1003 static void *mlx5_ib_alloc_xlt(size_t *nents, size_t ent_size, gfp_t gfp_mask)
1004 {
1005 	const size_t xlt_chunk_align =
1006 		MLX5_UMR_MTT_ALIGNMENT / ent_size;
1007 	size_t size;
1008 	void *res = NULL;
1009 
1010 	static_assert(PAGE_SIZE % MLX5_UMR_MTT_ALIGNMENT == 0);
1011 
1012 	/*
1013 	 * MLX5_IB_UPD_XLT_ATOMIC doesn't signal an atomic context just that the
1014 	 * allocation can't trigger any kind of reclaim.
1015 	 */
1016 	might_sleep();
1017 
1018 	gfp_mask |= __GFP_ZERO | __GFP_NORETRY;
1019 
1020 	/*
1021 	 * If the system already has a suitable high order page then just use
1022 	 * that, but don't try hard to create one. This max is about 1M, so a
1023 	 * free x86 huge page will satisfy it.
1024 	 */
1025 	size = min_t(size_t, ent_size * ALIGN(*nents, xlt_chunk_align),
1026 		     MLX5_MAX_UMR_CHUNK);
1027 	*nents = size / ent_size;
1028 	res = (void *)__get_free_pages(gfp_mask | __GFP_NOWARN,
1029 				       get_order(size));
1030 	if (res)
1031 		return res;
1032 
1033 	if (size > MLX5_SPARE_UMR_CHUNK) {
1034 		size = MLX5_SPARE_UMR_CHUNK;
1035 		*nents = size / ent_size;
1036 		res = (void *)__get_free_pages(gfp_mask | __GFP_NOWARN,
1037 					       get_order(size));
1038 		if (res)
1039 			return res;
1040 	}
1041 
1042 	*nents = PAGE_SIZE / ent_size;
1043 	res = (void *)__get_free_page(gfp_mask);
1044 	if (res)
1045 		return res;
1046 
1047 	mutex_lock(&xlt_emergency_page_mutex);
1048 	memset(xlt_emergency_page, 0, PAGE_SIZE);
1049 	return xlt_emergency_page;
1050 }
1051 
1052 static void mlx5_ib_free_xlt(void *xlt, size_t length)
1053 {
1054 	if (xlt == xlt_emergency_page) {
1055 		mutex_unlock(&xlt_emergency_page_mutex);
1056 		return;
1057 	}
1058 
1059 	free_pages((unsigned long)xlt, get_order(length));
1060 }
1061 
1062 /*
1063  * Create a MLX5_IB_SEND_UMR_UPDATE_XLT work request and XLT buffer ready for
1064  * submission.
1065  */
1066 static void *mlx5_ib_create_xlt_wr(struct mlx5_ib_mr *mr,
1067 				   struct mlx5_umr_wr *wr, struct ib_sge *sg,
1068 				   size_t nents, size_t ent_size,
1069 				   unsigned int flags)
1070 {
1071 	struct mlx5_ib_dev *dev = mr_to_mdev(mr);
1072 	struct device *ddev = &dev->mdev->pdev->dev;
1073 	dma_addr_t dma;
1074 	void *xlt;
1075 
1076 	xlt = mlx5_ib_alloc_xlt(&nents, ent_size,
1077 				flags & MLX5_IB_UPD_XLT_ATOMIC ? GFP_ATOMIC :
1078 								 GFP_KERNEL);
1079 	sg->length = nents * ent_size;
1080 	dma = dma_map_single(ddev, xlt, sg->length, DMA_TO_DEVICE);
1081 	if (dma_mapping_error(ddev, dma)) {
1082 		mlx5_ib_err(dev, "unable to map DMA during XLT update.\n");
1083 		mlx5_ib_free_xlt(xlt, sg->length);
1084 		return NULL;
1085 	}
1086 	sg->addr = dma;
1087 	sg->lkey = dev->umrc.pd->local_dma_lkey;
1088 
1089 	memset(wr, 0, sizeof(*wr));
1090 	wr->wr.send_flags = MLX5_IB_SEND_UMR_UPDATE_XLT;
1091 	if (!(flags & MLX5_IB_UPD_XLT_ENABLE))
1092 		wr->wr.send_flags |= MLX5_IB_SEND_UMR_FAIL_IF_FREE;
1093 	wr->wr.sg_list = sg;
1094 	wr->wr.num_sge = 1;
1095 	wr->wr.opcode = MLX5_IB_WR_UMR;
1096 	wr->pd = mr->ibmr.pd;
1097 	wr->mkey = mr->mmkey.key;
1098 	wr->length = mr->ibmr.length;
1099 	wr->virt_addr = mr->ibmr.iova;
1100 	wr->access_flags = mr->access_flags;
1101 	wr->page_shift = mr->page_shift;
1102 	wr->xlt_size = sg->length;
1103 	return xlt;
1104 }
1105 
1106 static void mlx5_ib_unmap_free_xlt(struct mlx5_ib_dev *dev, void *xlt,
1107 				   struct ib_sge *sg)
1108 {
1109 	struct device *ddev = &dev->mdev->pdev->dev;
1110 
1111 	dma_unmap_single(ddev, sg->addr, sg->length, DMA_TO_DEVICE);
1112 	mlx5_ib_free_xlt(xlt, sg->length);
1113 }
1114 
1115 static unsigned int xlt_wr_final_send_flags(unsigned int flags)
1116 {
1117 	unsigned int res = 0;
1118 
1119 	if (flags & MLX5_IB_UPD_XLT_ENABLE)
1120 		res |= MLX5_IB_SEND_UMR_ENABLE_MR |
1121 		       MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS |
1122 		       MLX5_IB_SEND_UMR_UPDATE_TRANSLATION;
1123 	if (flags & MLX5_IB_UPD_XLT_PD || flags & MLX5_IB_UPD_XLT_ACCESS)
1124 		res |= MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS;
1125 	if (flags & MLX5_IB_UPD_XLT_ADDR)
1126 		res |= MLX5_IB_SEND_UMR_UPDATE_TRANSLATION;
1127 	return res;
1128 }
1129 
1130 int mlx5_ib_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages,
1131 		       int page_shift, int flags)
1132 {
1133 	struct mlx5_ib_dev *dev = mr_to_mdev(mr);
1134 	struct device *ddev = &dev->mdev->pdev->dev;
1135 	void *xlt;
1136 	struct mlx5_umr_wr wr;
1137 	struct ib_sge sg;
1138 	int err = 0;
1139 	int desc_size = (flags & MLX5_IB_UPD_XLT_INDIRECT)
1140 			       ? sizeof(struct mlx5_klm)
1141 			       : sizeof(struct mlx5_mtt);
1142 	const int page_align = MLX5_UMR_MTT_ALIGNMENT / desc_size;
1143 	const int page_mask = page_align - 1;
1144 	size_t pages_mapped = 0;
1145 	size_t pages_to_map = 0;
1146 	size_t pages_iter;
1147 	size_t size_to_map = 0;
1148 	size_t orig_sg_length;
1149 
1150 	if ((flags & MLX5_IB_UPD_XLT_INDIRECT) &&
1151 	    !umr_can_use_indirect_mkey(dev))
1152 		return -EPERM;
1153 
1154 	if (WARN_ON(!mr->umem->is_odp))
1155 		return -EINVAL;
1156 
1157 	/* UMR copies MTTs in units of MLX5_UMR_MTT_ALIGNMENT bytes,
1158 	 * so we need to align the offset and length accordingly
1159 	 */
1160 	if (idx & page_mask) {
1161 		npages += idx & page_mask;
1162 		idx &= ~page_mask;
1163 	}
1164 	pages_to_map = ALIGN(npages, page_align);
1165 
1166 	xlt = mlx5_ib_create_xlt_wr(mr, &wr, &sg, npages, desc_size, flags);
1167 	if (!xlt)
1168 		return -ENOMEM;
1169 	pages_iter = sg.length / desc_size;
1170 	orig_sg_length = sg.length;
1171 
1172 	if (!(flags & MLX5_IB_UPD_XLT_INDIRECT)) {
1173 		struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
1174 		size_t max_pages = ib_umem_odp_num_pages(odp) - idx;
1175 
1176 		pages_to_map = min_t(size_t, pages_to_map, max_pages);
1177 	}
1178 
1179 	wr.page_shift = page_shift;
1180 
1181 	for (pages_mapped = 0;
1182 	     pages_mapped < pages_to_map && !err;
1183 	     pages_mapped += pages_iter, idx += pages_iter) {
1184 		npages = min_t(int, pages_iter, pages_to_map - pages_mapped);
1185 		size_to_map = npages * desc_size;
1186 		dma_sync_single_for_cpu(ddev, sg.addr, sg.length,
1187 					DMA_TO_DEVICE);
1188 		mlx5_odp_populate_xlt(xlt, idx, npages, mr, flags);
1189 		dma_sync_single_for_device(ddev, sg.addr, sg.length,
1190 					   DMA_TO_DEVICE);
1191 
1192 		sg.length = ALIGN(size_to_map, MLX5_UMR_MTT_ALIGNMENT);
1193 
1194 		if (pages_mapped + pages_iter >= pages_to_map)
1195 			wr.wr.send_flags |= xlt_wr_final_send_flags(flags);
1196 
1197 		wr.offset = idx * desc_size;
1198 		wr.xlt_size = sg.length;
1199 
1200 		err = mlx5_ib_post_send_wait(dev, &wr);
1201 	}
1202 	sg.length = orig_sg_length;
1203 	mlx5_ib_unmap_free_xlt(dev, xlt, &sg);
1204 	return err;
1205 }
1206 
1207 /*
1208  * Send the DMA list to the HW for a normal MR using UMR.
1209  * Dmabuf MR is handled in a similar way, except that the MLX5_IB_UPD_XLT_ZAP
1210  * flag may be used.
1211  */
1212 int mlx5_ib_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags)
1213 {
1214 	struct mlx5_ib_dev *dev = mr_to_mdev(mr);
1215 	struct device *ddev = &dev->mdev->pdev->dev;
1216 	struct ib_block_iter biter;
1217 	struct mlx5_mtt *cur_mtt;
1218 	struct mlx5_umr_wr wr;
1219 	size_t orig_sg_length;
1220 	struct mlx5_mtt *mtt;
1221 	size_t final_size;
1222 	struct ib_sge sg;
1223 	int err = 0;
1224 
1225 	if (WARN_ON(mr->umem->is_odp))
1226 		return -EINVAL;
1227 
1228 	mtt = mlx5_ib_create_xlt_wr(mr, &wr, &sg,
1229 				    ib_umem_num_dma_blocks(mr->umem,
1230 							   1 << mr->page_shift),
1231 				    sizeof(*mtt), flags);
1232 	if (!mtt)
1233 		return -ENOMEM;
1234 	orig_sg_length = sg.length;
1235 
1236 	cur_mtt = mtt;
1237 	rdma_for_each_block (mr->umem->sgt_append.sgt.sgl, &biter,
1238 			     mr->umem->sgt_append.sgt.nents,
1239 			     BIT(mr->page_shift)) {
1240 		if (cur_mtt == (void *)mtt + sg.length) {
1241 			dma_sync_single_for_device(ddev, sg.addr, sg.length,
1242 						   DMA_TO_DEVICE);
1243 			err = mlx5_ib_post_send_wait(dev, &wr);
1244 			if (err)
1245 				goto err;
1246 			dma_sync_single_for_cpu(ddev, sg.addr, sg.length,
1247 						DMA_TO_DEVICE);
1248 			wr.offset += sg.length;
1249 			cur_mtt = mtt;
1250 		}
1251 
1252 		cur_mtt->ptag =
1253 			cpu_to_be64(rdma_block_iter_dma_address(&biter) |
1254 				    MLX5_IB_MTT_PRESENT);
1255 
1256 		if (mr->umem->is_dmabuf && (flags & MLX5_IB_UPD_XLT_ZAP))
1257 			cur_mtt->ptag = 0;
1258 
1259 		cur_mtt++;
1260 	}
1261 
1262 	final_size = (void *)cur_mtt - (void *)mtt;
1263 	sg.length = ALIGN(final_size, MLX5_UMR_MTT_ALIGNMENT);
1264 	memset(cur_mtt, 0, sg.length - final_size);
1265 	wr.wr.send_flags |= xlt_wr_final_send_flags(flags);
1266 	wr.xlt_size = sg.length;
1267 
1268 	dma_sync_single_for_device(ddev, sg.addr, sg.length, DMA_TO_DEVICE);
1269 	err = mlx5_ib_post_send_wait(dev, &wr);
1270 
1271 err:
1272 	sg.length = orig_sg_length;
1273 	mlx5_ib_unmap_free_xlt(dev, mtt, &sg);
1274 	return err;
1275 }
1276 
1277 /*
1278  * If ibmr is NULL it will be allocated by reg_create.
1279  * Else, the given ibmr will be used.
1280  */
1281 static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem,
1282 				     u64 iova, int access_flags,
1283 				     unsigned int page_size, bool populate)
1284 {
1285 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
1286 	struct mlx5_ib_mr *mr;
1287 	__be64 *pas;
1288 	void *mkc;
1289 	int inlen;
1290 	u32 *in;
1291 	int err;
1292 	bool pg_cap = !!(MLX5_CAP_GEN(dev->mdev, pg));
1293 
1294 	if (!page_size)
1295 		return ERR_PTR(-EINVAL);
1296 	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
1297 	if (!mr)
1298 		return ERR_PTR(-ENOMEM);
1299 
1300 	mr->ibmr.pd = pd;
1301 	mr->access_flags = access_flags;
1302 	mr->page_shift = order_base_2(page_size);
1303 
1304 	inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
1305 	if (populate)
1306 		inlen += sizeof(*pas) *
1307 			 roundup(ib_umem_num_dma_blocks(umem, page_size), 2);
1308 	in = kvzalloc(inlen, GFP_KERNEL);
1309 	if (!in) {
1310 		err = -ENOMEM;
1311 		goto err_1;
1312 	}
1313 	pas = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt);
1314 	if (populate) {
1315 		if (WARN_ON(access_flags & IB_ACCESS_ON_DEMAND)) {
1316 			err = -EINVAL;
1317 			goto err_2;
1318 		}
1319 		mlx5_ib_populate_pas(umem, 1UL << mr->page_shift, pas,
1320 				     pg_cap ? MLX5_IB_MTT_PRESENT : 0);
1321 	}
1322 
1323 	/* The pg_access bit allows setting the access flags
1324 	 * in the page list submitted with the command. */
1325 	MLX5_SET(create_mkey_in, in, pg_access, !!(pg_cap));
1326 
1327 	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
1328 	set_mkc_access_pd_addr_fields(mkc, access_flags, iova,
1329 				      populate ? pd : dev->umrc.pd);
1330 	MLX5_SET(mkc, mkc, free, !populate);
1331 	MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_MTT);
1332 	MLX5_SET(mkc, mkc, umr_en, 1);
1333 
1334 	MLX5_SET64(mkc, mkc, len, umem->length);
1335 	MLX5_SET(mkc, mkc, bsf_octword_size, 0);
1336 	MLX5_SET(mkc, mkc, translations_octword_size,
1337 		 get_octo_len(iova, umem->length, mr->page_shift));
1338 	MLX5_SET(mkc, mkc, log_page_size, mr->page_shift);
1339 	if (populate) {
1340 		MLX5_SET(create_mkey_in, in, translations_octword_actual_size,
1341 			 get_octo_len(iova, umem->length, mr->page_shift));
1342 	}
1343 
1344 	err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen);
1345 	if (err) {
1346 		mlx5_ib_warn(dev, "create mkey failed\n");
1347 		goto err_2;
1348 	}
1349 	mr->mmkey.type = MLX5_MKEY_MR;
1350 	mr->umem = umem;
1351 	set_mr_fields(dev, mr, umem->length, access_flags, iova);
1352 	kvfree(in);
1353 
1354 	mlx5_ib_dbg(dev, "mkey = 0x%x\n", mr->mmkey.key);
1355 
1356 	return mr;
1357 
1358 err_2:
1359 	kvfree(in);
1360 err_1:
1361 	kfree(mr);
1362 	return ERR_PTR(err);
1363 }
1364 
1365 static struct ib_mr *mlx5_ib_get_dm_mr(struct ib_pd *pd, u64 start_addr,
1366 				       u64 length, int acc, int mode)
1367 {
1368 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
1369 	int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
1370 	struct mlx5_ib_mr *mr;
1371 	void *mkc;
1372 	u32 *in;
1373 	int err;
1374 
1375 	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
1376 	if (!mr)
1377 		return ERR_PTR(-ENOMEM);
1378 
1379 	in = kzalloc(inlen, GFP_KERNEL);
1380 	if (!in) {
1381 		err = -ENOMEM;
1382 		goto err_free;
1383 	}
1384 
1385 	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
1386 
1387 	MLX5_SET(mkc, mkc, access_mode_1_0, mode & 0x3);
1388 	MLX5_SET(mkc, mkc, access_mode_4_2, (mode >> 2) & 0x7);
1389 	MLX5_SET64(mkc, mkc, len, length);
1390 	set_mkc_access_pd_addr_fields(mkc, acc, start_addr, pd);
1391 
1392 	err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen);
1393 	if (err)
1394 		goto err_in;
1395 
1396 	kfree(in);
1397 
1398 	set_mr_fields(dev, mr, length, acc, start_addr);
1399 
1400 	return &mr->ibmr;
1401 
1402 err_in:
1403 	kfree(in);
1404 
1405 err_free:
1406 	kfree(mr);
1407 
1408 	return ERR_PTR(err);
1409 }
1410 
1411 int mlx5_ib_advise_mr(struct ib_pd *pd,
1412 		      enum ib_uverbs_advise_mr_advice advice,
1413 		      u32 flags,
1414 		      struct ib_sge *sg_list,
1415 		      u32 num_sge,
1416 		      struct uverbs_attr_bundle *attrs)
1417 {
1418 	if (advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH &&
1419 	    advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE &&
1420 	    advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_NO_FAULT)
1421 		return -EOPNOTSUPP;
1422 
1423 	return mlx5_ib_advise_mr_prefetch(pd, advice, flags,
1424 					 sg_list, num_sge);
1425 }
1426 
1427 struct ib_mr *mlx5_ib_reg_dm_mr(struct ib_pd *pd, struct ib_dm *dm,
1428 				struct ib_dm_mr_attr *attr,
1429 				struct uverbs_attr_bundle *attrs)
1430 {
1431 	struct mlx5_ib_dm *mdm = to_mdm(dm);
1432 	struct mlx5_core_dev *dev = to_mdev(dm->device)->mdev;
1433 	u64 start_addr = mdm->dev_addr + attr->offset;
1434 	int mode;
1435 
1436 	switch (mdm->type) {
1437 	case MLX5_IB_UAPI_DM_TYPE_MEMIC:
1438 		if (attr->access_flags & ~MLX5_IB_DM_MEMIC_ALLOWED_ACCESS)
1439 			return ERR_PTR(-EINVAL);
1440 
1441 		mode = MLX5_MKC_ACCESS_MODE_MEMIC;
1442 		start_addr -= pci_resource_start(dev->pdev, 0);
1443 		break;
1444 	case MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM:
1445 	case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM:
1446 		if (attr->access_flags & ~MLX5_IB_DM_SW_ICM_ALLOWED_ACCESS)
1447 			return ERR_PTR(-EINVAL);
1448 
1449 		mode = MLX5_MKC_ACCESS_MODE_SW_ICM;
1450 		break;
1451 	default:
1452 		return ERR_PTR(-EINVAL);
1453 	}
1454 
1455 	return mlx5_ib_get_dm_mr(pd, start_addr, attr->length,
1456 				 attr->access_flags, mode);
1457 }
1458 
1459 static struct ib_mr *create_real_mr(struct ib_pd *pd, struct ib_umem *umem,
1460 				    u64 iova, int access_flags)
1461 {
1462 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
1463 	struct mlx5_ib_mr *mr = NULL;
1464 	bool xlt_with_umr;
1465 	int err;
1466 
1467 	xlt_with_umr = mlx5_ib_can_load_pas_with_umr(dev, umem->length);
1468 	if (xlt_with_umr) {
1469 		mr = alloc_cacheable_mr(pd, umem, iova, access_flags);
1470 	} else {
1471 		unsigned int page_size = mlx5_umem_find_best_pgsz(
1472 			umem, mkc, log_page_size, 0, iova);
1473 
1474 		mutex_lock(&dev->slow_path_mutex);
1475 		mr = reg_create(pd, umem, iova, access_flags, page_size, true);
1476 		mutex_unlock(&dev->slow_path_mutex);
1477 	}
1478 	if (IS_ERR(mr)) {
1479 		ib_umem_release(umem);
1480 		return ERR_CAST(mr);
1481 	}
1482 
1483 	mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmkey.key);
1484 
1485 	atomic_add(ib_umem_num_pages(umem), &dev->mdev->priv.reg_pages);
1486 
1487 	if (xlt_with_umr) {
1488 		/*
1489 		 * If the MR was created with reg_create then it will be
1490 		 * configured properly but left disabled. It is safe to go ahead
1491 		 * and configure it again via UMR while enabling it.
1492 		 */
1493 		err = mlx5_ib_update_mr_pas(mr, MLX5_IB_UPD_XLT_ENABLE);
1494 		if (err) {
1495 			mlx5_ib_dereg_mr(&mr->ibmr, NULL);
1496 			return ERR_PTR(err);
1497 		}
1498 	}
1499 	return &mr->ibmr;
1500 }
1501 
1502 static struct ib_mr *create_user_odp_mr(struct ib_pd *pd, u64 start, u64 length,
1503 					u64 iova, int access_flags,
1504 					struct ib_udata *udata)
1505 {
1506 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
1507 	struct ib_umem_odp *odp;
1508 	struct mlx5_ib_mr *mr;
1509 	int err;
1510 
1511 	if (!IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING))
1512 		return ERR_PTR(-EOPNOTSUPP);
1513 
1514 	err = mlx5r_odp_create_eq(dev, &dev->odp_pf_eq);
1515 	if (err)
1516 		return ERR_PTR(err);
1517 	if (!start && length == U64_MAX) {
1518 		if (iova != 0)
1519 			return ERR_PTR(-EINVAL);
1520 		if (!(dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT))
1521 			return ERR_PTR(-EINVAL);
1522 
1523 		mr = mlx5_ib_alloc_implicit_mr(to_mpd(pd), access_flags);
1524 		if (IS_ERR(mr))
1525 			return ERR_CAST(mr);
1526 		return &mr->ibmr;
1527 	}
1528 
1529 	/* ODP requires xlt update via umr to work. */
1530 	if (!mlx5_ib_can_load_pas_with_umr(dev, length))
1531 		return ERR_PTR(-EINVAL);
1532 
1533 	odp = ib_umem_odp_get(&dev->ib_dev, start, length, access_flags,
1534 			      &mlx5_mn_ops);
1535 	if (IS_ERR(odp))
1536 		return ERR_CAST(odp);
1537 
1538 	mr = alloc_cacheable_mr(pd, &odp->umem, iova, access_flags);
1539 	if (IS_ERR(mr)) {
1540 		ib_umem_release(&odp->umem);
1541 		return ERR_CAST(mr);
1542 	}
1543 	xa_init(&mr->implicit_children);
1544 
1545 	odp->private = mr;
1546 	err = mlx5r_store_odp_mkey(dev, &mr->mmkey);
1547 	if (err)
1548 		goto err_dereg_mr;
1549 
1550 	err = mlx5_ib_init_odp_mr(mr);
1551 	if (err)
1552 		goto err_dereg_mr;
1553 	return &mr->ibmr;
1554 
1555 err_dereg_mr:
1556 	mlx5_ib_dereg_mr(&mr->ibmr, NULL);
1557 	return ERR_PTR(err);
1558 }
1559 
1560 struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
1561 				  u64 iova, int access_flags,
1562 				  struct ib_udata *udata)
1563 {
1564 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
1565 	struct ib_umem *umem;
1566 
1567 	if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM))
1568 		return ERR_PTR(-EOPNOTSUPP);
1569 
1570 	mlx5_ib_dbg(dev, "start 0x%llx, iova 0x%llx, length 0x%llx, access_flags 0x%x\n",
1571 		    start, iova, length, access_flags);
1572 
1573 	if (access_flags & IB_ACCESS_ON_DEMAND)
1574 		return create_user_odp_mr(pd, start, length, iova, access_flags,
1575 					  udata);
1576 	umem = ib_umem_get(&dev->ib_dev, start, length, access_flags);
1577 	if (IS_ERR(umem))
1578 		return ERR_CAST(umem);
1579 	return create_real_mr(pd, umem, iova, access_flags);
1580 }
1581 
1582 static void mlx5_ib_dmabuf_invalidate_cb(struct dma_buf_attachment *attach)
1583 {
1584 	struct ib_umem_dmabuf *umem_dmabuf = attach->importer_priv;
1585 	struct mlx5_ib_mr *mr = umem_dmabuf->private;
1586 
1587 	dma_resv_assert_held(umem_dmabuf->attach->dmabuf->resv);
1588 
1589 	if (!umem_dmabuf->sgt)
1590 		return;
1591 
1592 	mlx5_ib_update_mr_pas(mr, MLX5_IB_UPD_XLT_ZAP);
1593 	ib_umem_dmabuf_unmap_pages(umem_dmabuf);
1594 }
1595 
1596 static struct dma_buf_attach_ops mlx5_ib_dmabuf_attach_ops = {
1597 	.allow_peer2peer = 1,
1598 	.move_notify = mlx5_ib_dmabuf_invalidate_cb,
1599 };
1600 
1601 struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset,
1602 					 u64 length, u64 virt_addr,
1603 					 int fd, int access_flags,
1604 					 struct ib_udata *udata)
1605 {
1606 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
1607 	struct mlx5_ib_mr *mr = NULL;
1608 	struct ib_umem_dmabuf *umem_dmabuf;
1609 	int err;
1610 
1611 	if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM) ||
1612 	    !IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING))
1613 		return ERR_PTR(-EOPNOTSUPP);
1614 
1615 	mlx5_ib_dbg(dev,
1616 		    "offset 0x%llx, virt_addr 0x%llx, length 0x%llx, fd %d, access_flags 0x%x\n",
1617 		    offset, virt_addr, length, fd, access_flags);
1618 
1619 	/* dmabuf requires xlt update via umr to work. */
1620 	if (!mlx5_ib_can_load_pas_with_umr(dev, length))
1621 		return ERR_PTR(-EINVAL);
1622 
1623 	umem_dmabuf = ib_umem_dmabuf_get(&dev->ib_dev, offset, length, fd,
1624 					 access_flags,
1625 					 &mlx5_ib_dmabuf_attach_ops);
1626 	if (IS_ERR(umem_dmabuf)) {
1627 		mlx5_ib_dbg(dev, "umem_dmabuf get failed (%ld)\n",
1628 			    PTR_ERR(umem_dmabuf));
1629 		return ERR_CAST(umem_dmabuf);
1630 	}
1631 
1632 	mr = alloc_cacheable_mr(pd, &umem_dmabuf->umem, virt_addr,
1633 				access_flags);
1634 	if (IS_ERR(mr)) {
1635 		ib_umem_release(&umem_dmabuf->umem);
1636 		return ERR_CAST(mr);
1637 	}
1638 
1639 	mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmkey.key);
1640 
1641 	atomic_add(ib_umem_num_pages(mr->umem), &dev->mdev->priv.reg_pages);
1642 	umem_dmabuf->private = mr;
1643 	err = mlx5r_store_odp_mkey(dev, &mr->mmkey);
1644 	if (err)
1645 		goto err_dereg_mr;
1646 
1647 	err = mlx5_ib_init_dmabuf_mr(mr);
1648 	if (err)
1649 		goto err_dereg_mr;
1650 	return &mr->ibmr;
1651 
1652 err_dereg_mr:
1653 	mlx5_ib_dereg_mr(&mr->ibmr, NULL);
1654 	return ERR_PTR(err);
1655 }
1656 
1657 /**
1658  * revoke_mr - Fence all DMA on the MR
1659  * @mr: The MR to fence
1660  *
1661  * Upon return the NIC will not be doing any DMA to the pages under the MR,
1662  * and any DMA in progress will be completed. Failure of this function
1663  * indicates the HW has failed catastrophically.
1664  */
1665 static int revoke_mr(struct mlx5_ib_mr *mr)
1666 {
1667 	struct mlx5_umr_wr umrwr = {};
1668 
1669 	if (mr_to_mdev(mr)->mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR)
1670 		return 0;
1671 
1672 	umrwr.wr.send_flags = MLX5_IB_SEND_UMR_DISABLE_MR |
1673 			      MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS;
1674 	umrwr.wr.opcode = MLX5_IB_WR_UMR;
1675 	umrwr.pd = mr_to_mdev(mr)->umrc.pd;
1676 	umrwr.mkey = mr->mmkey.key;
1677 	umrwr.ignore_free_state = 1;
1678 
1679 	return mlx5_ib_post_send_wait(mr_to_mdev(mr), &umrwr);
1680 }
1681 
1682 /*
1683  * True if the change in access flags can be done via UMR, only some access
1684  * flags can be updated.
1685  */
1686 static bool can_use_umr_rereg_access(struct mlx5_ib_dev *dev,
1687 				     unsigned int current_access_flags,
1688 				     unsigned int target_access_flags)
1689 {
1690 	unsigned int diffs = current_access_flags ^ target_access_flags;
1691 
1692 	if (diffs & ~(IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE |
1693 		      IB_ACCESS_REMOTE_READ | IB_ACCESS_RELAXED_ORDERING))
1694 		return false;
1695 	return mlx5_ib_can_reconfig_with_umr(dev, current_access_flags,
1696 					     target_access_flags);
1697 }
1698 
1699 static int umr_rereg_pd_access(struct mlx5_ib_mr *mr, struct ib_pd *pd,
1700 			       int access_flags)
1701 {
1702 	struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device);
1703 	struct mlx5_umr_wr umrwr = {
1704 		.wr = {
1705 			.send_flags = MLX5_IB_SEND_UMR_FAIL_IF_FREE |
1706 				      MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS,
1707 			.opcode = MLX5_IB_WR_UMR,
1708 		},
1709 		.mkey = mr->mmkey.key,
1710 		.pd = pd,
1711 		.access_flags = access_flags,
1712 	};
1713 	int err;
1714 
1715 	err = mlx5_ib_post_send_wait(dev, &umrwr);
1716 	if (err)
1717 		return err;
1718 
1719 	mr->access_flags = access_flags;
1720 	return 0;
1721 }
1722 
1723 static bool can_use_umr_rereg_pas(struct mlx5_ib_mr *mr,
1724 				  struct ib_umem *new_umem,
1725 				  int new_access_flags, u64 iova,
1726 				  unsigned long *page_size)
1727 {
1728 	struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device);
1729 
1730 	/* We only track the allocated sizes of MRs from the cache */
1731 	if (!mr->cache_ent)
1732 		return false;
1733 	if (!mlx5_ib_can_load_pas_with_umr(dev, new_umem->length))
1734 		return false;
1735 
1736 	*page_size =
1737 		mlx5_umem_find_best_pgsz(new_umem, mkc, log_page_size, 0, iova);
1738 	if (WARN_ON(!*page_size))
1739 		return false;
1740 	return (1ULL << mr->cache_ent->order) >=
1741 	       ib_umem_num_dma_blocks(new_umem, *page_size);
1742 }
1743 
1744 static int umr_rereg_pas(struct mlx5_ib_mr *mr, struct ib_pd *pd,
1745 			 int access_flags, int flags, struct ib_umem *new_umem,
1746 			 u64 iova, unsigned long page_size)
1747 {
1748 	struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device);
1749 	int upd_flags = MLX5_IB_UPD_XLT_ADDR | MLX5_IB_UPD_XLT_ENABLE;
1750 	struct ib_umem *old_umem = mr->umem;
1751 	int err;
1752 
1753 	/*
1754 	 * To keep everything simple the MR is revoked before we start to mess
1755 	 * with it. This ensure the change is atomic relative to any use of the
1756 	 * MR.
1757 	 */
1758 	err = revoke_mr(mr);
1759 	if (err)
1760 		return err;
1761 
1762 	if (flags & IB_MR_REREG_PD) {
1763 		mr->ibmr.pd = pd;
1764 		upd_flags |= MLX5_IB_UPD_XLT_PD;
1765 	}
1766 	if (flags & IB_MR_REREG_ACCESS) {
1767 		mr->access_flags = access_flags;
1768 		upd_flags |= MLX5_IB_UPD_XLT_ACCESS;
1769 	}
1770 
1771 	mr->ibmr.length = new_umem->length;
1772 	mr->ibmr.iova = iova;
1773 	mr->ibmr.length = new_umem->length;
1774 	mr->page_shift = order_base_2(page_size);
1775 	mr->umem = new_umem;
1776 	err = mlx5_ib_update_mr_pas(mr, upd_flags);
1777 	if (err) {
1778 		/*
1779 		 * The MR is revoked at this point so there is no issue to free
1780 		 * new_umem.
1781 		 */
1782 		mr->umem = old_umem;
1783 		return err;
1784 	}
1785 
1786 	atomic_sub(ib_umem_num_pages(old_umem), &dev->mdev->priv.reg_pages);
1787 	ib_umem_release(old_umem);
1788 	atomic_add(ib_umem_num_pages(new_umem), &dev->mdev->priv.reg_pages);
1789 	return 0;
1790 }
1791 
1792 struct ib_mr *mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
1793 				    u64 length, u64 iova, int new_access_flags,
1794 				    struct ib_pd *new_pd,
1795 				    struct ib_udata *udata)
1796 {
1797 	struct mlx5_ib_dev *dev = to_mdev(ib_mr->device);
1798 	struct mlx5_ib_mr *mr = to_mmr(ib_mr);
1799 	int err;
1800 
1801 	if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM))
1802 		return ERR_PTR(-EOPNOTSUPP);
1803 
1804 	mlx5_ib_dbg(
1805 		dev,
1806 		"start 0x%llx, iova 0x%llx, length 0x%llx, access_flags 0x%x\n",
1807 		start, iova, length, new_access_flags);
1808 
1809 	if (flags & ~(IB_MR_REREG_TRANS | IB_MR_REREG_PD | IB_MR_REREG_ACCESS))
1810 		return ERR_PTR(-EOPNOTSUPP);
1811 
1812 	if (!(flags & IB_MR_REREG_ACCESS))
1813 		new_access_flags = mr->access_flags;
1814 	if (!(flags & IB_MR_REREG_PD))
1815 		new_pd = ib_mr->pd;
1816 
1817 	if (!(flags & IB_MR_REREG_TRANS)) {
1818 		struct ib_umem *umem;
1819 
1820 		/* Fast path for PD/access change */
1821 		if (can_use_umr_rereg_access(dev, mr->access_flags,
1822 					     new_access_flags)) {
1823 			err = umr_rereg_pd_access(mr, new_pd, new_access_flags);
1824 			if (err)
1825 				return ERR_PTR(err);
1826 			return NULL;
1827 		}
1828 		/* DM or ODP MR's don't have a normal umem so we can't re-use it */
1829 		if (!mr->umem || is_odp_mr(mr) || is_dmabuf_mr(mr))
1830 			goto recreate;
1831 
1832 		/*
1833 		 * Only one active MR can refer to a umem at one time, revoke
1834 		 * the old MR before assigning the umem to the new one.
1835 		 */
1836 		err = revoke_mr(mr);
1837 		if (err)
1838 			return ERR_PTR(err);
1839 		umem = mr->umem;
1840 		mr->umem = NULL;
1841 		atomic_sub(ib_umem_num_pages(umem), &dev->mdev->priv.reg_pages);
1842 
1843 		return create_real_mr(new_pd, umem, mr->ibmr.iova,
1844 				      new_access_flags);
1845 	}
1846 
1847 	/*
1848 	 * DM doesn't have a PAS list so we can't re-use it, odp/dmabuf does
1849 	 * but the logic around releasing the umem is different
1850 	 */
1851 	if (!mr->umem || is_odp_mr(mr) || is_dmabuf_mr(mr))
1852 		goto recreate;
1853 
1854 	if (!(new_access_flags & IB_ACCESS_ON_DEMAND) &&
1855 	    can_use_umr_rereg_access(dev, mr->access_flags, new_access_flags)) {
1856 		struct ib_umem *new_umem;
1857 		unsigned long page_size;
1858 
1859 		new_umem = ib_umem_get(&dev->ib_dev, start, length,
1860 				       new_access_flags);
1861 		if (IS_ERR(new_umem))
1862 			return ERR_CAST(new_umem);
1863 
1864 		/* Fast path for PAS change */
1865 		if (can_use_umr_rereg_pas(mr, new_umem, new_access_flags, iova,
1866 					  &page_size)) {
1867 			err = umr_rereg_pas(mr, new_pd, new_access_flags, flags,
1868 					    new_umem, iova, page_size);
1869 			if (err) {
1870 				ib_umem_release(new_umem);
1871 				return ERR_PTR(err);
1872 			}
1873 			return NULL;
1874 		}
1875 		return create_real_mr(new_pd, new_umem, iova, new_access_flags);
1876 	}
1877 
1878 	/*
1879 	 * Everything else has no state we can preserve, just create a new MR
1880 	 * from scratch
1881 	 */
1882 recreate:
1883 	return mlx5_ib_reg_user_mr(new_pd, start, length, iova,
1884 				   new_access_flags, udata);
1885 }
1886 
1887 static int
1888 mlx5_alloc_priv_descs(struct ib_device *device,
1889 		      struct mlx5_ib_mr *mr,
1890 		      int ndescs,
1891 		      int desc_size)
1892 {
1893 	struct mlx5_ib_dev *dev = to_mdev(device);
1894 	struct device *ddev = &dev->mdev->pdev->dev;
1895 	int size = ndescs * desc_size;
1896 	int add_size;
1897 	int ret;
1898 
1899 	add_size = max_t(int, MLX5_UMR_ALIGN - ARCH_KMALLOC_MINALIGN, 0);
1900 
1901 	mr->descs_alloc = kzalloc(size + add_size, GFP_KERNEL);
1902 	if (!mr->descs_alloc)
1903 		return -ENOMEM;
1904 
1905 	mr->descs = PTR_ALIGN(mr->descs_alloc, MLX5_UMR_ALIGN);
1906 
1907 	mr->desc_map = dma_map_single(ddev, mr->descs, size, DMA_TO_DEVICE);
1908 	if (dma_mapping_error(ddev, mr->desc_map)) {
1909 		ret = -ENOMEM;
1910 		goto err;
1911 	}
1912 
1913 	return 0;
1914 err:
1915 	kfree(mr->descs_alloc);
1916 
1917 	return ret;
1918 }
1919 
1920 static void
1921 mlx5_free_priv_descs(struct mlx5_ib_mr *mr)
1922 {
1923 	if (!mr->umem && mr->descs) {
1924 		struct ib_device *device = mr->ibmr.device;
1925 		int size = mr->max_descs * mr->desc_size;
1926 		struct mlx5_ib_dev *dev = to_mdev(device);
1927 
1928 		dma_unmap_single(&dev->mdev->pdev->dev, mr->desc_map, size,
1929 				 DMA_TO_DEVICE);
1930 		kfree(mr->descs_alloc);
1931 		mr->descs = NULL;
1932 	}
1933 }
1934 
1935 int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
1936 {
1937 	struct mlx5_ib_mr *mr = to_mmr(ibmr);
1938 	struct mlx5_ib_dev *dev = to_mdev(ibmr->device);
1939 	int rc;
1940 
1941 	/*
1942 	 * Any async use of the mr must hold the refcount, once the refcount
1943 	 * goes to zero no other thread, such as ODP page faults, prefetch, any
1944 	 * UMR activity, etc can touch the mkey. Thus it is safe to destroy it.
1945 	 */
1946 	if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) &&
1947 	    refcount_read(&mr->mmkey.usecount) != 0 &&
1948 	    xa_erase(&mr_to_mdev(mr)->odp_mkeys, mlx5_base_mkey(mr->mmkey.key)))
1949 		mlx5r_deref_wait_odp_mkey(&mr->mmkey);
1950 
1951 	if (ibmr->type == IB_MR_TYPE_INTEGRITY) {
1952 		xa_cmpxchg(&dev->sig_mrs, mlx5_base_mkey(mr->mmkey.key),
1953 			   mr->sig, NULL, GFP_KERNEL);
1954 
1955 		if (mr->mtt_mr) {
1956 			rc = mlx5_ib_dereg_mr(&mr->mtt_mr->ibmr, NULL);
1957 			if (rc)
1958 				return rc;
1959 			mr->mtt_mr = NULL;
1960 		}
1961 		if (mr->klm_mr) {
1962 			rc = mlx5_ib_dereg_mr(&mr->klm_mr->ibmr, NULL);
1963 			if (rc)
1964 				return rc;
1965 			mr->klm_mr = NULL;
1966 		}
1967 
1968 		if (mlx5_core_destroy_psv(dev->mdev,
1969 					  mr->sig->psv_memory.psv_idx))
1970 			mlx5_ib_warn(dev, "failed to destroy mem psv %d\n",
1971 				     mr->sig->psv_memory.psv_idx);
1972 		if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_wire.psv_idx))
1973 			mlx5_ib_warn(dev, "failed to destroy wire psv %d\n",
1974 				     mr->sig->psv_wire.psv_idx);
1975 		kfree(mr->sig);
1976 		mr->sig = NULL;
1977 	}
1978 
1979 	/* Stop DMA */
1980 	if (mr->cache_ent) {
1981 		if (revoke_mr(mr)) {
1982 			spin_lock_irq(&mr->cache_ent->lock);
1983 			mr->cache_ent->total_mrs--;
1984 			spin_unlock_irq(&mr->cache_ent->lock);
1985 			mr->cache_ent = NULL;
1986 		}
1987 	}
1988 	if (!mr->cache_ent) {
1989 		rc = destroy_mkey(to_mdev(mr->ibmr.device), mr);
1990 		if (rc)
1991 			return rc;
1992 	}
1993 
1994 	if (mr->umem) {
1995 		bool is_odp = is_odp_mr(mr);
1996 
1997 		if (!is_odp)
1998 			atomic_sub(ib_umem_num_pages(mr->umem),
1999 				   &dev->mdev->priv.reg_pages);
2000 		ib_umem_release(mr->umem);
2001 		if (is_odp)
2002 			mlx5_ib_free_odp_mr(mr);
2003 	}
2004 
2005 	if (mr->cache_ent) {
2006 		mlx5_mr_cache_free(dev, mr);
2007 	} else {
2008 		mlx5_free_priv_descs(mr);
2009 		kfree(mr);
2010 	}
2011 	return 0;
2012 }
2013 
2014 static void mlx5_set_umr_free_mkey(struct ib_pd *pd, u32 *in, int ndescs,
2015 				   int access_mode, int page_shift)
2016 {
2017 	void *mkc;
2018 
2019 	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
2020 
2021 	/* This is only used from the kernel, so setting the PD is OK. */
2022 	set_mkc_access_pd_addr_fields(mkc, IB_ACCESS_RELAXED_ORDERING, 0, pd);
2023 	MLX5_SET(mkc, mkc, free, 1);
2024 	MLX5_SET(mkc, mkc, translations_octword_size, ndescs);
2025 	MLX5_SET(mkc, mkc, access_mode_1_0, access_mode & 0x3);
2026 	MLX5_SET(mkc, mkc, access_mode_4_2, (access_mode >> 2) & 0x7);
2027 	MLX5_SET(mkc, mkc, umr_en, 1);
2028 	MLX5_SET(mkc, mkc, log_page_size, page_shift);
2029 }
2030 
2031 static int _mlx5_alloc_mkey_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr,
2032 				  int ndescs, int desc_size, int page_shift,
2033 				  int access_mode, u32 *in, int inlen)
2034 {
2035 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
2036 	int err;
2037 
2038 	mr->access_mode = access_mode;
2039 	mr->desc_size = desc_size;
2040 	mr->max_descs = ndescs;
2041 
2042 	err = mlx5_alloc_priv_descs(pd->device, mr, ndescs, desc_size);
2043 	if (err)
2044 		return err;
2045 
2046 	mlx5_set_umr_free_mkey(pd, in, ndescs, access_mode, page_shift);
2047 
2048 	err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen);
2049 	if (err)
2050 		goto err_free_descs;
2051 
2052 	mr->mmkey.type = MLX5_MKEY_MR;
2053 	mr->ibmr.lkey = mr->mmkey.key;
2054 	mr->ibmr.rkey = mr->mmkey.key;
2055 
2056 	return 0;
2057 
2058 err_free_descs:
2059 	mlx5_free_priv_descs(mr);
2060 	return err;
2061 }
2062 
2063 static struct mlx5_ib_mr *mlx5_ib_alloc_pi_mr(struct ib_pd *pd,
2064 				u32 max_num_sg, u32 max_num_meta_sg,
2065 				int desc_size, int access_mode)
2066 {
2067 	int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
2068 	int ndescs = ALIGN(max_num_sg + max_num_meta_sg, 4);
2069 	int page_shift = 0;
2070 	struct mlx5_ib_mr *mr;
2071 	u32 *in;
2072 	int err;
2073 
2074 	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
2075 	if (!mr)
2076 		return ERR_PTR(-ENOMEM);
2077 
2078 	mr->ibmr.pd = pd;
2079 	mr->ibmr.device = pd->device;
2080 
2081 	in = kzalloc(inlen, GFP_KERNEL);
2082 	if (!in) {
2083 		err = -ENOMEM;
2084 		goto err_free;
2085 	}
2086 
2087 	if (access_mode == MLX5_MKC_ACCESS_MODE_MTT)
2088 		page_shift = PAGE_SHIFT;
2089 
2090 	err = _mlx5_alloc_mkey_descs(pd, mr, ndescs, desc_size, page_shift,
2091 				     access_mode, in, inlen);
2092 	if (err)
2093 		goto err_free_in;
2094 
2095 	mr->umem = NULL;
2096 	kfree(in);
2097 
2098 	return mr;
2099 
2100 err_free_in:
2101 	kfree(in);
2102 err_free:
2103 	kfree(mr);
2104 	return ERR_PTR(err);
2105 }
2106 
2107 static int mlx5_alloc_mem_reg_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr,
2108 				    int ndescs, u32 *in, int inlen)
2109 {
2110 	return _mlx5_alloc_mkey_descs(pd, mr, ndescs, sizeof(struct mlx5_mtt),
2111 				      PAGE_SHIFT, MLX5_MKC_ACCESS_MODE_MTT, in,
2112 				      inlen);
2113 }
2114 
2115 static int mlx5_alloc_sg_gaps_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr,
2116 				    int ndescs, u32 *in, int inlen)
2117 {
2118 	return _mlx5_alloc_mkey_descs(pd, mr, ndescs, sizeof(struct mlx5_klm),
2119 				      0, MLX5_MKC_ACCESS_MODE_KLMS, in, inlen);
2120 }
2121 
2122 static int mlx5_alloc_integrity_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr,
2123 				      int max_num_sg, int max_num_meta_sg,
2124 				      u32 *in, int inlen)
2125 {
2126 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
2127 	u32 psv_index[2];
2128 	void *mkc;
2129 	int err;
2130 
2131 	mr->sig = kzalloc(sizeof(*mr->sig), GFP_KERNEL);
2132 	if (!mr->sig)
2133 		return -ENOMEM;
2134 
2135 	/* create mem & wire PSVs */
2136 	err = mlx5_core_create_psv(dev->mdev, to_mpd(pd)->pdn, 2, psv_index);
2137 	if (err)
2138 		goto err_free_sig;
2139 
2140 	mr->sig->psv_memory.psv_idx = psv_index[0];
2141 	mr->sig->psv_wire.psv_idx = psv_index[1];
2142 
2143 	mr->sig->sig_status_checked = true;
2144 	mr->sig->sig_err_exists = false;
2145 	/* Next UMR, Arm SIGERR */
2146 	++mr->sig->sigerr_count;
2147 	mr->klm_mr = mlx5_ib_alloc_pi_mr(pd, max_num_sg, max_num_meta_sg,
2148 					 sizeof(struct mlx5_klm),
2149 					 MLX5_MKC_ACCESS_MODE_KLMS);
2150 	if (IS_ERR(mr->klm_mr)) {
2151 		err = PTR_ERR(mr->klm_mr);
2152 		goto err_destroy_psv;
2153 	}
2154 	mr->mtt_mr = mlx5_ib_alloc_pi_mr(pd, max_num_sg, max_num_meta_sg,
2155 					 sizeof(struct mlx5_mtt),
2156 					 MLX5_MKC_ACCESS_MODE_MTT);
2157 	if (IS_ERR(mr->mtt_mr)) {
2158 		err = PTR_ERR(mr->mtt_mr);
2159 		goto err_free_klm_mr;
2160 	}
2161 
2162 	/* Set bsf descriptors for mkey */
2163 	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
2164 	MLX5_SET(mkc, mkc, bsf_en, 1);
2165 	MLX5_SET(mkc, mkc, bsf_octword_size, MLX5_MKEY_BSF_OCTO_SIZE);
2166 
2167 	err = _mlx5_alloc_mkey_descs(pd, mr, 4, sizeof(struct mlx5_klm), 0,
2168 				     MLX5_MKC_ACCESS_MODE_KLMS, in, inlen);
2169 	if (err)
2170 		goto err_free_mtt_mr;
2171 
2172 	err = xa_err(xa_store(&dev->sig_mrs, mlx5_base_mkey(mr->mmkey.key),
2173 			      mr->sig, GFP_KERNEL));
2174 	if (err)
2175 		goto err_free_descs;
2176 	return 0;
2177 
2178 err_free_descs:
2179 	destroy_mkey(dev, mr);
2180 	mlx5_free_priv_descs(mr);
2181 err_free_mtt_mr:
2182 	mlx5_ib_dereg_mr(&mr->mtt_mr->ibmr, NULL);
2183 	mr->mtt_mr = NULL;
2184 err_free_klm_mr:
2185 	mlx5_ib_dereg_mr(&mr->klm_mr->ibmr, NULL);
2186 	mr->klm_mr = NULL;
2187 err_destroy_psv:
2188 	if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_memory.psv_idx))
2189 		mlx5_ib_warn(dev, "failed to destroy mem psv %d\n",
2190 			     mr->sig->psv_memory.psv_idx);
2191 	if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_wire.psv_idx))
2192 		mlx5_ib_warn(dev, "failed to destroy wire psv %d\n",
2193 			     mr->sig->psv_wire.psv_idx);
2194 err_free_sig:
2195 	kfree(mr->sig);
2196 
2197 	return err;
2198 }
2199 
2200 static struct ib_mr *__mlx5_ib_alloc_mr(struct ib_pd *pd,
2201 					enum ib_mr_type mr_type, u32 max_num_sg,
2202 					u32 max_num_meta_sg)
2203 {
2204 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
2205 	int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
2206 	int ndescs = ALIGN(max_num_sg, 4);
2207 	struct mlx5_ib_mr *mr;
2208 	u32 *in;
2209 	int err;
2210 
2211 	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
2212 	if (!mr)
2213 		return ERR_PTR(-ENOMEM);
2214 
2215 	in = kzalloc(inlen, GFP_KERNEL);
2216 	if (!in) {
2217 		err = -ENOMEM;
2218 		goto err_free;
2219 	}
2220 
2221 	mr->ibmr.device = pd->device;
2222 	mr->umem = NULL;
2223 
2224 	switch (mr_type) {
2225 	case IB_MR_TYPE_MEM_REG:
2226 		err = mlx5_alloc_mem_reg_descs(pd, mr, ndescs, in, inlen);
2227 		break;
2228 	case IB_MR_TYPE_SG_GAPS:
2229 		err = mlx5_alloc_sg_gaps_descs(pd, mr, ndescs, in, inlen);
2230 		break;
2231 	case IB_MR_TYPE_INTEGRITY:
2232 		err = mlx5_alloc_integrity_descs(pd, mr, max_num_sg,
2233 						 max_num_meta_sg, in, inlen);
2234 		break;
2235 	default:
2236 		mlx5_ib_warn(dev, "Invalid mr type %d\n", mr_type);
2237 		err = -EINVAL;
2238 	}
2239 
2240 	if (err)
2241 		goto err_free_in;
2242 
2243 	kfree(in);
2244 
2245 	return &mr->ibmr;
2246 
2247 err_free_in:
2248 	kfree(in);
2249 err_free:
2250 	kfree(mr);
2251 	return ERR_PTR(err);
2252 }
2253 
2254 struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
2255 			       u32 max_num_sg)
2256 {
2257 	return __mlx5_ib_alloc_mr(pd, mr_type, max_num_sg, 0);
2258 }
2259 
2260 struct ib_mr *mlx5_ib_alloc_mr_integrity(struct ib_pd *pd,
2261 					 u32 max_num_sg, u32 max_num_meta_sg)
2262 {
2263 	return __mlx5_ib_alloc_mr(pd, IB_MR_TYPE_INTEGRITY, max_num_sg,
2264 				  max_num_meta_sg);
2265 }
2266 
2267 int mlx5_ib_alloc_mw(struct ib_mw *ibmw, struct ib_udata *udata)
2268 {
2269 	struct mlx5_ib_dev *dev = to_mdev(ibmw->device);
2270 	int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
2271 	struct mlx5_ib_mw *mw = to_mmw(ibmw);
2272 	unsigned int ndescs;
2273 	u32 *in = NULL;
2274 	void *mkc;
2275 	int err;
2276 	struct mlx5_ib_alloc_mw req = {};
2277 	struct {
2278 		__u32	comp_mask;
2279 		__u32	response_length;
2280 	} resp = {};
2281 
2282 	err = ib_copy_from_udata(&req, udata, min(udata->inlen, sizeof(req)));
2283 	if (err)
2284 		return err;
2285 
2286 	if (req.comp_mask || req.reserved1 || req.reserved2)
2287 		return -EOPNOTSUPP;
2288 
2289 	if (udata->inlen > sizeof(req) &&
2290 	    !ib_is_udata_cleared(udata, sizeof(req),
2291 				 udata->inlen - sizeof(req)))
2292 		return -EOPNOTSUPP;
2293 
2294 	ndescs = req.num_klms ? roundup(req.num_klms, 4) : roundup(1, 4);
2295 
2296 	in = kzalloc(inlen, GFP_KERNEL);
2297 	if (!in) {
2298 		err = -ENOMEM;
2299 		goto free;
2300 	}
2301 
2302 	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
2303 
2304 	MLX5_SET(mkc, mkc, free, 1);
2305 	MLX5_SET(mkc, mkc, translations_octword_size, ndescs);
2306 	MLX5_SET(mkc, mkc, pd, to_mpd(ibmw->pd)->pdn);
2307 	MLX5_SET(mkc, mkc, umr_en, 1);
2308 	MLX5_SET(mkc, mkc, lr, 1);
2309 	MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_KLMS);
2310 	MLX5_SET(mkc, mkc, en_rinval, !!((ibmw->type == IB_MW_TYPE_2)));
2311 	MLX5_SET(mkc, mkc, qpn, 0xffffff);
2312 
2313 	err = mlx5_ib_create_mkey(dev, &mw->mmkey, in, inlen);
2314 	if (err)
2315 		goto free;
2316 
2317 	mw->mmkey.type = MLX5_MKEY_MW;
2318 	ibmw->rkey = mw->mmkey.key;
2319 	mw->mmkey.ndescs = ndescs;
2320 
2321 	resp.response_length =
2322 		min(offsetofend(typeof(resp), response_length), udata->outlen);
2323 	if (resp.response_length) {
2324 		err = ib_copy_to_udata(udata, &resp, resp.response_length);
2325 		if (err)
2326 			goto free_mkey;
2327 	}
2328 
2329 	if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) {
2330 		err = mlx5r_store_odp_mkey(dev, &mw->mmkey);
2331 		if (err)
2332 			goto free_mkey;
2333 	}
2334 
2335 	kfree(in);
2336 	return 0;
2337 
2338 free_mkey:
2339 	mlx5_core_destroy_mkey(dev->mdev, mw->mmkey.key);
2340 free:
2341 	kfree(in);
2342 	return err;
2343 }
2344 
2345 int mlx5_ib_dealloc_mw(struct ib_mw *mw)
2346 {
2347 	struct mlx5_ib_dev *dev = to_mdev(mw->device);
2348 	struct mlx5_ib_mw *mmw = to_mmw(mw);
2349 
2350 	if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) &&
2351 	    xa_erase(&dev->odp_mkeys, mlx5_base_mkey(mmw->mmkey.key)))
2352 		/*
2353 		 * pagefault_single_data_segment() may be accessing mmw
2354 		 * if the user bound an ODP MR to this MW.
2355 		 */
2356 		mlx5r_deref_wait_odp_mkey(&mmw->mmkey);
2357 
2358 	return mlx5_core_destroy_mkey(dev->mdev, mmw->mmkey.key);
2359 }
2360 
2361 int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask,
2362 			    struct ib_mr_status *mr_status)
2363 {
2364 	struct mlx5_ib_mr *mmr = to_mmr(ibmr);
2365 	int ret = 0;
2366 
2367 	if (check_mask & ~IB_MR_CHECK_SIG_STATUS) {
2368 		pr_err("Invalid status check mask\n");
2369 		ret = -EINVAL;
2370 		goto done;
2371 	}
2372 
2373 	mr_status->fail_status = 0;
2374 	if (check_mask & IB_MR_CHECK_SIG_STATUS) {
2375 		if (!mmr->sig) {
2376 			ret = -EINVAL;
2377 			pr_err("signature status check requested on a non-signature enabled MR\n");
2378 			goto done;
2379 		}
2380 
2381 		mmr->sig->sig_status_checked = true;
2382 		if (!mmr->sig->sig_err_exists)
2383 			goto done;
2384 
2385 		if (ibmr->lkey == mmr->sig->err_item.key)
2386 			memcpy(&mr_status->sig_err, &mmr->sig->err_item,
2387 			       sizeof(mr_status->sig_err));
2388 		else {
2389 			mr_status->sig_err.err_type = IB_SIG_BAD_GUARD;
2390 			mr_status->sig_err.sig_err_offset = 0;
2391 			mr_status->sig_err.key = mmr->sig->err_item.key;
2392 		}
2393 
2394 		mmr->sig->sig_err_exists = false;
2395 		mr_status->fail_status |= IB_MR_CHECK_SIG_STATUS;
2396 	}
2397 
2398 done:
2399 	return ret;
2400 }
2401 
2402 static int
2403 mlx5_ib_map_pa_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg,
2404 			int data_sg_nents, unsigned int *data_sg_offset,
2405 			struct scatterlist *meta_sg, int meta_sg_nents,
2406 			unsigned int *meta_sg_offset)
2407 {
2408 	struct mlx5_ib_mr *mr = to_mmr(ibmr);
2409 	unsigned int sg_offset = 0;
2410 	int n = 0;
2411 
2412 	mr->meta_length = 0;
2413 	if (data_sg_nents == 1) {
2414 		n++;
2415 		mr->mmkey.ndescs = 1;
2416 		if (data_sg_offset)
2417 			sg_offset = *data_sg_offset;
2418 		mr->data_length = sg_dma_len(data_sg) - sg_offset;
2419 		mr->data_iova = sg_dma_address(data_sg) + sg_offset;
2420 		if (meta_sg_nents == 1) {
2421 			n++;
2422 			mr->meta_ndescs = 1;
2423 			if (meta_sg_offset)
2424 				sg_offset = *meta_sg_offset;
2425 			else
2426 				sg_offset = 0;
2427 			mr->meta_length = sg_dma_len(meta_sg) - sg_offset;
2428 			mr->pi_iova = sg_dma_address(meta_sg) + sg_offset;
2429 		}
2430 		ibmr->length = mr->data_length + mr->meta_length;
2431 	}
2432 
2433 	return n;
2434 }
2435 
2436 static int
2437 mlx5_ib_sg_to_klms(struct mlx5_ib_mr *mr,
2438 		   struct scatterlist *sgl,
2439 		   unsigned short sg_nents,
2440 		   unsigned int *sg_offset_p,
2441 		   struct scatterlist *meta_sgl,
2442 		   unsigned short meta_sg_nents,
2443 		   unsigned int *meta_sg_offset_p)
2444 {
2445 	struct scatterlist *sg = sgl;
2446 	struct mlx5_klm *klms = mr->descs;
2447 	unsigned int sg_offset = sg_offset_p ? *sg_offset_p : 0;
2448 	u32 lkey = mr->ibmr.pd->local_dma_lkey;
2449 	int i, j = 0;
2450 
2451 	mr->ibmr.iova = sg_dma_address(sg) + sg_offset;
2452 	mr->ibmr.length = 0;
2453 
2454 	for_each_sg(sgl, sg, sg_nents, i) {
2455 		if (unlikely(i >= mr->max_descs))
2456 			break;
2457 		klms[i].va = cpu_to_be64(sg_dma_address(sg) + sg_offset);
2458 		klms[i].bcount = cpu_to_be32(sg_dma_len(sg) - sg_offset);
2459 		klms[i].key = cpu_to_be32(lkey);
2460 		mr->ibmr.length += sg_dma_len(sg) - sg_offset;
2461 
2462 		sg_offset = 0;
2463 	}
2464 
2465 	if (sg_offset_p)
2466 		*sg_offset_p = sg_offset;
2467 
2468 	mr->mmkey.ndescs = i;
2469 	mr->data_length = mr->ibmr.length;
2470 
2471 	if (meta_sg_nents) {
2472 		sg = meta_sgl;
2473 		sg_offset = meta_sg_offset_p ? *meta_sg_offset_p : 0;
2474 		for_each_sg(meta_sgl, sg, meta_sg_nents, j) {
2475 			if (unlikely(i + j >= mr->max_descs))
2476 				break;
2477 			klms[i + j].va = cpu_to_be64(sg_dma_address(sg) +
2478 						     sg_offset);
2479 			klms[i + j].bcount = cpu_to_be32(sg_dma_len(sg) -
2480 							 sg_offset);
2481 			klms[i + j].key = cpu_to_be32(lkey);
2482 			mr->ibmr.length += sg_dma_len(sg) - sg_offset;
2483 
2484 			sg_offset = 0;
2485 		}
2486 		if (meta_sg_offset_p)
2487 			*meta_sg_offset_p = sg_offset;
2488 
2489 		mr->meta_ndescs = j;
2490 		mr->meta_length = mr->ibmr.length - mr->data_length;
2491 	}
2492 
2493 	return i + j;
2494 }
2495 
2496 static int mlx5_set_page(struct ib_mr *ibmr, u64 addr)
2497 {
2498 	struct mlx5_ib_mr *mr = to_mmr(ibmr);
2499 	__be64 *descs;
2500 
2501 	if (unlikely(mr->mmkey.ndescs == mr->max_descs))
2502 		return -ENOMEM;
2503 
2504 	descs = mr->descs;
2505 	descs[mr->mmkey.ndescs++] = cpu_to_be64(addr | MLX5_EN_RD | MLX5_EN_WR);
2506 
2507 	return 0;
2508 }
2509 
2510 static int mlx5_set_page_pi(struct ib_mr *ibmr, u64 addr)
2511 {
2512 	struct mlx5_ib_mr *mr = to_mmr(ibmr);
2513 	__be64 *descs;
2514 
2515 	if (unlikely(mr->mmkey.ndescs + mr->meta_ndescs == mr->max_descs))
2516 		return -ENOMEM;
2517 
2518 	descs = mr->descs;
2519 	descs[mr->mmkey.ndescs + mr->meta_ndescs++] =
2520 		cpu_to_be64(addr | MLX5_EN_RD | MLX5_EN_WR);
2521 
2522 	return 0;
2523 }
2524 
2525 static int
2526 mlx5_ib_map_mtt_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg,
2527 			 int data_sg_nents, unsigned int *data_sg_offset,
2528 			 struct scatterlist *meta_sg, int meta_sg_nents,
2529 			 unsigned int *meta_sg_offset)
2530 {
2531 	struct mlx5_ib_mr *mr = to_mmr(ibmr);
2532 	struct mlx5_ib_mr *pi_mr = mr->mtt_mr;
2533 	int n;
2534 
2535 	pi_mr->mmkey.ndescs = 0;
2536 	pi_mr->meta_ndescs = 0;
2537 	pi_mr->meta_length = 0;
2538 
2539 	ib_dma_sync_single_for_cpu(ibmr->device, pi_mr->desc_map,
2540 				   pi_mr->desc_size * pi_mr->max_descs,
2541 				   DMA_TO_DEVICE);
2542 
2543 	pi_mr->ibmr.page_size = ibmr->page_size;
2544 	n = ib_sg_to_pages(&pi_mr->ibmr, data_sg, data_sg_nents, data_sg_offset,
2545 			   mlx5_set_page);
2546 	if (n != data_sg_nents)
2547 		return n;
2548 
2549 	pi_mr->data_iova = pi_mr->ibmr.iova;
2550 	pi_mr->data_length = pi_mr->ibmr.length;
2551 	pi_mr->ibmr.length = pi_mr->data_length;
2552 	ibmr->length = pi_mr->data_length;
2553 
2554 	if (meta_sg_nents) {
2555 		u64 page_mask = ~((u64)ibmr->page_size - 1);
2556 		u64 iova = pi_mr->data_iova;
2557 
2558 		n += ib_sg_to_pages(&pi_mr->ibmr, meta_sg, meta_sg_nents,
2559 				    meta_sg_offset, mlx5_set_page_pi);
2560 
2561 		pi_mr->meta_length = pi_mr->ibmr.length;
2562 		/*
2563 		 * PI address for the HW is the offset of the metadata address
2564 		 * relative to the first data page address.
2565 		 * It equals to first data page address + size of data pages +
2566 		 * metadata offset at the first metadata page
2567 		 */
2568 		pi_mr->pi_iova = (iova & page_mask) +
2569 				 pi_mr->mmkey.ndescs * ibmr->page_size +
2570 				 (pi_mr->ibmr.iova & ~page_mask);
2571 		/*
2572 		 * In order to use one MTT MR for data and metadata, we register
2573 		 * also the gaps between the end of the data and the start of
2574 		 * the metadata (the sig MR will verify that the HW will access
2575 		 * to right addresses). This mapping is safe because we use
2576 		 * internal mkey for the registration.
2577 		 */
2578 		pi_mr->ibmr.length = pi_mr->pi_iova + pi_mr->meta_length - iova;
2579 		pi_mr->ibmr.iova = iova;
2580 		ibmr->length += pi_mr->meta_length;
2581 	}
2582 
2583 	ib_dma_sync_single_for_device(ibmr->device, pi_mr->desc_map,
2584 				      pi_mr->desc_size * pi_mr->max_descs,
2585 				      DMA_TO_DEVICE);
2586 
2587 	return n;
2588 }
2589 
2590 static int
2591 mlx5_ib_map_klm_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg,
2592 			 int data_sg_nents, unsigned int *data_sg_offset,
2593 			 struct scatterlist *meta_sg, int meta_sg_nents,
2594 			 unsigned int *meta_sg_offset)
2595 {
2596 	struct mlx5_ib_mr *mr = to_mmr(ibmr);
2597 	struct mlx5_ib_mr *pi_mr = mr->klm_mr;
2598 	int n;
2599 
2600 	pi_mr->mmkey.ndescs = 0;
2601 	pi_mr->meta_ndescs = 0;
2602 	pi_mr->meta_length = 0;
2603 
2604 	ib_dma_sync_single_for_cpu(ibmr->device, pi_mr->desc_map,
2605 				   pi_mr->desc_size * pi_mr->max_descs,
2606 				   DMA_TO_DEVICE);
2607 
2608 	n = mlx5_ib_sg_to_klms(pi_mr, data_sg, data_sg_nents, data_sg_offset,
2609 			       meta_sg, meta_sg_nents, meta_sg_offset);
2610 
2611 	ib_dma_sync_single_for_device(ibmr->device, pi_mr->desc_map,
2612 				      pi_mr->desc_size * pi_mr->max_descs,
2613 				      DMA_TO_DEVICE);
2614 
2615 	/* This is zero-based memory region */
2616 	pi_mr->data_iova = 0;
2617 	pi_mr->ibmr.iova = 0;
2618 	pi_mr->pi_iova = pi_mr->data_length;
2619 	ibmr->length = pi_mr->ibmr.length;
2620 
2621 	return n;
2622 }
2623 
2624 int mlx5_ib_map_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg,
2625 			 int data_sg_nents, unsigned int *data_sg_offset,
2626 			 struct scatterlist *meta_sg, int meta_sg_nents,
2627 			 unsigned int *meta_sg_offset)
2628 {
2629 	struct mlx5_ib_mr *mr = to_mmr(ibmr);
2630 	struct mlx5_ib_mr *pi_mr = NULL;
2631 	int n;
2632 
2633 	WARN_ON(ibmr->type != IB_MR_TYPE_INTEGRITY);
2634 
2635 	mr->mmkey.ndescs = 0;
2636 	mr->data_length = 0;
2637 	mr->data_iova = 0;
2638 	mr->meta_ndescs = 0;
2639 	mr->pi_iova = 0;
2640 	/*
2641 	 * As a performance optimization, if possible, there is no need to
2642 	 * perform UMR operation to register the data/metadata buffers.
2643 	 * First try to map the sg lists to PA descriptors with local_dma_lkey.
2644 	 * Fallback to UMR only in case of a failure.
2645 	 */
2646 	n = mlx5_ib_map_pa_mr_sg_pi(ibmr, data_sg, data_sg_nents,
2647 				    data_sg_offset, meta_sg, meta_sg_nents,
2648 				    meta_sg_offset);
2649 	if (n == data_sg_nents + meta_sg_nents)
2650 		goto out;
2651 	/*
2652 	 * As a performance optimization, if possible, there is no need to map
2653 	 * the sg lists to KLM descriptors. First try to map the sg lists to MTT
2654 	 * descriptors and fallback to KLM only in case of a failure.
2655 	 * It's more efficient for the HW to work with MTT descriptors
2656 	 * (especially in high load).
2657 	 * Use KLM (indirect access) only if it's mandatory.
2658 	 */
2659 	pi_mr = mr->mtt_mr;
2660 	n = mlx5_ib_map_mtt_mr_sg_pi(ibmr, data_sg, data_sg_nents,
2661 				     data_sg_offset, meta_sg, meta_sg_nents,
2662 				     meta_sg_offset);
2663 	if (n == data_sg_nents + meta_sg_nents)
2664 		goto out;
2665 
2666 	pi_mr = mr->klm_mr;
2667 	n = mlx5_ib_map_klm_mr_sg_pi(ibmr, data_sg, data_sg_nents,
2668 				     data_sg_offset, meta_sg, meta_sg_nents,
2669 				     meta_sg_offset);
2670 	if (unlikely(n != data_sg_nents + meta_sg_nents))
2671 		return -ENOMEM;
2672 
2673 out:
2674 	/* This is zero-based memory region */
2675 	ibmr->iova = 0;
2676 	mr->pi_mr = pi_mr;
2677 	if (pi_mr)
2678 		ibmr->sig_attrs->meta_length = pi_mr->meta_length;
2679 	else
2680 		ibmr->sig_attrs->meta_length = mr->meta_length;
2681 
2682 	return 0;
2683 }
2684 
2685 int mlx5_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
2686 		      unsigned int *sg_offset)
2687 {
2688 	struct mlx5_ib_mr *mr = to_mmr(ibmr);
2689 	int n;
2690 
2691 	mr->mmkey.ndescs = 0;
2692 
2693 	ib_dma_sync_single_for_cpu(ibmr->device, mr->desc_map,
2694 				   mr->desc_size * mr->max_descs,
2695 				   DMA_TO_DEVICE);
2696 
2697 	if (mr->access_mode == MLX5_MKC_ACCESS_MODE_KLMS)
2698 		n = mlx5_ib_sg_to_klms(mr, sg, sg_nents, sg_offset, NULL, 0,
2699 				       NULL);
2700 	else
2701 		n = ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset,
2702 				mlx5_set_page);
2703 
2704 	ib_dma_sync_single_for_device(ibmr->device, mr->desc_map,
2705 				      mr->desc_size * mr->max_descs,
2706 				      DMA_TO_DEVICE);
2707 
2708 	return n;
2709 }
2710