xref: /linux/drivers/infiniband/hw/mlx5/mr.c (revision 4fd18fc38757217c746aa063ba9e4729814dc737)
1 /*
2  * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
3  *
4  * This software is available to you under a choice of one of two
5  * licenses.  You may choose to be licensed under the terms of the GNU
6  * General Public License (GPL) Version 2, available from the file
7  * COPYING in the main directory of this source tree, or the
8  * OpenIB.org BSD license below:
9  *
10  *     Redistribution and use in source and binary forms, with or
11  *     without modification, are permitted provided that the following
12  *     conditions are met:
13  *
14  *      - Redistributions of source code must retain the above
15  *        copyright notice, this list of conditions and the following
16  *        disclaimer.
17  *
18  *      - Redistributions in binary form must reproduce the above
19  *        copyright notice, this list of conditions and the following
20  *        disclaimer in the documentation and/or other materials
21  *        provided with the distribution.
22  *
23  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30  * SOFTWARE.
31  */
32 
33 
34 #include <linux/kref.h>
35 #include <linux/random.h>
36 #include <linux/debugfs.h>
37 #include <linux/export.h>
38 #include <linux/delay.h>
39 #include <rdma/ib_umem.h>
40 #include <rdma/ib_umem_odp.h>
41 #include <rdma/ib_verbs.h>
42 #include "mlx5_ib.h"
43 
44 /*
45  * We can't use an array for xlt_emergency_page because dma_map_single doesn't
46  * work on kernel modules memory
47  */
48 void *xlt_emergency_page;
49 static DEFINE_MUTEX(xlt_emergency_page_mutex);
50 
51 enum {
52 	MAX_PENDING_REG_MR = 8,
53 };
54 
55 #define MLX5_UMR_ALIGN 2048
56 
57 static void
58 create_mkey_callback(int status, struct mlx5_async_work *context);
59 static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem,
60 				     u64 iova, int access_flags,
61 				     unsigned int page_size, bool populate);
62 
63 static void set_mkc_access_pd_addr_fields(void *mkc, int acc, u64 start_addr,
64 					  struct ib_pd *pd)
65 {
66 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
67 
68 	MLX5_SET(mkc, mkc, a, !!(acc & IB_ACCESS_REMOTE_ATOMIC));
69 	MLX5_SET(mkc, mkc, rw, !!(acc & IB_ACCESS_REMOTE_WRITE));
70 	MLX5_SET(mkc, mkc, rr, !!(acc & IB_ACCESS_REMOTE_READ));
71 	MLX5_SET(mkc, mkc, lw, !!(acc & IB_ACCESS_LOCAL_WRITE));
72 	MLX5_SET(mkc, mkc, lr, 1);
73 
74 	if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write))
75 		MLX5_SET(mkc, mkc, relaxed_ordering_write,
76 			 !!(acc & IB_ACCESS_RELAXED_ORDERING));
77 	if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read))
78 		MLX5_SET(mkc, mkc, relaxed_ordering_read,
79 			 !!(acc & IB_ACCESS_RELAXED_ORDERING));
80 
81 	MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn);
82 	MLX5_SET(mkc, mkc, qpn, 0xffffff);
83 	MLX5_SET64(mkc, mkc, start_addr, start_addr);
84 }
85 
86 static void
87 assign_mkey_variant(struct mlx5_ib_dev *dev, struct mlx5_core_mkey *mkey,
88 		    u32 *in)
89 {
90 	u8 key = atomic_inc_return(&dev->mkey_var);
91 	void *mkc;
92 
93 	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
94 	MLX5_SET(mkc, mkc, mkey_7_0, key);
95 	mkey->key = key;
96 }
97 
98 static int
99 mlx5_ib_create_mkey(struct mlx5_ib_dev *dev, struct mlx5_core_mkey *mkey,
100 		    u32 *in, int inlen)
101 {
102 	assign_mkey_variant(dev, mkey, in);
103 	return mlx5_core_create_mkey(dev->mdev, mkey, in, inlen);
104 }
105 
106 static int
107 mlx5_ib_create_mkey_cb(struct mlx5_ib_dev *dev,
108 		       struct mlx5_core_mkey *mkey,
109 		       struct mlx5_async_ctx *async_ctx,
110 		       u32 *in, int inlen, u32 *out, int outlen,
111 		       struct mlx5_async_work *context)
112 {
113 	MLX5_SET(create_mkey_in, in, opcode, MLX5_CMD_OP_CREATE_MKEY);
114 	assign_mkey_variant(dev, mkey, in);
115 	return mlx5_cmd_exec_cb(async_ctx, in, inlen, out, outlen,
116 				create_mkey_callback, context);
117 }
118 
119 static void clean_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr);
120 static void dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr);
121 static int mr_cache_max_order(struct mlx5_ib_dev *dev);
122 static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent);
123 
124 static bool umr_can_use_indirect_mkey(struct mlx5_ib_dev *dev)
125 {
126 	return !MLX5_CAP_GEN(dev->mdev, umr_indirect_mkey_disabled);
127 }
128 
129 static int destroy_mkey(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
130 {
131 	WARN_ON(xa_load(&dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key)));
132 
133 	return mlx5_core_destroy_mkey(dev->mdev, &mr->mmkey);
134 }
135 
136 static void create_mkey_callback(int status, struct mlx5_async_work *context)
137 {
138 	struct mlx5_ib_mr *mr =
139 		container_of(context, struct mlx5_ib_mr, cb_work);
140 	struct mlx5_cache_ent *ent = mr->cache_ent;
141 	struct mlx5_ib_dev *dev = ent->dev;
142 	unsigned long flags;
143 
144 	if (status) {
145 		mlx5_ib_warn(dev, "async reg mr failed. status %d\n", status);
146 		kfree(mr);
147 		spin_lock_irqsave(&ent->lock, flags);
148 		ent->pending--;
149 		WRITE_ONCE(dev->fill_delay, 1);
150 		spin_unlock_irqrestore(&ent->lock, flags);
151 		mod_timer(&dev->delay_timer, jiffies + HZ);
152 		return;
153 	}
154 
155 	mr->mmkey.type = MLX5_MKEY_MR;
156 	mr->mmkey.key |= mlx5_idx_to_mkey(
157 		MLX5_GET(create_mkey_out, mr->out, mkey_index));
158 
159 	WRITE_ONCE(dev->cache.last_add, jiffies);
160 
161 	spin_lock_irqsave(&ent->lock, flags);
162 	list_add_tail(&mr->list, &ent->head);
163 	ent->available_mrs++;
164 	ent->total_mrs++;
165 	/* If we are doing fill_to_high_water then keep going. */
166 	queue_adjust_cache_locked(ent);
167 	ent->pending--;
168 	spin_unlock_irqrestore(&ent->lock, flags);
169 }
170 
171 static struct mlx5_ib_mr *alloc_cache_mr(struct mlx5_cache_ent *ent, void *mkc)
172 {
173 	struct mlx5_ib_mr *mr;
174 
175 	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
176 	if (!mr)
177 		return NULL;
178 	mr->cache_ent = ent;
179 
180 	set_mkc_access_pd_addr_fields(mkc, 0, 0, ent->dev->umrc.pd);
181 	MLX5_SET(mkc, mkc, free, 1);
182 	MLX5_SET(mkc, mkc, umr_en, 1);
183 	MLX5_SET(mkc, mkc, access_mode_1_0, ent->access_mode & 0x3);
184 	MLX5_SET(mkc, mkc, access_mode_4_2, (ent->access_mode >> 2) & 0x7);
185 
186 	MLX5_SET(mkc, mkc, translations_octword_size, ent->xlt);
187 	MLX5_SET(mkc, mkc, log_page_size, ent->page);
188 	return mr;
189 }
190 
191 /* Asynchronously schedule new MRs to be populated in the cache. */
192 static int add_keys(struct mlx5_cache_ent *ent, unsigned int num)
193 {
194 	size_t inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
195 	struct mlx5_ib_mr *mr;
196 	void *mkc;
197 	u32 *in;
198 	int err = 0;
199 	int i;
200 
201 	in = kzalloc(inlen, GFP_KERNEL);
202 	if (!in)
203 		return -ENOMEM;
204 
205 	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
206 	for (i = 0; i < num; i++) {
207 		mr = alloc_cache_mr(ent, mkc);
208 		if (!mr) {
209 			err = -ENOMEM;
210 			break;
211 		}
212 		spin_lock_irq(&ent->lock);
213 		if (ent->pending >= MAX_PENDING_REG_MR) {
214 			err = -EAGAIN;
215 			spin_unlock_irq(&ent->lock);
216 			kfree(mr);
217 			break;
218 		}
219 		ent->pending++;
220 		spin_unlock_irq(&ent->lock);
221 		err = mlx5_ib_create_mkey_cb(ent->dev, &mr->mmkey,
222 					     &ent->dev->async_ctx, in, inlen,
223 					     mr->out, sizeof(mr->out),
224 					     &mr->cb_work);
225 		if (err) {
226 			spin_lock_irq(&ent->lock);
227 			ent->pending--;
228 			spin_unlock_irq(&ent->lock);
229 			mlx5_ib_warn(ent->dev, "create mkey failed %d\n", err);
230 			kfree(mr);
231 			break;
232 		}
233 	}
234 
235 	kfree(in);
236 	return err;
237 }
238 
239 /* Synchronously create a MR in the cache */
240 static struct mlx5_ib_mr *create_cache_mr(struct mlx5_cache_ent *ent)
241 {
242 	size_t inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
243 	struct mlx5_ib_mr *mr;
244 	void *mkc;
245 	u32 *in;
246 	int err;
247 
248 	in = kzalloc(inlen, GFP_KERNEL);
249 	if (!in)
250 		return ERR_PTR(-ENOMEM);
251 	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
252 
253 	mr = alloc_cache_mr(ent, mkc);
254 	if (!mr) {
255 		err = -ENOMEM;
256 		goto free_in;
257 	}
258 
259 	err = mlx5_core_create_mkey(ent->dev->mdev, &mr->mmkey, in, inlen);
260 	if (err)
261 		goto free_mr;
262 
263 	mr->mmkey.type = MLX5_MKEY_MR;
264 	WRITE_ONCE(ent->dev->cache.last_add, jiffies);
265 	spin_lock_irq(&ent->lock);
266 	ent->total_mrs++;
267 	spin_unlock_irq(&ent->lock);
268 	kfree(in);
269 	return mr;
270 free_mr:
271 	kfree(mr);
272 free_in:
273 	kfree(in);
274 	return ERR_PTR(err);
275 }
276 
277 static void remove_cache_mr_locked(struct mlx5_cache_ent *ent)
278 {
279 	struct mlx5_ib_mr *mr;
280 
281 	lockdep_assert_held(&ent->lock);
282 	if (list_empty(&ent->head))
283 		return;
284 	mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list);
285 	list_del(&mr->list);
286 	ent->available_mrs--;
287 	ent->total_mrs--;
288 	spin_unlock_irq(&ent->lock);
289 	mlx5_core_destroy_mkey(ent->dev->mdev, &mr->mmkey);
290 	kfree(mr);
291 	spin_lock_irq(&ent->lock);
292 }
293 
294 static int resize_available_mrs(struct mlx5_cache_ent *ent, unsigned int target,
295 				bool limit_fill)
296 {
297 	int err;
298 
299 	lockdep_assert_held(&ent->lock);
300 
301 	while (true) {
302 		if (limit_fill)
303 			target = ent->limit * 2;
304 		if (target == ent->available_mrs + ent->pending)
305 			return 0;
306 		if (target > ent->available_mrs + ent->pending) {
307 			u32 todo = target - (ent->available_mrs + ent->pending);
308 
309 			spin_unlock_irq(&ent->lock);
310 			err = add_keys(ent, todo);
311 			if (err == -EAGAIN)
312 				usleep_range(3000, 5000);
313 			spin_lock_irq(&ent->lock);
314 			if (err) {
315 				if (err != -EAGAIN)
316 					return err;
317 			} else
318 				return 0;
319 		} else {
320 			remove_cache_mr_locked(ent);
321 		}
322 	}
323 }
324 
325 static ssize_t size_write(struct file *filp, const char __user *buf,
326 			  size_t count, loff_t *pos)
327 {
328 	struct mlx5_cache_ent *ent = filp->private_data;
329 	u32 target;
330 	int err;
331 
332 	err = kstrtou32_from_user(buf, count, 0, &target);
333 	if (err)
334 		return err;
335 
336 	/*
337 	 * Target is the new value of total_mrs the user requests, however we
338 	 * cannot free MRs that are in use. Compute the target value for
339 	 * available_mrs.
340 	 */
341 	spin_lock_irq(&ent->lock);
342 	if (target < ent->total_mrs - ent->available_mrs) {
343 		err = -EINVAL;
344 		goto err_unlock;
345 	}
346 	target = target - (ent->total_mrs - ent->available_mrs);
347 	if (target < ent->limit || target > ent->limit*2) {
348 		err = -EINVAL;
349 		goto err_unlock;
350 	}
351 	err = resize_available_mrs(ent, target, false);
352 	if (err)
353 		goto err_unlock;
354 	spin_unlock_irq(&ent->lock);
355 
356 	return count;
357 
358 err_unlock:
359 	spin_unlock_irq(&ent->lock);
360 	return err;
361 }
362 
363 static ssize_t size_read(struct file *filp, char __user *buf, size_t count,
364 			 loff_t *pos)
365 {
366 	struct mlx5_cache_ent *ent = filp->private_data;
367 	char lbuf[20];
368 	int err;
369 
370 	err = snprintf(lbuf, sizeof(lbuf), "%d\n", ent->total_mrs);
371 	if (err < 0)
372 		return err;
373 
374 	return simple_read_from_buffer(buf, count, pos, lbuf, err);
375 }
376 
377 static const struct file_operations size_fops = {
378 	.owner	= THIS_MODULE,
379 	.open	= simple_open,
380 	.write	= size_write,
381 	.read	= size_read,
382 };
383 
384 static ssize_t limit_write(struct file *filp, const char __user *buf,
385 			   size_t count, loff_t *pos)
386 {
387 	struct mlx5_cache_ent *ent = filp->private_data;
388 	u32 var;
389 	int err;
390 
391 	err = kstrtou32_from_user(buf, count, 0, &var);
392 	if (err)
393 		return err;
394 
395 	/*
396 	 * Upon set we immediately fill the cache to high water mark implied by
397 	 * the limit.
398 	 */
399 	spin_lock_irq(&ent->lock);
400 	ent->limit = var;
401 	err = resize_available_mrs(ent, 0, true);
402 	spin_unlock_irq(&ent->lock);
403 	if (err)
404 		return err;
405 	return count;
406 }
407 
408 static ssize_t limit_read(struct file *filp, char __user *buf, size_t count,
409 			  loff_t *pos)
410 {
411 	struct mlx5_cache_ent *ent = filp->private_data;
412 	char lbuf[20];
413 	int err;
414 
415 	err = snprintf(lbuf, sizeof(lbuf), "%d\n", ent->limit);
416 	if (err < 0)
417 		return err;
418 
419 	return simple_read_from_buffer(buf, count, pos, lbuf, err);
420 }
421 
422 static const struct file_operations limit_fops = {
423 	.owner	= THIS_MODULE,
424 	.open	= simple_open,
425 	.write	= limit_write,
426 	.read	= limit_read,
427 };
428 
429 static bool someone_adding(struct mlx5_mr_cache *cache)
430 {
431 	unsigned int i;
432 
433 	for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
434 		struct mlx5_cache_ent *ent = &cache->ent[i];
435 		bool ret;
436 
437 		spin_lock_irq(&ent->lock);
438 		ret = ent->available_mrs < ent->limit;
439 		spin_unlock_irq(&ent->lock);
440 		if (ret)
441 			return true;
442 	}
443 	return false;
444 }
445 
446 /*
447  * Check if the bucket is outside the high/low water mark and schedule an async
448  * update. The cache refill has hysteresis, once the low water mark is hit it is
449  * refilled up to the high mark.
450  */
451 static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent)
452 {
453 	lockdep_assert_held(&ent->lock);
454 
455 	if (ent->disabled || READ_ONCE(ent->dev->fill_delay))
456 		return;
457 	if (ent->available_mrs < ent->limit) {
458 		ent->fill_to_high_water = true;
459 		queue_work(ent->dev->cache.wq, &ent->work);
460 	} else if (ent->fill_to_high_water &&
461 		   ent->available_mrs + ent->pending < 2 * ent->limit) {
462 		/*
463 		 * Once we start populating due to hitting a low water mark
464 		 * continue until we pass the high water mark.
465 		 */
466 		queue_work(ent->dev->cache.wq, &ent->work);
467 	} else if (ent->available_mrs == 2 * ent->limit) {
468 		ent->fill_to_high_water = false;
469 	} else if (ent->available_mrs > 2 * ent->limit) {
470 		/* Queue deletion of excess entries */
471 		ent->fill_to_high_water = false;
472 		if (ent->pending)
473 			queue_delayed_work(ent->dev->cache.wq, &ent->dwork,
474 					   msecs_to_jiffies(1000));
475 		else
476 			queue_work(ent->dev->cache.wq, &ent->work);
477 	}
478 }
479 
480 static void __cache_work_func(struct mlx5_cache_ent *ent)
481 {
482 	struct mlx5_ib_dev *dev = ent->dev;
483 	struct mlx5_mr_cache *cache = &dev->cache;
484 	int err;
485 
486 	spin_lock_irq(&ent->lock);
487 	if (ent->disabled)
488 		goto out;
489 
490 	if (ent->fill_to_high_water &&
491 	    ent->available_mrs + ent->pending < 2 * ent->limit &&
492 	    !READ_ONCE(dev->fill_delay)) {
493 		spin_unlock_irq(&ent->lock);
494 		err = add_keys(ent, 1);
495 		spin_lock_irq(&ent->lock);
496 		if (ent->disabled)
497 			goto out;
498 		if (err) {
499 			/*
500 			 * EAGAIN only happens if pending is positive, so we
501 			 * will be rescheduled from reg_mr_callback(). The only
502 			 * failure path here is ENOMEM.
503 			 */
504 			if (err != -EAGAIN) {
505 				mlx5_ib_warn(
506 					dev,
507 					"command failed order %d, err %d\n",
508 					ent->order, err);
509 				queue_delayed_work(cache->wq, &ent->dwork,
510 						   msecs_to_jiffies(1000));
511 			}
512 		}
513 	} else if (ent->available_mrs > 2 * ent->limit) {
514 		bool need_delay;
515 
516 		/*
517 		 * The remove_cache_mr() logic is performed as garbage
518 		 * collection task. Such task is intended to be run when no
519 		 * other active processes are running.
520 		 *
521 		 * The need_resched() will return TRUE if there are user tasks
522 		 * to be activated in near future.
523 		 *
524 		 * In such case, we don't execute remove_cache_mr() and postpone
525 		 * the garbage collection work to try to run in next cycle, in
526 		 * order to free CPU resources to other tasks.
527 		 */
528 		spin_unlock_irq(&ent->lock);
529 		need_delay = need_resched() || someone_adding(cache) ||
530 			     time_after(jiffies,
531 					READ_ONCE(cache->last_add) + 300 * HZ);
532 		spin_lock_irq(&ent->lock);
533 		if (ent->disabled)
534 			goto out;
535 		if (need_delay)
536 			queue_delayed_work(cache->wq, &ent->dwork, 300 * HZ);
537 		remove_cache_mr_locked(ent);
538 		queue_adjust_cache_locked(ent);
539 	}
540 out:
541 	spin_unlock_irq(&ent->lock);
542 }
543 
544 static void delayed_cache_work_func(struct work_struct *work)
545 {
546 	struct mlx5_cache_ent *ent;
547 
548 	ent = container_of(work, struct mlx5_cache_ent, dwork.work);
549 	__cache_work_func(ent);
550 }
551 
552 static void cache_work_func(struct work_struct *work)
553 {
554 	struct mlx5_cache_ent *ent;
555 
556 	ent = container_of(work, struct mlx5_cache_ent, work);
557 	__cache_work_func(ent);
558 }
559 
560 /* Allocate a special entry from the cache */
561 struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev,
562 				       unsigned int entry, int access_flags)
563 {
564 	struct mlx5_mr_cache *cache = &dev->cache;
565 	struct mlx5_cache_ent *ent;
566 	struct mlx5_ib_mr *mr;
567 
568 	if (WARN_ON(entry <= MR_CACHE_LAST_STD_ENTRY ||
569 		    entry >= ARRAY_SIZE(cache->ent)))
570 		return ERR_PTR(-EINVAL);
571 
572 	/* Matches access in alloc_cache_mr() */
573 	if (!mlx5_ib_can_reconfig_with_umr(dev, 0, access_flags))
574 		return ERR_PTR(-EOPNOTSUPP);
575 
576 	ent = &cache->ent[entry];
577 	spin_lock_irq(&ent->lock);
578 	if (list_empty(&ent->head)) {
579 		spin_unlock_irq(&ent->lock);
580 		mr = create_cache_mr(ent);
581 		if (IS_ERR(mr))
582 			return mr;
583 	} else {
584 		mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list);
585 		list_del(&mr->list);
586 		ent->available_mrs--;
587 		queue_adjust_cache_locked(ent);
588 		spin_unlock_irq(&ent->lock);
589 	}
590 	mr->access_flags = access_flags;
591 	return mr;
592 }
593 
594 /* Return a MR already available in the cache */
595 static struct mlx5_ib_mr *get_cache_mr(struct mlx5_cache_ent *req_ent)
596 {
597 	struct mlx5_ib_dev *dev = req_ent->dev;
598 	struct mlx5_ib_mr *mr = NULL;
599 	struct mlx5_cache_ent *ent = req_ent;
600 
601 	/* Try larger MR pools from the cache to satisfy the allocation */
602 	for (; ent != &dev->cache.ent[MR_CACHE_LAST_STD_ENTRY + 1]; ent++) {
603 		mlx5_ib_dbg(dev, "order %u, cache index %zu\n", ent->order,
604 			    ent - dev->cache.ent);
605 
606 		spin_lock_irq(&ent->lock);
607 		if (!list_empty(&ent->head)) {
608 			mr = list_first_entry(&ent->head, struct mlx5_ib_mr,
609 					      list);
610 			list_del(&mr->list);
611 			ent->available_mrs--;
612 			queue_adjust_cache_locked(ent);
613 			spin_unlock_irq(&ent->lock);
614 			break;
615 		}
616 		queue_adjust_cache_locked(ent);
617 		spin_unlock_irq(&ent->lock);
618 	}
619 
620 	if (!mr)
621 		req_ent->miss++;
622 
623 	return mr;
624 }
625 
626 static void detach_mr_from_cache(struct mlx5_ib_mr *mr)
627 {
628 	struct mlx5_cache_ent *ent = mr->cache_ent;
629 
630 	mr->cache_ent = NULL;
631 	spin_lock_irq(&ent->lock);
632 	ent->total_mrs--;
633 	spin_unlock_irq(&ent->lock);
634 }
635 
636 void mlx5_mr_cache_free(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
637 {
638 	struct mlx5_cache_ent *ent = mr->cache_ent;
639 
640 	if (!ent)
641 		return;
642 
643 	if (mlx5_mr_cache_invalidate(mr)) {
644 		detach_mr_from_cache(mr);
645 		destroy_mkey(dev, mr);
646 		kfree(mr);
647 		return;
648 	}
649 
650 	spin_lock_irq(&ent->lock);
651 	list_add_tail(&mr->list, &ent->head);
652 	ent->available_mrs++;
653 	queue_adjust_cache_locked(ent);
654 	spin_unlock_irq(&ent->lock);
655 }
656 
657 static void clean_keys(struct mlx5_ib_dev *dev, int c)
658 {
659 	struct mlx5_mr_cache *cache = &dev->cache;
660 	struct mlx5_cache_ent *ent = &cache->ent[c];
661 	struct mlx5_ib_mr *tmp_mr;
662 	struct mlx5_ib_mr *mr;
663 	LIST_HEAD(del_list);
664 
665 	cancel_delayed_work(&ent->dwork);
666 	while (1) {
667 		spin_lock_irq(&ent->lock);
668 		if (list_empty(&ent->head)) {
669 			spin_unlock_irq(&ent->lock);
670 			break;
671 		}
672 		mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list);
673 		list_move(&mr->list, &del_list);
674 		ent->available_mrs--;
675 		ent->total_mrs--;
676 		spin_unlock_irq(&ent->lock);
677 		mlx5_core_destroy_mkey(dev->mdev, &mr->mmkey);
678 	}
679 
680 	list_for_each_entry_safe(mr, tmp_mr, &del_list, list) {
681 		list_del(&mr->list);
682 		kfree(mr);
683 	}
684 }
685 
686 static void mlx5_mr_cache_debugfs_cleanup(struct mlx5_ib_dev *dev)
687 {
688 	if (!mlx5_debugfs_root || dev->is_rep)
689 		return;
690 
691 	debugfs_remove_recursive(dev->cache.root);
692 	dev->cache.root = NULL;
693 }
694 
695 static void mlx5_mr_cache_debugfs_init(struct mlx5_ib_dev *dev)
696 {
697 	struct mlx5_mr_cache *cache = &dev->cache;
698 	struct mlx5_cache_ent *ent;
699 	struct dentry *dir;
700 	int i;
701 
702 	if (!mlx5_debugfs_root || dev->is_rep)
703 		return;
704 
705 	cache->root = debugfs_create_dir("mr_cache", dev->mdev->priv.dbg_root);
706 
707 	for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
708 		ent = &cache->ent[i];
709 		sprintf(ent->name, "%d", ent->order);
710 		dir = debugfs_create_dir(ent->name, cache->root);
711 		debugfs_create_file("size", 0600, dir, ent, &size_fops);
712 		debugfs_create_file("limit", 0600, dir, ent, &limit_fops);
713 		debugfs_create_u32("cur", 0400, dir, &ent->available_mrs);
714 		debugfs_create_u32("miss", 0600, dir, &ent->miss);
715 	}
716 }
717 
718 static void delay_time_func(struct timer_list *t)
719 {
720 	struct mlx5_ib_dev *dev = from_timer(dev, t, delay_timer);
721 
722 	WRITE_ONCE(dev->fill_delay, 0);
723 }
724 
725 int mlx5_mr_cache_init(struct mlx5_ib_dev *dev)
726 {
727 	struct mlx5_mr_cache *cache = &dev->cache;
728 	struct mlx5_cache_ent *ent;
729 	int i;
730 
731 	mutex_init(&dev->slow_path_mutex);
732 	cache->wq = alloc_ordered_workqueue("mkey_cache", WQ_MEM_RECLAIM);
733 	if (!cache->wq) {
734 		mlx5_ib_warn(dev, "failed to create work queue\n");
735 		return -ENOMEM;
736 	}
737 
738 	mlx5_cmd_init_async_ctx(dev->mdev, &dev->async_ctx);
739 	timer_setup(&dev->delay_timer, delay_time_func, 0);
740 	for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
741 		ent = &cache->ent[i];
742 		INIT_LIST_HEAD(&ent->head);
743 		spin_lock_init(&ent->lock);
744 		ent->order = i + 2;
745 		ent->dev = dev;
746 		ent->limit = 0;
747 
748 		INIT_WORK(&ent->work, cache_work_func);
749 		INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func);
750 
751 		if (i > MR_CACHE_LAST_STD_ENTRY) {
752 			mlx5_odp_init_mr_cache_entry(ent);
753 			continue;
754 		}
755 
756 		if (ent->order > mr_cache_max_order(dev))
757 			continue;
758 
759 		ent->page = PAGE_SHIFT;
760 		ent->xlt = (1 << ent->order) * sizeof(struct mlx5_mtt) /
761 			   MLX5_IB_UMR_OCTOWORD;
762 		ent->access_mode = MLX5_MKC_ACCESS_MODE_MTT;
763 		if ((dev->mdev->profile->mask & MLX5_PROF_MASK_MR_CACHE) &&
764 		    !dev->is_rep && mlx5_core_is_pf(dev->mdev) &&
765 		    mlx5_ib_can_load_pas_with_umr(dev, 0))
766 			ent->limit = dev->mdev->profile->mr_cache[i].limit;
767 		else
768 			ent->limit = 0;
769 		spin_lock_irq(&ent->lock);
770 		queue_adjust_cache_locked(ent);
771 		spin_unlock_irq(&ent->lock);
772 	}
773 
774 	mlx5_mr_cache_debugfs_init(dev);
775 
776 	return 0;
777 }
778 
779 int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev)
780 {
781 	unsigned int i;
782 
783 	if (!dev->cache.wq)
784 		return 0;
785 
786 	for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
787 		struct mlx5_cache_ent *ent = &dev->cache.ent[i];
788 
789 		spin_lock_irq(&ent->lock);
790 		ent->disabled = true;
791 		spin_unlock_irq(&ent->lock);
792 		cancel_work_sync(&ent->work);
793 		cancel_delayed_work_sync(&ent->dwork);
794 	}
795 
796 	mlx5_mr_cache_debugfs_cleanup(dev);
797 	mlx5_cmd_cleanup_async_ctx(&dev->async_ctx);
798 
799 	for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++)
800 		clean_keys(dev, i);
801 
802 	destroy_workqueue(dev->cache.wq);
803 	del_timer_sync(&dev->delay_timer);
804 
805 	return 0;
806 }
807 
808 struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc)
809 {
810 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
811 	int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
812 	struct mlx5_ib_mr *mr;
813 	void *mkc;
814 	u32 *in;
815 	int err;
816 
817 	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
818 	if (!mr)
819 		return ERR_PTR(-ENOMEM);
820 
821 	in = kzalloc(inlen, GFP_KERNEL);
822 	if (!in) {
823 		err = -ENOMEM;
824 		goto err_free;
825 	}
826 
827 	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
828 
829 	MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_PA);
830 	MLX5_SET(mkc, mkc, length64, 1);
831 	set_mkc_access_pd_addr_fields(mkc, acc, 0, pd);
832 
833 	err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen);
834 	if (err)
835 		goto err_in;
836 
837 	kfree(in);
838 	mr->mmkey.type = MLX5_MKEY_MR;
839 	mr->ibmr.lkey = mr->mmkey.key;
840 	mr->ibmr.rkey = mr->mmkey.key;
841 	mr->umem = NULL;
842 
843 	return &mr->ibmr;
844 
845 err_in:
846 	kfree(in);
847 
848 err_free:
849 	kfree(mr);
850 
851 	return ERR_PTR(err);
852 }
853 
854 static int get_octo_len(u64 addr, u64 len, int page_shift)
855 {
856 	u64 page_size = 1ULL << page_shift;
857 	u64 offset;
858 	int npages;
859 
860 	offset = addr & (page_size - 1);
861 	npages = ALIGN(len + offset, page_size) >> page_shift;
862 	return (npages + 1) / 2;
863 }
864 
865 static int mr_cache_max_order(struct mlx5_ib_dev *dev)
866 {
867 	if (MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset))
868 		return MR_CACHE_LAST_STD_ENTRY + 2;
869 	return MLX5_MAX_UMR_SHIFT;
870 }
871 
872 static void mlx5_ib_umr_done(struct ib_cq *cq, struct ib_wc *wc)
873 {
874 	struct mlx5_ib_umr_context *context =
875 		container_of(wc->wr_cqe, struct mlx5_ib_umr_context, cqe);
876 
877 	context->status = wc->status;
878 	complete(&context->done);
879 }
880 
881 static inline void mlx5_ib_init_umr_context(struct mlx5_ib_umr_context *context)
882 {
883 	context->cqe.done = mlx5_ib_umr_done;
884 	context->status = -1;
885 	init_completion(&context->done);
886 }
887 
888 static int mlx5_ib_post_send_wait(struct mlx5_ib_dev *dev,
889 				  struct mlx5_umr_wr *umrwr)
890 {
891 	struct umr_common *umrc = &dev->umrc;
892 	const struct ib_send_wr *bad;
893 	int err;
894 	struct mlx5_ib_umr_context umr_context;
895 
896 	mlx5_ib_init_umr_context(&umr_context);
897 	umrwr->wr.wr_cqe = &umr_context.cqe;
898 
899 	down(&umrc->sem);
900 	err = ib_post_send(umrc->qp, &umrwr->wr, &bad);
901 	if (err) {
902 		mlx5_ib_warn(dev, "UMR post send failed, err %d\n", err);
903 	} else {
904 		wait_for_completion(&umr_context.done);
905 		if (umr_context.status != IB_WC_SUCCESS) {
906 			mlx5_ib_warn(dev, "reg umr failed (%u)\n",
907 				     umr_context.status);
908 			err = -EFAULT;
909 		}
910 	}
911 	up(&umrc->sem);
912 	return err;
913 }
914 
915 static struct mlx5_cache_ent *mr_cache_ent_from_order(struct mlx5_ib_dev *dev,
916 						      unsigned int order)
917 {
918 	struct mlx5_mr_cache *cache = &dev->cache;
919 
920 	if (order < cache->ent[0].order)
921 		return &cache->ent[0];
922 	order = order - cache->ent[0].order;
923 	if (order > MR_CACHE_LAST_STD_ENTRY)
924 		return NULL;
925 	return &cache->ent[order];
926 }
927 
928 static void set_mr_fields(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr,
929 			  u64 length, int access_flags)
930 {
931 	mr->ibmr.lkey = mr->mmkey.key;
932 	mr->ibmr.rkey = mr->mmkey.key;
933 	mr->ibmr.length = length;
934 	mr->ibmr.device = &dev->ib_dev;
935 	mr->access_flags = access_flags;
936 }
937 
938 static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd,
939 					     struct ib_umem *umem, u64 iova,
940 					     int access_flags)
941 {
942 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
943 	struct mlx5_cache_ent *ent;
944 	struct mlx5_ib_mr *mr;
945 	unsigned int page_size;
946 
947 	page_size = mlx5_umem_find_best_pgsz(umem, mkc, log_page_size, 0, iova);
948 	if (WARN_ON(!page_size))
949 		return ERR_PTR(-EINVAL);
950 	ent = mr_cache_ent_from_order(
951 		dev, order_base_2(ib_umem_num_dma_blocks(umem, page_size)));
952 	/*
953 	 * Matches access in alloc_cache_mr(). If the MR can't come from the
954 	 * cache then synchronously create an uncached one.
955 	 */
956 	if (!ent || ent->limit == 0 ||
957 	    !mlx5_ib_can_reconfig_with_umr(dev, 0, access_flags)) {
958 		mutex_lock(&dev->slow_path_mutex);
959 		mr = reg_create(pd, umem, iova, access_flags, page_size, false);
960 		mutex_unlock(&dev->slow_path_mutex);
961 		return mr;
962 	}
963 
964 	mr = get_cache_mr(ent);
965 	if (!mr) {
966 		mr = create_cache_mr(ent);
967 		/*
968 		 * The above already tried to do the same stuff as reg_create(),
969 		 * no reason to try it again.
970 		 */
971 		if (IS_ERR(mr))
972 			return mr;
973 	}
974 
975 	mr->ibmr.pd = pd;
976 	mr->umem = umem;
977 	mr->access_flags = access_flags;
978 	mr->desc_size = sizeof(struct mlx5_mtt);
979 	mr->mmkey.iova = iova;
980 	mr->mmkey.size = umem->length;
981 	mr->mmkey.pd = to_mpd(pd)->pdn;
982 	mr->page_shift = order_base_2(page_size);
983 	mr->umem = umem;
984 	set_mr_fields(dev, mr, umem->length, access_flags);
985 
986 	return mr;
987 }
988 
989 #define MLX5_MAX_UMR_CHUNK ((1 << (MLX5_MAX_UMR_SHIFT + 4)) - \
990 			    MLX5_UMR_MTT_ALIGNMENT)
991 #define MLX5_SPARE_UMR_CHUNK 0x10000
992 
993 /*
994  * Allocate a temporary buffer to hold the per-page information to transfer to
995  * HW. For efficiency this should be as large as it can be, but buffer
996  * allocation failure is not allowed, so try smaller sizes.
997  */
998 static void *mlx5_ib_alloc_xlt(size_t *nents, size_t ent_size, gfp_t gfp_mask)
999 {
1000 	const size_t xlt_chunk_align =
1001 		MLX5_UMR_MTT_ALIGNMENT / sizeof(ent_size);
1002 	size_t size;
1003 	void *res = NULL;
1004 
1005 	static_assert(PAGE_SIZE % MLX5_UMR_MTT_ALIGNMENT == 0);
1006 
1007 	/*
1008 	 * MLX5_IB_UPD_XLT_ATOMIC doesn't signal an atomic context just that the
1009 	 * allocation can't trigger any kind of reclaim.
1010 	 */
1011 	might_sleep();
1012 
1013 	gfp_mask |= __GFP_ZERO;
1014 
1015 	/*
1016 	 * If the system already has a suitable high order page then just use
1017 	 * that, but don't try hard to create one. This max is about 1M, so a
1018 	 * free x86 huge page will satisfy it.
1019 	 */
1020 	size = min_t(size_t, ent_size * ALIGN(*nents, xlt_chunk_align),
1021 		     MLX5_MAX_UMR_CHUNK);
1022 	*nents = size / ent_size;
1023 	res = (void *)__get_free_pages(gfp_mask | __GFP_NOWARN,
1024 				       get_order(size));
1025 	if (res)
1026 		return res;
1027 
1028 	if (size > MLX5_SPARE_UMR_CHUNK) {
1029 		size = MLX5_SPARE_UMR_CHUNK;
1030 		*nents = get_order(size) / ent_size;
1031 		res = (void *)__get_free_pages(gfp_mask | __GFP_NOWARN,
1032 					       get_order(size));
1033 		if (res)
1034 			return res;
1035 	}
1036 
1037 	*nents = PAGE_SIZE / ent_size;
1038 	res = (void *)__get_free_page(gfp_mask);
1039 	if (res)
1040 		return res;
1041 
1042 	mutex_lock(&xlt_emergency_page_mutex);
1043 	memset(xlt_emergency_page, 0, PAGE_SIZE);
1044 	return xlt_emergency_page;
1045 }
1046 
1047 static void mlx5_ib_free_xlt(void *xlt, size_t length)
1048 {
1049 	if (xlt == xlt_emergency_page) {
1050 		mutex_unlock(&xlt_emergency_page_mutex);
1051 		return;
1052 	}
1053 
1054 	free_pages((unsigned long)xlt, get_order(length));
1055 }
1056 
1057 /*
1058  * Create a MLX5_IB_SEND_UMR_UPDATE_XLT work request and XLT buffer ready for
1059  * submission.
1060  */
1061 static void *mlx5_ib_create_xlt_wr(struct mlx5_ib_mr *mr,
1062 				   struct mlx5_umr_wr *wr, struct ib_sge *sg,
1063 				   size_t nents, size_t ent_size,
1064 				   unsigned int flags)
1065 {
1066 	struct mlx5_ib_dev *dev = mr_to_mdev(mr);
1067 	struct device *ddev = &dev->mdev->pdev->dev;
1068 	dma_addr_t dma;
1069 	void *xlt;
1070 
1071 	xlt = mlx5_ib_alloc_xlt(&nents, ent_size,
1072 				flags & MLX5_IB_UPD_XLT_ATOMIC ? GFP_ATOMIC :
1073 								 GFP_KERNEL);
1074 	sg->length = nents * ent_size;
1075 	dma = dma_map_single(ddev, xlt, sg->length, DMA_TO_DEVICE);
1076 	if (dma_mapping_error(ddev, dma)) {
1077 		mlx5_ib_err(dev, "unable to map DMA during XLT update.\n");
1078 		mlx5_ib_free_xlt(xlt, sg->length);
1079 		return NULL;
1080 	}
1081 	sg->addr = dma;
1082 	sg->lkey = dev->umrc.pd->local_dma_lkey;
1083 
1084 	memset(wr, 0, sizeof(*wr));
1085 	wr->wr.send_flags = MLX5_IB_SEND_UMR_UPDATE_XLT;
1086 	if (!(flags & MLX5_IB_UPD_XLT_ENABLE))
1087 		wr->wr.send_flags |= MLX5_IB_SEND_UMR_FAIL_IF_FREE;
1088 	wr->wr.sg_list = sg;
1089 	wr->wr.num_sge = 1;
1090 	wr->wr.opcode = MLX5_IB_WR_UMR;
1091 	wr->pd = mr->ibmr.pd;
1092 	wr->mkey = mr->mmkey.key;
1093 	wr->length = mr->mmkey.size;
1094 	wr->virt_addr = mr->mmkey.iova;
1095 	wr->access_flags = mr->access_flags;
1096 	wr->page_shift = mr->page_shift;
1097 	wr->xlt_size = sg->length;
1098 	return xlt;
1099 }
1100 
1101 static void mlx5_ib_unmap_free_xlt(struct mlx5_ib_dev *dev, void *xlt,
1102 				   struct ib_sge *sg)
1103 {
1104 	struct device *ddev = &dev->mdev->pdev->dev;
1105 
1106 	dma_unmap_single(ddev, sg->addr, sg->length, DMA_TO_DEVICE);
1107 	mlx5_ib_free_xlt(xlt, sg->length);
1108 }
1109 
1110 static unsigned int xlt_wr_final_send_flags(unsigned int flags)
1111 {
1112 	unsigned int res = 0;
1113 
1114 	if (flags & MLX5_IB_UPD_XLT_ENABLE)
1115 		res |= MLX5_IB_SEND_UMR_ENABLE_MR |
1116 		       MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS |
1117 		       MLX5_IB_SEND_UMR_UPDATE_TRANSLATION;
1118 	if (flags & MLX5_IB_UPD_XLT_PD || flags & MLX5_IB_UPD_XLT_ACCESS)
1119 		res |= MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS;
1120 	if (flags & MLX5_IB_UPD_XLT_ADDR)
1121 		res |= MLX5_IB_SEND_UMR_UPDATE_TRANSLATION;
1122 	return res;
1123 }
1124 
1125 int mlx5_ib_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages,
1126 		       int page_shift, int flags)
1127 {
1128 	struct mlx5_ib_dev *dev = mr_to_mdev(mr);
1129 	struct device *ddev = &dev->mdev->pdev->dev;
1130 	void *xlt;
1131 	struct mlx5_umr_wr wr;
1132 	struct ib_sge sg;
1133 	int err = 0;
1134 	int desc_size = (flags & MLX5_IB_UPD_XLT_INDIRECT)
1135 			       ? sizeof(struct mlx5_klm)
1136 			       : sizeof(struct mlx5_mtt);
1137 	const int page_align = MLX5_UMR_MTT_ALIGNMENT / desc_size;
1138 	const int page_mask = page_align - 1;
1139 	size_t pages_mapped = 0;
1140 	size_t pages_to_map = 0;
1141 	size_t pages_iter;
1142 	size_t size_to_map = 0;
1143 	size_t orig_sg_length;
1144 
1145 	if ((flags & MLX5_IB_UPD_XLT_INDIRECT) &&
1146 	    !umr_can_use_indirect_mkey(dev))
1147 		return -EPERM;
1148 
1149 	if (WARN_ON(!mr->umem->is_odp))
1150 		return -EINVAL;
1151 
1152 	/* UMR copies MTTs in units of MLX5_UMR_MTT_ALIGNMENT bytes,
1153 	 * so we need to align the offset and length accordingly
1154 	 */
1155 	if (idx & page_mask) {
1156 		npages += idx & page_mask;
1157 		idx &= ~page_mask;
1158 	}
1159 	pages_to_map = ALIGN(npages, page_align);
1160 
1161 	xlt = mlx5_ib_create_xlt_wr(mr, &wr, &sg, npages, desc_size, flags);
1162 	if (!xlt)
1163 		return -ENOMEM;
1164 	pages_iter = sg.length / desc_size;
1165 	orig_sg_length = sg.length;
1166 
1167 	if (!(flags & MLX5_IB_UPD_XLT_INDIRECT)) {
1168 		struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
1169 		size_t max_pages = ib_umem_odp_num_pages(odp) - idx;
1170 
1171 		pages_to_map = min_t(size_t, pages_to_map, max_pages);
1172 	}
1173 
1174 	wr.page_shift = page_shift;
1175 
1176 	for (pages_mapped = 0;
1177 	     pages_mapped < pages_to_map && !err;
1178 	     pages_mapped += pages_iter, idx += pages_iter) {
1179 		npages = min_t(int, pages_iter, pages_to_map - pages_mapped);
1180 		size_to_map = npages * desc_size;
1181 		dma_sync_single_for_cpu(ddev, sg.addr, sg.length,
1182 					DMA_TO_DEVICE);
1183 		mlx5_odp_populate_xlt(xlt, idx, npages, mr, flags);
1184 		dma_sync_single_for_device(ddev, sg.addr, sg.length,
1185 					   DMA_TO_DEVICE);
1186 
1187 		sg.length = ALIGN(size_to_map, MLX5_UMR_MTT_ALIGNMENT);
1188 
1189 		if (pages_mapped + pages_iter >= pages_to_map)
1190 			wr.wr.send_flags |= xlt_wr_final_send_flags(flags);
1191 
1192 		wr.offset = idx * desc_size;
1193 		wr.xlt_size = sg.length;
1194 
1195 		err = mlx5_ib_post_send_wait(dev, &wr);
1196 	}
1197 	sg.length = orig_sg_length;
1198 	mlx5_ib_unmap_free_xlt(dev, xlt, &sg);
1199 	return err;
1200 }
1201 
1202 /*
1203  * Send the DMA list to the HW for a normal MR using UMR.
1204  */
1205 static int mlx5_ib_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags)
1206 {
1207 	struct mlx5_ib_dev *dev = mr_to_mdev(mr);
1208 	struct device *ddev = &dev->mdev->pdev->dev;
1209 	struct ib_block_iter biter;
1210 	struct mlx5_mtt *cur_mtt;
1211 	struct mlx5_umr_wr wr;
1212 	size_t orig_sg_length;
1213 	struct mlx5_mtt *mtt;
1214 	size_t final_size;
1215 	struct ib_sge sg;
1216 	int err = 0;
1217 
1218 	if (WARN_ON(mr->umem->is_odp))
1219 		return -EINVAL;
1220 
1221 	mtt = mlx5_ib_create_xlt_wr(mr, &wr, &sg,
1222 				    ib_umem_num_dma_blocks(mr->umem,
1223 							   1 << mr->page_shift),
1224 				    sizeof(*mtt), flags);
1225 	if (!mtt)
1226 		return -ENOMEM;
1227 	orig_sg_length = sg.length;
1228 
1229 	cur_mtt = mtt;
1230 	rdma_for_each_block (mr->umem->sg_head.sgl, &biter, mr->umem->nmap,
1231 			     BIT(mr->page_shift)) {
1232 		if (cur_mtt == (void *)mtt + sg.length) {
1233 			dma_sync_single_for_device(ddev, sg.addr, sg.length,
1234 						   DMA_TO_DEVICE);
1235 			err = mlx5_ib_post_send_wait(dev, &wr);
1236 			if (err)
1237 				goto err;
1238 			dma_sync_single_for_cpu(ddev, sg.addr, sg.length,
1239 						DMA_TO_DEVICE);
1240 			wr.offset += sg.length;
1241 			cur_mtt = mtt;
1242 		}
1243 
1244 		cur_mtt->ptag =
1245 			cpu_to_be64(rdma_block_iter_dma_address(&biter) |
1246 				    MLX5_IB_MTT_PRESENT);
1247 		cur_mtt++;
1248 	}
1249 
1250 	final_size = (void *)cur_mtt - (void *)mtt;
1251 	sg.length = ALIGN(final_size, MLX5_UMR_MTT_ALIGNMENT);
1252 	memset(cur_mtt, 0, sg.length - final_size);
1253 	wr.wr.send_flags |= xlt_wr_final_send_flags(flags);
1254 	wr.xlt_size = sg.length;
1255 
1256 	dma_sync_single_for_device(ddev, sg.addr, sg.length, DMA_TO_DEVICE);
1257 	err = mlx5_ib_post_send_wait(dev, &wr);
1258 
1259 err:
1260 	sg.length = orig_sg_length;
1261 	mlx5_ib_unmap_free_xlt(dev, mtt, &sg);
1262 	return err;
1263 }
1264 
1265 /*
1266  * If ibmr is NULL it will be allocated by reg_create.
1267  * Else, the given ibmr will be used.
1268  */
1269 static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem,
1270 				     u64 iova, int access_flags,
1271 				     unsigned int page_size, bool populate)
1272 {
1273 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
1274 	struct mlx5_ib_mr *mr;
1275 	__be64 *pas;
1276 	void *mkc;
1277 	int inlen;
1278 	u32 *in;
1279 	int err;
1280 	bool pg_cap = !!(MLX5_CAP_GEN(dev->mdev, pg));
1281 
1282 	if (!page_size)
1283 		return ERR_PTR(-EINVAL);
1284 	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
1285 	if (!mr)
1286 		return ERR_PTR(-ENOMEM);
1287 
1288 	mr->ibmr.pd = pd;
1289 	mr->access_flags = access_flags;
1290 	mr->page_shift = order_base_2(page_size);
1291 
1292 	inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
1293 	if (populate)
1294 		inlen += sizeof(*pas) *
1295 			 roundup(ib_umem_num_dma_blocks(umem, page_size), 2);
1296 	in = kvzalloc(inlen, GFP_KERNEL);
1297 	if (!in) {
1298 		err = -ENOMEM;
1299 		goto err_1;
1300 	}
1301 	pas = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt);
1302 	if (populate) {
1303 		if (WARN_ON(access_flags & IB_ACCESS_ON_DEMAND)) {
1304 			err = -EINVAL;
1305 			goto err_2;
1306 		}
1307 		mlx5_ib_populate_pas(umem, 1UL << mr->page_shift, pas,
1308 				     pg_cap ? MLX5_IB_MTT_PRESENT : 0);
1309 	}
1310 
1311 	/* The pg_access bit allows setting the access flags
1312 	 * in the page list submitted with the command. */
1313 	MLX5_SET(create_mkey_in, in, pg_access, !!(pg_cap));
1314 
1315 	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
1316 	set_mkc_access_pd_addr_fields(mkc, access_flags, iova,
1317 				      populate ? pd : dev->umrc.pd);
1318 	MLX5_SET(mkc, mkc, free, !populate);
1319 	MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_MTT);
1320 	MLX5_SET(mkc, mkc, umr_en, 1);
1321 
1322 	MLX5_SET64(mkc, mkc, len, umem->length);
1323 	MLX5_SET(mkc, mkc, bsf_octword_size, 0);
1324 	MLX5_SET(mkc, mkc, translations_octword_size,
1325 		 get_octo_len(iova, umem->length, mr->page_shift));
1326 	MLX5_SET(mkc, mkc, log_page_size, mr->page_shift);
1327 	if (populate) {
1328 		MLX5_SET(create_mkey_in, in, translations_octword_actual_size,
1329 			 get_octo_len(iova, umem->length, mr->page_shift));
1330 	}
1331 
1332 	err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen);
1333 	if (err) {
1334 		mlx5_ib_warn(dev, "create mkey failed\n");
1335 		goto err_2;
1336 	}
1337 	mr->mmkey.type = MLX5_MKEY_MR;
1338 	mr->desc_size = sizeof(struct mlx5_mtt);
1339 	mr->umem = umem;
1340 	set_mr_fields(dev, mr, umem->length, access_flags);
1341 	kvfree(in);
1342 
1343 	mlx5_ib_dbg(dev, "mkey = 0x%x\n", mr->mmkey.key);
1344 
1345 	return mr;
1346 
1347 err_2:
1348 	kvfree(in);
1349 err_1:
1350 	kfree(mr);
1351 	return ERR_PTR(err);
1352 }
1353 
1354 static struct ib_mr *mlx5_ib_get_dm_mr(struct ib_pd *pd, u64 start_addr,
1355 				       u64 length, int acc, int mode)
1356 {
1357 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
1358 	int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
1359 	struct mlx5_ib_mr *mr;
1360 	void *mkc;
1361 	u32 *in;
1362 	int err;
1363 
1364 	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
1365 	if (!mr)
1366 		return ERR_PTR(-ENOMEM);
1367 
1368 	in = kzalloc(inlen, GFP_KERNEL);
1369 	if (!in) {
1370 		err = -ENOMEM;
1371 		goto err_free;
1372 	}
1373 
1374 	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
1375 
1376 	MLX5_SET(mkc, mkc, access_mode_1_0, mode & 0x3);
1377 	MLX5_SET(mkc, mkc, access_mode_4_2, (mode >> 2) & 0x7);
1378 	MLX5_SET64(mkc, mkc, len, length);
1379 	set_mkc_access_pd_addr_fields(mkc, acc, start_addr, pd);
1380 
1381 	err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen);
1382 	if (err)
1383 		goto err_in;
1384 
1385 	kfree(in);
1386 
1387 	set_mr_fields(dev, mr, length, acc);
1388 
1389 	return &mr->ibmr;
1390 
1391 err_in:
1392 	kfree(in);
1393 
1394 err_free:
1395 	kfree(mr);
1396 
1397 	return ERR_PTR(err);
1398 }
1399 
1400 int mlx5_ib_advise_mr(struct ib_pd *pd,
1401 		      enum ib_uverbs_advise_mr_advice advice,
1402 		      u32 flags,
1403 		      struct ib_sge *sg_list,
1404 		      u32 num_sge,
1405 		      struct uverbs_attr_bundle *attrs)
1406 {
1407 	if (advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH &&
1408 	    advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE &&
1409 	    advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_NO_FAULT)
1410 		return -EOPNOTSUPP;
1411 
1412 	return mlx5_ib_advise_mr_prefetch(pd, advice, flags,
1413 					 sg_list, num_sge);
1414 }
1415 
1416 struct ib_mr *mlx5_ib_reg_dm_mr(struct ib_pd *pd, struct ib_dm *dm,
1417 				struct ib_dm_mr_attr *attr,
1418 				struct uverbs_attr_bundle *attrs)
1419 {
1420 	struct mlx5_ib_dm *mdm = to_mdm(dm);
1421 	struct mlx5_core_dev *dev = to_mdev(dm->device)->mdev;
1422 	u64 start_addr = mdm->dev_addr + attr->offset;
1423 	int mode;
1424 
1425 	switch (mdm->type) {
1426 	case MLX5_IB_UAPI_DM_TYPE_MEMIC:
1427 		if (attr->access_flags & ~MLX5_IB_DM_MEMIC_ALLOWED_ACCESS)
1428 			return ERR_PTR(-EINVAL);
1429 
1430 		mode = MLX5_MKC_ACCESS_MODE_MEMIC;
1431 		start_addr -= pci_resource_start(dev->pdev, 0);
1432 		break;
1433 	case MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM:
1434 	case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM:
1435 		if (attr->access_flags & ~MLX5_IB_DM_SW_ICM_ALLOWED_ACCESS)
1436 			return ERR_PTR(-EINVAL);
1437 
1438 		mode = MLX5_MKC_ACCESS_MODE_SW_ICM;
1439 		break;
1440 	default:
1441 		return ERR_PTR(-EINVAL);
1442 	}
1443 
1444 	return mlx5_ib_get_dm_mr(pd, start_addr, attr->length,
1445 				 attr->access_flags, mode);
1446 }
1447 
1448 static struct ib_mr *create_real_mr(struct ib_pd *pd, struct ib_umem *umem,
1449 				    u64 iova, int access_flags)
1450 {
1451 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
1452 	struct mlx5_ib_mr *mr = NULL;
1453 	bool xlt_with_umr;
1454 	int err;
1455 
1456 	xlt_with_umr = mlx5_ib_can_load_pas_with_umr(dev, umem->length);
1457 	if (xlt_with_umr) {
1458 		mr = alloc_cacheable_mr(pd, umem, iova, access_flags);
1459 	} else {
1460 		unsigned int page_size = mlx5_umem_find_best_pgsz(
1461 			umem, mkc, log_page_size, 0, iova);
1462 
1463 		mutex_lock(&dev->slow_path_mutex);
1464 		mr = reg_create(pd, umem, iova, access_flags, page_size, true);
1465 		mutex_unlock(&dev->slow_path_mutex);
1466 	}
1467 	if (IS_ERR(mr)) {
1468 		ib_umem_release(umem);
1469 		return ERR_CAST(mr);
1470 	}
1471 
1472 	mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmkey.key);
1473 
1474 	atomic_add(ib_umem_num_pages(umem), &dev->mdev->priv.reg_pages);
1475 
1476 	if (xlt_with_umr) {
1477 		/*
1478 		 * If the MR was created with reg_create then it will be
1479 		 * configured properly but left disabled. It is safe to go ahead
1480 		 * and configure it again via UMR while enabling it.
1481 		 */
1482 		err = mlx5_ib_update_mr_pas(mr, MLX5_IB_UPD_XLT_ENABLE);
1483 		if (err) {
1484 			dereg_mr(dev, mr);
1485 			return ERR_PTR(err);
1486 		}
1487 	}
1488 	return &mr->ibmr;
1489 }
1490 
1491 static struct ib_mr *create_user_odp_mr(struct ib_pd *pd, u64 start, u64 length,
1492 					u64 iova, int access_flags,
1493 					struct ib_udata *udata)
1494 {
1495 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
1496 	struct ib_umem_odp *odp;
1497 	struct mlx5_ib_mr *mr;
1498 	int err;
1499 
1500 	if (!IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING))
1501 		return ERR_PTR(-EOPNOTSUPP);
1502 
1503 	if (!start && length == U64_MAX) {
1504 		if (iova != 0)
1505 			return ERR_PTR(-EINVAL);
1506 		if (!(dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT))
1507 			return ERR_PTR(-EINVAL);
1508 
1509 		mr = mlx5_ib_alloc_implicit_mr(to_mpd(pd), udata, access_flags);
1510 		if (IS_ERR(mr))
1511 			return ERR_CAST(mr);
1512 		return &mr->ibmr;
1513 	}
1514 
1515 	/* ODP requires xlt update via umr to work. */
1516 	if (!mlx5_ib_can_load_pas_with_umr(dev, length))
1517 		return ERR_PTR(-EINVAL);
1518 
1519 	odp = ib_umem_odp_get(&dev->ib_dev, start, length, access_flags,
1520 			      &mlx5_mn_ops);
1521 	if (IS_ERR(odp))
1522 		return ERR_CAST(odp);
1523 
1524 	mr = alloc_cacheable_mr(pd, &odp->umem, iova, access_flags);
1525 	if (IS_ERR(mr)) {
1526 		ib_umem_release(&odp->umem);
1527 		return ERR_CAST(mr);
1528 	}
1529 
1530 	odp->private = mr;
1531 	init_waitqueue_head(&mr->q_deferred_work);
1532 	atomic_set(&mr->num_deferred_work, 0);
1533 	err = xa_err(xa_store(&dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key),
1534 			      &mr->mmkey, GFP_KERNEL));
1535 	if (err)
1536 		goto err_dereg_mr;
1537 
1538 	err = mlx5_ib_init_odp_mr(mr);
1539 	if (err)
1540 		goto err_dereg_mr;
1541 	return &mr->ibmr;
1542 
1543 err_dereg_mr:
1544 	dereg_mr(dev, mr);
1545 	return ERR_PTR(err);
1546 }
1547 
1548 struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
1549 				  u64 iova, int access_flags,
1550 				  struct ib_udata *udata)
1551 {
1552 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
1553 	struct ib_umem *umem;
1554 
1555 	if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM))
1556 		return ERR_PTR(-EOPNOTSUPP);
1557 
1558 	mlx5_ib_dbg(dev, "start 0x%llx, iova 0x%llx, length 0x%llx, access_flags 0x%x\n",
1559 		    start, iova, length, access_flags);
1560 
1561 	if (access_flags & IB_ACCESS_ON_DEMAND)
1562 		return create_user_odp_mr(pd, start, length, iova, access_flags,
1563 					  udata);
1564 	umem = ib_umem_get(&dev->ib_dev, start, length, access_flags);
1565 	if (IS_ERR(umem))
1566 		return ERR_CAST(umem);
1567 	return create_real_mr(pd, umem, iova, access_flags);
1568 }
1569 
1570 /**
1571  * mlx5_mr_cache_invalidate - Fence all DMA on the MR
1572  * @mr: The MR to fence
1573  *
1574  * Upon return the NIC will not be doing any DMA to the pages under the MR,
1575  * and any DMA inprogress will be completed. Failure of this function
1576  * indicates the HW has failed catastrophically.
1577  */
1578 int mlx5_mr_cache_invalidate(struct mlx5_ib_mr *mr)
1579 {
1580 	struct mlx5_umr_wr umrwr = {};
1581 
1582 	if (mr_to_mdev(mr)->mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR)
1583 		return 0;
1584 
1585 	umrwr.wr.send_flags = MLX5_IB_SEND_UMR_DISABLE_MR |
1586 			      MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS;
1587 	umrwr.wr.opcode = MLX5_IB_WR_UMR;
1588 	umrwr.pd = mr_to_mdev(mr)->umrc.pd;
1589 	umrwr.mkey = mr->mmkey.key;
1590 	umrwr.ignore_free_state = 1;
1591 
1592 	return mlx5_ib_post_send_wait(mr_to_mdev(mr), &umrwr);
1593 }
1594 
1595 /*
1596  * True if the change in access flags can be done via UMR, only some access
1597  * flags can be updated.
1598  */
1599 static bool can_use_umr_rereg_access(struct mlx5_ib_dev *dev,
1600 				     unsigned int current_access_flags,
1601 				     unsigned int target_access_flags)
1602 {
1603 	unsigned int diffs = current_access_flags ^ target_access_flags;
1604 
1605 	if (diffs & ~(IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE |
1606 		      IB_ACCESS_REMOTE_READ | IB_ACCESS_RELAXED_ORDERING))
1607 		return false;
1608 	return mlx5_ib_can_reconfig_with_umr(dev, current_access_flags,
1609 					     target_access_flags);
1610 }
1611 
1612 static int umr_rereg_pd_access(struct mlx5_ib_mr *mr, struct ib_pd *pd,
1613 			       int access_flags)
1614 {
1615 	struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device);
1616 	struct mlx5_umr_wr umrwr = {
1617 		.wr = {
1618 			.send_flags = MLX5_IB_SEND_UMR_FAIL_IF_FREE |
1619 				      MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS,
1620 			.opcode = MLX5_IB_WR_UMR,
1621 		},
1622 		.mkey = mr->mmkey.key,
1623 		.pd = pd,
1624 		.access_flags = access_flags,
1625 	};
1626 	int err;
1627 
1628 	err = mlx5_ib_post_send_wait(dev, &umrwr);
1629 	if (err)
1630 		return err;
1631 
1632 	mr->access_flags = access_flags;
1633 	mr->mmkey.pd = to_mpd(pd)->pdn;
1634 	return 0;
1635 }
1636 
1637 static bool can_use_umr_rereg_pas(struct mlx5_ib_mr *mr,
1638 				  struct ib_umem *new_umem,
1639 				  int new_access_flags, u64 iova,
1640 				  unsigned long *page_size)
1641 {
1642 	struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device);
1643 
1644 	/* We only track the allocated sizes of MRs from the cache */
1645 	if (!mr->cache_ent)
1646 		return false;
1647 	if (!mlx5_ib_can_load_pas_with_umr(dev, new_umem->length))
1648 		return false;
1649 
1650 	*page_size =
1651 		mlx5_umem_find_best_pgsz(new_umem, mkc, log_page_size, 0, iova);
1652 	if (WARN_ON(!*page_size))
1653 		return false;
1654 	return (1ULL << mr->cache_ent->order) >=
1655 	       ib_umem_num_dma_blocks(new_umem, *page_size);
1656 }
1657 
1658 static int umr_rereg_pas(struct mlx5_ib_mr *mr, struct ib_pd *pd,
1659 			 int access_flags, int flags, struct ib_umem *new_umem,
1660 			 u64 iova, unsigned long page_size)
1661 {
1662 	struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device);
1663 	int upd_flags = MLX5_IB_UPD_XLT_ADDR | MLX5_IB_UPD_XLT_ENABLE;
1664 	struct ib_umem *old_umem = mr->umem;
1665 	int err;
1666 
1667 	/*
1668 	 * To keep everything simple the MR is revoked before we start to mess
1669 	 * with it. This ensure the change is atomic relative to any use of the
1670 	 * MR.
1671 	 */
1672 	err = mlx5_mr_cache_invalidate(mr);
1673 	if (err)
1674 		return err;
1675 
1676 	if (flags & IB_MR_REREG_PD) {
1677 		mr->ibmr.pd = pd;
1678 		mr->mmkey.pd = to_mpd(pd)->pdn;
1679 		upd_flags |= MLX5_IB_UPD_XLT_PD;
1680 	}
1681 	if (flags & IB_MR_REREG_ACCESS) {
1682 		mr->access_flags = access_flags;
1683 		upd_flags |= MLX5_IB_UPD_XLT_ACCESS;
1684 	}
1685 
1686 	mr->ibmr.length = new_umem->length;
1687 	mr->mmkey.iova = iova;
1688 	mr->mmkey.size = new_umem->length;
1689 	mr->page_shift = order_base_2(page_size);
1690 	mr->umem = new_umem;
1691 	err = mlx5_ib_update_mr_pas(mr, upd_flags);
1692 	if (err) {
1693 		/*
1694 		 * The MR is revoked at this point so there is no issue to free
1695 		 * new_umem.
1696 		 */
1697 		mr->umem = old_umem;
1698 		return err;
1699 	}
1700 
1701 	atomic_sub(ib_umem_num_pages(old_umem), &dev->mdev->priv.reg_pages);
1702 	ib_umem_release(old_umem);
1703 	atomic_add(ib_umem_num_pages(new_umem), &dev->mdev->priv.reg_pages);
1704 	return 0;
1705 }
1706 
1707 struct ib_mr *mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
1708 				    u64 length, u64 iova, int new_access_flags,
1709 				    struct ib_pd *new_pd,
1710 				    struct ib_udata *udata)
1711 {
1712 	struct mlx5_ib_dev *dev = to_mdev(ib_mr->device);
1713 	struct mlx5_ib_mr *mr = to_mmr(ib_mr);
1714 	int err;
1715 
1716 	if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM))
1717 		return ERR_PTR(-EOPNOTSUPP);
1718 
1719 	mlx5_ib_dbg(
1720 		dev,
1721 		"start 0x%llx, iova 0x%llx, length 0x%llx, access_flags 0x%x\n",
1722 		start, iova, length, new_access_flags);
1723 
1724 	if (flags & ~(IB_MR_REREG_TRANS | IB_MR_REREG_PD | IB_MR_REREG_ACCESS))
1725 		return ERR_PTR(-EOPNOTSUPP);
1726 
1727 	if (!(flags & IB_MR_REREG_ACCESS))
1728 		new_access_flags = mr->access_flags;
1729 	if (!(flags & IB_MR_REREG_PD))
1730 		new_pd = ib_mr->pd;
1731 
1732 	if (!(flags & IB_MR_REREG_TRANS)) {
1733 		struct ib_umem *umem;
1734 
1735 		/* Fast path for PD/access change */
1736 		if (can_use_umr_rereg_access(dev, mr->access_flags,
1737 					     new_access_flags)) {
1738 			err = umr_rereg_pd_access(mr, new_pd, new_access_flags);
1739 			if (err)
1740 				return ERR_PTR(err);
1741 			return NULL;
1742 		}
1743 		/* DM or ODP MR's don't have a umem so we can't re-use it */
1744 		if (!mr->umem || is_odp_mr(mr))
1745 			goto recreate;
1746 
1747 		/*
1748 		 * Only one active MR can refer to a umem at one time, revoke
1749 		 * the old MR before assigning the umem to the new one.
1750 		 */
1751 		err = mlx5_mr_cache_invalidate(mr);
1752 		if (err)
1753 			return ERR_PTR(err);
1754 		umem = mr->umem;
1755 		mr->umem = NULL;
1756 		atomic_sub(ib_umem_num_pages(umem), &dev->mdev->priv.reg_pages);
1757 
1758 		return create_real_mr(new_pd, umem, mr->mmkey.iova,
1759 				      new_access_flags);
1760 	}
1761 
1762 	/*
1763 	 * DM doesn't have a PAS list so we can't re-use it, odp does but the
1764 	 * logic around releasing the umem is different
1765 	 */
1766 	if (!mr->umem || is_odp_mr(mr))
1767 		goto recreate;
1768 
1769 	if (!(new_access_flags & IB_ACCESS_ON_DEMAND) &&
1770 	    can_use_umr_rereg_access(dev, mr->access_flags, new_access_flags)) {
1771 		struct ib_umem *new_umem;
1772 		unsigned long page_size;
1773 
1774 		new_umem = ib_umem_get(&dev->ib_dev, start, length,
1775 				       new_access_flags);
1776 		if (IS_ERR(new_umem))
1777 			return ERR_CAST(new_umem);
1778 
1779 		/* Fast path for PAS change */
1780 		if (can_use_umr_rereg_pas(mr, new_umem, new_access_flags, iova,
1781 					  &page_size)) {
1782 			err = umr_rereg_pas(mr, new_pd, new_access_flags, flags,
1783 					    new_umem, iova, page_size);
1784 			if (err) {
1785 				ib_umem_release(new_umem);
1786 				return ERR_PTR(err);
1787 			}
1788 			return NULL;
1789 		}
1790 		return create_real_mr(new_pd, new_umem, iova, new_access_flags);
1791 	}
1792 
1793 	/*
1794 	 * Everything else has no state we can preserve, just create a new MR
1795 	 * from scratch
1796 	 */
1797 recreate:
1798 	return mlx5_ib_reg_user_mr(new_pd, start, length, iova,
1799 				   new_access_flags, udata);
1800 }
1801 
1802 static int
1803 mlx5_alloc_priv_descs(struct ib_device *device,
1804 		      struct mlx5_ib_mr *mr,
1805 		      int ndescs,
1806 		      int desc_size)
1807 {
1808 	struct mlx5_ib_dev *dev = to_mdev(device);
1809 	struct device *ddev = &dev->mdev->pdev->dev;
1810 	int size = ndescs * desc_size;
1811 	int add_size;
1812 	int ret;
1813 
1814 	add_size = max_t(int, MLX5_UMR_ALIGN - ARCH_KMALLOC_MINALIGN, 0);
1815 
1816 	mr->descs_alloc = kzalloc(size + add_size, GFP_KERNEL);
1817 	if (!mr->descs_alloc)
1818 		return -ENOMEM;
1819 
1820 	mr->descs = PTR_ALIGN(mr->descs_alloc, MLX5_UMR_ALIGN);
1821 
1822 	mr->desc_map = dma_map_single(ddev, mr->descs, size, DMA_TO_DEVICE);
1823 	if (dma_mapping_error(ddev, mr->desc_map)) {
1824 		ret = -ENOMEM;
1825 		goto err;
1826 	}
1827 
1828 	return 0;
1829 err:
1830 	kfree(mr->descs_alloc);
1831 
1832 	return ret;
1833 }
1834 
1835 static void
1836 mlx5_free_priv_descs(struct mlx5_ib_mr *mr)
1837 {
1838 	if (mr->descs) {
1839 		struct ib_device *device = mr->ibmr.device;
1840 		int size = mr->max_descs * mr->desc_size;
1841 		struct mlx5_ib_dev *dev = to_mdev(device);
1842 
1843 		dma_unmap_single(&dev->mdev->pdev->dev, mr->desc_map, size,
1844 				 DMA_TO_DEVICE);
1845 		kfree(mr->descs_alloc);
1846 		mr->descs = NULL;
1847 	}
1848 }
1849 
1850 static void clean_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
1851 {
1852 	if (mr->sig) {
1853 		if (mlx5_core_destroy_psv(dev->mdev,
1854 					  mr->sig->psv_memory.psv_idx))
1855 			mlx5_ib_warn(dev, "failed to destroy mem psv %d\n",
1856 				     mr->sig->psv_memory.psv_idx);
1857 		if (mlx5_core_destroy_psv(dev->mdev,
1858 					  mr->sig->psv_wire.psv_idx))
1859 			mlx5_ib_warn(dev, "failed to destroy wire psv %d\n",
1860 				     mr->sig->psv_wire.psv_idx);
1861 		xa_erase(&dev->sig_mrs, mlx5_base_mkey(mr->mmkey.key));
1862 		kfree(mr->sig);
1863 		mr->sig = NULL;
1864 	}
1865 
1866 	if (!mr->cache_ent) {
1867 		destroy_mkey(dev, mr);
1868 		mlx5_free_priv_descs(mr);
1869 	}
1870 }
1871 
1872 static void dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
1873 {
1874 	struct ib_umem *umem = mr->umem;
1875 
1876 	/* Stop all DMA */
1877 	if (is_odp_mr(mr))
1878 		mlx5_ib_fence_odp_mr(mr);
1879 	else
1880 		clean_mr(dev, mr);
1881 
1882 	if (umem) {
1883 		if (!is_odp_mr(mr))
1884 			atomic_sub(ib_umem_num_pages(umem),
1885 				   &dev->mdev->priv.reg_pages);
1886 		ib_umem_release(umem);
1887 	}
1888 
1889 	if (mr->cache_ent)
1890 		mlx5_mr_cache_free(dev, mr);
1891 	else
1892 		kfree(mr);
1893 }
1894 
1895 int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
1896 {
1897 	struct mlx5_ib_mr *mmr = to_mmr(ibmr);
1898 
1899 	if (ibmr->type == IB_MR_TYPE_INTEGRITY) {
1900 		dereg_mr(to_mdev(mmr->mtt_mr->ibmr.device), mmr->mtt_mr);
1901 		dereg_mr(to_mdev(mmr->klm_mr->ibmr.device), mmr->klm_mr);
1902 	}
1903 
1904 	if (is_odp_mr(mmr) && to_ib_umem_odp(mmr->umem)->is_implicit_odp) {
1905 		mlx5_ib_free_implicit_mr(mmr);
1906 		return 0;
1907 	}
1908 
1909 	dereg_mr(to_mdev(ibmr->device), mmr);
1910 
1911 	return 0;
1912 }
1913 
1914 static void mlx5_set_umr_free_mkey(struct ib_pd *pd, u32 *in, int ndescs,
1915 				   int access_mode, int page_shift)
1916 {
1917 	void *mkc;
1918 
1919 	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
1920 
1921 	/* This is only used from the kernel, so setting the PD is OK. */
1922 	set_mkc_access_pd_addr_fields(mkc, 0, 0, pd);
1923 	MLX5_SET(mkc, mkc, free, 1);
1924 	MLX5_SET(mkc, mkc, translations_octword_size, ndescs);
1925 	MLX5_SET(mkc, mkc, access_mode_1_0, access_mode & 0x3);
1926 	MLX5_SET(mkc, mkc, access_mode_4_2, (access_mode >> 2) & 0x7);
1927 	MLX5_SET(mkc, mkc, umr_en, 1);
1928 	MLX5_SET(mkc, mkc, log_page_size, page_shift);
1929 }
1930 
1931 static int _mlx5_alloc_mkey_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr,
1932 				  int ndescs, int desc_size, int page_shift,
1933 				  int access_mode, u32 *in, int inlen)
1934 {
1935 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
1936 	int err;
1937 
1938 	mr->access_mode = access_mode;
1939 	mr->desc_size = desc_size;
1940 	mr->max_descs = ndescs;
1941 
1942 	err = mlx5_alloc_priv_descs(pd->device, mr, ndescs, desc_size);
1943 	if (err)
1944 		return err;
1945 
1946 	mlx5_set_umr_free_mkey(pd, in, ndescs, access_mode, page_shift);
1947 
1948 	err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen);
1949 	if (err)
1950 		goto err_free_descs;
1951 
1952 	mr->mmkey.type = MLX5_MKEY_MR;
1953 	mr->ibmr.lkey = mr->mmkey.key;
1954 	mr->ibmr.rkey = mr->mmkey.key;
1955 
1956 	return 0;
1957 
1958 err_free_descs:
1959 	mlx5_free_priv_descs(mr);
1960 	return err;
1961 }
1962 
1963 static struct mlx5_ib_mr *mlx5_ib_alloc_pi_mr(struct ib_pd *pd,
1964 				u32 max_num_sg, u32 max_num_meta_sg,
1965 				int desc_size, int access_mode)
1966 {
1967 	int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
1968 	int ndescs = ALIGN(max_num_sg + max_num_meta_sg, 4);
1969 	int page_shift = 0;
1970 	struct mlx5_ib_mr *mr;
1971 	u32 *in;
1972 	int err;
1973 
1974 	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
1975 	if (!mr)
1976 		return ERR_PTR(-ENOMEM);
1977 
1978 	mr->ibmr.pd = pd;
1979 	mr->ibmr.device = pd->device;
1980 
1981 	in = kzalloc(inlen, GFP_KERNEL);
1982 	if (!in) {
1983 		err = -ENOMEM;
1984 		goto err_free;
1985 	}
1986 
1987 	if (access_mode == MLX5_MKC_ACCESS_MODE_MTT)
1988 		page_shift = PAGE_SHIFT;
1989 
1990 	err = _mlx5_alloc_mkey_descs(pd, mr, ndescs, desc_size, page_shift,
1991 				     access_mode, in, inlen);
1992 	if (err)
1993 		goto err_free_in;
1994 
1995 	mr->umem = NULL;
1996 	kfree(in);
1997 
1998 	return mr;
1999 
2000 err_free_in:
2001 	kfree(in);
2002 err_free:
2003 	kfree(mr);
2004 	return ERR_PTR(err);
2005 }
2006 
2007 static int mlx5_alloc_mem_reg_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr,
2008 				    int ndescs, u32 *in, int inlen)
2009 {
2010 	return _mlx5_alloc_mkey_descs(pd, mr, ndescs, sizeof(struct mlx5_mtt),
2011 				      PAGE_SHIFT, MLX5_MKC_ACCESS_MODE_MTT, in,
2012 				      inlen);
2013 }
2014 
2015 static int mlx5_alloc_sg_gaps_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr,
2016 				    int ndescs, u32 *in, int inlen)
2017 {
2018 	return _mlx5_alloc_mkey_descs(pd, mr, ndescs, sizeof(struct mlx5_klm),
2019 				      0, MLX5_MKC_ACCESS_MODE_KLMS, in, inlen);
2020 }
2021 
2022 static int mlx5_alloc_integrity_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr,
2023 				      int max_num_sg, int max_num_meta_sg,
2024 				      u32 *in, int inlen)
2025 {
2026 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
2027 	u32 psv_index[2];
2028 	void *mkc;
2029 	int err;
2030 
2031 	mr->sig = kzalloc(sizeof(*mr->sig), GFP_KERNEL);
2032 	if (!mr->sig)
2033 		return -ENOMEM;
2034 
2035 	/* create mem & wire PSVs */
2036 	err = mlx5_core_create_psv(dev->mdev, to_mpd(pd)->pdn, 2, psv_index);
2037 	if (err)
2038 		goto err_free_sig;
2039 
2040 	mr->sig->psv_memory.psv_idx = psv_index[0];
2041 	mr->sig->psv_wire.psv_idx = psv_index[1];
2042 
2043 	mr->sig->sig_status_checked = true;
2044 	mr->sig->sig_err_exists = false;
2045 	/* Next UMR, Arm SIGERR */
2046 	++mr->sig->sigerr_count;
2047 	mr->klm_mr = mlx5_ib_alloc_pi_mr(pd, max_num_sg, max_num_meta_sg,
2048 					 sizeof(struct mlx5_klm),
2049 					 MLX5_MKC_ACCESS_MODE_KLMS);
2050 	if (IS_ERR(mr->klm_mr)) {
2051 		err = PTR_ERR(mr->klm_mr);
2052 		goto err_destroy_psv;
2053 	}
2054 	mr->mtt_mr = mlx5_ib_alloc_pi_mr(pd, max_num_sg, max_num_meta_sg,
2055 					 sizeof(struct mlx5_mtt),
2056 					 MLX5_MKC_ACCESS_MODE_MTT);
2057 	if (IS_ERR(mr->mtt_mr)) {
2058 		err = PTR_ERR(mr->mtt_mr);
2059 		goto err_free_klm_mr;
2060 	}
2061 
2062 	/* Set bsf descriptors for mkey */
2063 	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
2064 	MLX5_SET(mkc, mkc, bsf_en, 1);
2065 	MLX5_SET(mkc, mkc, bsf_octword_size, MLX5_MKEY_BSF_OCTO_SIZE);
2066 
2067 	err = _mlx5_alloc_mkey_descs(pd, mr, 4, sizeof(struct mlx5_klm), 0,
2068 				     MLX5_MKC_ACCESS_MODE_KLMS, in, inlen);
2069 	if (err)
2070 		goto err_free_mtt_mr;
2071 
2072 	err = xa_err(xa_store(&dev->sig_mrs, mlx5_base_mkey(mr->mmkey.key),
2073 			      mr->sig, GFP_KERNEL));
2074 	if (err)
2075 		goto err_free_descs;
2076 	return 0;
2077 
2078 err_free_descs:
2079 	destroy_mkey(dev, mr);
2080 	mlx5_free_priv_descs(mr);
2081 err_free_mtt_mr:
2082 	dereg_mr(to_mdev(mr->mtt_mr->ibmr.device), mr->mtt_mr);
2083 	mr->mtt_mr = NULL;
2084 err_free_klm_mr:
2085 	dereg_mr(to_mdev(mr->klm_mr->ibmr.device), mr->klm_mr);
2086 	mr->klm_mr = NULL;
2087 err_destroy_psv:
2088 	if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_memory.psv_idx))
2089 		mlx5_ib_warn(dev, "failed to destroy mem psv %d\n",
2090 			     mr->sig->psv_memory.psv_idx);
2091 	if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_wire.psv_idx))
2092 		mlx5_ib_warn(dev, "failed to destroy wire psv %d\n",
2093 			     mr->sig->psv_wire.psv_idx);
2094 err_free_sig:
2095 	kfree(mr->sig);
2096 
2097 	return err;
2098 }
2099 
2100 static struct ib_mr *__mlx5_ib_alloc_mr(struct ib_pd *pd,
2101 					enum ib_mr_type mr_type, u32 max_num_sg,
2102 					u32 max_num_meta_sg)
2103 {
2104 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
2105 	int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
2106 	int ndescs = ALIGN(max_num_sg, 4);
2107 	struct mlx5_ib_mr *mr;
2108 	u32 *in;
2109 	int err;
2110 
2111 	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
2112 	if (!mr)
2113 		return ERR_PTR(-ENOMEM);
2114 
2115 	in = kzalloc(inlen, GFP_KERNEL);
2116 	if (!in) {
2117 		err = -ENOMEM;
2118 		goto err_free;
2119 	}
2120 
2121 	mr->ibmr.device = pd->device;
2122 	mr->umem = NULL;
2123 
2124 	switch (mr_type) {
2125 	case IB_MR_TYPE_MEM_REG:
2126 		err = mlx5_alloc_mem_reg_descs(pd, mr, ndescs, in, inlen);
2127 		break;
2128 	case IB_MR_TYPE_SG_GAPS:
2129 		err = mlx5_alloc_sg_gaps_descs(pd, mr, ndescs, in, inlen);
2130 		break;
2131 	case IB_MR_TYPE_INTEGRITY:
2132 		err = mlx5_alloc_integrity_descs(pd, mr, max_num_sg,
2133 						 max_num_meta_sg, in, inlen);
2134 		break;
2135 	default:
2136 		mlx5_ib_warn(dev, "Invalid mr type %d\n", mr_type);
2137 		err = -EINVAL;
2138 	}
2139 
2140 	if (err)
2141 		goto err_free_in;
2142 
2143 	kfree(in);
2144 
2145 	return &mr->ibmr;
2146 
2147 err_free_in:
2148 	kfree(in);
2149 err_free:
2150 	kfree(mr);
2151 	return ERR_PTR(err);
2152 }
2153 
2154 struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
2155 			       u32 max_num_sg)
2156 {
2157 	return __mlx5_ib_alloc_mr(pd, mr_type, max_num_sg, 0);
2158 }
2159 
2160 struct ib_mr *mlx5_ib_alloc_mr_integrity(struct ib_pd *pd,
2161 					 u32 max_num_sg, u32 max_num_meta_sg)
2162 {
2163 	return __mlx5_ib_alloc_mr(pd, IB_MR_TYPE_INTEGRITY, max_num_sg,
2164 				  max_num_meta_sg);
2165 }
2166 
2167 int mlx5_ib_alloc_mw(struct ib_mw *ibmw, struct ib_udata *udata)
2168 {
2169 	struct mlx5_ib_dev *dev = to_mdev(ibmw->device);
2170 	int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
2171 	struct mlx5_ib_mw *mw = to_mmw(ibmw);
2172 	u32 *in = NULL;
2173 	void *mkc;
2174 	int ndescs;
2175 	int err;
2176 	struct mlx5_ib_alloc_mw req = {};
2177 	struct {
2178 		__u32	comp_mask;
2179 		__u32	response_length;
2180 	} resp = {};
2181 
2182 	err = ib_copy_from_udata(&req, udata, min(udata->inlen, sizeof(req)));
2183 	if (err)
2184 		return err;
2185 
2186 	if (req.comp_mask || req.reserved1 || req.reserved2)
2187 		return -EOPNOTSUPP;
2188 
2189 	if (udata->inlen > sizeof(req) &&
2190 	    !ib_is_udata_cleared(udata, sizeof(req),
2191 				 udata->inlen - sizeof(req)))
2192 		return -EOPNOTSUPP;
2193 
2194 	ndescs = req.num_klms ? roundup(req.num_klms, 4) : roundup(1, 4);
2195 
2196 	in = kzalloc(inlen, GFP_KERNEL);
2197 	if (!in) {
2198 		err = -ENOMEM;
2199 		goto free;
2200 	}
2201 
2202 	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
2203 
2204 	MLX5_SET(mkc, mkc, free, 1);
2205 	MLX5_SET(mkc, mkc, translations_octword_size, ndescs);
2206 	MLX5_SET(mkc, mkc, pd, to_mpd(ibmw->pd)->pdn);
2207 	MLX5_SET(mkc, mkc, umr_en, 1);
2208 	MLX5_SET(mkc, mkc, lr, 1);
2209 	MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_KLMS);
2210 	MLX5_SET(mkc, mkc, en_rinval, !!((ibmw->type == IB_MW_TYPE_2)));
2211 	MLX5_SET(mkc, mkc, qpn, 0xffffff);
2212 
2213 	err = mlx5_ib_create_mkey(dev, &mw->mmkey, in, inlen);
2214 	if (err)
2215 		goto free;
2216 
2217 	mw->mmkey.type = MLX5_MKEY_MW;
2218 	ibmw->rkey = mw->mmkey.key;
2219 	mw->ndescs = ndescs;
2220 
2221 	resp.response_length =
2222 		min(offsetofend(typeof(resp), response_length), udata->outlen);
2223 	if (resp.response_length) {
2224 		err = ib_copy_to_udata(udata, &resp, resp.response_length);
2225 		if (err)
2226 			goto free_mkey;
2227 	}
2228 
2229 	if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) {
2230 		err = xa_err(xa_store(&dev->odp_mkeys,
2231 				      mlx5_base_mkey(mw->mmkey.key), &mw->mmkey,
2232 				      GFP_KERNEL));
2233 		if (err)
2234 			goto free_mkey;
2235 	}
2236 
2237 	kfree(in);
2238 	return 0;
2239 
2240 free_mkey:
2241 	mlx5_core_destroy_mkey(dev->mdev, &mw->mmkey);
2242 free:
2243 	kfree(in);
2244 	return err;
2245 }
2246 
2247 int mlx5_ib_dealloc_mw(struct ib_mw *mw)
2248 {
2249 	struct mlx5_ib_dev *dev = to_mdev(mw->device);
2250 	struct mlx5_ib_mw *mmw = to_mmw(mw);
2251 
2252 	if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) {
2253 		xa_erase(&dev->odp_mkeys, mlx5_base_mkey(mmw->mmkey.key));
2254 		/*
2255 		 * pagefault_single_data_segment() may be accessing mmw under
2256 		 * SRCU if the user bound an ODP MR to this MW.
2257 		 */
2258 		synchronize_srcu(&dev->odp_srcu);
2259 	}
2260 
2261 	return mlx5_core_destroy_mkey(dev->mdev, &mmw->mmkey);
2262 }
2263 
2264 int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask,
2265 			    struct ib_mr_status *mr_status)
2266 {
2267 	struct mlx5_ib_mr *mmr = to_mmr(ibmr);
2268 	int ret = 0;
2269 
2270 	if (check_mask & ~IB_MR_CHECK_SIG_STATUS) {
2271 		pr_err("Invalid status check mask\n");
2272 		ret = -EINVAL;
2273 		goto done;
2274 	}
2275 
2276 	mr_status->fail_status = 0;
2277 	if (check_mask & IB_MR_CHECK_SIG_STATUS) {
2278 		if (!mmr->sig) {
2279 			ret = -EINVAL;
2280 			pr_err("signature status check requested on a non-signature enabled MR\n");
2281 			goto done;
2282 		}
2283 
2284 		mmr->sig->sig_status_checked = true;
2285 		if (!mmr->sig->sig_err_exists)
2286 			goto done;
2287 
2288 		if (ibmr->lkey == mmr->sig->err_item.key)
2289 			memcpy(&mr_status->sig_err, &mmr->sig->err_item,
2290 			       sizeof(mr_status->sig_err));
2291 		else {
2292 			mr_status->sig_err.err_type = IB_SIG_BAD_GUARD;
2293 			mr_status->sig_err.sig_err_offset = 0;
2294 			mr_status->sig_err.key = mmr->sig->err_item.key;
2295 		}
2296 
2297 		mmr->sig->sig_err_exists = false;
2298 		mr_status->fail_status |= IB_MR_CHECK_SIG_STATUS;
2299 	}
2300 
2301 done:
2302 	return ret;
2303 }
2304 
2305 static int
2306 mlx5_ib_map_pa_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg,
2307 			int data_sg_nents, unsigned int *data_sg_offset,
2308 			struct scatterlist *meta_sg, int meta_sg_nents,
2309 			unsigned int *meta_sg_offset)
2310 {
2311 	struct mlx5_ib_mr *mr = to_mmr(ibmr);
2312 	unsigned int sg_offset = 0;
2313 	int n = 0;
2314 
2315 	mr->meta_length = 0;
2316 	if (data_sg_nents == 1) {
2317 		n++;
2318 		mr->ndescs = 1;
2319 		if (data_sg_offset)
2320 			sg_offset = *data_sg_offset;
2321 		mr->data_length = sg_dma_len(data_sg) - sg_offset;
2322 		mr->data_iova = sg_dma_address(data_sg) + sg_offset;
2323 		if (meta_sg_nents == 1) {
2324 			n++;
2325 			mr->meta_ndescs = 1;
2326 			if (meta_sg_offset)
2327 				sg_offset = *meta_sg_offset;
2328 			else
2329 				sg_offset = 0;
2330 			mr->meta_length = sg_dma_len(meta_sg) - sg_offset;
2331 			mr->pi_iova = sg_dma_address(meta_sg) + sg_offset;
2332 		}
2333 		ibmr->length = mr->data_length + mr->meta_length;
2334 	}
2335 
2336 	return n;
2337 }
2338 
2339 static int
2340 mlx5_ib_sg_to_klms(struct mlx5_ib_mr *mr,
2341 		   struct scatterlist *sgl,
2342 		   unsigned short sg_nents,
2343 		   unsigned int *sg_offset_p,
2344 		   struct scatterlist *meta_sgl,
2345 		   unsigned short meta_sg_nents,
2346 		   unsigned int *meta_sg_offset_p)
2347 {
2348 	struct scatterlist *sg = sgl;
2349 	struct mlx5_klm *klms = mr->descs;
2350 	unsigned int sg_offset = sg_offset_p ? *sg_offset_p : 0;
2351 	u32 lkey = mr->ibmr.pd->local_dma_lkey;
2352 	int i, j = 0;
2353 
2354 	mr->ibmr.iova = sg_dma_address(sg) + sg_offset;
2355 	mr->ibmr.length = 0;
2356 
2357 	for_each_sg(sgl, sg, sg_nents, i) {
2358 		if (unlikely(i >= mr->max_descs))
2359 			break;
2360 		klms[i].va = cpu_to_be64(sg_dma_address(sg) + sg_offset);
2361 		klms[i].bcount = cpu_to_be32(sg_dma_len(sg) - sg_offset);
2362 		klms[i].key = cpu_to_be32(lkey);
2363 		mr->ibmr.length += sg_dma_len(sg) - sg_offset;
2364 
2365 		sg_offset = 0;
2366 	}
2367 
2368 	if (sg_offset_p)
2369 		*sg_offset_p = sg_offset;
2370 
2371 	mr->ndescs = i;
2372 	mr->data_length = mr->ibmr.length;
2373 
2374 	if (meta_sg_nents) {
2375 		sg = meta_sgl;
2376 		sg_offset = meta_sg_offset_p ? *meta_sg_offset_p : 0;
2377 		for_each_sg(meta_sgl, sg, meta_sg_nents, j) {
2378 			if (unlikely(i + j >= mr->max_descs))
2379 				break;
2380 			klms[i + j].va = cpu_to_be64(sg_dma_address(sg) +
2381 						     sg_offset);
2382 			klms[i + j].bcount = cpu_to_be32(sg_dma_len(sg) -
2383 							 sg_offset);
2384 			klms[i + j].key = cpu_to_be32(lkey);
2385 			mr->ibmr.length += sg_dma_len(sg) - sg_offset;
2386 
2387 			sg_offset = 0;
2388 		}
2389 		if (meta_sg_offset_p)
2390 			*meta_sg_offset_p = sg_offset;
2391 
2392 		mr->meta_ndescs = j;
2393 		mr->meta_length = mr->ibmr.length - mr->data_length;
2394 	}
2395 
2396 	return i + j;
2397 }
2398 
2399 static int mlx5_set_page(struct ib_mr *ibmr, u64 addr)
2400 {
2401 	struct mlx5_ib_mr *mr = to_mmr(ibmr);
2402 	__be64 *descs;
2403 
2404 	if (unlikely(mr->ndescs == mr->max_descs))
2405 		return -ENOMEM;
2406 
2407 	descs = mr->descs;
2408 	descs[mr->ndescs++] = cpu_to_be64(addr | MLX5_EN_RD | MLX5_EN_WR);
2409 
2410 	return 0;
2411 }
2412 
2413 static int mlx5_set_page_pi(struct ib_mr *ibmr, u64 addr)
2414 {
2415 	struct mlx5_ib_mr *mr = to_mmr(ibmr);
2416 	__be64 *descs;
2417 
2418 	if (unlikely(mr->ndescs + mr->meta_ndescs == mr->max_descs))
2419 		return -ENOMEM;
2420 
2421 	descs = mr->descs;
2422 	descs[mr->ndescs + mr->meta_ndescs++] =
2423 		cpu_to_be64(addr | MLX5_EN_RD | MLX5_EN_WR);
2424 
2425 	return 0;
2426 }
2427 
2428 static int
2429 mlx5_ib_map_mtt_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg,
2430 			 int data_sg_nents, unsigned int *data_sg_offset,
2431 			 struct scatterlist *meta_sg, int meta_sg_nents,
2432 			 unsigned int *meta_sg_offset)
2433 {
2434 	struct mlx5_ib_mr *mr = to_mmr(ibmr);
2435 	struct mlx5_ib_mr *pi_mr = mr->mtt_mr;
2436 	int n;
2437 
2438 	pi_mr->ndescs = 0;
2439 	pi_mr->meta_ndescs = 0;
2440 	pi_mr->meta_length = 0;
2441 
2442 	ib_dma_sync_single_for_cpu(ibmr->device, pi_mr->desc_map,
2443 				   pi_mr->desc_size * pi_mr->max_descs,
2444 				   DMA_TO_DEVICE);
2445 
2446 	pi_mr->ibmr.page_size = ibmr->page_size;
2447 	n = ib_sg_to_pages(&pi_mr->ibmr, data_sg, data_sg_nents, data_sg_offset,
2448 			   mlx5_set_page);
2449 	if (n != data_sg_nents)
2450 		return n;
2451 
2452 	pi_mr->data_iova = pi_mr->ibmr.iova;
2453 	pi_mr->data_length = pi_mr->ibmr.length;
2454 	pi_mr->ibmr.length = pi_mr->data_length;
2455 	ibmr->length = pi_mr->data_length;
2456 
2457 	if (meta_sg_nents) {
2458 		u64 page_mask = ~((u64)ibmr->page_size - 1);
2459 		u64 iova = pi_mr->data_iova;
2460 
2461 		n += ib_sg_to_pages(&pi_mr->ibmr, meta_sg, meta_sg_nents,
2462 				    meta_sg_offset, mlx5_set_page_pi);
2463 
2464 		pi_mr->meta_length = pi_mr->ibmr.length;
2465 		/*
2466 		 * PI address for the HW is the offset of the metadata address
2467 		 * relative to the first data page address.
2468 		 * It equals to first data page address + size of data pages +
2469 		 * metadata offset at the first metadata page
2470 		 */
2471 		pi_mr->pi_iova = (iova & page_mask) +
2472 				 pi_mr->ndescs * ibmr->page_size +
2473 				 (pi_mr->ibmr.iova & ~page_mask);
2474 		/*
2475 		 * In order to use one MTT MR for data and metadata, we register
2476 		 * also the gaps between the end of the data and the start of
2477 		 * the metadata (the sig MR will verify that the HW will access
2478 		 * to right addresses). This mapping is safe because we use
2479 		 * internal mkey for the registration.
2480 		 */
2481 		pi_mr->ibmr.length = pi_mr->pi_iova + pi_mr->meta_length - iova;
2482 		pi_mr->ibmr.iova = iova;
2483 		ibmr->length += pi_mr->meta_length;
2484 	}
2485 
2486 	ib_dma_sync_single_for_device(ibmr->device, pi_mr->desc_map,
2487 				      pi_mr->desc_size * pi_mr->max_descs,
2488 				      DMA_TO_DEVICE);
2489 
2490 	return n;
2491 }
2492 
2493 static int
2494 mlx5_ib_map_klm_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg,
2495 			 int data_sg_nents, unsigned int *data_sg_offset,
2496 			 struct scatterlist *meta_sg, int meta_sg_nents,
2497 			 unsigned int *meta_sg_offset)
2498 {
2499 	struct mlx5_ib_mr *mr = to_mmr(ibmr);
2500 	struct mlx5_ib_mr *pi_mr = mr->klm_mr;
2501 	int n;
2502 
2503 	pi_mr->ndescs = 0;
2504 	pi_mr->meta_ndescs = 0;
2505 	pi_mr->meta_length = 0;
2506 
2507 	ib_dma_sync_single_for_cpu(ibmr->device, pi_mr->desc_map,
2508 				   pi_mr->desc_size * pi_mr->max_descs,
2509 				   DMA_TO_DEVICE);
2510 
2511 	n = mlx5_ib_sg_to_klms(pi_mr, data_sg, data_sg_nents, data_sg_offset,
2512 			       meta_sg, meta_sg_nents, meta_sg_offset);
2513 
2514 	ib_dma_sync_single_for_device(ibmr->device, pi_mr->desc_map,
2515 				      pi_mr->desc_size * pi_mr->max_descs,
2516 				      DMA_TO_DEVICE);
2517 
2518 	/* This is zero-based memory region */
2519 	pi_mr->data_iova = 0;
2520 	pi_mr->ibmr.iova = 0;
2521 	pi_mr->pi_iova = pi_mr->data_length;
2522 	ibmr->length = pi_mr->ibmr.length;
2523 
2524 	return n;
2525 }
2526 
2527 int mlx5_ib_map_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg,
2528 			 int data_sg_nents, unsigned int *data_sg_offset,
2529 			 struct scatterlist *meta_sg, int meta_sg_nents,
2530 			 unsigned int *meta_sg_offset)
2531 {
2532 	struct mlx5_ib_mr *mr = to_mmr(ibmr);
2533 	struct mlx5_ib_mr *pi_mr = NULL;
2534 	int n;
2535 
2536 	WARN_ON(ibmr->type != IB_MR_TYPE_INTEGRITY);
2537 
2538 	mr->ndescs = 0;
2539 	mr->data_length = 0;
2540 	mr->data_iova = 0;
2541 	mr->meta_ndescs = 0;
2542 	mr->pi_iova = 0;
2543 	/*
2544 	 * As a performance optimization, if possible, there is no need to
2545 	 * perform UMR operation to register the data/metadata buffers.
2546 	 * First try to map the sg lists to PA descriptors with local_dma_lkey.
2547 	 * Fallback to UMR only in case of a failure.
2548 	 */
2549 	n = mlx5_ib_map_pa_mr_sg_pi(ibmr, data_sg, data_sg_nents,
2550 				    data_sg_offset, meta_sg, meta_sg_nents,
2551 				    meta_sg_offset);
2552 	if (n == data_sg_nents + meta_sg_nents)
2553 		goto out;
2554 	/*
2555 	 * As a performance optimization, if possible, there is no need to map
2556 	 * the sg lists to KLM descriptors. First try to map the sg lists to MTT
2557 	 * descriptors and fallback to KLM only in case of a failure.
2558 	 * It's more efficient for the HW to work with MTT descriptors
2559 	 * (especially in high load).
2560 	 * Use KLM (indirect access) only if it's mandatory.
2561 	 */
2562 	pi_mr = mr->mtt_mr;
2563 	n = mlx5_ib_map_mtt_mr_sg_pi(ibmr, data_sg, data_sg_nents,
2564 				     data_sg_offset, meta_sg, meta_sg_nents,
2565 				     meta_sg_offset);
2566 	if (n == data_sg_nents + meta_sg_nents)
2567 		goto out;
2568 
2569 	pi_mr = mr->klm_mr;
2570 	n = mlx5_ib_map_klm_mr_sg_pi(ibmr, data_sg, data_sg_nents,
2571 				     data_sg_offset, meta_sg, meta_sg_nents,
2572 				     meta_sg_offset);
2573 	if (unlikely(n != data_sg_nents + meta_sg_nents))
2574 		return -ENOMEM;
2575 
2576 out:
2577 	/* This is zero-based memory region */
2578 	ibmr->iova = 0;
2579 	mr->pi_mr = pi_mr;
2580 	if (pi_mr)
2581 		ibmr->sig_attrs->meta_length = pi_mr->meta_length;
2582 	else
2583 		ibmr->sig_attrs->meta_length = mr->meta_length;
2584 
2585 	return 0;
2586 }
2587 
2588 int mlx5_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
2589 		      unsigned int *sg_offset)
2590 {
2591 	struct mlx5_ib_mr *mr = to_mmr(ibmr);
2592 	int n;
2593 
2594 	mr->ndescs = 0;
2595 
2596 	ib_dma_sync_single_for_cpu(ibmr->device, mr->desc_map,
2597 				   mr->desc_size * mr->max_descs,
2598 				   DMA_TO_DEVICE);
2599 
2600 	if (mr->access_mode == MLX5_MKC_ACCESS_MODE_KLMS)
2601 		n = mlx5_ib_sg_to_klms(mr, sg, sg_nents, sg_offset, NULL, 0,
2602 				       NULL);
2603 	else
2604 		n = ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset,
2605 				mlx5_set_page);
2606 
2607 	ib_dma_sync_single_for_device(ibmr->device, mr->desc_map,
2608 				      mr->desc_size * mr->max_descs,
2609 				      DMA_TO_DEVICE);
2610 
2611 	return n;
2612 }
2613