xref: /linux/drivers/infiniband/hw/mlx5/umr.c (revision 260f6f4fda93c8485c8037865c941b42b9cba5d2)
1 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
2 /* Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. */
3 
4 #include <rdma/ib_umem_odp.h>
5 #include "mlx5_ib.h"
6 #include "umr.h"
7 #include "wr.h"
8 
9 /*
10  * We can't use an array for xlt_emergency_page because dma_map_single doesn't
11  * work on kernel modules memory
12  */
13 void *xlt_emergency_page;
14 static DEFINE_MUTEX(xlt_emergency_page_mutex);
15 
16 static __be64 get_umr_enable_mr_mask(void)
17 {
18 	u64 result;
19 
20 	result = MLX5_MKEY_MASK_KEY |
21 		 MLX5_MKEY_MASK_FREE;
22 
23 	return cpu_to_be64(result);
24 }
25 
26 static __be64 get_umr_disable_mr_mask(void)
27 {
28 	u64 result;
29 
30 	result = MLX5_MKEY_MASK_FREE;
31 
32 	return cpu_to_be64(result);
33 }
34 
35 static __be64 get_umr_update_translation_mask(struct mlx5_ib_dev *dev)
36 {
37 	u64 result;
38 
39 	result = MLX5_MKEY_MASK_LEN |
40 		 MLX5_MKEY_MASK_PAGE_SIZE |
41 		 MLX5_MKEY_MASK_START_ADDR;
42 	if (MLX5_CAP_GEN_2(dev->mdev, umr_log_entity_size_5))
43 		result |= MLX5_MKEY_MASK_PAGE_SIZE_5;
44 
45 	return cpu_to_be64(result);
46 }
47 
48 static __be64 get_umr_update_access_mask(struct mlx5_ib_dev *dev)
49 {
50 	u64 result;
51 
52 	result = MLX5_MKEY_MASK_LR |
53 		 MLX5_MKEY_MASK_LW |
54 		 MLX5_MKEY_MASK_RR |
55 		 MLX5_MKEY_MASK_RW;
56 
57 	if (MLX5_CAP_GEN(dev->mdev, atomic))
58 		result |= MLX5_MKEY_MASK_A;
59 
60 	if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write_umr))
61 		result |= MLX5_MKEY_MASK_RELAXED_ORDERING_WRITE;
62 
63 	if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read_umr))
64 		result |= MLX5_MKEY_MASK_RELAXED_ORDERING_READ;
65 
66 	return cpu_to_be64(result);
67 }
68 
69 static __be64 get_umr_update_pd_mask(void)
70 {
71 	u64 result;
72 
73 	result = MLX5_MKEY_MASK_PD;
74 
75 	return cpu_to_be64(result);
76 }
77 
78 static int umr_check_mkey_mask(struct mlx5_ib_dev *dev, u64 mask)
79 {
80 	if (mask & MLX5_MKEY_MASK_PAGE_SIZE &&
81 	    MLX5_CAP_GEN(dev->mdev, umr_modify_entity_size_disabled))
82 		return -EPERM;
83 
84 	if (mask & MLX5_MKEY_MASK_A &&
85 	    MLX5_CAP_GEN(dev->mdev, umr_modify_atomic_disabled))
86 		return -EPERM;
87 
88 	if (mask & MLX5_MKEY_MASK_RELAXED_ORDERING_WRITE &&
89 	    !MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write_umr))
90 		return -EPERM;
91 
92 	if (mask & MLX5_MKEY_MASK_RELAXED_ORDERING_READ &&
93 	    !MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read_umr))
94 		return -EPERM;
95 
96 	return 0;
97 }
98 
99 enum {
100 	MAX_UMR_WR = 128,
101 };
102 
103 static int mlx5r_umr_qp_rst2rts(struct mlx5_ib_dev *dev, struct ib_qp *qp)
104 {
105 	struct ib_qp_attr attr = {};
106 	int ret;
107 
108 	attr.qp_state = IB_QPS_INIT;
109 	attr.port_num = 1;
110 	ret = ib_modify_qp(qp, &attr,
111 			   IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_PORT);
112 	if (ret) {
113 		mlx5_ib_dbg(dev, "Couldn't modify UMR QP\n");
114 		return ret;
115 	}
116 
117 	memset(&attr, 0, sizeof(attr));
118 	attr.qp_state = IB_QPS_RTR;
119 
120 	ret = ib_modify_qp(qp, &attr, IB_QP_STATE);
121 	if (ret) {
122 		mlx5_ib_dbg(dev, "Couldn't modify umr QP to rtr\n");
123 		return ret;
124 	}
125 
126 	memset(&attr, 0, sizeof(attr));
127 	attr.qp_state = IB_QPS_RTS;
128 	ret = ib_modify_qp(qp, &attr, IB_QP_STATE);
129 	if (ret) {
130 		mlx5_ib_dbg(dev, "Couldn't modify umr QP to rts\n");
131 		return ret;
132 	}
133 
134 	return 0;
135 }
136 
137 int mlx5r_umr_resource_init(struct mlx5_ib_dev *dev)
138 {
139 	struct ib_qp_init_attr init_attr = {};
140 	struct ib_cq *cq;
141 	struct ib_qp *qp;
142 	int ret = 0;
143 
144 
145 	/*
146 	 * UMR qp is set once, never changed until device unload.
147 	 * Avoid taking the mutex if initialization is already done.
148 	 */
149 	if (dev->umrc.qp)
150 		return 0;
151 
152 	mutex_lock(&dev->umrc.init_lock);
153 	/* First user allocates the UMR resources. Skip if already allocated. */
154 	if (dev->umrc.qp)
155 		goto unlock;
156 
157 	cq = ib_alloc_cq(&dev->ib_dev, NULL, 128, 0, IB_POLL_SOFTIRQ);
158 	if (IS_ERR(cq)) {
159 		mlx5_ib_dbg(dev, "Couldn't create CQ for sync UMR QP\n");
160 		ret = PTR_ERR(cq);
161 		goto unlock;
162 	}
163 
164 	init_attr.send_cq = cq;
165 	init_attr.recv_cq = cq;
166 	init_attr.sq_sig_type = IB_SIGNAL_ALL_WR;
167 	init_attr.cap.max_send_wr = MAX_UMR_WR;
168 	init_attr.cap.max_send_sge = 1;
169 	init_attr.qp_type = MLX5_IB_QPT_REG_UMR;
170 	init_attr.port_num = 1;
171 	qp = ib_create_qp(dev->umrc.pd, &init_attr);
172 	if (IS_ERR(qp)) {
173 		mlx5_ib_dbg(dev, "Couldn't create sync UMR QP\n");
174 		ret = PTR_ERR(qp);
175 		goto destroy_cq;
176 	}
177 
178 	ret = mlx5r_umr_qp_rst2rts(dev, qp);
179 	if (ret)
180 		goto destroy_qp;
181 
182 	dev->umrc.cq = cq;
183 
184 	sema_init(&dev->umrc.sem, MAX_UMR_WR);
185 	mutex_init(&dev->umrc.lock);
186 	dev->umrc.state = MLX5_UMR_STATE_ACTIVE;
187 	dev->umrc.qp = qp;
188 
189 	mutex_unlock(&dev->umrc.init_lock);
190 	return 0;
191 
192 destroy_qp:
193 	ib_destroy_qp(qp);
194 destroy_cq:
195 	ib_free_cq(cq);
196 unlock:
197 	mutex_unlock(&dev->umrc.init_lock);
198 	return ret;
199 }
200 
201 void mlx5r_umr_resource_cleanup(struct mlx5_ib_dev *dev)
202 {
203 	if (dev->umrc.state == MLX5_UMR_STATE_UNINIT)
204 		return;
205 	mutex_destroy(&dev->umrc.lock);
206 	/* After device init, UMR cp/qp are not unset during the lifetime. */
207 	ib_destroy_qp(dev->umrc.qp);
208 	ib_free_cq(dev->umrc.cq);
209 }
210 
211 int mlx5r_umr_init(struct mlx5_ib_dev *dev)
212 {
213 	struct ib_pd *pd;
214 
215 	pd = ib_alloc_pd(&dev->ib_dev, 0);
216 	if (IS_ERR(pd)) {
217 		mlx5_ib_dbg(dev, "Couldn't create PD for sync UMR QP\n");
218 		return PTR_ERR(pd);
219 	}
220 	dev->umrc.pd = pd;
221 
222 	mutex_init(&dev->umrc.init_lock);
223 
224 	return 0;
225 }
226 
227 void mlx5r_umr_cleanup(struct mlx5_ib_dev *dev)
228 {
229 	if (!dev->umrc.pd)
230 		return;
231 
232 	mutex_destroy(&dev->umrc.init_lock);
233 	ib_dealloc_pd(dev->umrc.pd);
234 }
235 
236 
237 static int mlx5r_umr_post_send(struct ib_qp *ibqp, u32 mkey, struct ib_cqe *cqe,
238 			       struct mlx5r_umr_wqe *wqe, bool with_data)
239 {
240 	unsigned int wqe_size =
241 		with_data ? sizeof(struct mlx5r_umr_wqe) :
242 			    sizeof(struct mlx5r_umr_wqe) -
243 				    sizeof(struct mlx5_wqe_data_seg);
244 	struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
245 	struct mlx5_core_dev *mdev = dev->mdev;
246 	struct mlx5_ib_qp *qp = to_mqp(ibqp);
247 	struct mlx5_wqe_ctrl_seg *ctrl;
248 	union {
249 		struct ib_cqe *ib_cqe;
250 		u64 wr_id;
251 	} id;
252 	void *cur_edge, *seg;
253 	unsigned long flags;
254 	unsigned int idx;
255 	int size, err;
256 
257 	if (unlikely(mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR))
258 		return -EIO;
259 
260 	spin_lock_irqsave(&qp->sq.lock, flags);
261 
262 	err = mlx5r_begin_wqe(qp, &seg, &ctrl, &idx, &size, &cur_edge, 0,
263 			      cpu_to_be32(mkey), false, false);
264 	if (WARN_ON(err))
265 		goto out;
266 
267 	qp->sq.wr_data[idx] = MLX5_IB_WR_UMR;
268 
269 	mlx5r_memcpy_send_wqe(&qp->sq, &cur_edge, &seg, &size, wqe, wqe_size);
270 
271 	id.ib_cqe = cqe;
272 	mlx5r_finish_wqe(qp, ctrl, seg, size, cur_edge, idx, id.wr_id, 0,
273 			 MLX5_FENCE_MODE_INITIATOR_SMALL, MLX5_OPCODE_UMR);
274 
275 	mlx5r_ring_db(qp, 1, ctrl);
276 
277 out:
278 	spin_unlock_irqrestore(&qp->sq.lock, flags);
279 
280 	return err;
281 }
282 
283 static int mlx5r_umr_recover(struct mlx5_ib_dev *dev, u32 mkey,
284 			     struct mlx5r_umr_context *umr_context,
285 			     struct mlx5r_umr_wqe *wqe, bool with_data)
286 {
287 	struct umr_common *umrc = &dev->umrc;
288 	struct ib_qp_attr attr;
289 	int err;
290 
291 	mutex_lock(&umrc->lock);
292 	/* Preventing any further WRs to be sent now */
293 	if (umrc->state != MLX5_UMR_STATE_RECOVER) {
294 		mlx5_ib_warn(dev, "UMR recovery encountered an unexpected state=%d\n",
295 			     umrc->state);
296 		umrc->state = MLX5_UMR_STATE_RECOVER;
297 	}
298 	mutex_unlock(&umrc->lock);
299 
300 	/* Sending a final/barrier WR (the failed one) and wait for its completion.
301 	 * This will ensure that all the previous WRs got a completion before
302 	 * we set the QP state to RESET.
303 	 */
304 	err = mlx5r_umr_post_send(umrc->qp, mkey, &umr_context->cqe, wqe,
305 				  with_data);
306 	if (err) {
307 		mlx5_ib_warn(dev, "UMR recovery post send failed, err %d\n", err);
308 		goto err;
309 	}
310 
311 	/* Since the QP is in an error state, it will only receive
312 	 * IB_WC_WR_FLUSH_ERR. However, as it serves only as a barrier
313 	 * we don't care about its status.
314 	 */
315 	wait_for_completion(&umr_context->done);
316 
317 	attr.qp_state = IB_QPS_RESET;
318 	err = ib_modify_qp(umrc->qp, &attr, IB_QP_STATE);
319 	if (err) {
320 		mlx5_ib_warn(dev, "Couldn't modify UMR QP to RESET, err=%d\n", err);
321 		goto err;
322 	}
323 
324 	err = mlx5r_umr_qp_rst2rts(dev, umrc->qp);
325 	if (err) {
326 		mlx5_ib_warn(dev, "Couldn't modify UMR QP to RTS, err=%d\n", err);
327 		goto err;
328 	}
329 
330 	umrc->state = MLX5_UMR_STATE_ACTIVE;
331 	return 0;
332 
333 err:
334 	umrc->state = MLX5_UMR_STATE_ERR;
335 	return err;
336 }
337 
338 static void mlx5r_umr_done(struct ib_cq *cq, struct ib_wc *wc)
339 {
340 	struct mlx5_ib_umr_context *context =
341 		container_of(wc->wr_cqe, struct mlx5_ib_umr_context, cqe);
342 
343 	context->status = wc->status;
344 	complete(&context->done);
345 }
346 
347 static inline void mlx5r_umr_init_context(struct mlx5r_umr_context *context)
348 {
349 	context->cqe.done = mlx5r_umr_done;
350 	init_completion(&context->done);
351 }
352 
353 static int mlx5r_umr_post_send_wait(struct mlx5_ib_dev *dev, u32 mkey,
354 				   struct mlx5r_umr_wqe *wqe, bool with_data)
355 {
356 	struct umr_common *umrc = &dev->umrc;
357 	struct mlx5r_umr_context umr_context;
358 	int err;
359 
360 	err = umr_check_mkey_mask(dev, be64_to_cpu(wqe->ctrl_seg.mkey_mask));
361 	if (WARN_ON(err))
362 		return err;
363 
364 	mlx5r_umr_init_context(&umr_context);
365 
366 	down(&umrc->sem);
367 	while (true) {
368 		mutex_lock(&umrc->lock);
369 		if (umrc->state == MLX5_UMR_STATE_ERR) {
370 			mutex_unlock(&umrc->lock);
371 			err = -EFAULT;
372 			break;
373 		}
374 
375 		if (umrc->state == MLX5_UMR_STATE_RECOVER) {
376 			mutex_unlock(&umrc->lock);
377 			usleep_range(3000, 5000);
378 			continue;
379 		}
380 
381 		err = mlx5r_umr_post_send(umrc->qp, mkey, &umr_context.cqe, wqe,
382 					  with_data);
383 		mutex_unlock(&umrc->lock);
384 		if (err) {
385 			mlx5_ib_warn(dev, "UMR post send failed, err %d\n",
386 				     err);
387 			break;
388 		}
389 
390 		wait_for_completion(&umr_context.done);
391 
392 		if (umr_context.status == IB_WC_SUCCESS)
393 			break;
394 
395 		if (umr_context.status == IB_WC_WR_FLUSH_ERR)
396 			continue;
397 
398 		WARN_ON_ONCE(1);
399 		mlx5_ib_warn(dev,
400 			"reg umr failed (%u). Trying to recover and resubmit the flushed WQEs, mkey = %u\n",
401 			umr_context.status, mkey);
402 		err = mlx5r_umr_recover(dev, mkey, &umr_context, wqe, with_data);
403 		if (err)
404 			mlx5_ib_warn(dev, "couldn't recover UMR, err %d\n",
405 				     err);
406 		err = -EFAULT;
407 		break;
408 	}
409 	up(&umrc->sem);
410 	return err;
411 }
412 
413 /**
414  * mlx5r_umr_revoke_mr - Fence all DMA on the MR
415  * @mr: The MR to fence
416  *
417  * Upon return the NIC will not be doing any DMA to the pages under the MR,
418  * and any DMA in progress will be completed. Failure of this function
419  * indicates the HW has failed catastrophically.
420  */
421 int mlx5r_umr_revoke_mr(struct mlx5_ib_mr *mr)
422 {
423 	struct mlx5_ib_dev *dev = mr_to_mdev(mr);
424 	struct mlx5r_umr_wqe wqe = {};
425 
426 	if (dev->mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR)
427 		return 0;
428 
429 	wqe.ctrl_seg.mkey_mask |= get_umr_update_pd_mask();
430 	wqe.ctrl_seg.mkey_mask |= get_umr_disable_mr_mask();
431 	wqe.ctrl_seg.flags |= MLX5_UMR_INLINE;
432 
433 	MLX5_SET(mkc, &wqe.mkey_seg, free, 1);
434 	MLX5_SET(mkc, &wqe.mkey_seg, pd, to_mpd(dev->umrc.pd)->pdn);
435 	MLX5_SET(mkc, &wqe.mkey_seg, qpn, 0xffffff);
436 	MLX5_SET(mkc, &wqe.mkey_seg, mkey_7_0,
437 		 mlx5_mkey_variant(mr->mmkey.key));
438 
439 	return mlx5r_umr_post_send_wait(dev, mr->mmkey.key, &wqe, false);
440 }
441 
442 static void mlx5r_umr_set_access_flags(struct mlx5_ib_dev *dev,
443 				       struct mlx5_mkey_seg *seg,
444 				       unsigned int access_flags)
445 {
446 	bool ro_read = (access_flags & IB_ACCESS_RELAXED_ORDERING) &&
447 		       (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read) ||
448 			pcie_relaxed_ordering_enabled(dev->mdev->pdev));
449 
450 	MLX5_SET(mkc, seg, a, !!(access_flags & IB_ACCESS_REMOTE_ATOMIC));
451 	MLX5_SET(mkc, seg, rw, !!(access_flags & IB_ACCESS_REMOTE_WRITE));
452 	MLX5_SET(mkc, seg, rr, !!(access_flags & IB_ACCESS_REMOTE_READ));
453 	MLX5_SET(mkc, seg, lw, !!(access_flags & IB_ACCESS_LOCAL_WRITE));
454 	MLX5_SET(mkc, seg, lr, 1);
455 	MLX5_SET(mkc, seg, relaxed_ordering_write,
456 		 !!(access_flags & IB_ACCESS_RELAXED_ORDERING));
457 	MLX5_SET(mkc, seg, relaxed_ordering_read, ro_read);
458 }
459 
460 int mlx5r_umr_rereg_pd_access(struct mlx5_ib_mr *mr, struct ib_pd *pd,
461 			      int access_flags)
462 {
463 	struct mlx5_ib_dev *dev = mr_to_mdev(mr);
464 	struct mlx5r_umr_wqe wqe = {};
465 	int err;
466 
467 	wqe.ctrl_seg.mkey_mask = get_umr_update_access_mask(dev);
468 	wqe.ctrl_seg.mkey_mask |= get_umr_update_pd_mask();
469 	wqe.ctrl_seg.flags = MLX5_UMR_CHECK_FREE;
470 	wqe.ctrl_seg.flags |= MLX5_UMR_INLINE;
471 
472 	mlx5r_umr_set_access_flags(dev, &wqe.mkey_seg, access_flags);
473 	MLX5_SET(mkc, &wqe.mkey_seg, pd, to_mpd(pd)->pdn);
474 	MLX5_SET(mkc, &wqe.mkey_seg, qpn, 0xffffff);
475 	MLX5_SET(mkc, &wqe.mkey_seg, mkey_7_0,
476 		 mlx5_mkey_variant(mr->mmkey.key));
477 
478 	err = mlx5r_umr_post_send_wait(dev, mr->mmkey.key, &wqe, false);
479 	if (err)
480 		return err;
481 
482 	mr->access_flags = access_flags;
483 	return 0;
484 }
485 
486 #define MLX5_MAX_UMR_CHUNK                                                     \
487 	((1 << (MLX5_MAX_UMR_SHIFT + 4)) - MLX5_UMR_FLEX_ALIGNMENT)
488 #define MLX5_SPARE_UMR_CHUNK 0x10000
489 
490 /*
491  * Allocate a temporary buffer to hold the per-page information to transfer to
492  * HW. For efficiency this should be as large as it can be, but buffer
493  * allocation failure is not allowed, so try smaller sizes.
494  */
495 static void *mlx5r_umr_alloc_xlt(size_t *nents, size_t ent_size, gfp_t gfp_mask)
496 {
497 	const size_t xlt_chunk_align = MLX5_UMR_FLEX_ALIGNMENT / ent_size;
498 	size_t size;
499 	void *res = NULL;
500 
501 	static_assert(PAGE_SIZE % MLX5_UMR_FLEX_ALIGNMENT == 0);
502 
503 	/*
504 	 * MLX5_IB_UPD_XLT_ATOMIC doesn't signal an atomic context just that the
505 	 * allocation can't trigger any kind of reclaim.
506 	 */
507 	might_sleep();
508 
509 	gfp_mask |= __GFP_ZERO | __GFP_NORETRY;
510 
511 	/*
512 	 * If the system already has a suitable high order page then just use
513 	 * that, but don't try hard to create one. This max is about 1M, so a
514 	 * free x86 huge page will satisfy it.
515 	 */
516 	size = min_t(size_t, ent_size * ALIGN(*nents, xlt_chunk_align),
517 		     MLX5_MAX_UMR_CHUNK);
518 	*nents = size / ent_size;
519 	res = (void *)__get_free_pages(gfp_mask | __GFP_NOWARN,
520 				       get_order(size));
521 	if (res)
522 		return res;
523 
524 	if (size > MLX5_SPARE_UMR_CHUNK) {
525 		size = MLX5_SPARE_UMR_CHUNK;
526 		*nents = size / ent_size;
527 		res = (void *)__get_free_pages(gfp_mask | __GFP_NOWARN,
528 					       get_order(size));
529 		if (res)
530 			return res;
531 	}
532 
533 	*nents = PAGE_SIZE / ent_size;
534 	res = (void *)__get_free_page(gfp_mask);
535 	if (res)
536 		return res;
537 
538 	mutex_lock(&xlt_emergency_page_mutex);
539 	memset(xlt_emergency_page, 0, PAGE_SIZE);
540 	return xlt_emergency_page;
541 }
542 
543 static void mlx5r_umr_free_xlt(void *xlt, size_t length)
544 {
545 	if (xlt == xlt_emergency_page) {
546 		mutex_unlock(&xlt_emergency_page_mutex);
547 		return;
548 	}
549 
550 	free_pages((unsigned long)xlt, get_order(length));
551 }
552 
553 static void mlx5r_umr_unmap_free_xlt(struct mlx5_ib_dev *dev, void *xlt,
554 				     struct ib_sge *sg)
555 {
556 	struct device *ddev = &dev->mdev->pdev->dev;
557 
558 	dma_unmap_single(ddev, sg->addr, sg->length, DMA_TO_DEVICE);
559 	mlx5r_umr_free_xlt(xlt, sg->length);
560 }
561 
562 /*
563  * Create an XLT buffer ready for submission.
564  */
565 static void *mlx5r_umr_create_xlt(struct mlx5_ib_dev *dev, struct ib_sge *sg,
566 				  size_t nents, size_t ent_size,
567 				  unsigned int flags)
568 {
569 	struct device *ddev = &dev->mdev->pdev->dev;
570 	dma_addr_t dma;
571 	void *xlt;
572 
573 	xlt = mlx5r_umr_alloc_xlt(&nents, ent_size,
574 				 flags & MLX5_IB_UPD_XLT_ATOMIC ? GFP_ATOMIC :
575 								  GFP_KERNEL);
576 	sg->length = nents * ent_size;
577 	dma = dma_map_single(ddev, xlt, sg->length, DMA_TO_DEVICE);
578 	if (dma_mapping_error(ddev, dma)) {
579 		mlx5_ib_err(dev, "unable to map DMA during XLT update.\n");
580 		mlx5r_umr_free_xlt(xlt, sg->length);
581 		return NULL;
582 	}
583 	sg->addr = dma;
584 	sg->lkey = dev->umrc.pd->local_dma_lkey;
585 
586 	return xlt;
587 }
588 
589 static void
590 mlx5r_umr_set_update_xlt_ctrl_seg(struct mlx5_wqe_umr_ctrl_seg *ctrl_seg,
591 				  unsigned int flags, struct ib_sge *sg)
592 {
593 	if (!(flags & MLX5_IB_UPD_XLT_ENABLE))
594 		/* fail if free */
595 		ctrl_seg->flags = MLX5_UMR_CHECK_FREE;
596 	else
597 		/* fail if not free */
598 		ctrl_seg->flags = MLX5_UMR_CHECK_NOT_FREE;
599 	ctrl_seg->xlt_octowords =
600 		cpu_to_be16(mlx5r_umr_get_xlt_octo(sg->length));
601 }
602 
603 static void mlx5r_umr_set_update_xlt_mkey_seg(struct mlx5_ib_dev *dev,
604 					      struct mlx5_mkey_seg *mkey_seg,
605 					      struct mlx5_ib_mr *mr,
606 					      unsigned int page_shift)
607 {
608 	mlx5r_umr_set_access_flags(dev, mkey_seg, mr->access_flags);
609 	MLX5_SET(mkc, mkey_seg, pd, to_mpd(mr->ibmr.pd)->pdn);
610 	MLX5_SET64(mkc, mkey_seg, start_addr, mr->ibmr.iova);
611 	MLX5_SET64(mkc, mkey_seg, len, mr->ibmr.length);
612 	MLX5_SET(mkc, mkey_seg, log_page_size, page_shift);
613 	MLX5_SET(mkc, mkey_seg, qpn, 0xffffff);
614 	MLX5_SET(mkc, mkey_seg, mkey_7_0, mlx5_mkey_variant(mr->mmkey.key));
615 }
616 
617 static void
618 mlx5r_umr_set_update_xlt_data_seg(struct mlx5_wqe_data_seg *data_seg,
619 				  struct ib_sge *sg)
620 {
621 	data_seg->byte_count = cpu_to_be32(sg->length);
622 	data_seg->lkey = cpu_to_be32(sg->lkey);
623 	data_seg->addr = cpu_to_be64(sg->addr);
624 }
625 
626 static void mlx5r_umr_update_offset(struct mlx5_wqe_umr_ctrl_seg *ctrl_seg,
627 				    u64 offset)
628 {
629 	u64 octo_offset = mlx5r_umr_get_xlt_octo(offset);
630 
631 	ctrl_seg->xlt_offset = cpu_to_be16(octo_offset & 0xffff);
632 	ctrl_seg->xlt_offset_47_16 = cpu_to_be32(octo_offset >> 16);
633 	ctrl_seg->flags |= MLX5_UMR_TRANSLATION_OFFSET_EN;
634 }
635 
636 static void mlx5r_umr_final_update_xlt(struct mlx5_ib_dev *dev,
637 				       struct mlx5r_umr_wqe *wqe,
638 				       struct mlx5_ib_mr *mr, struct ib_sge *sg,
639 				       unsigned int flags)
640 {
641 	bool update_pd_access, update_translation;
642 
643 	if (flags & MLX5_IB_UPD_XLT_ENABLE)
644 		wqe->ctrl_seg.mkey_mask |= get_umr_enable_mr_mask();
645 
646 	update_pd_access = flags & MLX5_IB_UPD_XLT_ENABLE ||
647 			   flags & MLX5_IB_UPD_XLT_PD ||
648 			   flags & MLX5_IB_UPD_XLT_ACCESS;
649 
650 	if (update_pd_access) {
651 		wqe->ctrl_seg.mkey_mask |= get_umr_update_access_mask(dev);
652 		wqe->ctrl_seg.mkey_mask |= get_umr_update_pd_mask();
653 	}
654 
655 	update_translation =
656 		flags & MLX5_IB_UPD_XLT_ENABLE || flags & MLX5_IB_UPD_XLT_ADDR;
657 
658 	if (update_translation) {
659 		wqe->ctrl_seg.mkey_mask |= get_umr_update_translation_mask(dev);
660 		if (!mr->ibmr.length)
661 			MLX5_SET(mkc, &wqe->mkey_seg, length64, 1);
662 	}
663 
664 	wqe->ctrl_seg.xlt_octowords =
665 		cpu_to_be16(mlx5r_umr_get_xlt_octo(sg->length));
666 	wqe->data_seg.byte_count = cpu_to_be32(sg->length);
667 }
668 
669 static int
670 _mlx5r_umr_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags, bool dd)
671 {
672 	size_t ent_size = dd ? sizeof(struct mlx5_ksm) : sizeof(struct mlx5_mtt);
673 	struct mlx5_ib_dev *dev = mr_to_mdev(mr);
674 	struct device *ddev = &dev->mdev->pdev->dev;
675 	struct mlx5r_umr_wqe wqe = {};
676 	struct ib_block_iter biter;
677 	struct mlx5_ksm *cur_ksm;
678 	struct mlx5_mtt *cur_mtt;
679 	size_t orig_sg_length;
680 	size_t final_size;
681 	void *curr_entry;
682 	struct ib_sge sg;
683 	void *entry;
684 	u64 offset = 0;
685 	int err = 0;
686 
687 	entry = mlx5r_umr_create_xlt(dev, &sg,
688 				     ib_umem_num_dma_blocks(mr->umem, 1 << mr->page_shift),
689 				     ent_size, flags);
690 	if (!entry)
691 		return -ENOMEM;
692 
693 	orig_sg_length = sg.length;
694 	mlx5r_umr_set_update_xlt_ctrl_seg(&wqe.ctrl_seg, flags, &sg);
695 	mlx5r_umr_set_update_xlt_mkey_seg(dev, &wqe.mkey_seg, mr,
696 					  mr->page_shift);
697 	if (dd) {
698 		/* Use the data direct internal kernel PD */
699 		MLX5_SET(mkc, &wqe.mkey_seg, pd, dev->ddr.pdn);
700 		cur_ksm = entry;
701 	} else {
702 		cur_mtt = entry;
703 	}
704 
705 	mlx5r_umr_set_update_xlt_data_seg(&wqe.data_seg, &sg);
706 
707 	curr_entry = entry;
708 	rdma_umem_for_each_dma_block(mr->umem, &biter, BIT(mr->page_shift)) {
709 		if (curr_entry == entry + sg.length) {
710 			dma_sync_single_for_device(ddev, sg.addr, sg.length,
711 						   DMA_TO_DEVICE);
712 
713 			err = mlx5r_umr_post_send_wait(dev, mr->mmkey.key, &wqe,
714 						       true);
715 			if (err)
716 				goto err;
717 			dma_sync_single_for_cpu(ddev, sg.addr, sg.length,
718 						DMA_TO_DEVICE);
719 			offset += sg.length;
720 			mlx5r_umr_update_offset(&wqe.ctrl_seg, offset);
721 			if (dd)
722 				cur_ksm = entry;
723 			else
724 				cur_mtt = entry;
725 		}
726 
727 		if (dd) {
728 			cur_ksm->va = cpu_to_be64(rdma_block_iter_dma_address(&biter));
729 			cur_ksm->key = cpu_to_be32(dev->ddr.mkey);
730 			cur_ksm++;
731 			curr_entry = cur_ksm;
732 		} else {
733 			cur_mtt->ptag =
734 				cpu_to_be64(rdma_block_iter_dma_address(&biter) |
735 					    MLX5_IB_MTT_PRESENT);
736 			if (mr->umem->is_dmabuf && (flags & MLX5_IB_UPD_XLT_ZAP))
737 				cur_mtt->ptag = 0;
738 			cur_mtt++;
739 			curr_entry = cur_mtt;
740 		}
741 	}
742 
743 	final_size = curr_entry - entry;
744 	sg.length = ALIGN(final_size, MLX5_UMR_FLEX_ALIGNMENT);
745 	memset(curr_entry, 0, sg.length - final_size);
746 	mlx5r_umr_final_update_xlt(dev, &wqe, mr, &sg, flags);
747 
748 	dma_sync_single_for_device(ddev, sg.addr, sg.length, DMA_TO_DEVICE);
749 	err = mlx5r_umr_post_send_wait(dev, mr->mmkey.key, &wqe, true);
750 
751 err:
752 	sg.length = orig_sg_length;
753 	mlx5r_umr_unmap_free_xlt(dev, entry, &sg);
754 	return err;
755 }
756 
757 int mlx5r_umr_update_data_direct_ksm_pas(struct mlx5_ib_mr *mr, unsigned int flags)
758 {
759 	/* No invalidation flow is expected */
760 	if (WARN_ON(!mr->umem->is_dmabuf) || (flags & MLX5_IB_UPD_XLT_ZAP))
761 		return -EINVAL;
762 
763 	return _mlx5r_umr_update_mr_pas(mr, flags, true);
764 }
765 
766 /*
767  * Send the DMA list to the HW for a normal MR using UMR.
768  * Dmabuf MR is handled in a similar way, except that the MLX5_IB_UPD_XLT_ZAP
769  * flag may be used.
770  */
771 int mlx5r_umr_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags)
772 {
773 	if (WARN_ON(mr->umem->is_odp))
774 		return -EINVAL;
775 
776 	return _mlx5r_umr_update_mr_pas(mr, flags, false);
777 }
778 
779 static bool umr_can_use_indirect_mkey(struct mlx5_ib_dev *dev)
780 {
781 	return !MLX5_CAP_GEN(dev->mdev, umr_indirect_mkey_disabled);
782 }
783 
784 int mlx5r_umr_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages,
785 			 int page_shift, int flags)
786 {
787 	int desc_size = (flags & MLX5_IB_UPD_XLT_INDIRECT)
788 			       ? sizeof(struct mlx5_klm)
789 			       : sizeof(struct mlx5_mtt);
790 	const int page_align = MLX5_UMR_FLEX_ALIGNMENT / desc_size;
791 	struct mlx5_ib_dev *dev = mr_to_mdev(mr);
792 	struct device *ddev = &dev->mdev->pdev->dev;
793 	const int page_mask = page_align - 1;
794 	struct mlx5r_umr_wqe wqe = {};
795 	size_t pages_mapped = 0;
796 	size_t pages_to_map = 0;
797 	size_t size_to_map = 0;
798 	size_t orig_sg_length;
799 	size_t pages_iter;
800 	struct ib_sge sg;
801 	int err = 0;
802 	void *xlt;
803 
804 	if ((flags & MLX5_IB_UPD_XLT_INDIRECT) &&
805 	    !umr_can_use_indirect_mkey(dev))
806 		return -EPERM;
807 
808 	if (WARN_ON(!mr->umem->is_odp))
809 		return -EINVAL;
810 
811 	/* UMR copies MTTs in units of MLX5_UMR_FLEX_ALIGNMENT bytes,
812 	 * so we need to align the offset and length accordingly
813 	 */
814 	if (idx & page_mask) {
815 		npages += idx & page_mask;
816 		idx &= ~page_mask;
817 	}
818 	pages_to_map = ALIGN(npages, page_align);
819 
820 	xlt = mlx5r_umr_create_xlt(dev, &sg, npages, desc_size, flags);
821 	if (!xlt)
822 		return -ENOMEM;
823 
824 	pages_iter = sg.length / desc_size;
825 	orig_sg_length = sg.length;
826 
827 	if (!(flags & MLX5_IB_UPD_XLT_INDIRECT)) {
828 		struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
829 		size_t max_pages = ib_umem_odp_num_pages(odp) - idx;
830 
831 		pages_to_map = min_t(size_t, pages_to_map, max_pages);
832 	}
833 
834 	mlx5r_umr_set_update_xlt_ctrl_seg(&wqe.ctrl_seg, flags, &sg);
835 	mlx5r_umr_set_update_xlt_mkey_seg(dev, &wqe.mkey_seg, mr, page_shift);
836 	mlx5r_umr_set_update_xlt_data_seg(&wqe.data_seg, &sg);
837 
838 	for (pages_mapped = 0;
839 	     pages_mapped < pages_to_map && !err;
840 	     pages_mapped += pages_iter, idx += pages_iter) {
841 		npages = min_t(int, pages_iter, pages_to_map - pages_mapped);
842 		size_to_map = npages * desc_size;
843 		dma_sync_single_for_cpu(ddev, sg.addr, sg.length,
844 					DMA_TO_DEVICE);
845 		/*
846 		 * npages is the maximum number of pages to map, but we
847 		 * can't guarantee that all pages are actually mapped.
848 		 *
849 		 * For example, if page is p2p of type which is not supported
850 		 * for mapping, the number of pages mapped will be less than
851 		 * requested.
852 		 */
853 		err = mlx5_odp_populate_xlt(xlt, idx, npages, mr, flags);
854 		if (err)
855 			return err;
856 		dma_sync_single_for_device(ddev, sg.addr, sg.length,
857 					   DMA_TO_DEVICE);
858 		sg.length = ALIGN(size_to_map, MLX5_UMR_FLEX_ALIGNMENT);
859 
860 		if (pages_mapped + pages_iter >= pages_to_map)
861 			mlx5r_umr_final_update_xlt(dev, &wqe, mr, &sg, flags);
862 		mlx5r_umr_update_offset(&wqe.ctrl_seg, idx * desc_size);
863 		err = mlx5r_umr_post_send_wait(dev, mr->mmkey.key, &wqe, true);
864 	}
865 	sg.length = orig_sg_length;
866 	mlx5r_umr_unmap_free_xlt(dev, xlt, &sg);
867 	return err;
868 }
869