1 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
2 /* Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. */
3
4 #include <rdma/ib_umem_odp.h>
5 #include "mlx5_ib.h"
6 #include "umr.h"
7 #include "wr.h"
8
9 /*
10 * We can't use an array for xlt_emergency_page because dma_map_single doesn't
11 * work on kernel modules memory
12 */
13 void *xlt_emergency_page;
14 static DEFINE_MUTEX(xlt_emergency_page_mutex);
15
get_umr_enable_mr_mask(void)16 static __be64 get_umr_enable_mr_mask(void)
17 {
18 u64 result;
19
20 result = MLX5_MKEY_MASK_KEY |
21 MLX5_MKEY_MASK_FREE;
22
23 return cpu_to_be64(result);
24 }
25
get_umr_disable_mr_mask(void)26 static __be64 get_umr_disable_mr_mask(void)
27 {
28 u64 result;
29
30 result = MLX5_MKEY_MASK_FREE;
31
32 return cpu_to_be64(result);
33 }
34
get_umr_update_translation_mask(void)35 static __be64 get_umr_update_translation_mask(void)
36 {
37 u64 result;
38
39 result = MLX5_MKEY_MASK_LEN |
40 MLX5_MKEY_MASK_PAGE_SIZE |
41 MLX5_MKEY_MASK_START_ADDR;
42
43 return cpu_to_be64(result);
44 }
45
get_umr_update_access_mask(struct mlx5_ib_dev * dev)46 static __be64 get_umr_update_access_mask(struct mlx5_ib_dev *dev)
47 {
48 u64 result;
49
50 result = MLX5_MKEY_MASK_LR |
51 MLX5_MKEY_MASK_LW |
52 MLX5_MKEY_MASK_RR |
53 MLX5_MKEY_MASK_RW;
54
55 if (MLX5_CAP_GEN(dev->mdev, atomic))
56 result |= MLX5_MKEY_MASK_A;
57
58 if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write_umr))
59 result |= MLX5_MKEY_MASK_RELAXED_ORDERING_WRITE;
60
61 if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read_umr))
62 result |= MLX5_MKEY_MASK_RELAXED_ORDERING_READ;
63
64 return cpu_to_be64(result);
65 }
66
get_umr_update_pd_mask(void)67 static __be64 get_umr_update_pd_mask(void)
68 {
69 u64 result;
70
71 result = MLX5_MKEY_MASK_PD;
72
73 return cpu_to_be64(result);
74 }
75
umr_check_mkey_mask(struct mlx5_ib_dev * dev,u64 mask)76 static int umr_check_mkey_mask(struct mlx5_ib_dev *dev, u64 mask)
77 {
78 if (mask & MLX5_MKEY_MASK_PAGE_SIZE &&
79 MLX5_CAP_GEN(dev->mdev, umr_modify_entity_size_disabled))
80 return -EPERM;
81
82 if (mask & MLX5_MKEY_MASK_A &&
83 MLX5_CAP_GEN(dev->mdev, umr_modify_atomic_disabled))
84 return -EPERM;
85
86 if (mask & MLX5_MKEY_MASK_RELAXED_ORDERING_WRITE &&
87 !MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write_umr))
88 return -EPERM;
89
90 if (mask & MLX5_MKEY_MASK_RELAXED_ORDERING_READ &&
91 !MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read_umr))
92 return -EPERM;
93
94 return 0;
95 }
96
97 enum {
98 MAX_UMR_WR = 128,
99 };
100
mlx5r_umr_qp_rst2rts(struct mlx5_ib_dev * dev,struct ib_qp * qp)101 static int mlx5r_umr_qp_rst2rts(struct mlx5_ib_dev *dev, struct ib_qp *qp)
102 {
103 struct ib_qp_attr attr = {};
104 int ret;
105
106 attr.qp_state = IB_QPS_INIT;
107 attr.port_num = 1;
108 ret = ib_modify_qp(qp, &attr,
109 IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_PORT);
110 if (ret) {
111 mlx5_ib_dbg(dev, "Couldn't modify UMR QP\n");
112 return ret;
113 }
114
115 memset(&attr, 0, sizeof(attr));
116 attr.qp_state = IB_QPS_RTR;
117
118 ret = ib_modify_qp(qp, &attr, IB_QP_STATE);
119 if (ret) {
120 mlx5_ib_dbg(dev, "Couldn't modify umr QP to rtr\n");
121 return ret;
122 }
123
124 memset(&attr, 0, sizeof(attr));
125 attr.qp_state = IB_QPS_RTS;
126 ret = ib_modify_qp(qp, &attr, IB_QP_STATE);
127 if (ret) {
128 mlx5_ib_dbg(dev, "Couldn't modify umr QP to rts\n");
129 return ret;
130 }
131
132 return 0;
133 }
134
mlx5r_umr_resource_init(struct mlx5_ib_dev * dev)135 int mlx5r_umr_resource_init(struct mlx5_ib_dev *dev)
136 {
137 struct ib_qp_init_attr init_attr = {};
138 struct ib_cq *cq;
139 struct ib_qp *qp;
140 int ret = 0;
141
142
143 /*
144 * UMR qp is set once, never changed until device unload.
145 * Avoid taking the mutex if initialization is already done.
146 */
147 if (dev->umrc.qp)
148 return 0;
149
150 mutex_lock(&dev->umrc.init_lock);
151 /* First user allocates the UMR resources. Skip if already allocated. */
152 if (dev->umrc.qp)
153 goto unlock;
154
155 cq = ib_alloc_cq(&dev->ib_dev, NULL, 128, 0, IB_POLL_SOFTIRQ);
156 if (IS_ERR(cq)) {
157 mlx5_ib_dbg(dev, "Couldn't create CQ for sync UMR QP\n");
158 ret = PTR_ERR(cq);
159 goto unlock;
160 }
161
162 init_attr.send_cq = cq;
163 init_attr.recv_cq = cq;
164 init_attr.sq_sig_type = IB_SIGNAL_ALL_WR;
165 init_attr.cap.max_send_wr = MAX_UMR_WR;
166 init_attr.cap.max_send_sge = 1;
167 init_attr.qp_type = MLX5_IB_QPT_REG_UMR;
168 init_attr.port_num = 1;
169 qp = ib_create_qp(dev->umrc.pd, &init_attr);
170 if (IS_ERR(qp)) {
171 mlx5_ib_dbg(dev, "Couldn't create sync UMR QP\n");
172 ret = PTR_ERR(qp);
173 goto destroy_cq;
174 }
175
176 ret = mlx5r_umr_qp_rst2rts(dev, qp);
177 if (ret)
178 goto destroy_qp;
179
180 dev->umrc.cq = cq;
181
182 sema_init(&dev->umrc.sem, MAX_UMR_WR);
183 mutex_init(&dev->umrc.lock);
184 dev->umrc.state = MLX5_UMR_STATE_ACTIVE;
185 dev->umrc.qp = qp;
186
187 mutex_unlock(&dev->umrc.init_lock);
188 return 0;
189
190 destroy_qp:
191 ib_destroy_qp(qp);
192 destroy_cq:
193 ib_free_cq(cq);
194 unlock:
195 mutex_unlock(&dev->umrc.init_lock);
196 return ret;
197 }
198
mlx5r_umr_resource_cleanup(struct mlx5_ib_dev * dev)199 void mlx5r_umr_resource_cleanup(struct mlx5_ib_dev *dev)
200 {
201 if (dev->umrc.state == MLX5_UMR_STATE_UNINIT)
202 return;
203 mutex_destroy(&dev->umrc.lock);
204 /* After device init, UMR cp/qp are not unset during the lifetime. */
205 ib_destroy_qp(dev->umrc.qp);
206 ib_free_cq(dev->umrc.cq);
207 }
208
mlx5r_umr_init(struct mlx5_ib_dev * dev)209 int mlx5r_umr_init(struct mlx5_ib_dev *dev)
210 {
211 struct ib_pd *pd;
212
213 pd = ib_alloc_pd(&dev->ib_dev, 0);
214 if (IS_ERR(pd)) {
215 mlx5_ib_dbg(dev, "Couldn't create PD for sync UMR QP\n");
216 return PTR_ERR(pd);
217 }
218 dev->umrc.pd = pd;
219
220 mutex_init(&dev->umrc.init_lock);
221
222 return 0;
223 }
224
mlx5r_umr_cleanup(struct mlx5_ib_dev * dev)225 void mlx5r_umr_cleanup(struct mlx5_ib_dev *dev)
226 {
227 if (!dev->umrc.pd)
228 return;
229
230 mutex_destroy(&dev->umrc.init_lock);
231 ib_dealloc_pd(dev->umrc.pd);
232 }
233
234
mlx5r_umr_post_send(struct ib_qp * ibqp,u32 mkey,struct ib_cqe * cqe,struct mlx5r_umr_wqe * wqe,bool with_data)235 static int mlx5r_umr_post_send(struct ib_qp *ibqp, u32 mkey, struct ib_cqe *cqe,
236 struct mlx5r_umr_wqe *wqe, bool with_data)
237 {
238 unsigned int wqe_size =
239 with_data ? sizeof(struct mlx5r_umr_wqe) :
240 sizeof(struct mlx5r_umr_wqe) -
241 sizeof(struct mlx5_wqe_data_seg);
242 struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
243 struct mlx5_core_dev *mdev = dev->mdev;
244 struct mlx5_ib_qp *qp = to_mqp(ibqp);
245 struct mlx5_wqe_ctrl_seg *ctrl;
246 union {
247 struct ib_cqe *ib_cqe;
248 u64 wr_id;
249 } id;
250 void *cur_edge, *seg;
251 unsigned long flags;
252 unsigned int idx;
253 int size, err;
254
255 if (unlikely(mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR))
256 return -EIO;
257
258 spin_lock_irqsave(&qp->sq.lock, flags);
259
260 err = mlx5r_begin_wqe(qp, &seg, &ctrl, &idx, &size, &cur_edge, 0,
261 cpu_to_be32(mkey), false, false);
262 if (WARN_ON(err))
263 goto out;
264
265 qp->sq.wr_data[idx] = MLX5_IB_WR_UMR;
266
267 mlx5r_memcpy_send_wqe(&qp->sq, &cur_edge, &seg, &size, wqe, wqe_size);
268
269 id.ib_cqe = cqe;
270 mlx5r_finish_wqe(qp, ctrl, seg, size, cur_edge, idx, id.wr_id, 0,
271 MLX5_FENCE_MODE_INITIATOR_SMALL, MLX5_OPCODE_UMR);
272
273 mlx5r_ring_db(qp, 1, ctrl);
274
275 out:
276 spin_unlock_irqrestore(&qp->sq.lock, flags);
277
278 return err;
279 }
280
mlx5r_umr_recover(struct mlx5_ib_dev * dev,u32 mkey,struct mlx5r_umr_context * umr_context,struct mlx5r_umr_wqe * wqe,bool with_data)281 static int mlx5r_umr_recover(struct mlx5_ib_dev *dev, u32 mkey,
282 struct mlx5r_umr_context *umr_context,
283 struct mlx5r_umr_wqe *wqe, bool with_data)
284 {
285 struct umr_common *umrc = &dev->umrc;
286 struct ib_qp_attr attr;
287 int err;
288
289 mutex_lock(&umrc->lock);
290 /* Preventing any further WRs to be sent now */
291 if (umrc->state != MLX5_UMR_STATE_RECOVER) {
292 mlx5_ib_warn(dev, "UMR recovery encountered an unexpected state=%d\n",
293 umrc->state);
294 umrc->state = MLX5_UMR_STATE_RECOVER;
295 }
296 mutex_unlock(&umrc->lock);
297
298 /* Sending a final/barrier WR (the failed one) and wait for its completion.
299 * This will ensure that all the previous WRs got a completion before
300 * we set the QP state to RESET.
301 */
302 err = mlx5r_umr_post_send(umrc->qp, mkey, &umr_context->cqe, wqe,
303 with_data);
304 if (err) {
305 mlx5_ib_warn(dev, "UMR recovery post send failed, err %d\n", err);
306 goto err;
307 }
308
309 /* Since the QP is in an error state, it will only receive
310 * IB_WC_WR_FLUSH_ERR. However, as it serves only as a barrier
311 * we don't care about its status.
312 */
313 wait_for_completion(&umr_context->done);
314
315 attr.qp_state = IB_QPS_RESET;
316 err = ib_modify_qp(umrc->qp, &attr, IB_QP_STATE);
317 if (err) {
318 mlx5_ib_warn(dev, "Couldn't modify UMR QP to RESET, err=%d\n", err);
319 goto err;
320 }
321
322 err = mlx5r_umr_qp_rst2rts(dev, umrc->qp);
323 if (err) {
324 mlx5_ib_warn(dev, "Couldn't modify UMR QP to RTS, err=%d\n", err);
325 goto err;
326 }
327
328 umrc->state = MLX5_UMR_STATE_ACTIVE;
329 return 0;
330
331 err:
332 umrc->state = MLX5_UMR_STATE_ERR;
333 return err;
334 }
335
mlx5r_umr_done(struct ib_cq * cq,struct ib_wc * wc)336 static void mlx5r_umr_done(struct ib_cq *cq, struct ib_wc *wc)
337 {
338 struct mlx5_ib_umr_context *context =
339 container_of(wc->wr_cqe, struct mlx5_ib_umr_context, cqe);
340
341 context->status = wc->status;
342 complete(&context->done);
343 }
344
mlx5r_umr_init_context(struct mlx5r_umr_context * context)345 static inline void mlx5r_umr_init_context(struct mlx5r_umr_context *context)
346 {
347 context->cqe.done = mlx5r_umr_done;
348 init_completion(&context->done);
349 }
350
mlx5r_umr_post_send_wait(struct mlx5_ib_dev * dev,u32 mkey,struct mlx5r_umr_wqe * wqe,bool with_data)351 static int mlx5r_umr_post_send_wait(struct mlx5_ib_dev *dev, u32 mkey,
352 struct mlx5r_umr_wqe *wqe, bool with_data)
353 {
354 struct umr_common *umrc = &dev->umrc;
355 struct mlx5r_umr_context umr_context;
356 int err;
357
358 err = umr_check_mkey_mask(dev, be64_to_cpu(wqe->ctrl_seg.mkey_mask));
359 if (WARN_ON(err))
360 return err;
361
362 mlx5r_umr_init_context(&umr_context);
363
364 down(&umrc->sem);
365 while (true) {
366 mutex_lock(&umrc->lock);
367 if (umrc->state == MLX5_UMR_STATE_ERR) {
368 mutex_unlock(&umrc->lock);
369 err = -EFAULT;
370 break;
371 }
372
373 if (umrc->state == MLX5_UMR_STATE_RECOVER) {
374 mutex_unlock(&umrc->lock);
375 usleep_range(3000, 5000);
376 continue;
377 }
378
379 err = mlx5r_umr_post_send(umrc->qp, mkey, &umr_context.cqe, wqe,
380 with_data);
381 mutex_unlock(&umrc->lock);
382 if (err) {
383 mlx5_ib_warn(dev, "UMR post send failed, err %d\n",
384 err);
385 break;
386 }
387
388 wait_for_completion(&umr_context.done);
389
390 if (umr_context.status == IB_WC_SUCCESS)
391 break;
392
393 if (umr_context.status == IB_WC_WR_FLUSH_ERR)
394 continue;
395
396 WARN_ON_ONCE(1);
397 mlx5_ib_warn(dev,
398 "reg umr failed (%u). Trying to recover and resubmit the flushed WQEs, mkey = %u\n",
399 umr_context.status, mkey);
400 err = mlx5r_umr_recover(dev, mkey, &umr_context, wqe, with_data);
401 if (err)
402 mlx5_ib_warn(dev, "couldn't recover UMR, err %d\n",
403 err);
404 err = -EFAULT;
405 break;
406 }
407 up(&umrc->sem);
408 return err;
409 }
410
411 /**
412 * mlx5r_umr_revoke_mr - Fence all DMA on the MR
413 * @mr: The MR to fence
414 *
415 * Upon return the NIC will not be doing any DMA to the pages under the MR,
416 * and any DMA in progress will be completed. Failure of this function
417 * indicates the HW has failed catastrophically.
418 */
mlx5r_umr_revoke_mr(struct mlx5_ib_mr * mr)419 int mlx5r_umr_revoke_mr(struct mlx5_ib_mr *mr)
420 {
421 struct mlx5_ib_dev *dev = mr_to_mdev(mr);
422 struct mlx5r_umr_wqe wqe = {};
423
424 if (dev->mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR)
425 return 0;
426
427 wqe.ctrl_seg.mkey_mask |= get_umr_update_pd_mask();
428 wqe.ctrl_seg.mkey_mask |= get_umr_disable_mr_mask();
429 wqe.ctrl_seg.flags |= MLX5_UMR_INLINE;
430
431 MLX5_SET(mkc, &wqe.mkey_seg, free, 1);
432 MLX5_SET(mkc, &wqe.mkey_seg, pd, to_mpd(dev->umrc.pd)->pdn);
433 MLX5_SET(mkc, &wqe.mkey_seg, qpn, 0xffffff);
434 MLX5_SET(mkc, &wqe.mkey_seg, mkey_7_0,
435 mlx5_mkey_variant(mr->mmkey.key));
436
437 return mlx5r_umr_post_send_wait(dev, mr->mmkey.key, &wqe, false);
438 }
439
mlx5r_umr_set_access_flags(struct mlx5_ib_dev * dev,struct mlx5_mkey_seg * seg,unsigned int access_flags)440 static void mlx5r_umr_set_access_flags(struct mlx5_ib_dev *dev,
441 struct mlx5_mkey_seg *seg,
442 unsigned int access_flags)
443 {
444 bool ro_read = (access_flags & IB_ACCESS_RELAXED_ORDERING) &&
445 (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read) ||
446 pcie_relaxed_ordering_enabled(dev->mdev->pdev));
447
448 MLX5_SET(mkc, seg, a, !!(access_flags & IB_ACCESS_REMOTE_ATOMIC));
449 MLX5_SET(mkc, seg, rw, !!(access_flags & IB_ACCESS_REMOTE_WRITE));
450 MLX5_SET(mkc, seg, rr, !!(access_flags & IB_ACCESS_REMOTE_READ));
451 MLX5_SET(mkc, seg, lw, !!(access_flags & IB_ACCESS_LOCAL_WRITE));
452 MLX5_SET(mkc, seg, lr, 1);
453 MLX5_SET(mkc, seg, relaxed_ordering_write,
454 !!(access_flags & IB_ACCESS_RELAXED_ORDERING));
455 MLX5_SET(mkc, seg, relaxed_ordering_read, ro_read);
456 }
457
mlx5r_umr_rereg_pd_access(struct mlx5_ib_mr * mr,struct ib_pd * pd,int access_flags)458 int mlx5r_umr_rereg_pd_access(struct mlx5_ib_mr *mr, struct ib_pd *pd,
459 int access_flags)
460 {
461 struct mlx5_ib_dev *dev = mr_to_mdev(mr);
462 struct mlx5r_umr_wqe wqe = {};
463 int err;
464
465 wqe.ctrl_seg.mkey_mask = get_umr_update_access_mask(dev);
466 wqe.ctrl_seg.mkey_mask |= get_umr_update_pd_mask();
467 wqe.ctrl_seg.flags = MLX5_UMR_CHECK_FREE;
468 wqe.ctrl_seg.flags |= MLX5_UMR_INLINE;
469
470 mlx5r_umr_set_access_flags(dev, &wqe.mkey_seg, access_flags);
471 MLX5_SET(mkc, &wqe.mkey_seg, pd, to_mpd(pd)->pdn);
472 MLX5_SET(mkc, &wqe.mkey_seg, qpn, 0xffffff);
473 MLX5_SET(mkc, &wqe.mkey_seg, mkey_7_0,
474 mlx5_mkey_variant(mr->mmkey.key));
475
476 err = mlx5r_umr_post_send_wait(dev, mr->mmkey.key, &wqe, false);
477 if (err)
478 return err;
479
480 mr->access_flags = access_flags;
481 return 0;
482 }
483
484 #define MLX5_MAX_UMR_CHUNK \
485 ((1 << (MLX5_MAX_UMR_SHIFT + 4)) - MLX5_UMR_FLEX_ALIGNMENT)
486 #define MLX5_SPARE_UMR_CHUNK 0x10000
487
488 /*
489 * Allocate a temporary buffer to hold the per-page information to transfer to
490 * HW. For efficiency this should be as large as it can be, but buffer
491 * allocation failure is not allowed, so try smaller sizes.
492 */
mlx5r_umr_alloc_xlt(size_t * nents,size_t ent_size,gfp_t gfp_mask)493 static void *mlx5r_umr_alloc_xlt(size_t *nents, size_t ent_size, gfp_t gfp_mask)
494 {
495 const size_t xlt_chunk_align = MLX5_UMR_FLEX_ALIGNMENT / ent_size;
496 size_t size;
497 void *res = NULL;
498
499 static_assert(PAGE_SIZE % MLX5_UMR_FLEX_ALIGNMENT == 0);
500
501 /*
502 * MLX5_IB_UPD_XLT_ATOMIC doesn't signal an atomic context just that the
503 * allocation can't trigger any kind of reclaim.
504 */
505 might_sleep();
506
507 gfp_mask |= __GFP_ZERO | __GFP_NORETRY;
508
509 /*
510 * If the system already has a suitable high order page then just use
511 * that, but don't try hard to create one. This max is about 1M, so a
512 * free x86 huge page will satisfy it.
513 */
514 size = min_t(size_t, ent_size * ALIGN(*nents, xlt_chunk_align),
515 MLX5_MAX_UMR_CHUNK);
516 *nents = size / ent_size;
517 res = (void *)__get_free_pages(gfp_mask | __GFP_NOWARN,
518 get_order(size));
519 if (res)
520 return res;
521
522 if (size > MLX5_SPARE_UMR_CHUNK) {
523 size = MLX5_SPARE_UMR_CHUNK;
524 *nents = size / ent_size;
525 res = (void *)__get_free_pages(gfp_mask | __GFP_NOWARN,
526 get_order(size));
527 if (res)
528 return res;
529 }
530
531 *nents = PAGE_SIZE / ent_size;
532 res = (void *)__get_free_page(gfp_mask);
533 if (res)
534 return res;
535
536 mutex_lock(&xlt_emergency_page_mutex);
537 memset(xlt_emergency_page, 0, PAGE_SIZE);
538 return xlt_emergency_page;
539 }
540
mlx5r_umr_free_xlt(void * xlt,size_t length)541 static void mlx5r_umr_free_xlt(void *xlt, size_t length)
542 {
543 if (xlt == xlt_emergency_page) {
544 mutex_unlock(&xlt_emergency_page_mutex);
545 return;
546 }
547
548 free_pages((unsigned long)xlt, get_order(length));
549 }
550
mlx5r_umr_unmap_free_xlt(struct mlx5_ib_dev * dev,void * xlt,struct ib_sge * sg)551 static void mlx5r_umr_unmap_free_xlt(struct mlx5_ib_dev *dev, void *xlt,
552 struct ib_sge *sg)
553 {
554 struct device *ddev = &dev->mdev->pdev->dev;
555
556 dma_unmap_single(ddev, sg->addr, sg->length, DMA_TO_DEVICE);
557 mlx5r_umr_free_xlt(xlt, sg->length);
558 }
559
560 /*
561 * Create an XLT buffer ready for submission.
562 */
mlx5r_umr_create_xlt(struct mlx5_ib_dev * dev,struct ib_sge * sg,size_t nents,size_t ent_size,unsigned int flags)563 static void *mlx5r_umr_create_xlt(struct mlx5_ib_dev *dev, struct ib_sge *sg,
564 size_t nents, size_t ent_size,
565 unsigned int flags)
566 {
567 struct device *ddev = &dev->mdev->pdev->dev;
568 dma_addr_t dma;
569 void *xlt;
570
571 xlt = mlx5r_umr_alloc_xlt(&nents, ent_size,
572 flags & MLX5_IB_UPD_XLT_ATOMIC ? GFP_ATOMIC :
573 GFP_KERNEL);
574 sg->length = nents * ent_size;
575 dma = dma_map_single(ddev, xlt, sg->length, DMA_TO_DEVICE);
576 if (dma_mapping_error(ddev, dma)) {
577 mlx5_ib_err(dev, "unable to map DMA during XLT update.\n");
578 mlx5r_umr_free_xlt(xlt, sg->length);
579 return NULL;
580 }
581 sg->addr = dma;
582 sg->lkey = dev->umrc.pd->local_dma_lkey;
583
584 return xlt;
585 }
586
587 static void
mlx5r_umr_set_update_xlt_ctrl_seg(struct mlx5_wqe_umr_ctrl_seg * ctrl_seg,unsigned int flags,struct ib_sge * sg)588 mlx5r_umr_set_update_xlt_ctrl_seg(struct mlx5_wqe_umr_ctrl_seg *ctrl_seg,
589 unsigned int flags, struct ib_sge *sg)
590 {
591 if (!(flags & MLX5_IB_UPD_XLT_ENABLE))
592 /* fail if free */
593 ctrl_seg->flags = MLX5_UMR_CHECK_FREE;
594 else
595 /* fail if not free */
596 ctrl_seg->flags = MLX5_UMR_CHECK_NOT_FREE;
597 ctrl_seg->xlt_octowords =
598 cpu_to_be16(mlx5r_umr_get_xlt_octo(sg->length));
599 }
600
mlx5r_umr_set_update_xlt_mkey_seg(struct mlx5_ib_dev * dev,struct mlx5_mkey_seg * mkey_seg,struct mlx5_ib_mr * mr,unsigned int page_shift)601 static void mlx5r_umr_set_update_xlt_mkey_seg(struct mlx5_ib_dev *dev,
602 struct mlx5_mkey_seg *mkey_seg,
603 struct mlx5_ib_mr *mr,
604 unsigned int page_shift)
605 {
606 mlx5r_umr_set_access_flags(dev, mkey_seg, mr->access_flags);
607 MLX5_SET(mkc, mkey_seg, pd, to_mpd(mr->ibmr.pd)->pdn);
608 MLX5_SET64(mkc, mkey_seg, start_addr, mr->ibmr.iova);
609 MLX5_SET64(mkc, mkey_seg, len, mr->ibmr.length);
610 MLX5_SET(mkc, mkey_seg, log_page_size, page_shift);
611 MLX5_SET(mkc, mkey_seg, qpn, 0xffffff);
612 MLX5_SET(mkc, mkey_seg, mkey_7_0, mlx5_mkey_variant(mr->mmkey.key));
613 }
614
615 static void
mlx5r_umr_set_update_xlt_data_seg(struct mlx5_wqe_data_seg * data_seg,struct ib_sge * sg)616 mlx5r_umr_set_update_xlt_data_seg(struct mlx5_wqe_data_seg *data_seg,
617 struct ib_sge *sg)
618 {
619 data_seg->byte_count = cpu_to_be32(sg->length);
620 data_seg->lkey = cpu_to_be32(sg->lkey);
621 data_seg->addr = cpu_to_be64(sg->addr);
622 }
623
mlx5r_umr_update_offset(struct mlx5_wqe_umr_ctrl_seg * ctrl_seg,u64 offset)624 static void mlx5r_umr_update_offset(struct mlx5_wqe_umr_ctrl_seg *ctrl_seg,
625 u64 offset)
626 {
627 u64 octo_offset = mlx5r_umr_get_xlt_octo(offset);
628
629 ctrl_seg->xlt_offset = cpu_to_be16(octo_offset & 0xffff);
630 ctrl_seg->xlt_offset_47_16 = cpu_to_be32(octo_offset >> 16);
631 ctrl_seg->flags |= MLX5_UMR_TRANSLATION_OFFSET_EN;
632 }
633
mlx5r_umr_final_update_xlt(struct mlx5_ib_dev * dev,struct mlx5r_umr_wqe * wqe,struct mlx5_ib_mr * mr,struct ib_sge * sg,unsigned int flags)634 static void mlx5r_umr_final_update_xlt(struct mlx5_ib_dev *dev,
635 struct mlx5r_umr_wqe *wqe,
636 struct mlx5_ib_mr *mr, struct ib_sge *sg,
637 unsigned int flags)
638 {
639 bool update_pd_access, update_translation;
640
641 if (flags & MLX5_IB_UPD_XLT_ENABLE)
642 wqe->ctrl_seg.mkey_mask |= get_umr_enable_mr_mask();
643
644 update_pd_access = flags & MLX5_IB_UPD_XLT_ENABLE ||
645 flags & MLX5_IB_UPD_XLT_PD ||
646 flags & MLX5_IB_UPD_XLT_ACCESS;
647
648 if (update_pd_access) {
649 wqe->ctrl_seg.mkey_mask |= get_umr_update_access_mask(dev);
650 wqe->ctrl_seg.mkey_mask |= get_umr_update_pd_mask();
651 }
652
653 update_translation =
654 flags & MLX5_IB_UPD_XLT_ENABLE || flags & MLX5_IB_UPD_XLT_ADDR;
655
656 if (update_translation) {
657 wqe->ctrl_seg.mkey_mask |= get_umr_update_translation_mask();
658 if (!mr->ibmr.length)
659 MLX5_SET(mkc, &wqe->mkey_seg, length64, 1);
660 }
661
662 wqe->ctrl_seg.xlt_octowords =
663 cpu_to_be16(mlx5r_umr_get_xlt_octo(sg->length));
664 wqe->data_seg.byte_count = cpu_to_be32(sg->length);
665 }
666
667 static int
_mlx5r_umr_update_mr_pas(struct mlx5_ib_mr * mr,unsigned int flags,bool dd)668 _mlx5r_umr_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags, bool dd)
669 {
670 size_t ent_size = dd ? sizeof(struct mlx5_ksm) : sizeof(struct mlx5_mtt);
671 struct mlx5_ib_dev *dev = mr_to_mdev(mr);
672 struct device *ddev = &dev->mdev->pdev->dev;
673 struct mlx5r_umr_wqe wqe = {};
674 struct ib_block_iter biter;
675 struct mlx5_ksm *cur_ksm;
676 struct mlx5_mtt *cur_mtt;
677 size_t orig_sg_length;
678 size_t final_size;
679 void *curr_entry;
680 struct ib_sge sg;
681 void *entry;
682 u64 offset = 0;
683 int err = 0;
684
685 entry = mlx5r_umr_create_xlt(dev, &sg,
686 ib_umem_num_dma_blocks(mr->umem, 1 << mr->page_shift),
687 ent_size, flags);
688 if (!entry)
689 return -ENOMEM;
690
691 orig_sg_length = sg.length;
692 mlx5r_umr_set_update_xlt_ctrl_seg(&wqe.ctrl_seg, flags, &sg);
693 mlx5r_umr_set_update_xlt_mkey_seg(dev, &wqe.mkey_seg, mr,
694 mr->page_shift);
695 if (dd) {
696 /* Use the data direct internal kernel PD */
697 MLX5_SET(mkc, &wqe.mkey_seg, pd, dev->ddr.pdn);
698 cur_ksm = entry;
699 } else {
700 cur_mtt = entry;
701 }
702
703 mlx5r_umr_set_update_xlt_data_seg(&wqe.data_seg, &sg);
704
705 curr_entry = entry;
706 rdma_umem_for_each_dma_block(mr->umem, &biter, BIT(mr->page_shift)) {
707 if (curr_entry == entry + sg.length) {
708 dma_sync_single_for_device(ddev, sg.addr, sg.length,
709 DMA_TO_DEVICE);
710
711 err = mlx5r_umr_post_send_wait(dev, mr->mmkey.key, &wqe,
712 true);
713 if (err)
714 goto err;
715 dma_sync_single_for_cpu(ddev, sg.addr, sg.length,
716 DMA_TO_DEVICE);
717 offset += sg.length;
718 mlx5r_umr_update_offset(&wqe.ctrl_seg, offset);
719 if (dd)
720 cur_ksm = entry;
721 else
722 cur_mtt = entry;
723 }
724
725 if (dd) {
726 cur_ksm->va = cpu_to_be64(rdma_block_iter_dma_address(&biter));
727 cur_ksm->key = cpu_to_be32(dev->ddr.mkey);
728 cur_ksm++;
729 curr_entry = cur_ksm;
730 } else {
731 cur_mtt->ptag =
732 cpu_to_be64(rdma_block_iter_dma_address(&biter) |
733 MLX5_IB_MTT_PRESENT);
734 if (mr->umem->is_dmabuf && (flags & MLX5_IB_UPD_XLT_ZAP))
735 cur_mtt->ptag = 0;
736 cur_mtt++;
737 curr_entry = cur_mtt;
738 }
739 }
740
741 final_size = curr_entry - entry;
742 sg.length = ALIGN(final_size, MLX5_UMR_FLEX_ALIGNMENT);
743 memset(curr_entry, 0, sg.length - final_size);
744 mlx5r_umr_final_update_xlt(dev, &wqe, mr, &sg, flags);
745
746 dma_sync_single_for_device(ddev, sg.addr, sg.length, DMA_TO_DEVICE);
747 err = mlx5r_umr_post_send_wait(dev, mr->mmkey.key, &wqe, true);
748
749 err:
750 sg.length = orig_sg_length;
751 mlx5r_umr_unmap_free_xlt(dev, entry, &sg);
752 return err;
753 }
754
mlx5r_umr_update_data_direct_ksm_pas(struct mlx5_ib_mr * mr,unsigned int flags)755 int mlx5r_umr_update_data_direct_ksm_pas(struct mlx5_ib_mr *mr, unsigned int flags)
756 {
757 /* No invalidation flow is expected */
758 if (WARN_ON(!mr->umem->is_dmabuf) || (flags & MLX5_IB_UPD_XLT_ZAP))
759 return -EINVAL;
760
761 return _mlx5r_umr_update_mr_pas(mr, flags, true);
762 }
763
764 /*
765 * Send the DMA list to the HW for a normal MR using UMR.
766 * Dmabuf MR is handled in a similar way, except that the MLX5_IB_UPD_XLT_ZAP
767 * flag may be used.
768 */
mlx5r_umr_update_mr_pas(struct mlx5_ib_mr * mr,unsigned int flags)769 int mlx5r_umr_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags)
770 {
771 if (WARN_ON(mr->umem->is_odp))
772 return -EINVAL;
773
774 return _mlx5r_umr_update_mr_pas(mr, flags, false);
775 }
776
umr_can_use_indirect_mkey(struct mlx5_ib_dev * dev)777 static bool umr_can_use_indirect_mkey(struct mlx5_ib_dev *dev)
778 {
779 return !MLX5_CAP_GEN(dev->mdev, umr_indirect_mkey_disabled);
780 }
781
mlx5r_umr_update_xlt(struct mlx5_ib_mr * mr,u64 idx,int npages,int page_shift,int flags)782 int mlx5r_umr_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages,
783 int page_shift, int flags)
784 {
785 int desc_size = (flags & MLX5_IB_UPD_XLT_INDIRECT)
786 ? sizeof(struct mlx5_klm)
787 : sizeof(struct mlx5_mtt);
788 const int page_align = MLX5_UMR_FLEX_ALIGNMENT / desc_size;
789 struct mlx5_ib_dev *dev = mr_to_mdev(mr);
790 struct device *ddev = &dev->mdev->pdev->dev;
791 const int page_mask = page_align - 1;
792 struct mlx5r_umr_wqe wqe = {};
793 size_t pages_mapped = 0;
794 size_t pages_to_map = 0;
795 size_t size_to_map = 0;
796 size_t orig_sg_length;
797 size_t pages_iter;
798 struct ib_sge sg;
799 int err = 0;
800 void *xlt;
801
802 if ((flags & MLX5_IB_UPD_XLT_INDIRECT) &&
803 !umr_can_use_indirect_mkey(dev))
804 return -EPERM;
805
806 if (WARN_ON(!mr->umem->is_odp))
807 return -EINVAL;
808
809 /* UMR copies MTTs in units of MLX5_UMR_FLEX_ALIGNMENT bytes,
810 * so we need to align the offset and length accordingly
811 */
812 if (idx & page_mask) {
813 npages += idx & page_mask;
814 idx &= ~page_mask;
815 }
816 pages_to_map = ALIGN(npages, page_align);
817
818 xlt = mlx5r_umr_create_xlt(dev, &sg, npages, desc_size, flags);
819 if (!xlt)
820 return -ENOMEM;
821
822 pages_iter = sg.length / desc_size;
823 orig_sg_length = sg.length;
824
825 if (!(flags & MLX5_IB_UPD_XLT_INDIRECT)) {
826 struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
827 size_t max_pages = ib_umem_odp_num_pages(odp) - idx;
828
829 pages_to_map = min_t(size_t, pages_to_map, max_pages);
830 }
831
832 mlx5r_umr_set_update_xlt_ctrl_seg(&wqe.ctrl_seg, flags, &sg);
833 mlx5r_umr_set_update_xlt_mkey_seg(dev, &wqe.mkey_seg, mr, page_shift);
834 mlx5r_umr_set_update_xlt_data_seg(&wqe.data_seg, &sg);
835
836 for (pages_mapped = 0;
837 pages_mapped < pages_to_map && !err;
838 pages_mapped += pages_iter, idx += pages_iter) {
839 npages = min_t(int, pages_iter, pages_to_map - pages_mapped);
840 size_to_map = npages * desc_size;
841 dma_sync_single_for_cpu(ddev, sg.addr, sg.length,
842 DMA_TO_DEVICE);
843 mlx5_odp_populate_xlt(xlt, idx, npages, mr, flags);
844 dma_sync_single_for_device(ddev, sg.addr, sg.length,
845 DMA_TO_DEVICE);
846 sg.length = ALIGN(size_to_map, MLX5_UMR_FLEX_ALIGNMENT);
847
848 if (pages_mapped + pages_iter >= pages_to_map)
849 mlx5r_umr_final_update_xlt(dev, &wqe, mr, &sg, flags);
850 mlx5r_umr_update_offset(&wqe.ctrl_seg, idx * desc_size);
851 err = mlx5r_umr_post_send_wait(dev, mr->mmkey.key, &wqe, true);
852 }
853 sg.length = orig_sg_length;
854 mlx5r_umr_unmap_free_xlt(dev, xlt, &sg);
855 return err;
856 }
857