1 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB 2 /* Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. */ 3 4 #include <rdma/ib_umem_odp.h> 5 #include "mlx5_ib.h" 6 #include "umr.h" 7 #include "wr.h" 8 9 /* 10 * We can't use an array for xlt_emergency_page because dma_map_single doesn't 11 * work on kernel modules memory 12 */ 13 void *xlt_emergency_page; 14 static DEFINE_MUTEX(xlt_emergency_page_mutex); 15 16 static __be64 get_umr_enable_mr_mask(void) 17 { 18 u64 result; 19 20 result = MLX5_MKEY_MASK_KEY | 21 MLX5_MKEY_MASK_FREE; 22 23 return cpu_to_be64(result); 24 } 25 26 static __be64 get_umr_disable_mr_mask(void) 27 { 28 u64 result; 29 30 result = MLX5_MKEY_MASK_FREE; 31 32 return cpu_to_be64(result); 33 } 34 35 static __be64 get_umr_update_translation_mask(struct mlx5_ib_dev *dev) 36 { 37 u64 result; 38 39 result = MLX5_MKEY_MASK_LEN | 40 MLX5_MKEY_MASK_PAGE_SIZE | 41 MLX5_MKEY_MASK_START_ADDR; 42 if (MLX5_CAP_GEN_2(dev->mdev, umr_log_entity_size_5)) 43 result |= MLX5_MKEY_MASK_PAGE_SIZE_5; 44 45 return cpu_to_be64(result); 46 } 47 48 static __be64 get_umr_update_access_mask(struct mlx5_ib_dev *dev) 49 { 50 u64 result; 51 52 result = MLX5_MKEY_MASK_LR | 53 MLX5_MKEY_MASK_LW | 54 MLX5_MKEY_MASK_RR | 55 MLX5_MKEY_MASK_RW; 56 57 if (MLX5_CAP_GEN(dev->mdev, atomic)) 58 result |= MLX5_MKEY_MASK_A; 59 60 if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write_umr)) 61 result |= MLX5_MKEY_MASK_RELAXED_ORDERING_WRITE; 62 63 if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read_umr)) 64 result |= MLX5_MKEY_MASK_RELAXED_ORDERING_READ; 65 66 return cpu_to_be64(result); 67 } 68 69 static __be64 get_umr_update_pd_mask(void) 70 { 71 u64 result; 72 73 result = MLX5_MKEY_MASK_PD; 74 75 return cpu_to_be64(result); 76 } 77 78 static int umr_check_mkey_mask(struct mlx5_ib_dev *dev, u64 mask) 79 { 80 if (mask & MLX5_MKEY_MASK_PAGE_SIZE && 81 MLX5_CAP_GEN(dev->mdev, umr_modify_entity_size_disabled)) 82 return -EPERM; 83 84 if (mask & MLX5_MKEY_MASK_A && 85 MLX5_CAP_GEN(dev->mdev, umr_modify_atomic_disabled)) 86 return -EPERM; 87 88 if (mask & MLX5_MKEY_MASK_RELAXED_ORDERING_WRITE && 89 !MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write_umr)) 90 return -EPERM; 91 92 if (mask & MLX5_MKEY_MASK_RELAXED_ORDERING_READ && 93 !MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read_umr)) 94 return -EPERM; 95 96 return 0; 97 } 98 99 enum { 100 MAX_UMR_WR = 128, 101 }; 102 103 static int mlx5r_umr_qp_rst2rts(struct mlx5_ib_dev *dev, struct ib_qp *qp) 104 { 105 struct ib_qp_attr attr = {}; 106 int ret; 107 108 attr.qp_state = IB_QPS_INIT; 109 attr.port_num = 1; 110 ret = ib_modify_qp(qp, &attr, 111 IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_PORT); 112 if (ret) { 113 mlx5_ib_dbg(dev, "Couldn't modify UMR QP\n"); 114 return ret; 115 } 116 117 memset(&attr, 0, sizeof(attr)); 118 attr.qp_state = IB_QPS_RTR; 119 120 ret = ib_modify_qp(qp, &attr, IB_QP_STATE); 121 if (ret) { 122 mlx5_ib_dbg(dev, "Couldn't modify umr QP to rtr\n"); 123 return ret; 124 } 125 126 memset(&attr, 0, sizeof(attr)); 127 attr.qp_state = IB_QPS_RTS; 128 ret = ib_modify_qp(qp, &attr, IB_QP_STATE); 129 if (ret) { 130 mlx5_ib_dbg(dev, "Couldn't modify umr QP to rts\n"); 131 return ret; 132 } 133 134 return 0; 135 } 136 137 int mlx5r_umr_resource_init(struct mlx5_ib_dev *dev) 138 { 139 struct ib_qp_init_attr init_attr = {}; 140 struct ib_cq *cq; 141 struct ib_qp *qp; 142 int ret = 0; 143 144 145 /* 146 * UMR qp is set once, never changed until device unload. 147 * Avoid taking the mutex if initialization is already done. 148 */ 149 if (dev->umrc.qp) 150 return 0; 151 152 mutex_lock(&dev->umrc.init_lock); 153 /* First user allocates the UMR resources. Skip if already allocated. */ 154 if (dev->umrc.qp) 155 goto unlock; 156 157 cq = ib_alloc_cq(&dev->ib_dev, NULL, 128, 0, IB_POLL_SOFTIRQ); 158 if (IS_ERR(cq)) { 159 mlx5_ib_dbg(dev, "Couldn't create CQ for sync UMR QP\n"); 160 ret = PTR_ERR(cq); 161 goto unlock; 162 } 163 164 init_attr.send_cq = cq; 165 init_attr.recv_cq = cq; 166 init_attr.sq_sig_type = IB_SIGNAL_ALL_WR; 167 init_attr.cap.max_send_wr = MAX_UMR_WR; 168 init_attr.cap.max_send_sge = 1; 169 init_attr.qp_type = MLX5_IB_QPT_REG_UMR; 170 init_attr.port_num = 1; 171 qp = ib_create_qp(dev->umrc.pd, &init_attr); 172 if (IS_ERR(qp)) { 173 mlx5_ib_dbg(dev, "Couldn't create sync UMR QP\n"); 174 ret = PTR_ERR(qp); 175 goto destroy_cq; 176 } 177 178 ret = mlx5r_umr_qp_rst2rts(dev, qp); 179 if (ret) 180 goto destroy_qp; 181 182 dev->umrc.cq = cq; 183 184 sema_init(&dev->umrc.sem, MAX_UMR_WR); 185 mutex_init(&dev->umrc.lock); 186 dev->umrc.state = MLX5_UMR_STATE_ACTIVE; 187 dev->umrc.qp = qp; 188 189 mutex_unlock(&dev->umrc.init_lock); 190 return 0; 191 192 destroy_qp: 193 ib_destroy_qp(qp); 194 destroy_cq: 195 ib_free_cq(cq); 196 unlock: 197 mutex_unlock(&dev->umrc.init_lock); 198 return ret; 199 } 200 201 void mlx5r_umr_resource_cleanup(struct mlx5_ib_dev *dev) 202 { 203 if (dev->umrc.state == MLX5_UMR_STATE_UNINIT) 204 return; 205 mutex_destroy(&dev->umrc.lock); 206 /* After device init, UMR cp/qp are not unset during the lifetime. */ 207 ib_destroy_qp(dev->umrc.qp); 208 ib_free_cq(dev->umrc.cq); 209 } 210 211 int mlx5r_umr_init(struct mlx5_ib_dev *dev) 212 { 213 struct ib_pd *pd; 214 215 pd = ib_alloc_pd(&dev->ib_dev, 0); 216 if (IS_ERR(pd)) { 217 mlx5_ib_dbg(dev, "Couldn't create PD for sync UMR QP\n"); 218 return PTR_ERR(pd); 219 } 220 dev->umrc.pd = pd; 221 222 mutex_init(&dev->umrc.init_lock); 223 224 return 0; 225 } 226 227 void mlx5r_umr_cleanup(struct mlx5_ib_dev *dev) 228 { 229 if (!dev->umrc.pd) 230 return; 231 232 mutex_destroy(&dev->umrc.init_lock); 233 ib_dealloc_pd(dev->umrc.pd); 234 } 235 236 237 static int mlx5r_umr_post_send(struct ib_qp *ibqp, u32 mkey, struct ib_cqe *cqe, 238 struct mlx5r_umr_wqe *wqe, bool with_data) 239 { 240 unsigned int wqe_size = 241 with_data ? sizeof(struct mlx5r_umr_wqe) : 242 sizeof(struct mlx5r_umr_wqe) - 243 sizeof(struct mlx5_wqe_data_seg); 244 struct mlx5_ib_dev *dev = to_mdev(ibqp->device); 245 struct mlx5_core_dev *mdev = dev->mdev; 246 struct mlx5_ib_qp *qp = to_mqp(ibqp); 247 struct mlx5_wqe_ctrl_seg *ctrl; 248 union { 249 struct ib_cqe *ib_cqe; 250 u64 wr_id; 251 } id; 252 void *cur_edge, *seg; 253 unsigned long flags; 254 unsigned int idx; 255 int size, err; 256 257 if (unlikely(mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR)) 258 return -EIO; 259 260 spin_lock_irqsave(&qp->sq.lock, flags); 261 262 err = mlx5r_begin_wqe(qp, &seg, &ctrl, &idx, &size, &cur_edge, 0, 263 cpu_to_be32(mkey), false, false); 264 if (WARN_ON(err)) 265 goto out; 266 267 qp->sq.wr_data[idx] = MLX5_IB_WR_UMR; 268 269 mlx5r_memcpy_send_wqe(&qp->sq, &cur_edge, &seg, &size, wqe, wqe_size); 270 271 id.ib_cqe = cqe; 272 mlx5r_finish_wqe(qp, ctrl, seg, size, cur_edge, idx, id.wr_id, 0, 273 MLX5_FENCE_MODE_INITIATOR_SMALL, MLX5_OPCODE_UMR); 274 275 mlx5r_ring_db(qp, 1, ctrl); 276 277 out: 278 spin_unlock_irqrestore(&qp->sq.lock, flags); 279 280 return err; 281 } 282 283 static int mlx5r_umr_recover(struct mlx5_ib_dev *dev, u32 mkey, 284 struct mlx5r_umr_context *umr_context, 285 struct mlx5r_umr_wqe *wqe, bool with_data) 286 { 287 struct umr_common *umrc = &dev->umrc; 288 struct ib_qp_attr attr; 289 int err; 290 291 mutex_lock(&umrc->lock); 292 /* Preventing any further WRs to be sent now */ 293 if (umrc->state != MLX5_UMR_STATE_RECOVER) { 294 mlx5_ib_warn(dev, "UMR recovery encountered an unexpected state=%d\n", 295 umrc->state); 296 umrc->state = MLX5_UMR_STATE_RECOVER; 297 } 298 mutex_unlock(&umrc->lock); 299 300 /* Sending a final/barrier WR (the failed one) and wait for its completion. 301 * This will ensure that all the previous WRs got a completion before 302 * we set the QP state to RESET. 303 */ 304 err = mlx5r_umr_post_send(umrc->qp, mkey, &umr_context->cqe, wqe, 305 with_data); 306 if (err) { 307 mlx5_ib_warn(dev, "UMR recovery post send failed, err %d\n", err); 308 goto err; 309 } 310 311 /* Since the QP is in an error state, it will only receive 312 * IB_WC_WR_FLUSH_ERR. However, as it serves only as a barrier 313 * we don't care about its status. 314 */ 315 wait_for_completion(&umr_context->done); 316 317 attr.qp_state = IB_QPS_RESET; 318 err = ib_modify_qp(umrc->qp, &attr, IB_QP_STATE); 319 if (err) { 320 mlx5_ib_warn(dev, "Couldn't modify UMR QP to RESET, err=%d\n", err); 321 goto err; 322 } 323 324 err = mlx5r_umr_qp_rst2rts(dev, umrc->qp); 325 if (err) { 326 mlx5_ib_warn(dev, "Couldn't modify UMR QP to RTS, err=%d\n", err); 327 goto err; 328 } 329 330 umrc->state = MLX5_UMR_STATE_ACTIVE; 331 return 0; 332 333 err: 334 umrc->state = MLX5_UMR_STATE_ERR; 335 return err; 336 } 337 338 static void mlx5r_umr_done(struct ib_cq *cq, struct ib_wc *wc) 339 { 340 struct mlx5_ib_umr_context *context = 341 container_of(wc->wr_cqe, struct mlx5_ib_umr_context, cqe); 342 343 context->status = wc->status; 344 complete(&context->done); 345 } 346 347 static inline void mlx5r_umr_init_context(struct mlx5r_umr_context *context) 348 { 349 context->cqe.done = mlx5r_umr_done; 350 init_completion(&context->done); 351 } 352 353 static int mlx5r_umr_post_send_wait(struct mlx5_ib_dev *dev, u32 mkey, 354 struct mlx5r_umr_wqe *wqe, bool with_data) 355 { 356 struct umr_common *umrc = &dev->umrc; 357 struct mlx5r_umr_context umr_context; 358 int err; 359 360 err = umr_check_mkey_mask(dev, be64_to_cpu(wqe->ctrl_seg.mkey_mask)); 361 if (WARN_ON(err)) 362 return err; 363 364 mlx5r_umr_init_context(&umr_context); 365 366 down(&umrc->sem); 367 while (true) { 368 mutex_lock(&umrc->lock); 369 if (umrc->state == MLX5_UMR_STATE_ERR) { 370 mutex_unlock(&umrc->lock); 371 err = -EFAULT; 372 break; 373 } 374 375 if (umrc->state == MLX5_UMR_STATE_RECOVER) { 376 mutex_unlock(&umrc->lock); 377 usleep_range(3000, 5000); 378 continue; 379 } 380 381 err = mlx5r_umr_post_send(umrc->qp, mkey, &umr_context.cqe, wqe, 382 with_data); 383 mutex_unlock(&umrc->lock); 384 if (err) { 385 mlx5_ib_warn(dev, "UMR post send failed, err %d\n", 386 err); 387 break; 388 } 389 390 wait_for_completion(&umr_context.done); 391 392 if (umr_context.status == IB_WC_SUCCESS) 393 break; 394 395 if (umr_context.status == IB_WC_WR_FLUSH_ERR) 396 continue; 397 398 WARN_ON_ONCE(1); 399 mlx5_ib_warn(dev, 400 "reg umr failed (%u). Trying to recover and resubmit the flushed WQEs, mkey = %u\n", 401 umr_context.status, mkey); 402 err = mlx5r_umr_recover(dev, mkey, &umr_context, wqe, with_data); 403 if (err) 404 mlx5_ib_warn(dev, "couldn't recover UMR, err %d\n", 405 err); 406 err = -EFAULT; 407 break; 408 } 409 up(&umrc->sem); 410 return err; 411 } 412 413 /** 414 * mlx5r_umr_revoke_mr - Fence all DMA on the MR 415 * @mr: The MR to fence 416 * 417 * Upon return the NIC will not be doing any DMA to the pages under the MR, 418 * and any DMA in progress will be completed. Failure of this function 419 * indicates the HW has failed catastrophically. 420 */ 421 int mlx5r_umr_revoke_mr(struct mlx5_ib_mr *mr) 422 { 423 struct mlx5_ib_dev *dev = mr_to_mdev(mr); 424 struct mlx5r_umr_wqe wqe = {}; 425 426 if (dev->mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) 427 return 0; 428 429 wqe.ctrl_seg.mkey_mask |= get_umr_update_pd_mask(); 430 wqe.ctrl_seg.mkey_mask |= get_umr_disable_mr_mask(); 431 wqe.ctrl_seg.flags |= MLX5_UMR_INLINE; 432 433 MLX5_SET(mkc, &wqe.mkey_seg, free, 1); 434 MLX5_SET(mkc, &wqe.mkey_seg, pd, to_mpd(dev->umrc.pd)->pdn); 435 MLX5_SET(mkc, &wqe.mkey_seg, qpn, 0xffffff); 436 MLX5_SET(mkc, &wqe.mkey_seg, mkey_7_0, 437 mlx5_mkey_variant(mr->mmkey.key)); 438 439 return mlx5r_umr_post_send_wait(dev, mr->mmkey.key, &wqe, false); 440 } 441 442 static void mlx5r_umr_set_access_flags(struct mlx5_ib_dev *dev, 443 struct mlx5_mkey_seg *seg, 444 unsigned int access_flags) 445 { 446 bool ro_read = (access_flags & IB_ACCESS_RELAXED_ORDERING) && 447 (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read) || 448 pcie_relaxed_ordering_enabled(dev->mdev->pdev)); 449 450 MLX5_SET(mkc, seg, a, !!(access_flags & IB_ACCESS_REMOTE_ATOMIC)); 451 MLX5_SET(mkc, seg, rw, !!(access_flags & IB_ACCESS_REMOTE_WRITE)); 452 MLX5_SET(mkc, seg, rr, !!(access_flags & IB_ACCESS_REMOTE_READ)); 453 MLX5_SET(mkc, seg, lw, !!(access_flags & IB_ACCESS_LOCAL_WRITE)); 454 MLX5_SET(mkc, seg, lr, 1); 455 MLX5_SET(mkc, seg, relaxed_ordering_write, 456 !!(access_flags & IB_ACCESS_RELAXED_ORDERING)); 457 MLX5_SET(mkc, seg, relaxed_ordering_read, ro_read); 458 } 459 460 int mlx5r_umr_rereg_pd_access(struct mlx5_ib_mr *mr, struct ib_pd *pd, 461 int access_flags) 462 { 463 struct mlx5_ib_dev *dev = mr_to_mdev(mr); 464 struct mlx5r_umr_wqe wqe = {}; 465 int err; 466 467 wqe.ctrl_seg.mkey_mask = get_umr_update_access_mask(dev); 468 wqe.ctrl_seg.mkey_mask |= get_umr_update_pd_mask(); 469 wqe.ctrl_seg.flags = MLX5_UMR_CHECK_FREE; 470 wqe.ctrl_seg.flags |= MLX5_UMR_INLINE; 471 472 mlx5r_umr_set_access_flags(dev, &wqe.mkey_seg, access_flags); 473 MLX5_SET(mkc, &wqe.mkey_seg, pd, to_mpd(pd)->pdn); 474 MLX5_SET(mkc, &wqe.mkey_seg, qpn, 0xffffff); 475 MLX5_SET(mkc, &wqe.mkey_seg, mkey_7_0, 476 mlx5_mkey_variant(mr->mmkey.key)); 477 478 err = mlx5r_umr_post_send_wait(dev, mr->mmkey.key, &wqe, false); 479 if (err) 480 return err; 481 482 mr->access_flags = access_flags; 483 return 0; 484 } 485 486 #define MLX5_MAX_UMR_CHUNK \ 487 ((1 << (MLX5_MAX_UMR_SHIFT + 4)) - MLX5_UMR_FLEX_ALIGNMENT) 488 #define MLX5_SPARE_UMR_CHUNK 0x10000 489 490 /* 491 * Allocate a temporary buffer to hold the per-page information to transfer to 492 * HW. For efficiency this should be as large as it can be, but buffer 493 * allocation failure is not allowed, so try smaller sizes. 494 */ 495 static void *mlx5r_umr_alloc_xlt(size_t *nents, size_t ent_size, gfp_t gfp_mask) 496 { 497 const size_t xlt_chunk_align = MLX5_UMR_FLEX_ALIGNMENT / ent_size; 498 size_t size; 499 void *res = NULL; 500 501 static_assert(PAGE_SIZE % MLX5_UMR_FLEX_ALIGNMENT == 0); 502 503 /* 504 * MLX5_IB_UPD_XLT_ATOMIC doesn't signal an atomic context just that the 505 * allocation can't trigger any kind of reclaim. 506 */ 507 might_sleep(); 508 509 gfp_mask |= __GFP_ZERO | __GFP_NORETRY; 510 511 /* 512 * If the system already has a suitable high order page then just use 513 * that, but don't try hard to create one. This max is about 1M, so a 514 * free x86 huge page will satisfy it. 515 */ 516 size = min_t(size_t, ent_size * ALIGN(*nents, xlt_chunk_align), 517 MLX5_MAX_UMR_CHUNK); 518 *nents = size / ent_size; 519 res = (void *)__get_free_pages(gfp_mask | __GFP_NOWARN, 520 get_order(size)); 521 if (res) 522 return res; 523 524 if (size > MLX5_SPARE_UMR_CHUNK) { 525 size = MLX5_SPARE_UMR_CHUNK; 526 *nents = size / ent_size; 527 res = (void *)__get_free_pages(gfp_mask | __GFP_NOWARN, 528 get_order(size)); 529 if (res) 530 return res; 531 } 532 533 *nents = PAGE_SIZE / ent_size; 534 res = (void *)__get_free_page(gfp_mask); 535 if (res) 536 return res; 537 538 mutex_lock(&xlt_emergency_page_mutex); 539 memset(xlt_emergency_page, 0, PAGE_SIZE); 540 return xlt_emergency_page; 541 } 542 543 static void mlx5r_umr_free_xlt(void *xlt, size_t length) 544 { 545 if (xlt == xlt_emergency_page) { 546 mutex_unlock(&xlt_emergency_page_mutex); 547 return; 548 } 549 550 free_pages((unsigned long)xlt, get_order(length)); 551 } 552 553 static void mlx5r_umr_unmap_free_xlt(struct mlx5_ib_dev *dev, void *xlt, 554 struct ib_sge *sg) 555 { 556 struct device *ddev = &dev->mdev->pdev->dev; 557 558 dma_unmap_single(ddev, sg->addr, sg->length, DMA_TO_DEVICE); 559 mlx5r_umr_free_xlt(xlt, sg->length); 560 } 561 562 /* 563 * Create an XLT buffer ready for submission. 564 */ 565 static void *mlx5r_umr_create_xlt(struct mlx5_ib_dev *dev, struct ib_sge *sg, 566 size_t nents, size_t ent_size, 567 unsigned int flags) 568 { 569 struct device *ddev = &dev->mdev->pdev->dev; 570 dma_addr_t dma; 571 void *xlt; 572 573 xlt = mlx5r_umr_alloc_xlt(&nents, ent_size, 574 flags & MLX5_IB_UPD_XLT_ATOMIC ? GFP_ATOMIC : 575 GFP_KERNEL); 576 sg->length = nents * ent_size; 577 dma = dma_map_single(ddev, xlt, sg->length, DMA_TO_DEVICE); 578 if (dma_mapping_error(ddev, dma)) { 579 mlx5_ib_err(dev, "unable to map DMA during XLT update.\n"); 580 mlx5r_umr_free_xlt(xlt, sg->length); 581 return NULL; 582 } 583 sg->addr = dma; 584 sg->lkey = dev->umrc.pd->local_dma_lkey; 585 586 return xlt; 587 } 588 589 static void 590 mlx5r_umr_set_update_xlt_ctrl_seg(struct mlx5_wqe_umr_ctrl_seg *ctrl_seg, 591 unsigned int flags, struct ib_sge *sg) 592 { 593 if (!(flags & MLX5_IB_UPD_XLT_ENABLE)) 594 /* fail if free */ 595 ctrl_seg->flags = MLX5_UMR_CHECK_FREE; 596 else 597 /* fail if not free */ 598 ctrl_seg->flags = MLX5_UMR_CHECK_NOT_FREE; 599 ctrl_seg->xlt_octowords = 600 cpu_to_be16(mlx5r_umr_get_xlt_octo(sg->length)); 601 } 602 603 static void mlx5r_umr_set_update_xlt_mkey_seg(struct mlx5_ib_dev *dev, 604 struct mlx5_mkey_seg *mkey_seg, 605 struct mlx5_ib_mr *mr, 606 unsigned int page_shift) 607 { 608 mlx5r_umr_set_access_flags(dev, mkey_seg, mr->access_flags); 609 MLX5_SET(mkc, mkey_seg, pd, to_mpd(mr->ibmr.pd)->pdn); 610 MLX5_SET64(mkc, mkey_seg, start_addr, mr->ibmr.iova); 611 MLX5_SET64(mkc, mkey_seg, len, mr->ibmr.length); 612 MLX5_SET(mkc, mkey_seg, log_page_size, page_shift); 613 MLX5_SET(mkc, mkey_seg, qpn, 0xffffff); 614 MLX5_SET(mkc, mkey_seg, mkey_7_0, mlx5_mkey_variant(mr->mmkey.key)); 615 } 616 617 static void 618 mlx5r_umr_set_update_xlt_data_seg(struct mlx5_wqe_data_seg *data_seg, 619 struct ib_sge *sg) 620 { 621 data_seg->byte_count = cpu_to_be32(sg->length); 622 data_seg->lkey = cpu_to_be32(sg->lkey); 623 data_seg->addr = cpu_to_be64(sg->addr); 624 } 625 626 static void mlx5r_umr_update_offset(struct mlx5_wqe_umr_ctrl_seg *ctrl_seg, 627 u64 offset) 628 { 629 u64 octo_offset = mlx5r_umr_get_xlt_octo(offset); 630 631 ctrl_seg->xlt_offset = cpu_to_be16(octo_offset & 0xffff); 632 ctrl_seg->xlt_offset_47_16 = cpu_to_be32(octo_offset >> 16); 633 ctrl_seg->flags |= MLX5_UMR_TRANSLATION_OFFSET_EN; 634 } 635 636 static void mlx5r_umr_final_update_xlt(struct mlx5_ib_dev *dev, 637 struct mlx5r_umr_wqe *wqe, 638 struct mlx5_ib_mr *mr, struct ib_sge *sg, 639 unsigned int flags) 640 { 641 bool update_pd_access, update_translation; 642 643 if (flags & MLX5_IB_UPD_XLT_ENABLE) 644 wqe->ctrl_seg.mkey_mask |= get_umr_enable_mr_mask(); 645 646 update_pd_access = flags & MLX5_IB_UPD_XLT_ENABLE || 647 flags & MLX5_IB_UPD_XLT_PD || 648 flags & MLX5_IB_UPD_XLT_ACCESS; 649 650 if (update_pd_access) { 651 wqe->ctrl_seg.mkey_mask |= get_umr_update_access_mask(dev); 652 wqe->ctrl_seg.mkey_mask |= get_umr_update_pd_mask(); 653 } 654 655 update_translation = 656 flags & MLX5_IB_UPD_XLT_ENABLE || flags & MLX5_IB_UPD_XLT_ADDR; 657 658 if (update_translation) { 659 wqe->ctrl_seg.mkey_mask |= get_umr_update_translation_mask(dev); 660 if (!mr->ibmr.length) 661 MLX5_SET(mkc, &wqe->mkey_seg, length64, 1); 662 if (flags & MLX5_IB_UPD_XLT_KEEP_PGSZ) 663 wqe->ctrl_seg.mkey_mask &= 664 cpu_to_be64(~MLX5_MKEY_MASK_PAGE_SIZE); 665 } 666 667 wqe->ctrl_seg.xlt_octowords = 668 cpu_to_be16(mlx5r_umr_get_xlt_octo(sg->length)); 669 wqe->data_seg.byte_count = cpu_to_be32(sg->length); 670 } 671 672 static void 673 _mlx5r_umr_init_wqe(struct mlx5_ib_mr *mr, struct mlx5r_umr_wqe *wqe, 674 struct ib_sge *sg, unsigned int flags, 675 unsigned int page_shift, bool dd) 676 { 677 struct mlx5_ib_dev *dev = mr_to_mdev(mr); 678 679 mlx5r_umr_set_update_xlt_ctrl_seg(&wqe->ctrl_seg, flags, sg); 680 mlx5r_umr_set_update_xlt_mkey_seg(dev, &wqe->mkey_seg, mr, page_shift); 681 if (dd) /* Use the data direct internal kernel PD */ 682 MLX5_SET(mkc, &wqe->mkey_seg, pd, dev->ddr.pdn); 683 mlx5r_umr_set_update_xlt_data_seg(&wqe->data_seg, sg); 684 } 685 686 static int 687 _mlx5r_umr_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags, bool dd, 688 size_t start_block, size_t nblocks) 689 { 690 size_t ent_size = dd ? sizeof(struct mlx5_ksm) : sizeof(struct mlx5_mtt); 691 struct mlx5_ib_dev *dev = mr_to_mdev(mr); 692 struct device *ddev = &dev->mdev->pdev->dev; 693 struct mlx5r_umr_wqe wqe = {}; 694 size_t processed_blocks = 0; 695 struct ib_block_iter biter; 696 size_t cur_block_idx = 0; 697 struct mlx5_ksm *cur_ksm; 698 struct mlx5_mtt *cur_mtt; 699 size_t orig_sg_length; 700 size_t total_blocks; 701 size_t final_size; 702 void *curr_entry; 703 struct ib_sge sg; 704 void *entry; 705 u64 offset; 706 int err = 0; 707 708 total_blocks = ib_umem_num_dma_blocks(mr->umem, 1UL << mr->page_shift); 709 if (start_block > total_blocks) 710 return -EINVAL; 711 712 /* nblocks 0 means update all blocks starting from start_block */ 713 if (nblocks) 714 total_blocks = nblocks; 715 716 entry = mlx5r_umr_create_xlt(dev, &sg, total_blocks, ent_size, flags); 717 if (!entry) 718 return -ENOMEM; 719 720 orig_sg_length = sg.length; 721 722 _mlx5r_umr_init_wqe(mr, &wqe, &sg, flags, mr->page_shift, dd); 723 724 /* Set initial translation offset to start_block */ 725 offset = (u64)start_block * ent_size; 726 mlx5r_umr_update_offset(&wqe.ctrl_seg, offset); 727 728 if (dd) 729 cur_ksm = entry; 730 else 731 cur_mtt = entry; 732 733 curr_entry = entry; 734 735 rdma_umem_for_each_dma_block(mr->umem, &biter, BIT(mr->page_shift)) { 736 if (cur_block_idx < start_block) { 737 cur_block_idx++; 738 continue; 739 } 740 741 if (nblocks && processed_blocks >= nblocks) 742 break; 743 744 if (curr_entry == entry + sg.length) { 745 dma_sync_single_for_device(ddev, sg.addr, sg.length, 746 DMA_TO_DEVICE); 747 748 err = mlx5r_umr_post_send_wait(dev, mr->mmkey.key, &wqe, 749 true); 750 if (err) 751 goto err; 752 dma_sync_single_for_cpu(ddev, sg.addr, sg.length, 753 DMA_TO_DEVICE); 754 offset += sg.length; 755 mlx5r_umr_update_offset(&wqe.ctrl_seg, offset); 756 if (dd) 757 cur_ksm = entry; 758 else 759 cur_mtt = entry; 760 } 761 762 if (dd) { 763 cur_ksm->va = cpu_to_be64(rdma_block_iter_dma_address(&biter)); 764 if (mr->access_flags & IB_ACCESS_RELAXED_ORDERING && 765 dev->ddr.mkey_ro_valid) 766 cur_ksm->key = cpu_to_be32(dev->ddr.mkey_ro); 767 else 768 cur_ksm->key = cpu_to_be32(dev->ddr.mkey); 769 if (mr->umem->is_dmabuf && 770 (flags & MLX5_IB_UPD_XLT_ZAP)) { 771 cur_ksm->va = 0; 772 cur_ksm->key = 0; 773 } 774 cur_ksm++; 775 curr_entry = cur_ksm; 776 } else { 777 cur_mtt->ptag = 778 cpu_to_be64(rdma_block_iter_dma_address(&biter) | 779 MLX5_IB_MTT_PRESENT); 780 if (mr->umem->is_dmabuf && (flags & MLX5_IB_UPD_XLT_ZAP)) 781 cur_mtt->ptag = 0; 782 cur_mtt++; 783 curr_entry = cur_mtt; 784 } 785 786 processed_blocks++; 787 } 788 789 final_size = curr_entry - entry; 790 sg.length = ALIGN(final_size, MLX5_UMR_FLEX_ALIGNMENT); 791 memset(curr_entry, 0, sg.length - final_size); 792 mlx5r_umr_final_update_xlt(dev, &wqe, mr, &sg, flags); 793 794 dma_sync_single_for_device(ddev, sg.addr, sg.length, DMA_TO_DEVICE); 795 err = mlx5r_umr_post_send_wait(dev, mr->mmkey.key, &wqe, true); 796 797 err: 798 sg.length = orig_sg_length; 799 mlx5r_umr_unmap_free_xlt(dev, entry, &sg); 800 return err; 801 } 802 803 int mlx5r_umr_update_data_direct_ksm_pas_range(struct mlx5_ib_mr *mr, 804 unsigned int flags, 805 size_t start_block, 806 size_t nblocks) 807 { 808 /* No invalidation flow is expected */ 809 if (WARN_ON(!mr->umem->is_dmabuf) || ((flags & MLX5_IB_UPD_XLT_ZAP) && 810 !(flags & MLX5_IB_UPD_XLT_KEEP_PGSZ))) 811 return -EINVAL; 812 813 return _mlx5r_umr_update_mr_pas(mr, flags, true, start_block, nblocks); 814 } 815 816 int mlx5r_umr_update_data_direct_ksm_pas(struct mlx5_ib_mr *mr, 817 unsigned int flags) 818 { 819 return mlx5r_umr_update_data_direct_ksm_pas_range(mr, flags, 0, 0); 820 } 821 822 int mlx5r_umr_update_mr_pas_range(struct mlx5_ib_mr *mr, unsigned int flags, 823 size_t start_block, size_t nblocks) 824 { 825 if (WARN_ON(mr->umem->is_odp)) 826 return -EINVAL; 827 828 return _mlx5r_umr_update_mr_pas(mr, flags, false, start_block, nblocks); 829 } 830 831 /* 832 * Send the DMA list to the HW for a normal MR using UMR. 833 * Dmabuf MR is handled in a similar way, except that the MLX5_IB_UPD_XLT_ZAP 834 * flag may be used. 835 */ 836 int mlx5r_umr_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags) 837 { 838 return mlx5r_umr_update_mr_pas_range(mr, flags, 0, 0); 839 } 840 841 static bool umr_can_use_indirect_mkey(struct mlx5_ib_dev *dev) 842 { 843 return !MLX5_CAP_GEN(dev->mdev, umr_indirect_mkey_disabled); 844 } 845 846 int mlx5r_umr_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages, 847 int page_shift, int flags) 848 { 849 int desc_size = (flags & MLX5_IB_UPD_XLT_INDIRECT) 850 ? sizeof(struct mlx5_klm) 851 : sizeof(struct mlx5_mtt); 852 const int page_align = MLX5_UMR_FLEX_ALIGNMENT / desc_size; 853 struct mlx5_ib_dev *dev = mr_to_mdev(mr); 854 struct device *ddev = &dev->mdev->pdev->dev; 855 const int page_mask = page_align - 1; 856 struct mlx5r_umr_wqe wqe = {}; 857 size_t pages_mapped = 0; 858 size_t pages_to_map = 0; 859 size_t size_to_map = 0; 860 size_t orig_sg_length; 861 size_t pages_iter; 862 struct ib_sge sg; 863 int err = 0; 864 void *xlt; 865 866 if ((flags & MLX5_IB_UPD_XLT_INDIRECT) && 867 !umr_can_use_indirect_mkey(dev)) 868 return -EPERM; 869 870 if (WARN_ON(!mr->umem->is_odp)) 871 return -EINVAL; 872 873 /* UMR copies MTTs in units of MLX5_UMR_FLEX_ALIGNMENT bytes, 874 * so we need to align the offset and length accordingly 875 */ 876 if (idx & page_mask) { 877 npages += idx & page_mask; 878 idx &= ~page_mask; 879 } 880 pages_to_map = ALIGN(npages, page_align); 881 882 xlt = mlx5r_umr_create_xlt(dev, &sg, npages, desc_size, flags); 883 if (!xlt) 884 return -ENOMEM; 885 886 pages_iter = sg.length / desc_size; 887 orig_sg_length = sg.length; 888 889 if (!(flags & MLX5_IB_UPD_XLT_INDIRECT)) { 890 struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem); 891 size_t max_pages = ib_umem_odp_num_pages(odp) - idx; 892 893 pages_to_map = min_t(size_t, pages_to_map, max_pages); 894 } 895 896 mlx5r_umr_set_update_xlt_ctrl_seg(&wqe.ctrl_seg, flags, &sg); 897 mlx5r_umr_set_update_xlt_mkey_seg(dev, &wqe.mkey_seg, mr, page_shift); 898 mlx5r_umr_set_update_xlt_data_seg(&wqe.data_seg, &sg); 899 900 for (pages_mapped = 0; 901 pages_mapped < pages_to_map && !err; 902 pages_mapped += pages_iter, idx += pages_iter) { 903 npages = min_t(int, pages_iter, pages_to_map - pages_mapped); 904 size_to_map = npages * desc_size; 905 dma_sync_single_for_cpu(ddev, sg.addr, sg.length, 906 DMA_TO_DEVICE); 907 /* 908 * npages is the maximum number of pages to map, but we 909 * can't guarantee that all pages are actually mapped. 910 * 911 * For example, if page is p2p of type which is not supported 912 * for mapping, the number of pages mapped will be less than 913 * requested. 914 */ 915 err = mlx5_odp_populate_xlt(xlt, idx, npages, mr, flags); 916 if (err) 917 return err; 918 dma_sync_single_for_device(ddev, sg.addr, sg.length, 919 DMA_TO_DEVICE); 920 sg.length = ALIGN(size_to_map, MLX5_UMR_FLEX_ALIGNMENT); 921 922 if (pages_mapped + pages_iter >= pages_to_map) 923 mlx5r_umr_final_update_xlt(dev, &wqe, mr, &sg, flags); 924 mlx5r_umr_update_offset(&wqe.ctrl_seg, idx * desc_size); 925 err = mlx5r_umr_post_send_wait(dev, mr->mmkey.key, &wqe, true); 926 } 927 sg.length = orig_sg_length; 928 mlx5r_umr_unmap_free_xlt(dev, xlt, &sg); 929 return err; 930 } 931 932 /* 933 * Update only the page-size (log_page_size) field of an existing memory key 934 * using UMR. This is useful when the MR's physical layout stays the same 935 * but the optimal page shift has changed (e.g. dmabuf after pages are 936 * pinned and the HW can switch from 4K to huge-page alignment). 937 */ 938 int mlx5r_umr_update_mr_page_shift(struct mlx5_ib_mr *mr, 939 unsigned int page_shift, 940 bool dd) 941 { 942 struct mlx5_ib_dev *dev = mr_to_mdev(mr); 943 struct mlx5r_umr_wqe wqe = {}; 944 int err; 945 946 /* Build UMR wqe: we touch only PAGE_SIZE, so use the dedicated mask */ 947 wqe.ctrl_seg.mkey_mask = get_umr_update_translation_mask(dev); 948 949 /* MR must be free while page size is modified */ 950 wqe.ctrl_seg.flags = MLX5_UMR_CHECK_FREE | MLX5_UMR_INLINE; 951 952 /* Fill mkey segment with the new page size, keep the rest unchanged */ 953 MLX5_SET(mkc, &wqe.mkey_seg, log_page_size, page_shift); 954 955 if (dd) 956 MLX5_SET(mkc, &wqe.mkey_seg, pd, dev->ddr.pdn); 957 else 958 MLX5_SET(mkc, &wqe.mkey_seg, pd, to_mpd(mr->ibmr.pd)->pdn); 959 960 MLX5_SET64(mkc, &wqe.mkey_seg, start_addr, mr->ibmr.iova); 961 MLX5_SET64(mkc, &wqe.mkey_seg, len, mr->ibmr.length); 962 MLX5_SET(mkc, &wqe.mkey_seg, qpn, 0xffffff); 963 MLX5_SET(mkc, &wqe.mkey_seg, mkey_7_0, 964 mlx5_mkey_variant(mr->mmkey.key)); 965 966 err = mlx5r_umr_post_send_wait(dev, mr->mmkey.key, &wqe, false); 967 if (!err) 968 mr->page_shift = page_shift; 969 970 return err; 971 } 972 973 static inline int 974 _mlx5r_dmabuf_umr_update_pas(struct mlx5_ib_mr *mr, unsigned int flags, 975 size_t start_block, size_t nblocks, bool dd) 976 { 977 if (dd) 978 return mlx5r_umr_update_data_direct_ksm_pas_range(mr, flags, 979 start_block, 980 nblocks); 981 else 982 return mlx5r_umr_update_mr_pas_range(mr, flags, start_block, 983 nblocks); 984 } 985 986 /** 987 * This function makes an mkey non-present by zapping the translation entries of 988 * the mkey by zapping (zeroing out) the first N entries, where N is determined 989 * by the largest page size supported by the device and the MR length. 990 * It then updates the mkey's page size to the largest possible value, ensuring 991 * the MR is completely non-present and safe for further updates. 992 * It is useful to update the page size of a dmabuf MR on a page fault. 993 * 994 * Return: On success, returns the number of entries that were zapped. 995 * On error, returns a negative error code. 996 */ 997 static int _mlx5r_umr_zap_mkey(struct mlx5_ib_mr *mr, 998 unsigned int flags, 999 unsigned int page_shift, 1000 size_t *nblocks, 1001 bool dd) 1002 { 1003 unsigned int old_page_shift = mr->page_shift; 1004 struct mlx5_ib_dev *dev = mr_to_mdev(mr); 1005 unsigned int max_page_shift; 1006 size_t page_shift_nblocks; 1007 unsigned int max_log_size; 1008 int access_mode; 1009 int err; 1010 1011 access_mode = dd ? MLX5_MKC_ACCESS_MODE_KSM : MLX5_MKC_ACCESS_MODE_MTT; 1012 flags |= MLX5_IB_UPD_XLT_KEEP_PGSZ | MLX5_IB_UPD_XLT_ZAP | 1013 MLX5_IB_UPD_XLT_ATOMIC; 1014 max_log_size = get_max_log_entity_size_cap(dev, access_mode); 1015 max_page_shift = order_base_2(mr->ibmr.length); 1016 max_page_shift = min(max(max_page_shift, page_shift), max_log_size); 1017 /* Count blocks in units of max_page_shift, we will zap exactly this 1018 * many to make the whole MR non-present. 1019 * Block size must be aligned to MLX5_UMR_FLEX_ALIGNMENT since it may 1020 * be used as offset into the XLT later on. 1021 */ 1022 *nblocks = ib_umem_num_dma_blocks(mr->umem, 1UL << max_page_shift); 1023 if (dd) 1024 *nblocks = ALIGN(*nblocks, MLX5_UMR_KSM_NUM_ENTRIES_ALIGNMENT); 1025 else 1026 *nblocks = ALIGN(*nblocks, MLX5_UMR_MTT_NUM_ENTRIES_ALIGNMENT); 1027 page_shift_nblocks = ib_umem_num_dma_blocks(mr->umem, 1028 1UL << page_shift); 1029 /* If the number of blocks at max possible page shift is greater than 1030 * the number of blocks at the new page size, we should just go over the 1031 * whole mkey entries. 1032 */ 1033 if (*nblocks >= page_shift_nblocks) 1034 *nblocks = 0; 1035 1036 /* Make the first nblocks entries non-present without changing 1037 * page size yet. 1038 */ 1039 if (*nblocks) 1040 mr->page_shift = max_page_shift; 1041 err = _mlx5r_dmabuf_umr_update_pas(mr, flags, 0, *nblocks, dd); 1042 if (err) { 1043 mr->page_shift = old_page_shift; 1044 return err; 1045 } 1046 1047 /* Change page size to the max page size now that the MR is completely 1048 * non-present. 1049 */ 1050 if (*nblocks) { 1051 err = mlx5r_umr_update_mr_page_shift(mr, max_page_shift, dd); 1052 if (err) { 1053 mr->page_shift = old_page_shift; 1054 return err; 1055 } 1056 } 1057 1058 return 0; 1059 } 1060 1061 /** 1062 * mlx5r_umr_dmabuf_update_pgsz - Safely update DMABUF MR page size and its 1063 * entries accordingly 1064 * @mr: The memory region to update 1065 * @xlt_flags: Translation table update flags 1066 * @page_shift: The new (optimized) page shift to use 1067 * 1068 * This function updates the page size and mkey translation entries for a DMABUF 1069 * MR in a safe, multi-step process to avoid exposing partially updated mappings 1070 * The update is performed in 5 steps: 1071 * 1. Make the first X entries non-present, while X is calculated to be 1072 * minimal according to a large page shift that can be used to cover the 1073 * MR length. 1074 * 2. Update the page size to the large supported page size 1075 * 3. Load the remaining N-X entries according to the (optimized) page_shift 1076 * 4. Update the page size according to the (optimized) page_shift 1077 * 5. Load the first X entries with the correct translations 1078 * 1079 * This ensures that at no point is the MR accessible with a partially updated 1080 * translation table, maintaining correctness and preventing access to stale or 1081 * inconsistent mappings. 1082 * 1083 * Returns 0 on success or a negative error code on failure. 1084 */ 1085 int mlx5r_umr_dmabuf_update_pgsz(struct mlx5_ib_mr *mr, u32 xlt_flags, 1086 unsigned int page_shift) 1087 { 1088 unsigned int old_page_shift = mr->page_shift; 1089 size_t zapped_blocks; 1090 size_t total_blocks; 1091 int err; 1092 1093 err = _mlx5r_umr_zap_mkey(mr, xlt_flags, page_shift, &zapped_blocks, 1094 mr->data_direct); 1095 if (err) 1096 return err; 1097 1098 /* _mlx5r_umr_zap_mkey already enables the mkey */ 1099 xlt_flags &= ~MLX5_IB_UPD_XLT_ENABLE; 1100 mr->page_shift = page_shift; 1101 total_blocks = ib_umem_num_dma_blocks(mr->umem, 1UL << mr->page_shift); 1102 if (zapped_blocks && zapped_blocks < total_blocks) { 1103 /* Update PAS according to the new page size but don't update 1104 * the page size in the mkey yet. 1105 */ 1106 err = _mlx5r_dmabuf_umr_update_pas( 1107 mr, 1108 xlt_flags | MLX5_IB_UPD_XLT_KEEP_PGSZ, 1109 zapped_blocks, 1110 total_blocks - zapped_blocks, 1111 mr->data_direct); 1112 if (err) 1113 goto err; 1114 } 1115 1116 err = mlx5r_umr_update_mr_page_shift(mr, mr->page_shift, 1117 mr->data_direct); 1118 if (err) 1119 goto err; 1120 err = _mlx5r_dmabuf_umr_update_pas(mr, xlt_flags, 0, zapped_blocks, 1121 mr->data_direct); 1122 if (err) 1123 goto err; 1124 1125 return 0; 1126 err: 1127 mr->page_shift = old_page_shift; 1128 return err; 1129 } 1130