1 /* 2 * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved. 3 * 4 * This software is available to you under a choice of one of two 5 * licenses. You may choose to be licensed under the terms of the GNU 6 * General Public License (GPL) Version 2, available from the file 7 * COPYING in the main directory of this source tree, or the 8 * OpenIB.org BSD license below: 9 * 10 * Redistribution and use in source and binary forms, with or 11 * without modification, are permitted provided that the following 12 * conditions are met: 13 * 14 * - Redistributions of source code must retain the above 15 * copyright notice, this list of conditions and the following 16 * disclaimer. 17 * 18 * - Redistributions in binary form must reproduce the above 19 * copyright notice, this list of conditions and the following 20 * disclaimer in the documentation and/or other materials 21 * provided with the distribution. 22 * 23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 30 * SOFTWARE. 31 */ 32 33 #include <rdma/ib_umem.h> 34 #include <rdma/ib_umem_odp.h> 35 #include <linux/kernel.h> 36 37 #include "mlx5_ib.h" 38 #include "cmd.h" 39 40 #include <linux/mlx5/eq.h> 41 42 /* Contains the details of a pagefault. */ 43 struct mlx5_pagefault { 44 u32 bytes_committed; 45 u32 token; 46 u8 event_subtype; 47 u8 type; 48 union { 49 /* Initiator or send message responder pagefault details. */ 50 struct { 51 /* Received packet size, only valid for responders. */ 52 u32 packet_size; 53 /* 54 * Number of resource holding WQE, depends on type. 55 */ 56 u32 wq_num; 57 /* 58 * WQE index. Refers to either the send queue or 59 * receive queue, according to event_subtype. 60 */ 61 u16 wqe_index; 62 } wqe; 63 /* RDMA responder pagefault details */ 64 struct { 65 u32 r_key; 66 /* 67 * Received packet size, minimal size page fault 68 * resolution required for forward progress. 69 */ 70 u32 packet_size; 71 u32 rdma_op_len; 72 u64 rdma_va; 73 } rdma; 74 }; 75 76 struct mlx5_ib_pf_eq *eq; 77 struct work_struct work; 78 }; 79 80 #define MAX_PREFETCH_LEN (4*1024*1024U) 81 82 /* Timeout in ms to wait for an active mmu notifier to complete when handling 83 * a pagefault. */ 84 #define MMU_NOTIFIER_TIMEOUT 1000 85 86 #define MLX5_IMR_MTT_BITS (30 - PAGE_SHIFT) 87 #define MLX5_IMR_MTT_SHIFT (MLX5_IMR_MTT_BITS + PAGE_SHIFT) 88 #define MLX5_IMR_MTT_ENTRIES BIT_ULL(MLX5_IMR_MTT_BITS) 89 #define MLX5_IMR_MTT_SIZE BIT_ULL(MLX5_IMR_MTT_SHIFT) 90 #define MLX5_IMR_MTT_MASK (~(MLX5_IMR_MTT_SIZE - 1)) 91 92 #define MLX5_KSM_PAGE_SHIFT MLX5_IMR_MTT_SHIFT 93 94 static u64 mlx5_imr_ksm_entries; 95 96 static int check_parent(struct ib_umem_odp *odp, 97 struct mlx5_ib_mr *parent) 98 { 99 struct mlx5_ib_mr *mr = odp->private; 100 101 return mr && mr->parent == parent && !odp->dying; 102 } 103 104 struct ib_ucontext_per_mm *mr_to_per_mm(struct mlx5_ib_mr *mr) 105 { 106 if (WARN_ON(!mr || !mr->umem || !mr->umem->is_odp)) 107 return NULL; 108 109 return to_ib_umem_odp(mr->umem)->per_mm; 110 } 111 112 static struct ib_umem_odp *odp_next(struct ib_umem_odp *odp) 113 { 114 struct mlx5_ib_mr *mr = odp->private, *parent = mr->parent; 115 struct ib_ucontext_per_mm *per_mm = odp->per_mm; 116 struct rb_node *rb; 117 118 down_read(&per_mm->umem_rwsem); 119 while (1) { 120 rb = rb_next(&odp->interval_tree.rb); 121 if (!rb) 122 goto not_found; 123 odp = rb_entry(rb, struct ib_umem_odp, interval_tree.rb); 124 if (check_parent(odp, parent)) 125 goto end; 126 } 127 not_found: 128 odp = NULL; 129 end: 130 up_read(&per_mm->umem_rwsem); 131 return odp; 132 } 133 134 static struct ib_umem_odp *odp_lookup(u64 start, u64 length, 135 struct mlx5_ib_mr *parent) 136 { 137 struct ib_ucontext_per_mm *per_mm = mr_to_per_mm(parent); 138 struct ib_umem_odp *odp; 139 struct rb_node *rb; 140 141 down_read(&per_mm->umem_rwsem); 142 odp = rbt_ib_umem_lookup(&per_mm->umem_tree, start, length); 143 if (!odp) 144 goto end; 145 146 while (1) { 147 if (check_parent(odp, parent)) 148 goto end; 149 rb = rb_next(&odp->interval_tree.rb); 150 if (!rb) 151 goto not_found; 152 odp = rb_entry(rb, struct ib_umem_odp, interval_tree.rb); 153 if (ib_umem_start(&odp->umem) > start + length) 154 goto not_found; 155 } 156 not_found: 157 odp = NULL; 158 end: 159 up_read(&per_mm->umem_rwsem); 160 return odp; 161 } 162 163 void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t offset, 164 size_t nentries, struct mlx5_ib_mr *mr, int flags) 165 { 166 struct ib_pd *pd = mr->ibmr.pd; 167 struct mlx5_ib_dev *dev = to_mdev(pd->device); 168 struct ib_umem_odp *odp; 169 unsigned long va; 170 int i; 171 172 if (flags & MLX5_IB_UPD_XLT_ZAP) { 173 for (i = 0; i < nentries; i++, pklm++) { 174 pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE); 175 pklm->key = cpu_to_be32(dev->null_mkey); 176 pklm->va = 0; 177 } 178 return; 179 } 180 181 odp = odp_lookup(offset * MLX5_IMR_MTT_SIZE, 182 nentries * MLX5_IMR_MTT_SIZE, mr); 183 184 for (i = 0; i < nentries; i++, pklm++) { 185 pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE); 186 va = (offset + i) * MLX5_IMR_MTT_SIZE; 187 if (odp && odp->umem.address == va) { 188 struct mlx5_ib_mr *mtt = odp->private; 189 190 pklm->key = cpu_to_be32(mtt->ibmr.lkey); 191 odp = odp_next(odp); 192 } else { 193 pklm->key = cpu_to_be32(dev->null_mkey); 194 } 195 mlx5_ib_dbg(dev, "[%d] va %lx key %x\n", 196 i, va, be32_to_cpu(pklm->key)); 197 } 198 } 199 200 static void mr_leaf_free_action(struct work_struct *work) 201 { 202 struct ib_umem_odp *odp = container_of(work, struct ib_umem_odp, work); 203 int idx = ib_umem_start(&odp->umem) >> MLX5_IMR_MTT_SHIFT; 204 struct mlx5_ib_mr *mr = odp->private, *imr = mr->parent; 205 206 mr->parent = NULL; 207 synchronize_srcu(&mr->dev->mr_srcu); 208 209 ib_umem_release(&odp->umem); 210 if (imr->live) 211 mlx5_ib_update_xlt(imr, idx, 1, 0, 212 MLX5_IB_UPD_XLT_INDIRECT | 213 MLX5_IB_UPD_XLT_ATOMIC); 214 mlx5_mr_cache_free(mr->dev, mr); 215 216 if (atomic_dec_and_test(&imr->num_leaf_free)) 217 wake_up(&imr->q_leaf_free); 218 } 219 220 void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start, 221 unsigned long end) 222 { 223 struct mlx5_ib_mr *mr; 224 const u64 umr_block_mask = (MLX5_UMR_MTT_ALIGNMENT / 225 sizeof(struct mlx5_mtt)) - 1; 226 u64 idx = 0, blk_start_idx = 0; 227 struct ib_umem *umem; 228 int in_block = 0; 229 u64 addr; 230 231 if (!umem_odp) { 232 pr_err("invalidation called on NULL umem or non-ODP umem\n"); 233 return; 234 } 235 umem = &umem_odp->umem; 236 237 mr = umem_odp->private; 238 239 if (!mr || !mr->ibmr.pd) 240 return; 241 242 start = max_t(u64, ib_umem_start(umem), start); 243 end = min_t(u64, ib_umem_end(umem), end); 244 245 /* 246 * Iteration one - zap the HW's MTTs. The notifiers_count ensures that 247 * while we are doing the invalidation, no page fault will attempt to 248 * overwrite the same MTTs. Concurent invalidations might race us, 249 * but they will write 0s as well, so no difference in the end result. 250 */ 251 252 for (addr = start; addr < end; addr += BIT(umem->page_shift)) { 253 idx = (addr - ib_umem_start(umem)) >> umem->page_shift; 254 /* 255 * Strive to write the MTTs in chunks, but avoid overwriting 256 * non-existing MTTs. The huristic here can be improved to 257 * estimate the cost of another UMR vs. the cost of bigger 258 * UMR. 259 */ 260 if (umem_odp->dma_list[idx] & 261 (ODP_READ_ALLOWED_BIT | ODP_WRITE_ALLOWED_BIT)) { 262 if (!in_block) { 263 blk_start_idx = idx; 264 in_block = 1; 265 } 266 } else { 267 u64 umr_offset = idx & umr_block_mask; 268 269 if (in_block && umr_offset == 0) { 270 mlx5_ib_update_xlt(mr, blk_start_idx, 271 idx - blk_start_idx, 0, 272 MLX5_IB_UPD_XLT_ZAP | 273 MLX5_IB_UPD_XLT_ATOMIC); 274 in_block = 0; 275 } 276 } 277 } 278 if (in_block) 279 mlx5_ib_update_xlt(mr, blk_start_idx, 280 idx - blk_start_idx + 1, 0, 281 MLX5_IB_UPD_XLT_ZAP | 282 MLX5_IB_UPD_XLT_ATOMIC); 283 /* 284 * We are now sure that the device will not access the 285 * memory. We can safely unmap it, and mark it as dirty if 286 * needed. 287 */ 288 289 ib_umem_odp_unmap_dma_pages(umem_odp, start, end); 290 291 if (unlikely(!umem->npages && mr->parent && 292 !umem_odp->dying)) { 293 WRITE_ONCE(umem_odp->dying, 1); 294 atomic_inc(&mr->parent->num_leaf_free); 295 schedule_work(&umem_odp->work); 296 } 297 } 298 299 void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev) 300 { 301 struct ib_odp_caps *caps = &dev->odp_caps; 302 303 memset(caps, 0, sizeof(*caps)); 304 305 if (!MLX5_CAP_GEN(dev->mdev, pg)) 306 return; 307 308 caps->general_caps = IB_ODP_SUPPORT; 309 310 if (MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset)) 311 dev->odp_max_size = U64_MAX; 312 else 313 dev->odp_max_size = BIT_ULL(MLX5_MAX_UMR_SHIFT + PAGE_SHIFT); 314 315 if (MLX5_CAP_ODP(dev->mdev, ud_odp_caps.send)) 316 caps->per_transport_caps.ud_odp_caps |= IB_ODP_SUPPORT_SEND; 317 318 if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.send)) 319 caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_SEND; 320 321 if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.receive)) 322 caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_RECV; 323 324 if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.write)) 325 caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_WRITE; 326 327 if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.read)) 328 caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_READ; 329 330 if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.atomic)) 331 caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_ATOMIC; 332 333 if (MLX5_CAP_GEN(dev->mdev, fixed_buffer_size) && 334 MLX5_CAP_GEN(dev->mdev, null_mkey) && 335 MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset)) 336 caps->general_caps |= IB_ODP_SUPPORT_IMPLICIT; 337 338 return; 339 } 340 341 static void mlx5_ib_page_fault_resume(struct mlx5_ib_dev *dev, 342 struct mlx5_pagefault *pfault, 343 int error) 344 { 345 int wq_num = pfault->event_subtype == MLX5_PFAULT_SUBTYPE_WQE ? 346 pfault->wqe.wq_num : pfault->token; 347 u32 out[MLX5_ST_SZ_DW(page_fault_resume_out)] = { }; 348 u32 in[MLX5_ST_SZ_DW(page_fault_resume_in)] = { }; 349 int err; 350 351 MLX5_SET(page_fault_resume_in, in, opcode, MLX5_CMD_OP_PAGE_FAULT_RESUME); 352 MLX5_SET(page_fault_resume_in, in, page_fault_type, pfault->type); 353 MLX5_SET(page_fault_resume_in, in, token, pfault->token); 354 MLX5_SET(page_fault_resume_in, in, wq_number, wq_num); 355 MLX5_SET(page_fault_resume_in, in, error, !!error); 356 357 err = mlx5_cmd_exec(dev->mdev, in, sizeof(in), out, sizeof(out)); 358 if (err) 359 mlx5_ib_err(dev, "Failed to resolve the page fault on WQ 0x%x err %d\n", 360 wq_num, err); 361 } 362 363 static struct mlx5_ib_mr *implicit_mr_alloc(struct ib_pd *pd, 364 struct ib_umem *umem, 365 bool ksm, int access_flags) 366 { 367 struct mlx5_ib_dev *dev = to_mdev(pd->device); 368 struct mlx5_ib_mr *mr; 369 int err; 370 371 mr = mlx5_mr_cache_alloc(dev, ksm ? MLX5_IMR_KSM_CACHE_ENTRY : 372 MLX5_IMR_MTT_CACHE_ENTRY); 373 374 if (IS_ERR(mr)) 375 return mr; 376 377 mr->ibmr.pd = pd; 378 379 mr->dev = dev; 380 mr->access_flags = access_flags; 381 mr->mmkey.iova = 0; 382 mr->umem = umem; 383 384 if (ksm) { 385 err = mlx5_ib_update_xlt(mr, 0, 386 mlx5_imr_ksm_entries, 387 MLX5_KSM_PAGE_SHIFT, 388 MLX5_IB_UPD_XLT_INDIRECT | 389 MLX5_IB_UPD_XLT_ZAP | 390 MLX5_IB_UPD_XLT_ENABLE); 391 392 } else { 393 err = mlx5_ib_update_xlt(mr, 0, 394 MLX5_IMR_MTT_ENTRIES, 395 PAGE_SHIFT, 396 MLX5_IB_UPD_XLT_ZAP | 397 MLX5_IB_UPD_XLT_ENABLE | 398 MLX5_IB_UPD_XLT_ATOMIC); 399 } 400 401 if (err) 402 goto fail; 403 404 mr->ibmr.lkey = mr->mmkey.key; 405 mr->ibmr.rkey = mr->mmkey.key; 406 407 mr->live = 1; 408 409 mlx5_ib_dbg(dev, "key %x dev %p mr %p\n", 410 mr->mmkey.key, dev->mdev, mr); 411 412 return mr; 413 414 fail: 415 mlx5_ib_err(dev, "Failed to register MKEY %d\n", err); 416 mlx5_mr_cache_free(dev, mr); 417 418 return ERR_PTR(err); 419 } 420 421 static struct ib_umem_odp *implicit_mr_get_data(struct mlx5_ib_mr *mr, 422 u64 io_virt, size_t bcnt) 423 { 424 struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.pd->device); 425 struct ib_umem_odp *odp, *result = NULL; 426 struct ib_umem_odp *odp_mr = to_ib_umem_odp(mr->umem); 427 u64 addr = io_virt & MLX5_IMR_MTT_MASK; 428 int nentries = 0, start_idx = 0, ret; 429 struct mlx5_ib_mr *mtt; 430 431 mutex_lock(&odp_mr->umem_mutex); 432 odp = odp_lookup(addr, 1, mr); 433 434 mlx5_ib_dbg(dev, "io_virt:%llx bcnt:%zx addr:%llx odp:%p\n", 435 io_virt, bcnt, addr, odp); 436 437 next_mr: 438 if (likely(odp)) { 439 if (nentries) 440 nentries++; 441 } else { 442 odp = ib_alloc_odp_umem(odp_mr->per_mm, addr, 443 MLX5_IMR_MTT_SIZE); 444 if (IS_ERR(odp)) { 445 mutex_unlock(&odp_mr->umem_mutex); 446 return ERR_CAST(odp); 447 } 448 449 mtt = implicit_mr_alloc(mr->ibmr.pd, &odp->umem, 0, 450 mr->access_flags); 451 if (IS_ERR(mtt)) { 452 mutex_unlock(&odp_mr->umem_mutex); 453 ib_umem_release(&odp->umem); 454 return ERR_CAST(mtt); 455 } 456 457 odp->private = mtt; 458 mtt->umem = &odp->umem; 459 mtt->mmkey.iova = addr; 460 mtt->parent = mr; 461 INIT_WORK(&odp->work, mr_leaf_free_action); 462 463 if (!nentries) 464 start_idx = addr >> MLX5_IMR_MTT_SHIFT; 465 nentries++; 466 } 467 468 /* Return first odp if region not covered by single one */ 469 if (likely(!result)) 470 result = odp; 471 472 addr += MLX5_IMR_MTT_SIZE; 473 if (unlikely(addr < io_virt + bcnt)) { 474 odp = odp_next(odp); 475 if (odp && odp->umem.address != addr) 476 odp = NULL; 477 goto next_mr; 478 } 479 480 if (unlikely(nentries)) { 481 ret = mlx5_ib_update_xlt(mr, start_idx, nentries, 0, 482 MLX5_IB_UPD_XLT_INDIRECT | 483 MLX5_IB_UPD_XLT_ATOMIC); 484 if (ret) { 485 mlx5_ib_err(dev, "Failed to update PAS\n"); 486 result = ERR_PTR(ret); 487 } 488 } 489 490 mutex_unlock(&odp_mr->umem_mutex); 491 return result; 492 } 493 494 struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd, 495 int access_flags) 496 { 497 struct ib_ucontext *ctx = pd->ibpd.uobject->context; 498 struct mlx5_ib_mr *imr; 499 struct ib_umem *umem; 500 501 umem = ib_umem_get(ctx, 0, 0, IB_ACCESS_ON_DEMAND, 0); 502 if (IS_ERR(umem)) 503 return ERR_CAST(umem); 504 505 imr = implicit_mr_alloc(&pd->ibpd, umem, 1, access_flags); 506 if (IS_ERR(imr)) { 507 ib_umem_release(umem); 508 return ERR_CAST(imr); 509 } 510 511 imr->umem = umem; 512 init_waitqueue_head(&imr->q_leaf_free); 513 atomic_set(&imr->num_leaf_free, 0); 514 515 return imr; 516 } 517 518 static int mr_leaf_free(struct ib_umem_odp *umem_odp, u64 start, u64 end, 519 void *cookie) 520 { 521 struct mlx5_ib_mr *mr = umem_odp->private, *imr = cookie; 522 struct ib_umem *umem = &umem_odp->umem; 523 524 if (mr->parent != imr) 525 return 0; 526 527 ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem), 528 ib_umem_end(umem)); 529 530 if (umem_odp->dying) 531 return 0; 532 533 WRITE_ONCE(umem_odp->dying, 1); 534 atomic_inc(&imr->num_leaf_free); 535 schedule_work(&umem_odp->work); 536 537 return 0; 538 } 539 540 void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *imr) 541 { 542 struct ib_ucontext_per_mm *per_mm = mr_to_per_mm(imr); 543 544 down_read(&per_mm->umem_rwsem); 545 rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, 0, ULLONG_MAX, 546 mr_leaf_free, true, imr); 547 up_read(&per_mm->umem_rwsem); 548 549 wait_event(imr->q_leaf_free, !atomic_read(&imr->num_leaf_free)); 550 } 551 552 static int pagefault_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr, 553 u64 io_virt, size_t bcnt, u32 *bytes_mapped) 554 { 555 struct ib_umem_odp *odp_mr = to_ib_umem_odp(mr->umem); 556 u64 access_mask = ODP_READ_ALLOWED_BIT; 557 int npages = 0, page_shift, np; 558 u64 start_idx, page_mask; 559 struct ib_umem_odp *odp; 560 int current_seq; 561 size_t size; 562 int ret; 563 564 if (!odp_mr->page_list) { 565 odp = implicit_mr_get_data(mr, io_virt, bcnt); 566 567 if (IS_ERR(odp)) 568 return PTR_ERR(odp); 569 mr = odp->private; 570 571 } else { 572 odp = odp_mr; 573 } 574 575 next_mr: 576 size = min_t(size_t, bcnt, ib_umem_end(&odp->umem) - io_virt); 577 578 page_shift = mr->umem->page_shift; 579 page_mask = ~(BIT(page_shift) - 1); 580 start_idx = (io_virt - (mr->mmkey.iova & page_mask)) >> page_shift; 581 582 if (mr->umem->writable) 583 access_mask |= ODP_WRITE_ALLOWED_BIT; 584 585 current_seq = READ_ONCE(odp->notifiers_seq); 586 /* 587 * Ensure the sequence number is valid for some time before we call 588 * gup. 589 */ 590 smp_rmb(); 591 592 ret = ib_umem_odp_map_dma_pages(to_ib_umem_odp(mr->umem), io_virt, size, 593 access_mask, current_seq); 594 595 if (ret < 0) 596 goto out; 597 598 np = ret; 599 600 mutex_lock(&odp->umem_mutex); 601 if (!ib_umem_mmu_notifier_retry(to_ib_umem_odp(mr->umem), 602 current_seq)) { 603 /* 604 * No need to check whether the MTTs really belong to 605 * this MR, since ib_umem_odp_map_dma_pages already 606 * checks this. 607 */ 608 ret = mlx5_ib_update_xlt(mr, start_idx, np, 609 page_shift, MLX5_IB_UPD_XLT_ATOMIC); 610 } else { 611 ret = -EAGAIN; 612 } 613 mutex_unlock(&odp->umem_mutex); 614 615 if (ret < 0) { 616 if (ret != -EAGAIN) 617 mlx5_ib_err(dev, "Failed to update mkey page tables\n"); 618 goto out; 619 } 620 621 if (bytes_mapped) { 622 u32 new_mappings = (np << page_shift) - 623 (io_virt - round_down(io_virt, 1 << page_shift)); 624 *bytes_mapped += min_t(u32, new_mappings, size); 625 } 626 627 npages += np << (page_shift - PAGE_SHIFT); 628 bcnt -= size; 629 630 if (unlikely(bcnt)) { 631 struct ib_umem_odp *next; 632 633 io_virt += size; 634 next = odp_next(odp); 635 if (unlikely(!next || next->umem.address != io_virt)) { 636 mlx5_ib_dbg(dev, "next implicit leaf removed at 0x%llx. got %p\n", 637 io_virt, next); 638 return -EAGAIN; 639 } 640 odp = next; 641 mr = odp->private; 642 goto next_mr; 643 } 644 645 return npages; 646 647 out: 648 if (ret == -EAGAIN) { 649 if (mr->parent || !odp->dying) { 650 unsigned long timeout = 651 msecs_to_jiffies(MMU_NOTIFIER_TIMEOUT); 652 653 if (!wait_for_completion_timeout( 654 &odp->notifier_completion, 655 timeout)) { 656 mlx5_ib_warn(dev, "timeout waiting for mmu notifier. seq %d against %d. notifiers_count=%d\n", 657 current_seq, odp->notifiers_seq, odp->notifiers_count); 658 } 659 } else { 660 /* The MR is being killed, kill the QP as well. */ 661 ret = -EFAULT; 662 } 663 } 664 665 return ret; 666 } 667 668 struct pf_frame { 669 struct pf_frame *next; 670 u32 key; 671 u64 io_virt; 672 size_t bcnt; 673 int depth; 674 }; 675 676 /* 677 * Handle a single data segment in a page-fault WQE or RDMA region. 678 * 679 * Returns number of OS pages retrieved on success. The caller may continue to 680 * the next data segment. 681 * Can return the following error codes: 682 * -EAGAIN to designate a temporary error. The caller will abort handling the 683 * page fault and resolve it. 684 * -EFAULT when there's an error mapping the requested pages. The caller will 685 * abort the page fault handling. 686 */ 687 static int pagefault_single_data_segment(struct mlx5_ib_dev *dev, 688 u32 key, u64 io_virt, size_t bcnt, 689 u32 *bytes_committed, 690 u32 *bytes_mapped) 691 { 692 int npages = 0, srcu_key, ret, i, outlen, cur_outlen = 0, depth = 0; 693 struct pf_frame *head = NULL, *frame; 694 struct mlx5_core_mkey *mmkey; 695 struct mlx5_ib_mw *mw; 696 struct mlx5_ib_mr *mr; 697 struct mlx5_klm *pklm; 698 u32 *out = NULL; 699 size_t offset; 700 701 srcu_key = srcu_read_lock(&dev->mr_srcu); 702 703 io_virt += *bytes_committed; 704 bcnt -= *bytes_committed; 705 706 next_mr: 707 mmkey = __mlx5_mr_lookup(dev->mdev, mlx5_base_mkey(key)); 708 if (!mmkey || mmkey->key != key) { 709 mlx5_ib_dbg(dev, "failed to find mkey %x\n", key); 710 ret = -EFAULT; 711 goto srcu_unlock; 712 } 713 714 switch (mmkey->type) { 715 case MLX5_MKEY_MR: 716 mr = container_of(mmkey, struct mlx5_ib_mr, mmkey); 717 if (!mr->live || !mr->ibmr.pd) { 718 mlx5_ib_dbg(dev, "got dead MR\n"); 719 ret = -EFAULT; 720 goto srcu_unlock; 721 } 722 723 if (!mr->umem->is_odp) { 724 mlx5_ib_dbg(dev, "skipping non ODP MR (lkey=0x%06x) in page fault handler.\n", 725 key); 726 if (bytes_mapped) 727 *bytes_mapped += bcnt; 728 ret = 0; 729 goto srcu_unlock; 730 } 731 732 ret = pagefault_mr(dev, mr, io_virt, bcnt, bytes_mapped); 733 if (ret < 0) 734 goto srcu_unlock; 735 736 npages += ret; 737 ret = 0; 738 break; 739 740 case MLX5_MKEY_MW: 741 mw = container_of(mmkey, struct mlx5_ib_mw, mmkey); 742 743 if (depth >= MLX5_CAP_GEN(dev->mdev, max_indirection)) { 744 mlx5_ib_dbg(dev, "indirection level exceeded\n"); 745 ret = -EFAULT; 746 goto srcu_unlock; 747 } 748 749 outlen = MLX5_ST_SZ_BYTES(query_mkey_out) + 750 sizeof(*pklm) * (mw->ndescs - 2); 751 752 if (outlen > cur_outlen) { 753 kfree(out); 754 out = kzalloc(outlen, GFP_KERNEL); 755 if (!out) { 756 ret = -ENOMEM; 757 goto srcu_unlock; 758 } 759 cur_outlen = outlen; 760 } 761 762 pklm = (struct mlx5_klm *)MLX5_ADDR_OF(query_mkey_out, out, 763 bsf0_klm0_pas_mtt0_1); 764 765 ret = mlx5_core_query_mkey(dev->mdev, &mw->mmkey, out, outlen); 766 if (ret) 767 goto srcu_unlock; 768 769 offset = io_virt - MLX5_GET64(query_mkey_out, out, 770 memory_key_mkey_entry.start_addr); 771 772 for (i = 0; bcnt && i < mw->ndescs; i++, pklm++) { 773 if (offset >= be32_to_cpu(pklm->bcount)) { 774 offset -= be32_to_cpu(pklm->bcount); 775 continue; 776 } 777 778 frame = kzalloc(sizeof(*frame), GFP_KERNEL); 779 if (!frame) { 780 ret = -ENOMEM; 781 goto srcu_unlock; 782 } 783 784 frame->key = be32_to_cpu(pklm->key); 785 frame->io_virt = be64_to_cpu(pklm->va) + offset; 786 frame->bcnt = min_t(size_t, bcnt, 787 be32_to_cpu(pklm->bcount) - offset); 788 frame->depth = depth + 1; 789 frame->next = head; 790 head = frame; 791 792 bcnt -= frame->bcnt; 793 offset = 0; 794 } 795 break; 796 797 default: 798 mlx5_ib_dbg(dev, "wrong mkey type %d\n", mmkey->type); 799 ret = -EFAULT; 800 goto srcu_unlock; 801 } 802 803 if (head) { 804 frame = head; 805 head = frame->next; 806 807 key = frame->key; 808 io_virt = frame->io_virt; 809 bcnt = frame->bcnt; 810 depth = frame->depth; 811 kfree(frame); 812 813 goto next_mr; 814 } 815 816 srcu_unlock: 817 while (head) { 818 frame = head; 819 head = frame->next; 820 kfree(frame); 821 } 822 kfree(out); 823 824 srcu_read_unlock(&dev->mr_srcu, srcu_key); 825 *bytes_committed = 0; 826 return ret ? ret : npages; 827 } 828 829 /** 830 * Parse a series of data segments for page fault handling. 831 * 832 * @qp the QP on which the fault occurred. 833 * @pfault contains page fault information. 834 * @wqe points at the first data segment in the WQE. 835 * @wqe_end points after the end of the WQE. 836 * @bytes_mapped receives the number of bytes that the function was able to 837 * map. This allows the caller to decide intelligently whether 838 * enough memory was mapped to resolve the page fault 839 * successfully (e.g. enough for the next MTU, or the entire 840 * WQE). 841 * @total_wqe_bytes receives the total data size of this WQE in bytes (minus 842 * the committed bytes). 843 * 844 * Returns the number of pages loaded if positive, zero for an empty WQE, or a 845 * negative error code. 846 */ 847 static int pagefault_data_segments(struct mlx5_ib_dev *dev, 848 struct mlx5_pagefault *pfault, 849 struct mlx5_ib_qp *qp, void *wqe, 850 void *wqe_end, u32 *bytes_mapped, 851 u32 *total_wqe_bytes, int receive_queue) 852 { 853 int ret = 0, npages = 0; 854 u64 io_virt; 855 u32 key; 856 u32 byte_count; 857 size_t bcnt; 858 int inline_segment; 859 860 /* Skip SRQ next-WQE segment. */ 861 if (receive_queue && qp->ibqp.srq) 862 wqe += sizeof(struct mlx5_wqe_srq_next_seg); 863 864 if (bytes_mapped) 865 *bytes_mapped = 0; 866 if (total_wqe_bytes) 867 *total_wqe_bytes = 0; 868 869 while (wqe < wqe_end) { 870 struct mlx5_wqe_data_seg *dseg = wqe; 871 872 io_virt = be64_to_cpu(dseg->addr); 873 key = be32_to_cpu(dseg->lkey); 874 byte_count = be32_to_cpu(dseg->byte_count); 875 inline_segment = !!(byte_count & MLX5_INLINE_SEG); 876 bcnt = byte_count & ~MLX5_INLINE_SEG; 877 878 if (inline_segment) { 879 bcnt = bcnt & MLX5_WQE_INLINE_SEG_BYTE_COUNT_MASK; 880 wqe += ALIGN(sizeof(struct mlx5_wqe_inline_seg) + bcnt, 881 16); 882 } else { 883 wqe += sizeof(*dseg); 884 } 885 886 /* receive WQE end of sg list. */ 887 if (receive_queue && bcnt == 0 && key == MLX5_INVALID_LKEY && 888 io_virt == 0) 889 break; 890 891 if (!inline_segment && total_wqe_bytes) { 892 *total_wqe_bytes += bcnt - min_t(size_t, bcnt, 893 pfault->bytes_committed); 894 } 895 896 /* A zero length data segment designates a length of 2GB. */ 897 if (bcnt == 0) 898 bcnt = 1U << 31; 899 900 if (inline_segment || bcnt <= pfault->bytes_committed) { 901 pfault->bytes_committed -= 902 min_t(size_t, bcnt, 903 pfault->bytes_committed); 904 continue; 905 } 906 907 ret = pagefault_single_data_segment(dev, key, io_virt, bcnt, 908 &pfault->bytes_committed, 909 bytes_mapped); 910 if (ret < 0) 911 break; 912 npages += ret; 913 } 914 915 return ret < 0 ? ret : npages; 916 } 917 918 static const u32 mlx5_ib_odp_opcode_cap[] = { 919 [MLX5_OPCODE_SEND] = IB_ODP_SUPPORT_SEND, 920 [MLX5_OPCODE_SEND_IMM] = IB_ODP_SUPPORT_SEND, 921 [MLX5_OPCODE_SEND_INVAL] = IB_ODP_SUPPORT_SEND, 922 [MLX5_OPCODE_RDMA_WRITE] = IB_ODP_SUPPORT_WRITE, 923 [MLX5_OPCODE_RDMA_WRITE_IMM] = IB_ODP_SUPPORT_WRITE, 924 [MLX5_OPCODE_RDMA_READ] = IB_ODP_SUPPORT_READ, 925 [MLX5_OPCODE_ATOMIC_CS] = IB_ODP_SUPPORT_ATOMIC, 926 [MLX5_OPCODE_ATOMIC_FA] = IB_ODP_SUPPORT_ATOMIC, 927 }; 928 929 /* 930 * Parse initiator WQE. Advances the wqe pointer to point at the 931 * scatter-gather list, and set wqe_end to the end of the WQE. 932 */ 933 static int mlx5_ib_mr_initiator_pfault_handler( 934 struct mlx5_ib_dev *dev, struct mlx5_pagefault *pfault, 935 struct mlx5_ib_qp *qp, void **wqe, void **wqe_end, int wqe_length) 936 { 937 struct mlx5_wqe_ctrl_seg *ctrl = *wqe; 938 u16 wqe_index = pfault->wqe.wqe_index; 939 u32 transport_caps; 940 struct mlx5_base_av *av; 941 unsigned ds, opcode; 942 #if defined(DEBUG) 943 u32 ctrl_wqe_index, ctrl_qpn; 944 #endif 945 u32 qpn = qp->trans_qp.base.mqp.qpn; 946 947 ds = be32_to_cpu(ctrl->qpn_ds) & MLX5_WQE_CTRL_DS_MASK; 948 if (ds * MLX5_WQE_DS_UNITS > wqe_length) { 949 mlx5_ib_err(dev, "Unable to read the complete WQE. ds = 0x%x, ret = 0x%x\n", 950 ds, wqe_length); 951 return -EFAULT; 952 } 953 954 if (ds == 0) { 955 mlx5_ib_err(dev, "Got WQE with zero DS. wqe_index=%x, qpn=%x\n", 956 wqe_index, qpn); 957 return -EFAULT; 958 } 959 960 #if defined(DEBUG) 961 ctrl_wqe_index = (be32_to_cpu(ctrl->opmod_idx_opcode) & 962 MLX5_WQE_CTRL_WQE_INDEX_MASK) >> 963 MLX5_WQE_CTRL_WQE_INDEX_SHIFT; 964 if (wqe_index != ctrl_wqe_index) { 965 mlx5_ib_err(dev, "Got WQE with invalid wqe_index. wqe_index=0x%x, qpn=0x%x ctrl->wqe_index=0x%x\n", 966 wqe_index, qpn, 967 ctrl_wqe_index); 968 return -EFAULT; 969 } 970 971 ctrl_qpn = (be32_to_cpu(ctrl->qpn_ds) & MLX5_WQE_CTRL_QPN_MASK) >> 972 MLX5_WQE_CTRL_QPN_SHIFT; 973 if (qpn != ctrl_qpn) { 974 mlx5_ib_err(dev, "Got WQE with incorrect QP number. wqe_index=0x%x, qpn=0x%x ctrl->qpn=0x%x\n", 975 wqe_index, qpn, 976 ctrl_qpn); 977 return -EFAULT; 978 } 979 #endif /* DEBUG */ 980 981 *wqe_end = *wqe + ds * MLX5_WQE_DS_UNITS; 982 *wqe += sizeof(*ctrl); 983 984 opcode = be32_to_cpu(ctrl->opmod_idx_opcode) & 985 MLX5_WQE_CTRL_OPCODE_MASK; 986 987 switch (qp->ibqp.qp_type) { 988 case IB_QPT_RC: 989 transport_caps = dev->odp_caps.per_transport_caps.rc_odp_caps; 990 break; 991 case IB_QPT_UD: 992 transport_caps = dev->odp_caps.per_transport_caps.ud_odp_caps; 993 break; 994 default: 995 mlx5_ib_err(dev, "ODP fault on QP of an unsupported transport 0x%x\n", 996 qp->ibqp.qp_type); 997 return -EFAULT; 998 } 999 1000 if (unlikely(opcode >= ARRAY_SIZE(mlx5_ib_odp_opcode_cap) || 1001 !(transport_caps & mlx5_ib_odp_opcode_cap[opcode]))) { 1002 mlx5_ib_err(dev, "ODP fault on QP of an unsupported opcode 0x%x\n", 1003 opcode); 1004 return -EFAULT; 1005 } 1006 1007 if (qp->ibqp.qp_type != IB_QPT_RC) { 1008 av = *wqe; 1009 if (av->dqp_dct & cpu_to_be32(MLX5_EXTENDED_UD_AV)) 1010 *wqe += sizeof(struct mlx5_av); 1011 else 1012 *wqe += sizeof(struct mlx5_base_av); 1013 } 1014 1015 switch (opcode) { 1016 case MLX5_OPCODE_RDMA_WRITE: 1017 case MLX5_OPCODE_RDMA_WRITE_IMM: 1018 case MLX5_OPCODE_RDMA_READ: 1019 *wqe += sizeof(struct mlx5_wqe_raddr_seg); 1020 break; 1021 case MLX5_OPCODE_ATOMIC_CS: 1022 case MLX5_OPCODE_ATOMIC_FA: 1023 *wqe += sizeof(struct mlx5_wqe_raddr_seg); 1024 *wqe += sizeof(struct mlx5_wqe_atomic_seg); 1025 break; 1026 } 1027 1028 return 0; 1029 } 1030 1031 /* 1032 * Parse responder WQE. Advances the wqe pointer to point at the 1033 * scatter-gather list, and set wqe_end to the end of the WQE. 1034 */ 1035 static int mlx5_ib_mr_responder_pfault_handler( 1036 struct mlx5_ib_dev *dev, struct mlx5_pagefault *pfault, 1037 struct mlx5_ib_qp *qp, void **wqe, void **wqe_end, int wqe_length) 1038 { 1039 struct mlx5_ib_wq *wq = &qp->rq; 1040 int wqe_size = 1 << wq->wqe_shift; 1041 1042 if (qp->ibqp.srq) { 1043 mlx5_ib_err(dev, "ODP fault on SRQ is not supported\n"); 1044 return -EFAULT; 1045 } 1046 1047 if (qp->wq_sig) { 1048 mlx5_ib_err(dev, "ODP fault with WQE signatures is not supported\n"); 1049 return -EFAULT; 1050 } 1051 1052 if (wqe_size > wqe_length) { 1053 mlx5_ib_err(dev, "Couldn't read all of the receive WQE's content\n"); 1054 return -EFAULT; 1055 } 1056 1057 switch (qp->ibqp.qp_type) { 1058 case IB_QPT_RC: 1059 if (!(dev->odp_caps.per_transport_caps.rc_odp_caps & 1060 IB_ODP_SUPPORT_RECV)) 1061 goto invalid_transport_or_opcode; 1062 break; 1063 default: 1064 invalid_transport_or_opcode: 1065 mlx5_ib_err(dev, "ODP fault on QP of an unsupported transport. transport: 0x%x\n", 1066 qp->ibqp.qp_type); 1067 return -EFAULT; 1068 } 1069 1070 *wqe_end = *wqe + wqe_size; 1071 1072 return 0; 1073 } 1074 1075 static inline struct mlx5_core_rsc_common *odp_get_rsc(struct mlx5_ib_dev *dev, 1076 u32 wq_num, int pf_type) 1077 { 1078 enum mlx5_res_type res_type; 1079 1080 switch (pf_type) { 1081 case MLX5_WQE_PF_TYPE_RMP: 1082 res_type = MLX5_RES_SRQ; 1083 break; 1084 case MLX5_WQE_PF_TYPE_REQ_SEND_OR_WRITE: 1085 case MLX5_WQE_PF_TYPE_RESP: 1086 case MLX5_WQE_PF_TYPE_REQ_READ_OR_ATOMIC: 1087 res_type = MLX5_RES_QP; 1088 break; 1089 default: 1090 return NULL; 1091 } 1092 1093 return mlx5_core_res_hold(dev->mdev, wq_num, res_type); 1094 } 1095 1096 static inline struct mlx5_ib_qp *res_to_qp(struct mlx5_core_rsc_common *res) 1097 { 1098 struct mlx5_core_qp *mqp = (struct mlx5_core_qp *)res; 1099 1100 return to_mibqp(mqp); 1101 } 1102 1103 static void mlx5_ib_mr_wqe_pfault_handler(struct mlx5_ib_dev *dev, 1104 struct mlx5_pagefault *pfault) 1105 { 1106 int ret; 1107 void *wqe, *wqe_end; 1108 u32 bytes_mapped, total_wqe_bytes; 1109 char *buffer = NULL; 1110 int resume_with_error = 1; 1111 u16 wqe_index = pfault->wqe.wqe_index; 1112 int requestor = pfault->type & MLX5_PFAULT_REQUESTOR; 1113 struct mlx5_core_rsc_common *res; 1114 struct mlx5_ib_qp *qp; 1115 1116 res = odp_get_rsc(dev, pfault->wqe.wq_num, pfault->type); 1117 if (!res) { 1118 mlx5_ib_dbg(dev, "wqe page fault for missing resource %d\n", pfault->wqe.wq_num); 1119 return; 1120 } 1121 1122 switch (res->res) { 1123 case MLX5_RES_QP: 1124 qp = res_to_qp(res); 1125 break; 1126 default: 1127 mlx5_ib_err(dev, "wqe page fault for unsupported type %d\n", pfault->type); 1128 goto resolve_page_fault; 1129 } 1130 1131 buffer = (char *)__get_free_page(GFP_KERNEL); 1132 if (!buffer) { 1133 mlx5_ib_err(dev, "Error allocating memory for IO page fault handling.\n"); 1134 goto resolve_page_fault; 1135 } 1136 1137 ret = mlx5_ib_read_user_wqe(qp, requestor, wqe_index, buffer, 1138 PAGE_SIZE, &qp->trans_qp.base); 1139 if (ret < 0) { 1140 mlx5_ib_err(dev, "Failed reading a WQE following page fault, error=%d, wqe_index=%x, qpn=%x\n", 1141 ret, wqe_index, pfault->token); 1142 goto resolve_page_fault; 1143 } 1144 1145 wqe = buffer; 1146 if (requestor) 1147 ret = mlx5_ib_mr_initiator_pfault_handler(dev, pfault, qp, &wqe, 1148 &wqe_end, ret); 1149 else 1150 ret = mlx5_ib_mr_responder_pfault_handler(dev, pfault, qp, &wqe, 1151 &wqe_end, ret); 1152 if (ret < 0) 1153 goto resolve_page_fault; 1154 1155 if (wqe >= wqe_end) { 1156 mlx5_ib_err(dev, "ODP fault on invalid WQE.\n"); 1157 goto resolve_page_fault; 1158 } 1159 1160 ret = pagefault_data_segments(dev, pfault, qp, wqe, wqe_end, 1161 &bytes_mapped, &total_wqe_bytes, 1162 !requestor); 1163 if (ret == -EAGAIN) { 1164 resume_with_error = 0; 1165 goto resolve_page_fault; 1166 } else if (ret < 0 || total_wqe_bytes > bytes_mapped) { 1167 goto resolve_page_fault; 1168 } 1169 1170 resume_with_error = 0; 1171 resolve_page_fault: 1172 mlx5_ib_page_fault_resume(dev, pfault, resume_with_error); 1173 mlx5_ib_dbg(dev, "PAGE FAULT completed. QP 0x%x resume_with_error=%d, type: 0x%x\n", 1174 pfault->wqe.wq_num, resume_with_error, 1175 pfault->type); 1176 mlx5_core_res_put(res); 1177 free_page((unsigned long)buffer); 1178 } 1179 1180 static int pages_in_range(u64 address, u32 length) 1181 { 1182 return (ALIGN(address + length, PAGE_SIZE) - 1183 (address & PAGE_MASK)) >> PAGE_SHIFT; 1184 } 1185 1186 static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_dev *dev, 1187 struct mlx5_pagefault *pfault) 1188 { 1189 u64 address; 1190 u32 length; 1191 u32 prefetch_len = pfault->bytes_committed; 1192 int prefetch_activated = 0; 1193 u32 rkey = pfault->rdma.r_key; 1194 int ret; 1195 1196 /* The RDMA responder handler handles the page fault in two parts. 1197 * First it brings the necessary pages for the current packet 1198 * (and uses the pfault context), and then (after resuming the QP) 1199 * prefetches more pages. The second operation cannot use the pfault 1200 * context and therefore uses the dummy_pfault context allocated on 1201 * the stack */ 1202 pfault->rdma.rdma_va += pfault->bytes_committed; 1203 pfault->rdma.rdma_op_len -= min(pfault->bytes_committed, 1204 pfault->rdma.rdma_op_len); 1205 pfault->bytes_committed = 0; 1206 1207 address = pfault->rdma.rdma_va; 1208 length = pfault->rdma.rdma_op_len; 1209 1210 /* For some operations, the hardware cannot tell the exact message 1211 * length, and in those cases it reports zero. Use prefetch 1212 * logic. */ 1213 if (length == 0) { 1214 prefetch_activated = 1; 1215 length = pfault->rdma.packet_size; 1216 prefetch_len = min(MAX_PREFETCH_LEN, prefetch_len); 1217 } 1218 1219 ret = pagefault_single_data_segment(dev, rkey, address, length, 1220 &pfault->bytes_committed, NULL); 1221 if (ret == -EAGAIN) { 1222 /* We're racing with an invalidation, don't prefetch */ 1223 prefetch_activated = 0; 1224 } else if (ret < 0 || pages_in_range(address, length) > ret) { 1225 mlx5_ib_page_fault_resume(dev, pfault, 1); 1226 if (ret != -ENOENT) 1227 mlx5_ib_dbg(dev, "PAGE FAULT error %d. QP 0x%x, type: 0x%x\n", 1228 ret, pfault->token, pfault->type); 1229 return; 1230 } 1231 1232 mlx5_ib_page_fault_resume(dev, pfault, 0); 1233 mlx5_ib_dbg(dev, "PAGE FAULT completed. QP 0x%x, type: 0x%x, prefetch_activated: %d\n", 1234 pfault->token, pfault->type, 1235 prefetch_activated); 1236 1237 /* At this point, there might be a new pagefault already arriving in 1238 * the eq, switch to the dummy pagefault for the rest of the 1239 * processing. We're still OK with the objects being alive as the 1240 * work-queue is being fenced. */ 1241 1242 if (prefetch_activated) { 1243 u32 bytes_committed = 0; 1244 1245 ret = pagefault_single_data_segment(dev, rkey, address, 1246 prefetch_len, 1247 &bytes_committed, NULL); 1248 if (ret < 0 && ret != -EAGAIN) { 1249 mlx5_ib_dbg(dev, "Prefetch failed. ret: %d, QP 0x%x, address: 0x%.16llx, length = 0x%.16x\n", 1250 ret, pfault->token, address, prefetch_len); 1251 } 1252 } 1253 } 1254 1255 static void mlx5_ib_pfault(struct mlx5_ib_dev *dev, struct mlx5_pagefault *pfault) 1256 { 1257 u8 event_subtype = pfault->event_subtype; 1258 1259 switch (event_subtype) { 1260 case MLX5_PFAULT_SUBTYPE_WQE: 1261 mlx5_ib_mr_wqe_pfault_handler(dev, pfault); 1262 break; 1263 case MLX5_PFAULT_SUBTYPE_RDMA: 1264 mlx5_ib_mr_rdma_pfault_handler(dev, pfault); 1265 break; 1266 default: 1267 mlx5_ib_err(dev, "Invalid page fault event subtype: 0x%x\n", 1268 event_subtype); 1269 mlx5_ib_page_fault_resume(dev, pfault, 1); 1270 } 1271 } 1272 1273 static void mlx5_ib_eqe_pf_action(struct work_struct *work) 1274 { 1275 struct mlx5_pagefault *pfault = container_of(work, 1276 struct mlx5_pagefault, 1277 work); 1278 struct mlx5_ib_pf_eq *eq = pfault->eq; 1279 1280 mlx5_ib_pfault(eq->dev, pfault); 1281 mempool_free(pfault, eq->pool); 1282 } 1283 1284 static void mlx5_ib_eq_pf_process(struct mlx5_ib_pf_eq *eq) 1285 { 1286 struct mlx5_eqe_page_fault *pf_eqe; 1287 struct mlx5_pagefault *pfault; 1288 struct mlx5_eqe *eqe; 1289 int cc = 0; 1290 1291 while ((eqe = mlx5_eq_get_eqe(eq->core, cc))) { 1292 pfault = mempool_alloc(eq->pool, GFP_ATOMIC); 1293 if (!pfault) { 1294 schedule_work(&eq->work); 1295 break; 1296 } 1297 1298 pf_eqe = &eqe->data.page_fault; 1299 pfault->event_subtype = eqe->sub_type; 1300 pfault->bytes_committed = be32_to_cpu(pf_eqe->bytes_committed); 1301 1302 mlx5_ib_dbg(eq->dev, 1303 "PAGE_FAULT: subtype: 0x%02x, bytes_committed: 0x%06x\n", 1304 eqe->sub_type, pfault->bytes_committed); 1305 1306 switch (eqe->sub_type) { 1307 case MLX5_PFAULT_SUBTYPE_RDMA: 1308 /* RDMA based event */ 1309 pfault->type = 1310 be32_to_cpu(pf_eqe->rdma.pftype_token) >> 24; 1311 pfault->token = 1312 be32_to_cpu(pf_eqe->rdma.pftype_token) & 1313 MLX5_24BIT_MASK; 1314 pfault->rdma.r_key = 1315 be32_to_cpu(pf_eqe->rdma.r_key); 1316 pfault->rdma.packet_size = 1317 be16_to_cpu(pf_eqe->rdma.packet_length); 1318 pfault->rdma.rdma_op_len = 1319 be32_to_cpu(pf_eqe->rdma.rdma_op_len); 1320 pfault->rdma.rdma_va = 1321 be64_to_cpu(pf_eqe->rdma.rdma_va); 1322 mlx5_ib_dbg(eq->dev, 1323 "PAGE_FAULT: type:0x%x, token: 0x%06x, r_key: 0x%08x\n", 1324 pfault->type, pfault->token, 1325 pfault->rdma.r_key); 1326 mlx5_ib_dbg(eq->dev, 1327 "PAGE_FAULT: rdma_op_len: 0x%08x, rdma_va: 0x%016llx\n", 1328 pfault->rdma.rdma_op_len, 1329 pfault->rdma.rdma_va); 1330 break; 1331 1332 case MLX5_PFAULT_SUBTYPE_WQE: 1333 /* WQE based event */ 1334 pfault->type = 1335 (be32_to_cpu(pf_eqe->wqe.pftype_wq) >> 24) & 0x7; 1336 pfault->token = 1337 be32_to_cpu(pf_eqe->wqe.token); 1338 pfault->wqe.wq_num = 1339 be32_to_cpu(pf_eqe->wqe.pftype_wq) & 1340 MLX5_24BIT_MASK; 1341 pfault->wqe.wqe_index = 1342 be16_to_cpu(pf_eqe->wqe.wqe_index); 1343 pfault->wqe.packet_size = 1344 be16_to_cpu(pf_eqe->wqe.packet_length); 1345 mlx5_ib_dbg(eq->dev, 1346 "PAGE_FAULT: type:0x%x, token: 0x%06x, wq_num: 0x%06x, wqe_index: 0x%04x\n", 1347 pfault->type, pfault->token, 1348 pfault->wqe.wq_num, 1349 pfault->wqe.wqe_index); 1350 break; 1351 1352 default: 1353 mlx5_ib_warn(eq->dev, 1354 "Unsupported page fault event sub-type: 0x%02hhx\n", 1355 eqe->sub_type); 1356 /* Unsupported page faults should still be 1357 * resolved by the page fault handler 1358 */ 1359 } 1360 1361 pfault->eq = eq; 1362 INIT_WORK(&pfault->work, mlx5_ib_eqe_pf_action); 1363 queue_work(eq->wq, &pfault->work); 1364 1365 cc = mlx5_eq_update_cc(eq->core, ++cc); 1366 } 1367 1368 mlx5_eq_update_ci(eq->core, cc, 1); 1369 } 1370 1371 static irqreturn_t mlx5_ib_eq_pf_int(int irq, void *eq_ptr) 1372 { 1373 struct mlx5_ib_pf_eq *eq = eq_ptr; 1374 unsigned long flags; 1375 1376 if (spin_trylock_irqsave(&eq->lock, flags)) { 1377 mlx5_ib_eq_pf_process(eq); 1378 spin_unlock_irqrestore(&eq->lock, flags); 1379 } else { 1380 schedule_work(&eq->work); 1381 } 1382 1383 return IRQ_HANDLED; 1384 } 1385 1386 /* mempool_refill() was proposed but unfortunately wasn't accepted 1387 * http://lkml.iu.edu/hypermail/linux/kernel/1512.1/05073.html 1388 * Cheap workaround. 1389 */ 1390 static void mempool_refill(mempool_t *pool) 1391 { 1392 while (pool->curr_nr < pool->min_nr) 1393 mempool_free(mempool_alloc(pool, GFP_KERNEL), pool); 1394 } 1395 1396 static void mlx5_ib_eq_pf_action(struct work_struct *work) 1397 { 1398 struct mlx5_ib_pf_eq *eq = 1399 container_of(work, struct mlx5_ib_pf_eq, work); 1400 1401 mempool_refill(eq->pool); 1402 1403 spin_lock_irq(&eq->lock); 1404 mlx5_ib_eq_pf_process(eq); 1405 spin_unlock_irq(&eq->lock); 1406 } 1407 1408 enum { 1409 MLX5_IB_NUM_PF_EQE = 0x1000, 1410 MLX5_IB_NUM_PF_DRAIN = 64, 1411 }; 1412 1413 static int 1414 mlx5_ib_create_pf_eq(struct mlx5_ib_dev *dev, struct mlx5_ib_pf_eq *eq) 1415 { 1416 struct mlx5_eq_param param = {}; 1417 int err; 1418 1419 INIT_WORK(&eq->work, mlx5_ib_eq_pf_action); 1420 spin_lock_init(&eq->lock); 1421 eq->dev = dev; 1422 1423 eq->pool = mempool_create_kmalloc_pool(MLX5_IB_NUM_PF_DRAIN, 1424 sizeof(struct mlx5_pagefault)); 1425 if (!eq->pool) 1426 return -ENOMEM; 1427 1428 eq->wq = alloc_workqueue("mlx5_ib_page_fault", 1429 WQ_HIGHPRI | WQ_UNBOUND | WQ_MEM_RECLAIM, 1430 MLX5_NUM_CMD_EQE); 1431 if (!eq->wq) { 1432 err = -ENOMEM; 1433 goto err_mempool; 1434 } 1435 1436 param = (struct mlx5_eq_param) { 1437 .index = MLX5_EQ_PFAULT_IDX, 1438 .mask = 1 << MLX5_EVENT_TYPE_PAGE_FAULT, 1439 .nent = MLX5_IB_NUM_PF_EQE, 1440 .context = eq, 1441 .handler = mlx5_ib_eq_pf_int 1442 }; 1443 eq->core = mlx5_eq_create_generic(dev->mdev, "mlx5_ib_page_fault_eq", ¶m); 1444 if (IS_ERR(eq->core)) { 1445 err = PTR_ERR(eq->core); 1446 goto err_wq; 1447 } 1448 1449 return 0; 1450 err_wq: 1451 destroy_workqueue(eq->wq); 1452 err_mempool: 1453 mempool_destroy(eq->pool); 1454 return err; 1455 } 1456 1457 static int 1458 mlx5_ib_destroy_pf_eq(struct mlx5_ib_dev *dev, struct mlx5_ib_pf_eq *eq) 1459 { 1460 int err; 1461 1462 err = mlx5_eq_destroy_generic(dev->mdev, eq->core); 1463 cancel_work_sync(&eq->work); 1464 destroy_workqueue(eq->wq); 1465 mempool_destroy(eq->pool); 1466 1467 return err; 1468 } 1469 1470 void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent) 1471 { 1472 if (!(ent->dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT)) 1473 return; 1474 1475 switch (ent->order - 2) { 1476 case MLX5_IMR_MTT_CACHE_ENTRY: 1477 ent->page = PAGE_SHIFT; 1478 ent->xlt = MLX5_IMR_MTT_ENTRIES * 1479 sizeof(struct mlx5_mtt) / 1480 MLX5_IB_UMR_OCTOWORD; 1481 ent->access_mode = MLX5_MKC_ACCESS_MODE_MTT; 1482 ent->limit = 0; 1483 break; 1484 1485 case MLX5_IMR_KSM_CACHE_ENTRY: 1486 ent->page = MLX5_KSM_PAGE_SHIFT; 1487 ent->xlt = mlx5_imr_ksm_entries * 1488 sizeof(struct mlx5_klm) / 1489 MLX5_IB_UMR_OCTOWORD; 1490 ent->access_mode = MLX5_MKC_ACCESS_MODE_KSM; 1491 ent->limit = 0; 1492 break; 1493 } 1494 } 1495 1496 int mlx5_ib_odp_init_one(struct mlx5_ib_dev *dev) 1497 { 1498 int ret = 0; 1499 1500 if (dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT) { 1501 ret = mlx5_cmd_null_mkey(dev->mdev, &dev->null_mkey); 1502 if (ret) { 1503 mlx5_ib_err(dev, "Error getting null_mkey %d\n", ret); 1504 return ret; 1505 } 1506 } 1507 1508 if (!MLX5_CAP_GEN(dev->mdev, pg)) 1509 return ret; 1510 1511 ret = mlx5_ib_create_pf_eq(dev, &dev->odp_pf_eq); 1512 1513 return ret; 1514 } 1515 1516 void mlx5_ib_odp_cleanup_one(struct mlx5_ib_dev *dev) 1517 { 1518 if (!MLX5_CAP_GEN(dev->mdev, pg)) 1519 return; 1520 1521 mlx5_ib_destroy_pf_eq(dev, &dev->odp_pf_eq); 1522 } 1523 1524 int mlx5_ib_odp_init(void) 1525 { 1526 mlx5_imr_ksm_entries = BIT_ULL(get_order(TASK_SIZE) - 1527 MLX5_IMR_MTT_BITS); 1528 1529 return 0; 1530 } 1531