1 /* 2 * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved. 3 * 4 * This software is available to you under a choice of one of two 5 * licenses. You may choose to be licensed under the terms of the GNU 6 * General Public License (GPL) Version 2, available from the file 7 * COPYING in the main directory of this source tree, or the 8 * OpenIB.org BSD license below: 9 * 10 * Redistribution and use in source and binary forms, with or 11 * without modification, are permitted provided that the following 12 * conditions are met: 13 * 14 * - Redistributions of source code must retain the above 15 * copyright notice, this list of conditions and the following 16 * disclaimer. 17 * 18 * - Redistributions in binary form must reproduce the above 19 * copyright notice, this list of conditions and the following 20 * disclaimer in the documentation and/or other materials 21 * provided with the distribution. 22 * 23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 30 * SOFTWARE. 31 */ 32 33 #include <rdma/ib_umem_odp.h> 34 #include <linux/kernel.h> 35 #include <linux/dma-buf.h> 36 #include <linux/dma-resv.h> 37 #include <linux/hmm.h> 38 #include <linux/hmm-dma.h> 39 #include <linux/pci-p2pdma.h> 40 41 #include "mlx5_ib.h" 42 #include "cmd.h" 43 #include "umr.h" 44 #include "qp.h" 45 46 #include <linux/mlx5/eq.h> 47 48 /* Contains the details of a pagefault. */ 49 struct mlx5_pagefault { 50 u32 bytes_committed; 51 u64 token; 52 u8 event_subtype; 53 u8 type; 54 union { 55 /* Initiator or send message responder pagefault details. */ 56 struct { 57 /* Received packet size, only valid for responders. */ 58 u32 packet_size; 59 /* 60 * Number of resource holding WQE, depends on type. 61 */ 62 u32 wq_num; 63 /* 64 * WQE index. Refers to either the send queue or 65 * receive queue, according to event_subtype. 66 */ 67 u16 wqe_index; 68 } wqe; 69 /* RDMA responder pagefault details */ 70 struct { 71 u32 r_key; 72 /* 73 * Received packet size, minimal size page fault 74 * resolution required for forward progress. 75 */ 76 u32 packet_size; 77 u32 rdma_op_len; 78 u64 rdma_va; 79 } rdma; 80 struct { 81 u64 va; 82 u32 mkey; 83 u32 fault_byte_count; 84 u32 prefetch_before_byte_count; 85 u32 prefetch_after_byte_count; 86 u8 flags; 87 } memory; 88 }; 89 90 struct mlx5_ib_pf_eq *eq; 91 struct work_struct work; 92 }; 93 94 #define MAX_PREFETCH_LEN (4*1024*1024U) 95 96 /* Timeout in ms to wait for an active mmu notifier to complete when handling 97 * a pagefault. */ 98 #define MMU_NOTIFIER_TIMEOUT 1000 99 100 static u64 mlx5_imr_ksm_entries; 101 static u64 mlx5_imr_mtt_entries; 102 static u64 mlx5_imr_mtt_size; 103 static u8 mlx5_imr_mtt_shift; 104 static u8 mlx5_imr_ksm_page_shift; 105 106 static void populate_ksm(struct mlx5_ksm *pksm, size_t idx, size_t nentries, 107 struct mlx5_ib_mr *imr, int flags) 108 { 109 struct mlx5_core_dev *dev = mr_to_mdev(imr)->mdev; 110 struct mlx5_ksm *end = pksm + nentries; 111 u64 step = MLX5_CAP_ODP(dev, mem_page_fault) ? mlx5_imr_mtt_size : 0; 112 __be32 key = MLX5_CAP_ODP(dev, mem_page_fault) ? 113 cpu_to_be32(imr->null_mmkey.key) : 114 mr_to_mdev(imr)->mkeys.null_mkey; 115 u64 va = 116 MLX5_CAP_ODP(dev, mem_page_fault) ? idx * mlx5_imr_mtt_size : 0; 117 118 if (flags & MLX5_IB_UPD_XLT_ZAP) { 119 for (; pksm != end; pksm++, idx++, va += step) { 120 pksm->key = key; 121 pksm->va = cpu_to_be64(va); 122 } 123 return; 124 } 125 126 /* 127 * The locking here is pretty subtle. Ideally the implicit_children 128 * xarray would be protected by the umem_mutex, however that is not 129 * possible. Instead this uses a weaker update-then-lock pattern: 130 * 131 * xa_store() 132 * mutex_lock(umem_mutex) 133 * mlx5r_umr_update_xlt() 134 * mutex_unlock(umem_mutex) 135 * destroy lkey 136 * 137 * ie any change the xarray must be followed by the locked update_xlt 138 * before destroying. 139 * 140 * The umem_mutex provides the acquire/release semantic needed to make 141 * the xa_store() visible to a racing thread. 142 */ 143 lockdep_assert_held(&to_ib_umem_odp(imr->umem)->umem_mutex); 144 145 for (; pksm != end; pksm++, idx++, va += step) { 146 struct mlx5_ib_mr *mtt = xa_load(&imr->implicit_children, idx); 147 148 if (mtt) { 149 pksm->key = cpu_to_be32(mtt->ibmr.lkey); 150 pksm->va = cpu_to_be64(idx * mlx5_imr_mtt_size); 151 } else { 152 pksm->key = key; 153 pksm->va = cpu_to_be64(va); 154 } 155 } 156 } 157 158 static int populate_mtt(__be64 *pas, size_t start, size_t nentries, 159 struct mlx5_ib_mr *mr, int flags) 160 { 161 struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem); 162 bool downgrade = flags & MLX5_IB_UPD_XLT_DOWNGRADE; 163 struct pci_p2pdma_map_state p2pdma_state = {}; 164 struct ib_device *dev = odp->umem.ibdev; 165 size_t i; 166 167 if (flags & MLX5_IB_UPD_XLT_ZAP) 168 return 0; 169 170 for (i = 0; i < nentries; i++) { 171 unsigned long pfn = odp->map.pfn_list[start + i]; 172 dma_addr_t dma_addr; 173 174 pfn = odp->map.pfn_list[start + i]; 175 if (!(pfn & HMM_PFN_VALID)) 176 /* ODP initialization */ 177 continue; 178 179 dma_addr = hmm_dma_map_pfn(dev->dma_device, &odp->map, 180 start + i, &p2pdma_state); 181 if (ib_dma_mapping_error(dev, dma_addr)) 182 return -EFAULT; 183 184 dma_addr |= MLX5_IB_MTT_READ; 185 if ((pfn & HMM_PFN_WRITE) && !downgrade) 186 dma_addr |= MLX5_IB_MTT_WRITE; 187 188 pas[i] = cpu_to_be64(dma_addr); 189 odp->npages++; 190 } 191 return 0; 192 } 193 194 int mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries, 195 struct mlx5_ib_mr *mr, int flags) 196 { 197 if (flags & MLX5_IB_UPD_XLT_INDIRECT) { 198 populate_ksm(xlt, idx, nentries, mr, flags); 199 return 0; 200 } else { 201 return populate_mtt(xlt, idx, nentries, mr, flags); 202 } 203 } 204 205 /* 206 * This must be called after the mr has been removed from implicit_children. 207 * NOTE: The MR does not necessarily have to be 208 * empty here, parallel page faults could have raced with the free process and 209 * added pages to it. 210 */ 211 static void free_implicit_child_mr_work(struct work_struct *work) 212 { 213 struct mlx5_ib_mr *mr = 214 container_of(work, struct mlx5_ib_mr, odp_destroy.work); 215 struct mlx5_ib_mr *imr = mr->parent; 216 struct ib_umem_odp *odp_imr = to_ib_umem_odp(imr->umem); 217 struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem); 218 219 mlx5r_deref_wait_odp_mkey(&mr->mmkey); 220 221 mutex_lock(&odp_imr->umem_mutex); 222 mlx5r_umr_update_xlt(mr->parent, 223 ib_umem_start(odp) >> mlx5_imr_mtt_shift, 1, 0, 224 MLX5_IB_UPD_XLT_INDIRECT | MLX5_IB_UPD_XLT_ATOMIC); 225 mutex_unlock(&odp_imr->umem_mutex); 226 mlx5_ib_dereg_mr(&mr->ibmr, NULL); 227 228 mlx5r_deref_odp_mkey(&imr->mmkey); 229 } 230 231 static void destroy_unused_implicit_child_mr(struct mlx5_ib_mr *mr) 232 { 233 struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem); 234 unsigned long idx = ib_umem_start(odp) >> mlx5_imr_mtt_shift; 235 struct mlx5_ib_mr *imr = mr->parent; 236 237 /* 238 * If userspace is racing freeing the parent implicit ODP MR then we can 239 * loose the race with parent destruction. In this case 240 * mlx5_ib_free_odp_mr() will free everything in the implicit_children 241 * xarray so NOP is fine. This child MR cannot be destroyed here because 242 * we are under its umem_mutex. 243 */ 244 if (!refcount_inc_not_zero(&imr->mmkey.usecount)) 245 return; 246 247 xa_lock(&imr->implicit_children); 248 if (__xa_cmpxchg(&imr->implicit_children, idx, mr, NULL, GFP_KERNEL) != 249 mr) { 250 xa_unlock(&imr->implicit_children); 251 mlx5r_deref_odp_mkey(&imr->mmkey); 252 return; 253 } 254 255 if (MLX5_CAP_ODP(mr_to_mdev(mr)->mdev, mem_page_fault)) 256 xa_erase(&mr_to_mdev(mr)->odp_mkeys, 257 mlx5_base_mkey(mr->mmkey.key)); 258 xa_unlock(&imr->implicit_children); 259 260 /* Freeing a MR is a sleeping operation, so bounce to a work queue */ 261 INIT_WORK(&mr->odp_destroy.work, free_implicit_child_mr_work); 262 queue_work(system_dfl_wq, &mr->odp_destroy.work); 263 } 264 265 static bool mlx5_ib_invalidate_range(struct mmu_interval_notifier *mni, 266 const struct mmu_notifier_range *range, 267 unsigned long cur_seq) 268 { 269 struct ib_umem_odp *umem_odp = 270 container_of(mni, struct ib_umem_odp, notifier); 271 struct mlx5_ib_mr *mr; 272 const u64 umr_block_mask = MLX5_UMR_MTT_NUM_ENTRIES_ALIGNMENT - 1; 273 u64 idx = 0, blk_start_idx = 0; 274 u64 invalidations = 0; 275 unsigned long start; 276 unsigned long end; 277 int in_block = 0; 278 u64 addr; 279 280 if (!mmu_notifier_range_blockable(range)) 281 return false; 282 283 mutex_lock(&umem_odp->umem_mutex); 284 mmu_interval_set_seq(mni, cur_seq); 285 /* 286 * If npages is zero then umem_odp->private may not be setup yet. This 287 * does not complete until after the first page is mapped for DMA. 288 */ 289 if (!umem_odp->npages) 290 goto out; 291 mr = umem_odp->private; 292 if (!mr) 293 goto out; 294 295 start = max_t(u64, ib_umem_start(umem_odp), range->start); 296 end = min_t(u64, ib_umem_end(umem_odp), range->end); 297 298 /* 299 * Iteration one - zap the HW's MTTs. The notifiers_count ensures that 300 * while we are doing the invalidation, no page fault will attempt to 301 * overwrite the same MTTs. Concurent invalidations might race us, 302 * but they will write 0s as well, so no difference in the end result. 303 */ 304 for (addr = start; addr < end; addr += BIT(umem_odp->page_shift)) { 305 idx = (addr - ib_umem_start(umem_odp)) >> umem_odp->page_shift; 306 /* 307 * Strive to write the MTTs in chunks, but avoid overwriting 308 * non-existing MTTs. The huristic here can be improved to 309 * estimate the cost of another UMR vs. the cost of bigger 310 * UMR. 311 */ 312 if (umem_odp->map.pfn_list[idx] & HMM_PFN_VALID) { 313 if (!in_block) { 314 blk_start_idx = idx; 315 in_block = 1; 316 } 317 } else { 318 u64 umr_offset = idx & umr_block_mask; 319 320 if (in_block && umr_offset == 0) { 321 mlx5r_umr_update_xlt(mr, blk_start_idx, 322 idx - blk_start_idx, 0, 323 MLX5_IB_UPD_XLT_ZAP | 324 MLX5_IB_UPD_XLT_ATOMIC); 325 in_block = 0; 326 /* Count page invalidations */ 327 invalidations += idx - blk_start_idx + 1; 328 } 329 } 330 } 331 if (in_block) { 332 mlx5r_umr_update_xlt(mr, blk_start_idx, 333 idx - blk_start_idx + 1, 0, 334 MLX5_IB_UPD_XLT_ZAP | 335 MLX5_IB_UPD_XLT_ATOMIC); 336 /* Count page invalidations */ 337 invalidations += idx - blk_start_idx + 1; 338 } 339 340 mlx5_update_odp_stats_with_handled(mr, invalidations, invalidations); 341 342 /* 343 * We are now sure that the device will not access the 344 * memory. We can safely unmap it, and mark it as dirty if 345 * needed. 346 */ 347 348 ib_umem_odp_unmap_dma_pages(umem_odp, start, end); 349 350 if (unlikely(!umem_odp->npages && mr->parent)) 351 destroy_unused_implicit_child_mr(mr); 352 out: 353 mutex_unlock(&umem_odp->umem_mutex); 354 return true; 355 } 356 357 const struct mmu_interval_notifier_ops mlx5_mn_ops = { 358 .invalidate = mlx5_ib_invalidate_range, 359 }; 360 361 static void internal_fill_odp_caps(struct mlx5_ib_dev *dev) 362 { 363 struct ib_odp_caps *caps = &dev->odp_caps; 364 365 memset(caps, 0, sizeof(*caps)); 366 367 if (!MLX5_CAP_GEN(dev->mdev, pg) || !mlx5r_umr_can_load_pas(dev, 0)) 368 return; 369 370 caps->general_caps = IB_ODP_SUPPORT; 371 372 if (MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset)) 373 dev->odp_max_size = U64_MAX; 374 else 375 dev->odp_max_size = BIT_ULL(MLX5_MAX_UMR_SHIFT + PAGE_SHIFT); 376 377 if (MLX5_CAP_ODP_SCHEME(dev->mdev, ud_odp_caps.send)) 378 caps->per_transport_caps.ud_odp_caps |= IB_ODP_SUPPORT_SEND; 379 380 if (MLX5_CAP_ODP_SCHEME(dev->mdev, ud_odp_caps.srq_receive)) 381 caps->per_transport_caps.ud_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV; 382 383 if (MLX5_CAP_ODP_SCHEME(dev->mdev, rc_odp_caps.send)) 384 caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_SEND; 385 386 if (MLX5_CAP_ODP_SCHEME(dev->mdev, rc_odp_caps.receive)) 387 caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_RECV; 388 389 if (MLX5_CAP_ODP_SCHEME(dev->mdev, rc_odp_caps.write)) 390 caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_WRITE; 391 392 if (MLX5_CAP_ODP_SCHEME(dev->mdev, rc_odp_caps.read)) 393 caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_READ; 394 395 if (MLX5_CAP_ODP_SCHEME(dev->mdev, rc_odp_caps.atomic)) 396 caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_ATOMIC; 397 398 if (MLX5_CAP_ODP_SCHEME(dev->mdev, rc_odp_caps.srq_receive)) 399 caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV; 400 401 if (MLX5_CAP_ODP_SCHEME(dev->mdev, xrc_odp_caps.send)) 402 caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_SEND; 403 404 if (MLX5_CAP_ODP_SCHEME(dev->mdev, xrc_odp_caps.receive)) 405 caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_RECV; 406 407 if (MLX5_CAP_ODP_SCHEME(dev->mdev, xrc_odp_caps.write)) 408 caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_WRITE; 409 410 if (MLX5_CAP_ODP_SCHEME(dev->mdev, xrc_odp_caps.read)) 411 caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_READ; 412 413 if (MLX5_CAP_ODP_SCHEME(dev->mdev, xrc_odp_caps.atomic)) 414 caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_ATOMIC; 415 416 if (MLX5_CAP_ODP_SCHEME(dev->mdev, xrc_odp_caps.srq_receive)) 417 caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV; 418 419 if (MLX5_CAP_GEN(dev->mdev, fixed_buffer_size) && 420 MLX5_CAP_GEN(dev->mdev, null_mkey) && 421 MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset) && 422 !MLX5_CAP_GEN(dev->mdev, umr_indirect_mkey_disabled) && 423 mlx5_imr_ksm_entries != 0 && 424 !(mlx5_imr_ksm_page_shift > 425 get_max_log_entity_size_cap(dev, MLX5_MKC_ACCESS_MODE_KSM))) 426 caps->general_caps |= IB_ODP_SUPPORT_IMPLICIT; 427 } 428 429 static void mlx5_ib_page_fault_resume(struct mlx5_ib_dev *dev, 430 struct mlx5_pagefault *pfault, 431 int error) 432 { 433 int wq_num = pfault->event_subtype == MLX5_PFAULT_SUBTYPE_WQE ? 434 pfault->wqe.wq_num : pfault->token; 435 u32 in[MLX5_ST_SZ_DW(page_fault_resume_in)] = {}; 436 void *info; 437 int err; 438 439 MLX5_SET(page_fault_resume_in, in, opcode, MLX5_CMD_OP_PAGE_FAULT_RESUME); 440 441 if (pfault->event_subtype == MLX5_PFAULT_SUBTYPE_MEMORY) { 442 info = MLX5_ADDR_OF(page_fault_resume_in, in, 443 page_fault_info.mem_page_fault_info); 444 MLX5_SET(mem_page_fault_info, info, fault_token_31_0, 445 pfault->token & 0xffffffff); 446 MLX5_SET(mem_page_fault_info, info, fault_token_47_32, 447 (pfault->token >> 32) & 0xffff); 448 MLX5_SET(mem_page_fault_info, info, error, !!error); 449 } else { 450 info = MLX5_ADDR_OF(page_fault_resume_in, in, 451 page_fault_info.trans_page_fault_info); 452 MLX5_SET(trans_page_fault_info, info, page_fault_type, 453 pfault->type); 454 MLX5_SET(trans_page_fault_info, info, fault_token, 455 pfault->token); 456 MLX5_SET(trans_page_fault_info, info, wq_number, wq_num); 457 MLX5_SET(trans_page_fault_info, info, error, !!error); 458 } 459 460 err = mlx5_cmd_exec_in(dev->mdev, page_fault_resume, in); 461 if (err) 462 mlx5_ib_err(dev, "Failed to resolve the page fault on WQ 0x%x err %d\n", 463 wq_num, err); 464 } 465 466 static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr, 467 unsigned long idx) 468 { 469 struct mlx5_ib_dev *dev = mr_to_mdev(imr); 470 struct ib_umem_odp *odp; 471 struct mlx5_ib_mr *mr; 472 struct mlx5_ib_mr *ret; 473 int err; 474 475 odp = ib_umem_odp_alloc_child(to_ib_umem_odp(imr->umem), 476 idx * mlx5_imr_mtt_size, 477 mlx5_imr_mtt_size, &mlx5_mn_ops); 478 if (IS_ERR(odp)) 479 return ERR_CAST(odp); 480 481 mr = mlx5_mr_cache_alloc(dev, imr->access_flags, 482 MLX5_MKC_ACCESS_MODE_MTT, 483 mlx5_imr_mtt_entries); 484 if (IS_ERR(mr)) { 485 ib_umem_odp_release(odp); 486 return mr; 487 } 488 489 mr->access_flags = imr->access_flags; 490 mr->ibmr.pd = imr->ibmr.pd; 491 mr->ibmr.device = &mr_to_mdev(imr)->ib_dev; 492 mr->umem = &odp->umem; 493 mr->ibmr.lkey = mr->mmkey.key; 494 mr->ibmr.rkey = mr->mmkey.key; 495 mr->ibmr.iova = idx * mlx5_imr_mtt_size; 496 mr->parent = imr; 497 odp->private = mr; 498 499 /* 500 * First refcount is owned by the xarray and second refconut 501 * is returned to the caller. 502 */ 503 refcount_set(&mr->mmkey.usecount, 2); 504 505 err = mlx5r_umr_update_xlt(mr, 0, 506 mlx5_imr_mtt_entries, 507 PAGE_SHIFT, 508 MLX5_IB_UPD_XLT_ZAP | 509 MLX5_IB_UPD_XLT_ENABLE); 510 if (err) { 511 ret = ERR_PTR(err); 512 goto out_mr; 513 } 514 515 xa_lock(&imr->implicit_children); 516 ret = __xa_cmpxchg(&imr->implicit_children, idx, NULL, mr, 517 GFP_KERNEL); 518 if (unlikely(ret)) { 519 if (xa_is_err(ret)) { 520 ret = ERR_PTR(xa_err(ret)); 521 goto out_lock; 522 } 523 /* 524 * Another thread beat us to creating the child mr, use 525 * theirs. 526 */ 527 refcount_inc(&ret->mmkey.usecount); 528 goto out_lock; 529 } 530 531 if (MLX5_CAP_ODP(dev->mdev, mem_page_fault)) { 532 ret = xa_store(&dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key), 533 &mr->mmkey, GFP_KERNEL); 534 if (xa_is_err(ret)) { 535 ret = ERR_PTR(xa_err(ret)); 536 __xa_erase(&imr->implicit_children, idx); 537 goto out_lock; 538 } 539 mr->mmkey.type = MLX5_MKEY_IMPLICIT_CHILD; 540 } 541 xa_unlock(&imr->implicit_children); 542 mlx5_ib_dbg(mr_to_mdev(imr), "key %x mr %p\n", mr->mmkey.key, mr); 543 return mr; 544 545 out_lock: 546 xa_unlock(&imr->implicit_children); 547 out_mr: 548 mlx5_ib_dereg_mr(&mr->ibmr, NULL); 549 return ret; 550 } 551 552 /* 553 * When using memory scheme ODP, implicit MRs can't use the reserved null mkey 554 * and each implicit MR needs to assign a private null mkey to get the page 555 * faults on. 556 * The null mkey is created with the properties to enable getting the page 557 * fault for every time it is accessed and having all relevant access flags. 558 */ 559 static int alloc_implicit_mr_null_mkey(struct mlx5_ib_dev *dev, 560 struct mlx5_ib_mr *imr, 561 struct mlx5_ib_pd *pd) 562 { 563 size_t inlen = MLX5_ST_SZ_BYTES(create_mkey_in) + 64; 564 void *mkc; 565 u32 *in; 566 int err; 567 568 in = kzalloc(inlen, GFP_KERNEL); 569 if (!in) 570 return -ENOMEM; 571 572 MLX5_SET(create_mkey_in, in, translations_octword_actual_size, 4); 573 MLX5_SET(create_mkey_in, in, pg_access, 1); 574 575 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 576 MLX5_SET(mkc, mkc, a, 1); 577 MLX5_SET(mkc, mkc, rw, 1); 578 MLX5_SET(mkc, mkc, rr, 1); 579 MLX5_SET(mkc, mkc, lw, 1); 580 MLX5_SET(mkc, mkc, lr, 1); 581 MLX5_SET(mkc, mkc, free, 0); 582 MLX5_SET(mkc, mkc, umr_en, 0); 583 MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_MTT); 584 585 MLX5_SET(mkc, mkc, translations_octword_size, 4); 586 MLX5_SET(mkc, mkc, log_page_size, 61); 587 MLX5_SET(mkc, mkc, length64, 1); 588 MLX5_SET(mkc, mkc, pd, pd->pdn); 589 MLX5_SET64(mkc, mkc, start_addr, 0); 590 MLX5_SET(mkc, mkc, qpn, 0xffffff); 591 592 err = mlx5_core_create_mkey(dev->mdev, &imr->null_mmkey.key, in, inlen); 593 if (err) 594 goto free_in; 595 596 imr->null_mmkey.type = MLX5_MKEY_NULL; 597 598 free_in: 599 kfree(in); 600 return err; 601 } 602 603 struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd, 604 int access_flags) 605 { 606 struct mlx5_ib_dev *dev = to_mdev(pd->ibpd.device); 607 struct ib_umem_odp *umem_odp; 608 struct mlx5_ib_mr *imr; 609 int err; 610 611 if (!mlx5r_umr_can_load_pas(dev, mlx5_imr_mtt_entries * PAGE_SIZE)) 612 return ERR_PTR(-EOPNOTSUPP); 613 614 umem_odp = ib_umem_odp_alloc_implicit(&dev->ib_dev, access_flags); 615 if (IS_ERR(umem_odp)) 616 return ERR_CAST(umem_odp); 617 618 imr = mlx5_mr_cache_alloc(dev, access_flags, MLX5_MKC_ACCESS_MODE_KSM, 619 mlx5_imr_ksm_entries); 620 if (IS_ERR(imr)) { 621 ib_umem_odp_release(umem_odp); 622 return imr; 623 } 624 625 imr->access_flags = access_flags; 626 imr->ibmr.pd = &pd->ibpd; 627 imr->ibmr.iova = 0; 628 imr->umem = &umem_odp->umem; 629 imr->ibmr.lkey = imr->mmkey.key; 630 imr->ibmr.rkey = imr->mmkey.key; 631 imr->ibmr.device = &dev->ib_dev; 632 imr->is_odp_implicit = true; 633 xa_init(&imr->implicit_children); 634 635 if (MLX5_CAP_ODP(dev->mdev, mem_page_fault)) { 636 err = alloc_implicit_mr_null_mkey(dev, imr, pd); 637 if (err) 638 goto out_mr; 639 640 err = mlx5r_store_odp_mkey(dev, &imr->null_mmkey); 641 if (err) 642 goto out_mr; 643 } 644 645 err = mlx5r_umr_update_xlt(imr, 0, 646 mlx5_imr_ksm_entries, 647 mlx5_imr_ksm_page_shift, 648 MLX5_IB_UPD_XLT_INDIRECT | 649 MLX5_IB_UPD_XLT_ZAP | 650 MLX5_IB_UPD_XLT_ENABLE); 651 if (err) 652 goto out_mr; 653 654 err = mlx5r_store_odp_mkey(dev, &imr->mmkey); 655 if (err) 656 goto out_mr; 657 658 mlx5_ib_dbg(dev, "key %x mr %p\n", imr->mmkey.key, imr); 659 return imr; 660 out_mr: 661 mlx5_ib_err(dev, "Failed to register MKEY %d\n", err); 662 mlx5_ib_dereg_mr(&imr->ibmr, NULL); 663 return ERR_PTR(err); 664 } 665 666 void mlx5_ib_free_odp_mr(struct mlx5_ib_mr *mr) 667 { 668 struct mlx5_ib_mr *mtt; 669 unsigned long idx; 670 671 /* 672 * If this is an implicit MR it is already invalidated so we can just 673 * delete the children mkeys. 674 */ 675 xa_for_each(&mr->implicit_children, idx, mtt) { 676 xa_erase(&mr->implicit_children, idx); 677 mlx5_ib_dereg_mr(&mtt->ibmr, NULL); 678 } 679 680 if (mr->null_mmkey.key) { 681 xa_erase(&mr_to_mdev(mr)->odp_mkeys, 682 mlx5_base_mkey(mr->null_mmkey.key)); 683 684 mlx5_core_destroy_mkey(mr_to_mdev(mr)->mdev, 685 mr->null_mmkey.key); 686 } 687 } 688 689 #define MLX5_PF_FLAGS_DOWNGRADE BIT(1) 690 #define MLX5_PF_FLAGS_SNAPSHOT BIT(2) 691 #define MLX5_PF_FLAGS_ENABLE BIT(3) 692 static int pagefault_real_mr(struct mlx5_ib_mr *mr, struct ib_umem_odp *odp, 693 u64 user_va, size_t bcnt, u32 *bytes_mapped, 694 u32 flags) 695 { 696 int page_shift, ret, np; 697 bool downgrade = flags & MLX5_PF_FLAGS_DOWNGRADE; 698 u64 access_mask = 0; 699 u64 start_idx; 700 bool fault = !(flags & MLX5_PF_FLAGS_SNAPSHOT); 701 u32 xlt_flags = MLX5_IB_UPD_XLT_ATOMIC; 702 703 if (flags & MLX5_PF_FLAGS_ENABLE) 704 xlt_flags |= MLX5_IB_UPD_XLT_ENABLE; 705 706 if (flags & MLX5_PF_FLAGS_DOWNGRADE) 707 xlt_flags |= MLX5_IB_UPD_XLT_DOWNGRADE; 708 709 page_shift = odp->page_shift; 710 start_idx = (user_va - ib_umem_start(odp)) >> page_shift; 711 712 if (odp->umem.writable && !downgrade) 713 access_mask |= HMM_PFN_WRITE; 714 715 np = ib_umem_odp_map_dma_and_lock(odp, user_va, bcnt, access_mask, fault); 716 if (np < 0) 717 return np; 718 719 /* 720 * No need to check whether the MTTs really belong to this MR, since 721 * ib_umem_odp_map_dma_and_lock already checks this. 722 */ 723 ret = mlx5r_umr_update_xlt(mr, start_idx, np, page_shift, xlt_flags); 724 mutex_unlock(&odp->umem_mutex); 725 726 if (ret < 0) { 727 if (ret != -EAGAIN) 728 mlx5_ib_err(mr_to_mdev(mr), 729 "Failed to update mkey page tables\n"); 730 goto out; 731 } 732 733 if (bytes_mapped) { 734 u32 new_mappings = (np << page_shift) - 735 (user_va - round_down(user_va, 1 << page_shift)); 736 737 *bytes_mapped += min_t(u32, new_mappings, bcnt); 738 } 739 740 return np << (page_shift - PAGE_SHIFT); 741 742 out: 743 return ret; 744 } 745 746 static int pagefault_implicit_mr(struct mlx5_ib_mr *imr, 747 struct ib_umem_odp *odp_imr, u64 user_va, 748 size_t bcnt, u32 *bytes_mapped, u32 flags) 749 { 750 unsigned long end_idx = (user_va + bcnt - 1) >> mlx5_imr_mtt_shift; 751 unsigned long upd_start_idx = end_idx + 1; 752 unsigned long upd_len = 0; 753 unsigned long npages = 0; 754 int err; 755 int ret; 756 757 if (unlikely(user_va >= mlx5_imr_ksm_entries * mlx5_imr_mtt_size || 758 mlx5_imr_ksm_entries * mlx5_imr_mtt_size - user_va < bcnt)) 759 return -EFAULT; 760 761 /* Fault each child mr that intersects with our interval. */ 762 while (bcnt) { 763 unsigned long idx = user_va >> mlx5_imr_mtt_shift; 764 struct ib_umem_odp *umem_odp; 765 struct mlx5_ib_mr *mtt; 766 u64 len; 767 768 xa_lock(&imr->implicit_children); 769 mtt = xa_load(&imr->implicit_children, idx); 770 if (unlikely(!mtt)) { 771 xa_unlock(&imr->implicit_children); 772 mtt = implicit_get_child_mr(imr, idx); 773 if (IS_ERR(mtt)) { 774 ret = PTR_ERR(mtt); 775 goto out; 776 } 777 upd_start_idx = min(upd_start_idx, idx); 778 upd_len = idx - upd_start_idx + 1; 779 } else { 780 refcount_inc(&mtt->mmkey.usecount); 781 xa_unlock(&imr->implicit_children); 782 } 783 784 umem_odp = to_ib_umem_odp(mtt->umem); 785 len = min_t(u64, user_va + bcnt, ib_umem_end(umem_odp)) - 786 user_va; 787 788 ret = pagefault_real_mr(mtt, umem_odp, user_va, len, 789 bytes_mapped, flags); 790 791 mlx5r_deref_odp_mkey(&mtt->mmkey); 792 793 if (ret < 0) 794 goto out; 795 user_va += len; 796 bcnt -= len; 797 npages += ret; 798 } 799 800 ret = npages; 801 802 /* 803 * Any time the implicit_children are changed we must perform an 804 * update of the xlt before exiting to ensure the HW and the 805 * implicit_children remains synchronized. 806 */ 807 out: 808 if (likely(!upd_len)) 809 return ret; 810 811 /* 812 * Notice this is not strictly ordered right, the KSM is updated after 813 * the implicit_children is updated, so a parallel page fault could 814 * see a MR that is not yet visible in the KSM. This is similar to a 815 * parallel page fault seeing a MR that is being concurrently removed 816 * from the KSM. Both of these improbable situations are resolved 817 * safely by resuming the HW and then taking another page fault. The 818 * next pagefault handler will see the new information. 819 */ 820 mutex_lock(&odp_imr->umem_mutex); 821 err = mlx5r_umr_update_xlt(imr, upd_start_idx, upd_len, 0, 822 MLX5_IB_UPD_XLT_INDIRECT | 823 MLX5_IB_UPD_XLT_ATOMIC); 824 mutex_unlock(&odp_imr->umem_mutex); 825 if (err) { 826 mlx5_ib_err(mr_to_mdev(imr), "Failed to update PAS\n"); 827 return err; 828 } 829 return ret; 830 } 831 832 static int pagefault_dmabuf_mr(struct mlx5_ib_mr *mr, size_t bcnt, 833 u32 *bytes_mapped, u32 flags) 834 { 835 struct ib_umem_dmabuf *umem_dmabuf = to_ib_umem_dmabuf(mr->umem); 836 int access_mode = mr->data_direct ? MLX5_MKC_ACCESS_MODE_KSM : 837 MLX5_MKC_ACCESS_MODE_MTT; 838 unsigned int old_page_shift = mr->page_shift; 839 unsigned int page_shift; 840 unsigned long page_size; 841 u32 xlt_flags = 0; 842 int err; 843 844 if (flags & MLX5_PF_FLAGS_ENABLE) 845 xlt_flags |= MLX5_IB_UPD_XLT_ENABLE; 846 847 dma_resv_lock(umem_dmabuf->attach->dmabuf->resv, NULL); 848 err = ib_umem_dmabuf_map_pages(umem_dmabuf); 849 if (err) { 850 dma_resv_unlock(umem_dmabuf->attach->dmabuf->resv); 851 return err; 852 } 853 854 page_size = mlx5_umem_dmabuf_find_best_pgsz(umem_dmabuf, access_mode); 855 if (!page_size) { 856 ib_umem_dmabuf_unmap_pages(umem_dmabuf); 857 err = -EINVAL; 858 } else { 859 page_shift = order_base_2(page_size); 860 if (page_shift != mr->page_shift && mr->dmabuf_faulted) { 861 err = mlx5r_umr_dmabuf_update_pgsz(mr, xlt_flags, 862 page_shift); 863 } else { 864 mr->page_shift = page_shift; 865 if (mr->data_direct) 866 err = mlx5r_umr_update_data_direct_ksm_pas( 867 mr, xlt_flags); 868 else 869 err = mlx5r_umr_update_mr_pas(mr, 870 xlt_flags); 871 } 872 } 873 dma_resv_unlock(umem_dmabuf->attach->dmabuf->resv); 874 875 if (err) { 876 mr->page_shift = old_page_shift; 877 return err; 878 } 879 880 mr->dmabuf_faulted = 1; 881 882 if (bytes_mapped) 883 *bytes_mapped += bcnt; 884 885 return ib_umem_num_pages(mr->umem); 886 } 887 888 /* 889 * Returns: 890 * -EFAULT: The io_virt->bcnt is not within the MR, it covers pages that are 891 * not accessible, or the MR is no longer valid. 892 * -EAGAIN/-ENOMEM: The operation should be retried 893 * 894 * -EINVAL/others: General internal malfunction 895 * >0: Number of pages mapped 896 */ 897 static int pagefault_mr(struct mlx5_ib_mr *mr, u64 io_virt, size_t bcnt, 898 u32 *bytes_mapped, u32 flags, bool permissive_fault) 899 { 900 struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem); 901 902 if (unlikely(io_virt < mr->ibmr.iova) && !permissive_fault) 903 return -EFAULT; 904 905 if (mr->umem->is_dmabuf) 906 return pagefault_dmabuf_mr(mr, bcnt, bytes_mapped, flags); 907 908 if (!odp->is_implicit_odp) { 909 u64 offset = io_virt < mr->ibmr.iova ? 0 : io_virt - mr->ibmr.iova; 910 u64 user_va; 911 912 if (check_add_overflow(offset, (u64)odp->umem.address, 913 &user_va)) 914 return -EFAULT; 915 916 if (permissive_fault) { 917 if (user_va < ib_umem_start(odp)) 918 user_va = ib_umem_start(odp); 919 if ((user_va + bcnt) > ib_umem_end(odp)) 920 bcnt = ib_umem_end(odp) - user_va; 921 } else if (unlikely(user_va >= ib_umem_end(odp) || 922 ib_umem_end(odp) - user_va < bcnt)) 923 return -EFAULT; 924 return pagefault_real_mr(mr, odp, user_va, bcnt, bytes_mapped, 925 flags); 926 } 927 return pagefault_implicit_mr(mr, odp, io_virt, bcnt, bytes_mapped, 928 flags); 929 } 930 931 int mlx5_ib_init_odp_mr(struct mlx5_ib_mr *mr) 932 { 933 int ret; 934 935 ret = pagefault_real_mr(mr, to_ib_umem_odp(mr->umem), mr->umem->address, 936 mr->umem->length, NULL, 937 MLX5_PF_FLAGS_SNAPSHOT | MLX5_PF_FLAGS_ENABLE); 938 return ret >= 0 ? 0 : ret; 939 } 940 941 int mlx5_ib_init_dmabuf_mr(struct mlx5_ib_mr *mr) 942 { 943 int ret; 944 945 ret = pagefault_dmabuf_mr(mr, mr->umem->length, NULL, 946 MLX5_PF_FLAGS_ENABLE); 947 948 return ret >= 0 ? 0 : ret; 949 } 950 951 struct pf_frame { 952 struct pf_frame *next; 953 u32 key; 954 u64 io_virt; 955 size_t bcnt; 956 int depth; 957 }; 958 959 static bool mkey_is_eq(struct mlx5_ib_mkey *mmkey, u32 key) 960 { 961 if (!mmkey) 962 return false; 963 if (mmkey->type == MLX5_MKEY_MW || 964 mmkey->type == MLX5_MKEY_INDIRECT_DEVX) 965 return mlx5_base_mkey(mmkey->key) == mlx5_base_mkey(key); 966 return mmkey->key == key; 967 } 968 969 static struct mlx5_ib_mkey *find_odp_mkey(struct mlx5_ib_dev *dev, u32 key) 970 { 971 struct mlx5_ib_mkey *mmkey; 972 973 xa_lock(&dev->odp_mkeys); 974 mmkey = xa_load(&dev->odp_mkeys, mlx5_base_mkey(key)); 975 if (!mmkey) { 976 mmkey = ERR_PTR(-ENOENT); 977 goto out; 978 } 979 if (!mkey_is_eq(mmkey, key)) { 980 mmkey = ERR_PTR(-EFAULT); 981 goto out; 982 } 983 refcount_inc(&mmkey->usecount); 984 out: 985 xa_unlock(&dev->odp_mkeys); 986 987 return mmkey; 988 } 989 990 /* 991 * Handle a single data segment in a page-fault WQE or RDMA region. 992 * 993 * Returns zero on success. The caller may continue to the next data segment. 994 * Can return the following error codes: 995 * -EAGAIN to designate a temporary error. The caller will abort handling the 996 * page fault and resolve it. 997 * -EFAULT when there's an error mapping the requested pages. The caller will 998 * abort the page fault handling. 999 */ 1000 static int pagefault_single_data_segment(struct mlx5_ib_dev *dev, 1001 struct ib_pd *pd, u32 key, 1002 u64 io_virt, size_t bcnt, 1003 u32 *bytes_committed, 1004 u32 *bytes_mapped) 1005 { 1006 int ret, i, outlen, cur_outlen = 0, depth = 0, pages_in_range; 1007 struct pf_frame *head = NULL, *frame; 1008 struct mlx5_ib_mkey *mmkey; 1009 struct mlx5_ib_mr *mr; 1010 struct mlx5_klm *pklm; 1011 u32 *out = NULL; 1012 size_t offset; 1013 1014 io_virt += *bytes_committed; 1015 bcnt -= *bytes_committed; 1016 next_mr: 1017 mmkey = find_odp_mkey(dev, key); 1018 if (IS_ERR(mmkey)) { 1019 ret = PTR_ERR(mmkey); 1020 if (ret == -ENOENT) { 1021 mlx5_ib_dbg( 1022 dev, 1023 "skipping non ODP MR (lkey=0x%06x) in page fault handler.\n", 1024 key); 1025 if (bytes_mapped) 1026 *bytes_mapped += bcnt; 1027 /* 1028 * The user could specify a SGL with multiple lkeys and 1029 * only some of them are ODP. Treat the non-ODP ones as 1030 * fully faulted. 1031 */ 1032 ret = 0; 1033 } 1034 goto end; 1035 } 1036 1037 switch (mmkey->type) { 1038 case MLX5_MKEY_MR: 1039 mr = container_of(mmkey, struct mlx5_ib_mr, mmkey); 1040 1041 pages_in_range = (ALIGN(io_virt + bcnt, PAGE_SIZE) - 1042 (io_virt & PAGE_MASK)) >> 1043 PAGE_SHIFT; 1044 ret = pagefault_mr(mr, io_virt, bcnt, bytes_mapped, 0, false); 1045 if (ret < 0) 1046 goto end; 1047 1048 mlx5_update_odp_stats_with_handled(mr, faults, ret); 1049 1050 if (ret < pages_in_range) { 1051 ret = -EFAULT; 1052 goto end; 1053 } 1054 1055 ret = 0; 1056 break; 1057 1058 case MLX5_MKEY_MW: 1059 case MLX5_MKEY_INDIRECT_DEVX: 1060 if (depth >= MLX5_CAP_GEN(dev->mdev, max_indirection)) { 1061 mlx5_ib_dbg(dev, "indirection level exceeded\n"); 1062 ret = -EFAULT; 1063 goto end; 1064 } 1065 1066 outlen = MLX5_ST_SZ_BYTES(query_mkey_out) + 1067 sizeof(*pklm) * (mmkey->ndescs - 2); 1068 1069 if (outlen > cur_outlen) { 1070 kfree(out); 1071 out = kzalloc(outlen, GFP_KERNEL); 1072 if (!out) { 1073 ret = -ENOMEM; 1074 goto end; 1075 } 1076 cur_outlen = outlen; 1077 } 1078 1079 pklm = (struct mlx5_klm *)MLX5_ADDR_OF(query_mkey_out, out, 1080 bsf0_klm0_pas_mtt0_1); 1081 1082 ret = mlx5_core_query_mkey(dev->mdev, mmkey->key, out, outlen); 1083 if (ret) 1084 goto end; 1085 1086 offset = io_virt - MLX5_GET64(query_mkey_out, out, 1087 memory_key_mkey_entry.start_addr); 1088 1089 for (i = 0; bcnt && i < mmkey->ndescs; i++, pklm++) { 1090 if (offset >= be32_to_cpu(pklm->bcount)) { 1091 offset -= be32_to_cpu(pklm->bcount); 1092 continue; 1093 } 1094 1095 frame = kzalloc_obj(*frame); 1096 if (!frame) { 1097 ret = -ENOMEM; 1098 goto end; 1099 } 1100 1101 frame->key = be32_to_cpu(pklm->key); 1102 frame->io_virt = be64_to_cpu(pklm->va) + offset; 1103 frame->bcnt = min_t(size_t, bcnt, 1104 be32_to_cpu(pklm->bcount) - offset); 1105 frame->depth = depth + 1; 1106 frame->next = head; 1107 head = frame; 1108 1109 bcnt -= frame->bcnt; 1110 offset = 0; 1111 } 1112 break; 1113 1114 default: 1115 mlx5_ib_dbg(dev, "wrong mkey type %d\n", mmkey->type); 1116 ret = -EFAULT; 1117 goto end; 1118 } 1119 1120 if (head) { 1121 frame = head; 1122 head = frame->next; 1123 1124 key = frame->key; 1125 io_virt = frame->io_virt; 1126 bcnt = frame->bcnt; 1127 depth = frame->depth; 1128 kfree(frame); 1129 1130 mlx5r_deref_odp_mkey(mmkey); 1131 goto next_mr; 1132 } 1133 1134 end: 1135 if (!IS_ERR(mmkey)) 1136 mlx5r_deref_odp_mkey(mmkey); 1137 while (head) { 1138 frame = head; 1139 head = frame->next; 1140 kfree(frame); 1141 } 1142 kfree(out); 1143 1144 *bytes_committed = 0; 1145 return ret; 1146 } 1147 1148 /* 1149 * Parse a series of data segments for page fault handling. 1150 * 1151 * @dev: Pointer to mlx5 IB device 1152 * @pfault: contains page fault information. 1153 * @wqe: points at the first data segment in the WQE. 1154 * @wqe_end: points after the end of the WQE. 1155 * @bytes_mapped: receives the number of bytes that the function was able to 1156 * map. This allows the caller to decide intelligently whether 1157 * enough memory was mapped to resolve the page fault 1158 * successfully (e.g. enough for the next MTU, or the entire 1159 * WQE). 1160 * @total_wqe_bytes: receives the total data size of this WQE in bytes (minus 1161 * the committed bytes). 1162 * @receive_queue: receive WQE end of sg list 1163 * 1164 * Returns zero for success or a negative error code. 1165 */ 1166 static int pagefault_data_segments(struct mlx5_ib_dev *dev, 1167 struct mlx5_pagefault *pfault, 1168 void *wqe, 1169 void *wqe_end, u32 *bytes_mapped, 1170 u32 *total_wqe_bytes, bool receive_queue) 1171 { 1172 int ret = 0; 1173 u64 io_virt; 1174 __be32 key; 1175 u32 byte_count; 1176 size_t bcnt; 1177 int inline_segment; 1178 1179 if (bytes_mapped) 1180 *bytes_mapped = 0; 1181 if (total_wqe_bytes) 1182 *total_wqe_bytes = 0; 1183 1184 while (wqe < wqe_end) { 1185 struct mlx5_wqe_data_seg *dseg = wqe; 1186 1187 io_virt = be64_to_cpu(dseg->addr); 1188 key = dseg->lkey; 1189 byte_count = be32_to_cpu(dseg->byte_count); 1190 inline_segment = !!(byte_count & MLX5_INLINE_SEG); 1191 bcnt = byte_count & ~MLX5_INLINE_SEG; 1192 1193 if (inline_segment) { 1194 bcnt = bcnt & MLX5_WQE_INLINE_SEG_BYTE_COUNT_MASK; 1195 wqe += ALIGN(sizeof(struct mlx5_wqe_inline_seg) + bcnt, 1196 16); 1197 } else { 1198 wqe += sizeof(*dseg); 1199 } 1200 1201 /* receive WQE end of sg list. */ 1202 if (receive_queue && bcnt == 0 && 1203 key == dev->mkeys.terminate_scatter_list_mkey && 1204 io_virt == 0) 1205 break; 1206 1207 if (!inline_segment && total_wqe_bytes) { 1208 *total_wqe_bytes += bcnt - min_t(size_t, bcnt, 1209 pfault->bytes_committed); 1210 } 1211 1212 /* A zero length data segment designates a length of 2GB. */ 1213 if (bcnt == 0) 1214 bcnt = 1U << 31; 1215 1216 if (inline_segment || bcnt <= pfault->bytes_committed) { 1217 pfault->bytes_committed -= 1218 min_t(size_t, bcnt, 1219 pfault->bytes_committed); 1220 continue; 1221 } 1222 1223 ret = pagefault_single_data_segment(dev, NULL, be32_to_cpu(key), 1224 io_virt, bcnt, 1225 &pfault->bytes_committed, 1226 bytes_mapped); 1227 if (ret < 0) 1228 break; 1229 } 1230 1231 return ret; 1232 } 1233 1234 /* 1235 * Parse initiator WQE. Advances the wqe pointer to point at the 1236 * scatter-gather list, and set wqe_end to the end of the WQE. 1237 */ 1238 static int mlx5_ib_mr_initiator_pfault_handler( 1239 struct mlx5_ib_dev *dev, struct mlx5_pagefault *pfault, 1240 struct mlx5_ib_qp *qp, void **wqe, void **wqe_end, int wqe_length) 1241 { 1242 struct mlx5_wqe_ctrl_seg *ctrl = *wqe; 1243 u16 wqe_index = pfault->wqe.wqe_index; 1244 struct mlx5_base_av *av; 1245 unsigned ds, opcode; 1246 u32 qpn = qp->trans_qp.base.mqp.qpn; 1247 1248 ds = be32_to_cpu(ctrl->qpn_ds) & MLX5_WQE_CTRL_DS_MASK; 1249 if (ds * MLX5_WQE_DS_UNITS > wqe_length) { 1250 mlx5_ib_err(dev, "Unable to read the complete WQE. ds = 0x%x, ret = 0x%x\n", 1251 ds, wqe_length); 1252 return -EFAULT; 1253 } 1254 1255 if (ds == 0) { 1256 mlx5_ib_err(dev, "Got WQE with zero DS. wqe_index=%x, qpn=%x\n", 1257 wqe_index, qpn); 1258 return -EFAULT; 1259 } 1260 1261 *wqe_end = *wqe + ds * MLX5_WQE_DS_UNITS; 1262 *wqe += sizeof(*ctrl); 1263 1264 opcode = be32_to_cpu(ctrl->opmod_idx_opcode) & 1265 MLX5_WQE_CTRL_OPCODE_MASK; 1266 1267 if (qp->type == IB_QPT_XRC_INI) 1268 *wqe += sizeof(struct mlx5_wqe_xrc_seg); 1269 1270 if (qp->type == IB_QPT_UD || qp->type == MLX5_IB_QPT_DCI) { 1271 av = *wqe; 1272 if (av->dqp_dct & cpu_to_be32(MLX5_EXTENDED_UD_AV)) 1273 *wqe += sizeof(struct mlx5_av); 1274 else 1275 *wqe += sizeof(struct mlx5_base_av); 1276 } 1277 1278 switch (opcode) { 1279 case MLX5_OPCODE_RDMA_WRITE: 1280 case MLX5_OPCODE_RDMA_WRITE_IMM: 1281 case MLX5_OPCODE_RDMA_READ: 1282 *wqe += sizeof(struct mlx5_wqe_raddr_seg); 1283 break; 1284 case MLX5_OPCODE_ATOMIC_CS: 1285 case MLX5_OPCODE_ATOMIC_FA: 1286 *wqe += sizeof(struct mlx5_wqe_raddr_seg); 1287 *wqe += sizeof(struct mlx5_wqe_atomic_seg); 1288 break; 1289 } 1290 1291 return 0; 1292 } 1293 1294 /* 1295 * Parse responder WQE and set wqe_end to the end of the WQE. 1296 */ 1297 static int mlx5_ib_mr_responder_pfault_handler_srq(struct mlx5_ib_dev *dev, 1298 struct mlx5_ib_srq *srq, 1299 void **wqe, void **wqe_end, 1300 int wqe_length) 1301 { 1302 int wqe_size = 1 << srq->msrq.wqe_shift; 1303 1304 if (wqe_size > wqe_length) { 1305 mlx5_ib_err(dev, "Couldn't read all of the receive WQE's content\n"); 1306 return -EFAULT; 1307 } 1308 1309 *wqe_end = *wqe + wqe_size; 1310 *wqe += sizeof(struct mlx5_wqe_srq_next_seg); 1311 1312 return 0; 1313 } 1314 1315 static int mlx5_ib_mr_responder_pfault_handler_rq(struct mlx5_ib_dev *dev, 1316 struct mlx5_ib_qp *qp, 1317 void *wqe, void **wqe_end, 1318 int wqe_length) 1319 { 1320 struct mlx5_ib_wq *wq = &qp->rq; 1321 int wqe_size = 1 << wq->wqe_shift; 1322 1323 if (qp->flags_en & MLX5_QP_FLAG_SIGNATURE) { 1324 mlx5_ib_err(dev, "ODP fault with WQE signatures is not supported\n"); 1325 return -EFAULT; 1326 } 1327 1328 if (wqe_size > wqe_length) { 1329 mlx5_ib_err(dev, "Couldn't read all of the receive WQE's content\n"); 1330 return -EFAULT; 1331 } 1332 1333 *wqe_end = wqe + wqe_size; 1334 1335 return 0; 1336 } 1337 1338 static inline struct mlx5_core_rsc_common *odp_get_rsc(struct mlx5_ib_dev *dev, 1339 u32 wq_num, int pf_type) 1340 { 1341 struct mlx5_core_rsc_common *common = NULL; 1342 struct mlx5_core_srq *srq; 1343 1344 switch (pf_type) { 1345 case MLX5_WQE_PF_TYPE_RMP: 1346 srq = mlx5_cmd_get_srq(dev, wq_num); 1347 if (srq) 1348 common = &srq->common; 1349 break; 1350 case MLX5_WQE_PF_TYPE_REQ_SEND_OR_WRITE: 1351 case MLX5_WQE_PF_TYPE_RESP: 1352 case MLX5_WQE_PF_TYPE_REQ_READ_OR_ATOMIC: 1353 common = mlx5_core_res_hold(dev, wq_num, MLX5_RES_QP); 1354 break; 1355 default: 1356 break; 1357 } 1358 1359 return common; 1360 } 1361 1362 static inline struct mlx5_ib_qp *res_to_qp(struct mlx5_core_rsc_common *res) 1363 { 1364 struct mlx5_core_qp *mqp = (struct mlx5_core_qp *)res; 1365 1366 return to_mibqp(mqp); 1367 } 1368 1369 static inline struct mlx5_ib_srq *res_to_srq(struct mlx5_core_rsc_common *res) 1370 { 1371 struct mlx5_core_srq *msrq = 1372 container_of(res, struct mlx5_core_srq, common); 1373 1374 return to_mibsrq(msrq); 1375 } 1376 1377 static void mlx5_ib_mr_wqe_pfault_handler(struct mlx5_ib_dev *dev, 1378 struct mlx5_pagefault *pfault) 1379 { 1380 bool sq = pfault->type & MLX5_PFAULT_REQUESTOR; 1381 u16 wqe_index = pfault->wqe.wqe_index; 1382 void *wqe, *wqe_start = NULL, *wqe_end = NULL; 1383 u32 bytes_mapped, total_wqe_bytes; 1384 struct mlx5_core_rsc_common *res; 1385 int resume_with_error = 1; 1386 struct mlx5_ib_qp *qp; 1387 size_t bytes_copied; 1388 int ret = 0; 1389 1390 res = odp_get_rsc(dev, pfault->wqe.wq_num, pfault->type); 1391 if (!res) { 1392 mlx5_ib_dbg(dev, "wqe page fault for missing resource %d\n", pfault->wqe.wq_num); 1393 return; 1394 } 1395 1396 if (res->res != MLX5_RES_QP && res->res != MLX5_RES_SRQ && 1397 res->res != MLX5_RES_XSRQ) { 1398 mlx5_ib_err(dev, "wqe page fault for unsupported type %d\n", 1399 pfault->type); 1400 goto resolve_page_fault; 1401 } 1402 1403 wqe_start = (void *)__get_free_page(GFP_KERNEL); 1404 if (!wqe_start) { 1405 mlx5_ib_err(dev, "Error allocating memory for IO page fault handling.\n"); 1406 goto resolve_page_fault; 1407 } 1408 1409 wqe = wqe_start; 1410 qp = (res->res == MLX5_RES_QP) ? res_to_qp(res) : NULL; 1411 if (qp && sq) { 1412 ret = mlx5_ib_read_wqe_sq(qp, wqe_index, wqe, PAGE_SIZE, 1413 &bytes_copied); 1414 if (ret) 1415 goto read_user; 1416 ret = mlx5_ib_mr_initiator_pfault_handler( 1417 dev, pfault, qp, &wqe, &wqe_end, bytes_copied); 1418 } else if (qp && !sq) { 1419 ret = mlx5_ib_read_wqe_rq(qp, wqe_index, wqe, PAGE_SIZE, 1420 &bytes_copied); 1421 if (ret) 1422 goto read_user; 1423 ret = mlx5_ib_mr_responder_pfault_handler_rq( 1424 dev, qp, wqe, &wqe_end, bytes_copied); 1425 } else if (!qp) { 1426 struct mlx5_ib_srq *srq = res_to_srq(res); 1427 1428 ret = mlx5_ib_read_wqe_srq(srq, wqe_index, wqe, PAGE_SIZE, 1429 &bytes_copied); 1430 if (ret) 1431 goto read_user; 1432 ret = mlx5_ib_mr_responder_pfault_handler_srq( 1433 dev, srq, &wqe, &wqe_end, bytes_copied); 1434 } 1435 1436 if (ret < 0 || wqe >= wqe_end) 1437 goto resolve_page_fault; 1438 1439 ret = pagefault_data_segments(dev, pfault, wqe, wqe_end, &bytes_mapped, 1440 &total_wqe_bytes, !sq); 1441 if (ret == -EAGAIN) 1442 goto out; 1443 1444 if (ret < 0 || total_wqe_bytes > bytes_mapped) 1445 goto resolve_page_fault; 1446 1447 out: 1448 ret = 0; 1449 resume_with_error = 0; 1450 1451 read_user: 1452 if (ret) 1453 mlx5_ib_err( 1454 dev, 1455 "Failed reading a WQE following page fault, error %d, wqe_index %x, qpn %llx\n", 1456 ret, wqe_index, pfault->token); 1457 1458 resolve_page_fault: 1459 mlx5_ib_page_fault_resume(dev, pfault, resume_with_error); 1460 mlx5_ib_dbg(dev, "PAGE FAULT completed. QP 0x%x resume_with_error=%d, type: 0x%x\n", 1461 pfault->wqe.wq_num, resume_with_error, 1462 pfault->type); 1463 mlx5_core_res_put(res); 1464 free_page((unsigned long)wqe_start); 1465 } 1466 1467 static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_dev *dev, 1468 struct mlx5_pagefault *pfault) 1469 { 1470 u64 address; 1471 u32 length; 1472 u32 prefetch_len = pfault->bytes_committed; 1473 int prefetch_activated = 0; 1474 u32 rkey = pfault->rdma.r_key; 1475 int ret; 1476 1477 /* The RDMA responder handler handles the page fault in two parts. 1478 * First it brings the necessary pages for the current packet 1479 * (and uses the pfault context), and then (after resuming the QP) 1480 * prefetches more pages. The second operation cannot use the pfault 1481 * context and therefore uses the dummy_pfault context allocated on 1482 * the stack */ 1483 pfault->rdma.rdma_va += pfault->bytes_committed; 1484 pfault->rdma.rdma_op_len -= min(pfault->bytes_committed, 1485 pfault->rdma.rdma_op_len); 1486 pfault->bytes_committed = 0; 1487 1488 address = pfault->rdma.rdma_va; 1489 length = pfault->rdma.rdma_op_len; 1490 1491 /* For some operations, the hardware cannot tell the exact message 1492 * length, and in those cases it reports zero. Use prefetch 1493 * logic. */ 1494 if (length == 0) { 1495 prefetch_activated = 1; 1496 length = pfault->rdma.packet_size; 1497 prefetch_len = min(MAX_PREFETCH_LEN, prefetch_len); 1498 } 1499 1500 ret = pagefault_single_data_segment(dev, NULL, rkey, address, length, 1501 &pfault->bytes_committed, NULL); 1502 if (ret == -EAGAIN) { 1503 /* We're racing with an invalidation, don't prefetch */ 1504 prefetch_activated = 0; 1505 } else if (ret < 0) { 1506 mlx5_ib_page_fault_resume(dev, pfault, 1); 1507 if (ret != -ENOENT) 1508 mlx5_ib_dbg(dev, "PAGE FAULT error %d. QP 0x%llx, type: 0x%x\n", 1509 ret, pfault->token, pfault->type); 1510 return; 1511 } 1512 1513 mlx5_ib_page_fault_resume(dev, pfault, 0); 1514 mlx5_ib_dbg(dev, "PAGE FAULT completed. QP 0x%llx, type: 0x%x, prefetch_activated: %d\n", 1515 pfault->token, pfault->type, 1516 prefetch_activated); 1517 1518 /* At this point, there might be a new pagefault already arriving in 1519 * the eq, switch to the dummy pagefault for the rest of the 1520 * processing. We're still OK with the objects being alive as the 1521 * work-queue is being fenced. */ 1522 1523 if (prefetch_activated) { 1524 u32 bytes_committed = 0; 1525 1526 ret = pagefault_single_data_segment(dev, NULL, rkey, address, 1527 prefetch_len, 1528 &bytes_committed, NULL); 1529 if (ret < 0 && ret != -EAGAIN) { 1530 mlx5_ib_dbg(dev, "Prefetch failed. ret: %d, QP 0x%llx, address: 0x%.16llx, length = 0x%.16x\n", 1531 ret, pfault->token, address, prefetch_len); 1532 } 1533 } 1534 } 1535 1536 #define MLX5_MEMORY_PAGE_FAULT_FLAGS_LAST BIT(7) 1537 static void mlx5_ib_mr_memory_pfault_handler(struct mlx5_ib_dev *dev, 1538 struct mlx5_pagefault *pfault) 1539 { 1540 u64 prefetch_va = 1541 pfault->memory.va - pfault->memory.prefetch_before_byte_count; 1542 size_t prefetch_size = pfault->memory.prefetch_before_byte_count + 1543 pfault->memory.fault_byte_count + 1544 pfault->memory.prefetch_after_byte_count; 1545 struct mlx5_ib_mkey *mmkey; 1546 struct mlx5_ib_mr *mr, *child_mr; 1547 int ret = 0; 1548 1549 mmkey = find_odp_mkey(dev, pfault->memory.mkey); 1550 if (IS_ERR(mmkey)) 1551 goto err; 1552 1553 switch (mmkey->type) { 1554 case MLX5_MKEY_IMPLICIT_CHILD: 1555 child_mr = container_of(mmkey, struct mlx5_ib_mr, mmkey); 1556 mr = child_mr->parent; 1557 break; 1558 case MLX5_MKEY_NULL: 1559 mr = container_of(mmkey, struct mlx5_ib_mr, null_mmkey); 1560 break; 1561 default: 1562 mr = container_of(mmkey, struct mlx5_ib_mr, mmkey); 1563 break; 1564 } 1565 1566 /* If prefetch fails, handle only demanded page fault */ 1567 ret = pagefault_mr(mr, prefetch_va, prefetch_size, NULL, 0, true); 1568 if (ret < 0) { 1569 ret = pagefault_mr(mr, pfault->memory.va, 1570 pfault->memory.fault_byte_count, NULL, 0, 1571 true); 1572 if (ret < 0) 1573 goto err; 1574 } 1575 1576 mlx5_update_odp_stats_with_handled(mr, faults, ret); 1577 mlx5r_deref_odp_mkey(mmkey); 1578 1579 if (pfault->memory.flags & MLX5_MEMORY_PAGE_FAULT_FLAGS_LAST) 1580 mlx5_ib_page_fault_resume(dev, pfault, 0); 1581 1582 mlx5_ib_dbg( 1583 dev, 1584 "PAGE FAULT completed %s. token 0x%llx, mkey: 0x%x, va: 0x%llx, byte_count: 0x%x\n", 1585 pfault->memory.flags & MLX5_MEMORY_PAGE_FAULT_FLAGS_LAST ? 1586 "" : 1587 "without resume cmd", 1588 pfault->token, pfault->memory.mkey, pfault->memory.va, 1589 pfault->memory.fault_byte_count); 1590 1591 return; 1592 1593 err: 1594 if (!IS_ERR(mmkey)) 1595 mlx5r_deref_odp_mkey(mmkey); 1596 mlx5_ib_page_fault_resume(dev, pfault, 1); 1597 mlx5_ib_dbg( 1598 dev, 1599 "PAGE FAULT error. token 0x%llx, mkey: 0x%x, va: 0x%llx, byte_count: 0x%x, err: %d\n", 1600 pfault->token, pfault->memory.mkey, pfault->memory.va, 1601 pfault->memory.fault_byte_count, ret); 1602 } 1603 1604 static void mlx5_ib_pfault(struct mlx5_ib_dev *dev, struct mlx5_pagefault *pfault) 1605 { 1606 u8 event_subtype = pfault->event_subtype; 1607 1608 switch (event_subtype) { 1609 case MLX5_PFAULT_SUBTYPE_WQE: 1610 mlx5_ib_mr_wqe_pfault_handler(dev, pfault); 1611 break; 1612 case MLX5_PFAULT_SUBTYPE_RDMA: 1613 mlx5_ib_mr_rdma_pfault_handler(dev, pfault); 1614 break; 1615 case MLX5_PFAULT_SUBTYPE_MEMORY: 1616 mlx5_ib_mr_memory_pfault_handler(dev, pfault); 1617 break; 1618 default: 1619 mlx5_ib_err(dev, "Invalid page fault event subtype: 0x%x\n", 1620 event_subtype); 1621 mlx5_ib_page_fault_resume(dev, pfault, 1); 1622 } 1623 } 1624 1625 static void mlx5_ib_eqe_pf_action(struct work_struct *work) 1626 { 1627 struct mlx5_pagefault *pfault = container_of(work, 1628 struct mlx5_pagefault, 1629 work); 1630 struct mlx5_ib_pf_eq *eq = pfault->eq; 1631 1632 mlx5_ib_pfault(eq->dev, pfault); 1633 mempool_free(pfault, eq->pool); 1634 } 1635 1636 #define MEMORY_SCHEME_PAGE_FAULT_GRANULARITY 4096 1637 static void mlx5_ib_eq_pf_process(struct mlx5_ib_pf_eq *eq) 1638 { 1639 struct mlx5_eqe_page_fault *pf_eqe; 1640 struct mlx5_pagefault *pfault; 1641 struct mlx5_eqe *eqe; 1642 int cc = 0; 1643 1644 while ((eqe = mlx5_eq_get_eqe(eq->core, cc))) { 1645 pfault = mempool_alloc(eq->pool, GFP_ATOMIC); 1646 if (!pfault) { 1647 schedule_work(&eq->work); 1648 break; 1649 } 1650 1651 pf_eqe = &eqe->data.page_fault; 1652 pfault->event_subtype = eqe->sub_type; 1653 1654 switch (eqe->sub_type) { 1655 case MLX5_PFAULT_SUBTYPE_RDMA: 1656 /* RDMA based event */ 1657 pfault->bytes_committed = 1658 be32_to_cpu(pf_eqe->rdma.bytes_committed); 1659 pfault->type = 1660 be32_to_cpu(pf_eqe->rdma.pftype_token) >> 24; 1661 pfault->token = 1662 be32_to_cpu(pf_eqe->rdma.pftype_token) & 1663 MLX5_24BIT_MASK; 1664 pfault->rdma.r_key = 1665 be32_to_cpu(pf_eqe->rdma.r_key); 1666 pfault->rdma.packet_size = 1667 be16_to_cpu(pf_eqe->rdma.packet_length); 1668 pfault->rdma.rdma_op_len = 1669 be32_to_cpu(pf_eqe->rdma.rdma_op_len); 1670 pfault->rdma.rdma_va = 1671 be64_to_cpu(pf_eqe->rdma.rdma_va); 1672 mlx5_ib_dbg( 1673 eq->dev, 1674 "PAGE_FAULT: subtype: 0x%02x, bytes_committed: 0x%06x, type:0x%x, token: 0x%06llx, r_key: 0x%08x\n", 1675 eqe->sub_type, pfault->bytes_committed, 1676 pfault->type, pfault->token, 1677 pfault->rdma.r_key); 1678 mlx5_ib_dbg(eq->dev, 1679 "PAGE_FAULT: rdma_op_len: 0x%08x, rdma_va: 0x%016llx\n", 1680 pfault->rdma.rdma_op_len, 1681 pfault->rdma.rdma_va); 1682 break; 1683 1684 case MLX5_PFAULT_SUBTYPE_WQE: 1685 /* WQE based event */ 1686 pfault->bytes_committed = 1687 be32_to_cpu(pf_eqe->wqe.bytes_committed); 1688 pfault->type = 1689 (be32_to_cpu(pf_eqe->wqe.pftype_wq) >> 24) & 0x7; 1690 pfault->token = 1691 be32_to_cpu(pf_eqe->wqe.token); 1692 pfault->wqe.wq_num = 1693 be32_to_cpu(pf_eqe->wqe.pftype_wq) & 1694 MLX5_24BIT_MASK; 1695 pfault->wqe.wqe_index = 1696 be16_to_cpu(pf_eqe->wqe.wqe_index); 1697 pfault->wqe.packet_size = 1698 be16_to_cpu(pf_eqe->wqe.packet_length); 1699 mlx5_ib_dbg( 1700 eq->dev, 1701 "PAGE_FAULT: subtype: 0x%02x, bytes_committed: 0x%06x, type:0x%x, token: 0x%06llx, wq_num: 0x%06x, wqe_index: 0x%04x\n", 1702 eqe->sub_type, pfault->bytes_committed, 1703 pfault->type, pfault->token, pfault->wqe.wq_num, 1704 pfault->wqe.wqe_index); 1705 break; 1706 1707 case MLX5_PFAULT_SUBTYPE_MEMORY: 1708 /* Memory based event */ 1709 pfault->bytes_committed = 0; 1710 pfault->token = 1711 be32_to_cpu(pf_eqe->memory.token31_0) | 1712 ((u64)be16_to_cpu(pf_eqe->memory.token47_32) 1713 << 32); 1714 pfault->memory.va = be64_to_cpu(pf_eqe->memory.va); 1715 pfault->memory.mkey = be32_to_cpu(pf_eqe->memory.mkey); 1716 pfault->memory.fault_byte_count = (be32_to_cpu( 1717 pf_eqe->memory.demand_fault_pages) >> 12) * 1718 MEMORY_SCHEME_PAGE_FAULT_GRANULARITY; 1719 pfault->memory.prefetch_before_byte_count = 1720 be16_to_cpu( 1721 pf_eqe->memory.pre_demand_fault_pages) * 1722 MEMORY_SCHEME_PAGE_FAULT_GRANULARITY; 1723 pfault->memory.prefetch_after_byte_count = 1724 be16_to_cpu( 1725 pf_eqe->memory.post_demand_fault_pages) * 1726 MEMORY_SCHEME_PAGE_FAULT_GRANULARITY; 1727 pfault->memory.flags = pf_eqe->memory.flags; 1728 mlx5_ib_dbg( 1729 eq->dev, 1730 "PAGE_FAULT: subtype: 0x%02x, token: 0x%06llx, mkey: 0x%06x, fault_byte_count: 0x%06x, va: 0x%016llx, flags: 0x%02x\n", 1731 eqe->sub_type, pfault->token, 1732 pfault->memory.mkey, 1733 pfault->memory.fault_byte_count, 1734 pfault->memory.va, pfault->memory.flags); 1735 mlx5_ib_dbg( 1736 eq->dev, 1737 "PAGE_FAULT: prefetch size: before: 0x%06x, after 0x%06x\n", 1738 pfault->memory.prefetch_before_byte_count, 1739 pfault->memory.prefetch_after_byte_count); 1740 break; 1741 1742 default: 1743 mlx5_ib_warn(eq->dev, 1744 "Unsupported page fault event sub-type: 0x%02hhx\n", 1745 eqe->sub_type); 1746 /* Unsupported page faults should still be 1747 * resolved by the page fault handler 1748 */ 1749 } 1750 1751 pfault->eq = eq; 1752 INIT_WORK(&pfault->work, mlx5_ib_eqe_pf_action); 1753 queue_work(eq->wq, &pfault->work); 1754 1755 cc = mlx5_eq_update_cc(eq->core, ++cc); 1756 } 1757 1758 mlx5_eq_update_ci(eq->core, cc, 1); 1759 } 1760 1761 static int mlx5_ib_eq_pf_int(struct notifier_block *nb, unsigned long type, 1762 void *data) 1763 { 1764 struct mlx5_ib_pf_eq *eq = 1765 container_of(nb, struct mlx5_ib_pf_eq, irq_nb); 1766 unsigned long flags; 1767 1768 if (spin_trylock_irqsave(&eq->lock, flags)) { 1769 mlx5_ib_eq_pf_process(eq); 1770 spin_unlock_irqrestore(&eq->lock, flags); 1771 } else { 1772 schedule_work(&eq->work); 1773 } 1774 1775 return IRQ_HANDLED; 1776 } 1777 1778 /* mempool_refill() was proposed but unfortunately wasn't accepted 1779 * http://lkml.iu.edu/hypermail/linux/kernel/1512.1/05073.html 1780 * Cheap workaround. 1781 */ 1782 static void mempool_refill(mempool_t *pool) 1783 { 1784 while (pool->curr_nr < pool->min_nr) 1785 mempool_free(mempool_alloc(pool, GFP_KERNEL), pool); 1786 } 1787 1788 static void mlx5_ib_eq_pf_action(struct work_struct *work) 1789 { 1790 struct mlx5_ib_pf_eq *eq = 1791 container_of(work, struct mlx5_ib_pf_eq, work); 1792 1793 mempool_refill(eq->pool); 1794 1795 spin_lock_irq(&eq->lock); 1796 mlx5_ib_eq_pf_process(eq); 1797 spin_unlock_irq(&eq->lock); 1798 } 1799 1800 enum { 1801 MLX5_IB_NUM_PF_EQE = 0x1000, 1802 MLX5_IB_NUM_PF_DRAIN = 64, 1803 }; 1804 1805 int mlx5r_odp_create_eq(struct mlx5_ib_dev *dev, struct mlx5_ib_pf_eq *eq) 1806 { 1807 struct mlx5_eq_param param = {}; 1808 int err = 0; 1809 1810 mutex_lock(&dev->odp_eq_mutex); 1811 if (eq->core) 1812 goto unlock; 1813 INIT_WORK(&eq->work, mlx5_ib_eq_pf_action); 1814 spin_lock_init(&eq->lock); 1815 eq->dev = dev; 1816 1817 eq->pool = mempool_create_kmalloc_pool(MLX5_IB_NUM_PF_DRAIN, 1818 sizeof(struct mlx5_pagefault)); 1819 if (!eq->pool) { 1820 err = -ENOMEM; 1821 goto unlock; 1822 } 1823 1824 eq->wq = alloc_workqueue("mlx5_ib_page_fault", 1825 WQ_HIGHPRI | WQ_UNBOUND | WQ_MEM_RECLAIM, 1826 MLX5_NUM_CMD_EQE); 1827 if (!eq->wq) { 1828 err = -ENOMEM; 1829 goto err_mempool; 1830 } 1831 1832 eq->irq_nb.notifier_call = mlx5_ib_eq_pf_int; 1833 param = (struct mlx5_eq_param) { 1834 .nent = MLX5_IB_NUM_PF_EQE, 1835 }; 1836 param.mask[0] = 1ull << MLX5_EVENT_TYPE_PAGE_FAULT; 1837 eq->core = mlx5_eq_create_generic(dev->mdev, ¶m); 1838 if (IS_ERR(eq->core)) { 1839 err = PTR_ERR(eq->core); 1840 goto err_wq; 1841 } 1842 err = mlx5_eq_enable(dev->mdev, eq->core, &eq->irq_nb); 1843 if (err) { 1844 mlx5_ib_err(dev, "failed to enable odp EQ %d\n", err); 1845 goto err_eq; 1846 } 1847 1848 mutex_unlock(&dev->odp_eq_mutex); 1849 return 0; 1850 err_eq: 1851 mlx5_eq_destroy_generic(dev->mdev, eq->core); 1852 err_wq: 1853 eq->core = NULL; 1854 destroy_workqueue(eq->wq); 1855 err_mempool: 1856 mempool_destroy(eq->pool); 1857 unlock: 1858 mutex_unlock(&dev->odp_eq_mutex); 1859 return err; 1860 } 1861 1862 static int 1863 mlx5_ib_odp_destroy_eq(struct mlx5_ib_dev *dev, struct mlx5_ib_pf_eq *eq) 1864 { 1865 int err; 1866 1867 if (!eq->core) 1868 return 0; 1869 mlx5_eq_disable(dev->mdev, eq->core, &eq->irq_nb); 1870 err = mlx5_eq_destroy_generic(dev->mdev, eq->core); 1871 cancel_work_sync(&eq->work); 1872 destroy_workqueue(eq->wq); 1873 mempool_destroy(eq->pool); 1874 1875 return err; 1876 } 1877 1878 int mlx5_odp_init_mkey_cache(struct mlx5_ib_dev *dev) 1879 { 1880 struct mlx5r_cache_rb_key rb_key = { 1881 .access_mode = MLX5_MKC_ACCESS_MODE_KSM, 1882 .ndescs = mlx5_imr_ksm_entries, 1883 .ph = MLX5_IB_NO_PH, 1884 }; 1885 struct mlx5_cache_ent *ent; 1886 1887 if (!(dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT)) 1888 return 0; 1889 1890 ent = mlx5r_cache_create_ent_locked(dev, rb_key, true); 1891 if (IS_ERR(ent)) 1892 return PTR_ERR(ent); 1893 1894 return 0; 1895 } 1896 1897 static const struct ib_device_ops mlx5_ib_dev_odp_ops = { 1898 .advise_mr = mlx5_ib_advise_mr, 1899 }; 1900 1901 int mlx5_ib_odp_init_one(struct mlx5_ib_dev *dev) 1902 { 1903 internal_fill_odp_caps(dev); 1904 1905 if (!(dev->odp_caps.general_caps & IB_ODP_SUPPORT)) 1906 return 0; 1907 1908 ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_odp_ops); 1909 1910 mutex_init(&dev->odp_eq_mutex); 1911 return 0; 1912 } 1913 1914 void mlx5_ib_odp_cleanup_one(struct mlx5_ib_dev *dev) 1915 { 1916 if (!(dev->odp_caps.general_caps & IB_ODP_SUPPORT)) 1917 return; 1918 1919 mlx5_ib_odp_destroy_eq(dev, &dev->odp_pf_eq); 1920 } 1921 1922 int mlx5_ib_odp_init(void) 1923 { 1924 u32 log_va_pages = ilog2(TASK_SIZE) - PAGE_SHIFT; 1925 u8 mlx5_imr_mtt_bits; 1926 1927 /* 48 is default ARM64 VA space and covers X86 4-level paging which is 47 */ 1928 if (log_va_pages <= 48 - PAGE_SHIFT) 1929 mlx5_imr_mtt_shift = 30; 1930 /* 56 is x86-64, 5-level paging */ 1931 else if (log_va_pages <= 56 - PAGE_SHIFT) 1932 mlx5_imr_mtt_shift = 34; 1933 else 1934 return 0; 1935 1936 mlx5_imr_mtt_size = BIT_ULL(mlx5_imr_mtt_shift); 1937 mlx5_imr_mtt_bits = mlx5_imr_mtt_shift - PAGE_SHIFT; 1938 mlx5_imr_mtt_entries = BIT_ULL(mlx5_imr_mtt_bits); 1939 mlx5_imr_ksm_entries = BIT_ULL(get_order(TASK_SIZE) - 1940 mlx5_imr_mtt_bits); 1941 1942 mlx5_imr_ksm_page_shift = mlx5_imr_mtt_shift; 1943 return 0; 1944 } 1945 1946 struct prefetch_mr_work { 1947 struct work_struct work; 1948 u32 pf_flags; 1949 u32 num_sge; 1950 struct { 1951 u64 io_virt; 1952 struct mlx5_ib_mr *mr; 1953 size_t length; 1954 } frags[]; 1955 }; 1956 1957 static void destroy_prefetch_work(struct prefetch_mr_work *work) 1958 { 1959 u32 i; 1960 1961 for (i = 0; i < work->num_sge; ++i) 1962 mlx5r_deref_odp_mkey(&work->frags[i].mr->mmkey); 1963 1964 kvfree(work); 1965 } 1966 1967 static struct mlx5_ib_mr * 1968 get_prefetchable_mr(struct ib_pd *pd, enum ib_uverbs_advise_mr_advice advice, 1969 u32 lkey) 1970 { 1971 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1972 struct mlx5_ib_mr *mr = NULL; 1973 struct mlx5_ib_mkey *mmkey; 1974 1975 xa_lock(&dev->odp_mkeys); 1976 mmkey = xa_load(&dev->odp_mkeys, mlx5_base_mkey(lkey)); 1977 if (!mmkey || mmkey->key != lkey) { 1978 mr = ERR_PTR(-ENOENT); 1979 goto end; 1980 } 1981 if (mmkey->type != MLX5_MKEY_MR) { 1982 mr = ERR_PTR(-EINVAL); 1983 goto end; 1984 } 1985 1986 mr = container_of(mmkey, struct mlx5_ib_mr, mmkey); 1987 1988 if (mr->ibmr.pd != pd) { 1989 mr = ERR_PTR(-EPERM); 1990 goto end; 1991 } 1992 1993 /* prefetch with write-access must be supported by the MR */ 1994 if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE && 1995 !mr->umem->writable) { 1996 mr = ERR_PTR(-EPERM); 1997 goto end; 1998 } 1999 2000 refcount_inc(&mmkey->usecount); 2001 end: 2002 xa_unlock(&dev->odp_mkeys); 2003 return mr; 2004 } 2005 2006 static void mlx5_ib_prefetch_mr_work(struct work_struct *w) 2007 { 2008 struct prefetch_mr_work *work = 2009 container_of(w, struct prefetch_mr_work, work); 2010 u32 bytes_mapped = 0; 2011 int ret; 2012 u32 i; 2013 2014 /* We rely on IB/core that work is executed if we have num_sge != 0 only. */ 2015 WARN_ON(!work->num_sge); 2016 for (i = 0; i < work->num_sge; ++i) { 2017 ret = pagefault_mr(work->frags[i].mr, work->frags[i].io_virt, 2018 work->frags[i].length, &bytes_mapped, 2019 work->pf_flags, false); 2020 if (ret <= 0) 2021 continue; 2022 mlx5_update_odp_stats(work->frags[i].mr, prefetch, ret); 2023 } 2024 2025 destroy_prefetch_work(work); 2026 } 2027 2028 static int init_prefetch_work(struct ib_pd *pd, 2029 enum ib_uverbs_advise_mr_advice advice, 2030 u32 pf_flags, struct prefetch_mr_work *work, 2031 struct ib_sge *sg_list, u32 num_sge) 2032 { 2033 u32 i; 2034 2035 INIT_WORK(&work->work, mlx5_ib_prefetch_mr_work); 2036 work->pf_flags = pf_flags; 2037 2038 for (i = 0; i < num_sge; ++i) { 2039 struct mlx5_ib_mr *mr; 2040 2041 mr = get_prefetchable_mr(pd, advice, sg_list[i].lkey); 2042 if (IS_ERR(mr)) { 2043 work->num_sge = i; 2044 return PTR_ERR(mr); 2045 } 2046 work->frags[i].io_virt = sg_list[i].addr; 2047 work->frags[i].length = sg_list[i].length; 2048 work->frags[i].mr = mr; 2049 } 2050 work->num_sge = num_sge; 2051 return 0; 2052 } 2053 2054 static int mlx5_ib_prefetch_sg_list(struct ib_pd *pd, 2055 enum ib_uverbs_advise_mr_advice advice, 2056 u32 pf_flags, struct ib_sge *sg_list, 2057 u32 num_sge) 2058 { 2059 u32 bytes_mapped = 0; 2060 int ret = 0; 2061 u32 i; 2062 2063 for (i = 0; i < num_sge; ++i) { 2064 struct mlx5_ib_mr *mr; 2065 2066 mr = get_prefetchable_mr(pd, advice, sg_list[i].lkey); 2067 if (IS_ERR(mr)) 2068 return PTR_ERR(mr); 2069 ret = pagefault_mr(mr, sg_list[i].addr, sg_list[i].length, 2070 &bytes_mapped, pf_flags, false); 2071 if (ret < 0) { 2072 mlx5r_deref_odp_mkey(&mr->mmkey); 2073 return ret; 2074 } 2075 mlx5_update_odp_stats(mr, prefetch, ret); 2076 mlx5r_deref_odp_mkey(&mr->mmkey); 2077 } 2078 2079 return 0; 2080 } 2081 2082 int mlx5_ib_advise_mr_prefetch(struct ib_pd *pd, 2083 enum ib_uverbs_advise_mr_advice advice, 2084 u32 flags, struct ib_sge *sg_list, u32 num_sge) 2085 { 2086 u32 pf_flags = 0; 2087 struct prefetch_mr_work *work; 2088 int rc; 2089 2090 if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH) 2091 pf_flags |= MLX5_PF_FLAGS_DOWNGRADE; 2092 2093 if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_NO_FAULT) 2094 pf_flags |= MLX5_PF_FLAGS_SNAPSHOT; 2095 2096 if (flags & IB_UVERBS_ADVISE_MR_FLAG_FLUSH) 2097 return mlx5_ib_prefetch_sg_list(pd, advice, pf_flags, sg_list, 2098 num_sge); 2099 2100 work = kvzalloc_flex(*work, frags, num_sge); 2101 if (!work) 2102 return -ENOMEM; 2103 2104 rc = init_prefetch_work(pd, advice, pf_flags, work, sg_list, num_sge); 2105 if (rc) { 2106 destroy_prefetch_work(work); 2107 return rc; 2108 } 2109 queue_work(system_dfl_wq, &work->work); 2110 return 0; 2111 } 2112