1 /* 2 * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved. 3 * 4 * This software is available to you under a choice of one of two 5 * licenses. You may choose to be licensed under the terms of the GNU 6 * General Public License (GPL) Version 2, available from the file 7 * COPYING in the main directory of this source tree, or the 8 * OpenIB.org BSD license below: 9 * 10 * Redistribution and use in source and binary forms, with or 11 * without modification, are permitted provided that the following 12 * conditions are met: 13 * 14 * - Redistributions of source code must retain the above 15 * copyright notice, this list of conditions and the following 16 * disclaimer. 17 * 18 * - Redistributions in binary form must reproduce the above 19 * copyright notice, this list of conditions and the following 20 * disclaimer in the documentation and/or other materials 21 * provided with the distribution. 22 * 23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 30 * SOFTWARE. 31 */ 32 33 #include <rdma/ib_umem.h> 34 #include <rdma/ib_umem_odp.h> 35 36 #include "mlx5_ib.h" 37 38 #define MAX_PREFETCH_LEN (4*1024*1024U) 39 40 /* Timeout in ms to wait for an active mmu notifier to complete when handling 41 * a pagefault. */ 42 #define MMU_NOTIFIER_TIMEOUT 1000 43 44 struct workqueue_struct *mlx5_ib_page_fault_wq; 45 46 void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start, 47 unsigned long end) 48 { 49 struct mlx5_ib_mr *mr; 50 const u64 umr_block_mask = (MLX5_UMR_MTT_ALIGNMENT / sizeof(u64)) - 1; 51 u64 idx = 0, blk_start_idx = 0; 52 int in_block = 0; 53 u64 addr; 54 55 if (!umem || !umem->odp_data) { 56 pr_err("invalidation called on NULL umem or non-ODP umem\n"); 57 return; 58 } 59 60 mr = umem->odp_data->private; 61 62 if (!mr || !mr->ibmr.pd) 63 return; 64 65 start = max_t(u64, ib_umem_start(umem), start); 66 end = min_t(u64, ib_umem_end(umem), end); 67 68 /* 69 * Iteration one - zap the HW's MTTs. The notifiers_count ensures that 70 * while we are doing the invalidation, no page fault will attempt to 71 * overwrite the same MTTs. Concurent invalidations might race us, 72 * but they will write 0s as well, so no difference in the end result. 73 */ 74 75 for (addr = start; addr < end; addr += (u64)umem->page_size) { 76 idx = (addr - ib_umem_start(umem)) / PAGE_SIZE; 77 /* 78 * Strive to write the MTTs in chunks, but avoid overwriting 79 * non-existing MTTs. The huristic here can be improved to 80 * estimate the cost of another UMR vs. the cost of bigger 81 * UMR. 82 */ 83 if (umem->odp_data->dma_list[idx] & 84 (ODP_READ_ALLOWED_BIT | ODP_WRITE_ALLOWED_BIT)) { 85 if (!in_block) { 86 blk_start_idx = idx; 87 in_block = 1; 88 } 89 } else { 90 u64 umr_offset = idx & umr_block_mask; 91 92 if (in_block && umr_offset == 0) { 93 mlx5_ib_update_mtt(mr, blk_start_idx, 94 idx - blk_start_idx, 1); 95 in_block = 0; 96 } 97 } 98 } 99 if (in_block) 100 mlx5_ib_update_mtt(mr, blk_start_idx, idx - blk_start_idx + 1, 101 1); 102 103 /* 104 * We are now sure that the device will not access the 105 * memory. We can safely unmap it, and mark it as dirty if 106 * needed. 107 */ 108 109 ib_umem_odp_unmap_dma_pages(umem, start, end); 110 } 111 112 void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev) 113 { 114 struct ib_odp_caps *caps = &dev->odp_caps; 115 116 memset(caps, 0, sizeof(*caps)); 117 118 if (!MLX5_CAP_GEN(dev->mdev, pg)) 119 return; 120 121 caps->general_caps = IB_ODP_SUPPORT; 122 123 if (MLX5_CAP_ODP(dev->mdev, ud_odp_caps.send)) 124 caps->per_transport_caps.ud_odp_caps |= IB_ODP_SUPPORT_SEND; 125 126 if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.send)) 127 caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_SEND; 128 129 if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.receive)) 130 caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_RECV; 131 132 if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.write)) 133 caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_WRITE; 134 135 if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.read)) 136 caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_READ; 137 138 return; 139 } 140 141 static struct mlx5_ib_mr *mlx5_ib_odp_find_mr_lkey(struct mlx5_ib_dev *dev, 142 u32 key) 143 { 144 u32 base_key = mlx5_base_mkey(key); 145 struct mlx5_core_mkey *mmkey = __mlx5_mr_lookup(dev->mdev, base_key); 146 struct mlx5_ib_mr *mr = container_of(mmkey, struct mlx5_ib_mr, mmkey); 147 148 if (!mmkey || mmkey->key != key || !mr->live) 149 return NULL; 150 151 return container_of(mmkey, struct mlx5_ib_mr, mmkey); 152 } 153 154 static void mlx5_ib_page_fault_resume(struct mlx5_ib_qp *qp, 155 struct mlx5_ib_pfault *pfault, 156 int error) 157 { 158 struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.pd->device); 159 u32 qpn = qp->trans_qp.base.mqp.qpn; 160 int ret = mlx5_core_page_fault_resume(dev->mdev, 161 qpn, 162 pfault->mpfault.flags, 163 error); 164 if (ret) 165 pr_err("Failed to resolve the page fault on QP 0x%x\n", qpn); 166 } 167 168 /* 169 * Handle a single data segment in a page-fault WQE. 170 * 171 * Returns number of pages retrieved on success. The caller will continue to 172 * the next data segment. 173 * Can return the following error codes: 174 * -EAGAIN to designate a temporary error. The caller will abort handling the 175 * page fault and resolve it. 176 * -EFAULT when there's an error mapping the requested pages. The caller will 177 * abort the page fault handling and possibly move the QP to an error state. 178 * On other errors the QP should also be closed with an error. 179 */ 180 static int pagefault_single_data_segment(struct mlx5_ib_qp *qp, 181 struct mlx5_ib_pfault *pfault, 182 u32 key, u64 io_virt, size_t bcnt, 183 u32 *bytes_mapped) 184 { 185 struct mlx5_ib_dev *mib_dev = to_mdev(qp->ibqp.pd->device); 186 int srcu_key; 187 unsigned int current_seq; 188 u64 start_idx; 189 int npages = 0, ret = 0; 190 struct mlx5_ib_mr *mr; 191 u64 access_mask = ODP_READ_ALLOWED_BIT; 192 193 srcu_key = srcu_read_lock(&mib_dev->mr_srcu); 194 mr = mlx5_ib_odp_find_mr_lkey(mib_dev, key); 195 /* 196 * If we didn't find the MR, it means the MR was closed while we were 197 * handling the ODP event. In this case we return -EFAULT so that the 198 * QP will be closed. 199 */ 200 if (!mr || !mr->ibmr.pd) { 201 pr_err("Failed to find relevant mr for lkey=0x%06x, probably the MR was destroyed\n", 202 key); 203 ret = -EFAULT; 204 goto srcu_unlock; 205 } 206 if (!mr->umem->odp_data) { 207 pr_debug("skipping non ODP MR (lkey=0x%06x) in page fault handler.\n", 208 key); 209 if (bytes_mapped) 210 *bytes_mapped += 211 (bcnt - pfault->mpfault.bytes_committed); 212 goto srcu_unlock; 213 } 214 if (mr->ibmr.pd != qp->ibqp.pd) { 215 pr_err("Page-fault with different PDs for QP and MR.\n"); 216 ret = -EFAULT; 217 goto srcu_unlock; 218 } 219 220 current_seq = ACCESS_ONCE(mr->umem->odp_data->notifiers_seq); 221 /* 222 * Ensure the sequence number is valid for some time before we call 223 * gup. 224 */ 225 smp_rmb(); 226 227 /* 228 * Avoid branches - this code will perform correctly 229 * in all iterations (in iteration 2 and above, 230 * bytes_committed == 0). 231 */ 232 io_virt += pfault->mpfault.bytes_committed; 233 bcnt -= pfault->mpfault.bytes_committed; 234 235 start_idx = (io_virt - (mr->mmkey.iova & PAGE_MASK)) >> PAGE_SHIFT; 236 237 if (mr->umem->writable) 238 access_mask |= ODP_WRITE_ALLOWED_BIT; 239 npages = ib_umem_odp_map_dma_pages(mr->umem, io_virt, bcnt, 240 access_mask, current_seq); 241 if (npages < 0) { 242 ret = npages; 243 goto srcu_unlock; 244 } 245 246 if (npages > 0) { 247 mutex_lock(&mr->umem->odp_data->umem_mutex); 248 if (!ib_umem_mmu_notifier_retry(mr->umem, current_seq)) { 249 /* 250 * No need to check whether the MTTs really belong to 251 * this MR, since ib_umem_odp_map_dma_pages already 252 * checks this. 253 */ 254 ret = mlx5_ib_update_mtt(mr, start_idx, npages, 0); 255 } else { 256 ret = -EAGAIN; 257 } 258 mutex_unlock(&mr->umem->odp_data->umem_mutex); 259 if (ret < 0) { 260 if (ret != -EAGAIN) 261 pr_err("Failed to update mkey page tables\n"); 262 goto srcu_unlock; 263 } 264 265 if (bytes_mapped) { 266 u32 new_mappings = npages * PAGE_SIZE - 267 (io_virt - round_down(io_virt, PAGE_SIZE)); 268 *bytes_mapped += min_t(u32, new_mappings, bcnt); 269 } 270 } 271 272 srcu_unlock: 273 if (ret == -EAGAIN) { 274 if (!mr->umem->odp_data->dying) { 275 struct ib_umem_odp *odp_data = mr->umem->odp_data; 276 unsigned long timeout = 277 msecs_to_jiffies(MMU_NOTIFIER_TIMEOUT); 278 279 if (!wait_for_completion_timeout( 280 &odp_data->notifier_completion, 281 timeout)) { 282 pr_warn("timeout waiting for mmu notifier completion\n"); 283 } 284 } else { 285 /* The MR is being killed, kill the QP as well. */ 286 ret = -EFAULT; 287 } 288 } 289 srcu_read_unlock(&mib_dev->mr_srcu, srcu_key); 290 pfault->mpfault.bytes_committed = 0; 291 return ret ? ret : npages; 292 } 293 294 /** 295 * Parse a series of data segments for page fault handling. 296 * 297 * @qp the QP on which the fault occurred. 298 * @pfault contains page fault information. 299 * @wqe points at the first data segment in the WQE. 300 * @wqe_end points after the end of the WQE. 301 * @bytes_mapped receives the number of bytes that the function was able to 302 * map. This allows the caller to decide intelligently whether 303 * enough memory was mapped to resolve the page fault 304 * successfully (e.g. enough for the next MTU, or the entire 305 * WQE). 306 * @total_wqe_bytes receives the total data size of this WQE in bytes (minus 307 * the committed bytes). 308 * 309 * Returns the number of pages loaded if positive, zero for an empty WQE, or a 310 * negative error code. 311 */ 312 static int pagefault_data_segments(struct mlx5_ib_qp *qp, 313 struct mlx5_ib_pfault *pfault, void *wqe, 314 void *wqe_end, u32 *bytes_mapped, 315 u32 *total_wqe_bytes, int receive_queue) 316 { 317 int ret = 0, npages = 0; 318 u64 io_virt; 319 u32 key; 320 u32 byte_count; 321 size_t bcnt; 322 int inline_segment; 323 324 /* Skip SRQ next-WQE segment. */ 325 if (receive_queue && qp->ibqp.srq) 326 wqe += sizeof(struct mlx5_wqe_srq_next_seg); 327 328 if (bytes_mapped) 329 *bytes_mapped = 0; 330 if (total_wqe_bytes) 331 *total_wqe_bytes = 0; 332 333 while (wqe < wqe_end) { 334 struct mlx5_wqe_data_seg *dseg = wqe; 335 336 io_virt = be64_to_cpu(dseg->addr); 337 key = be32_to_cpu(dseg->lkey); 338 byte_count = be32_to_cpu(dseg->byte_count); 339 inline_segment = !!(byte_count & MLX5_INLINE_SEG); 340 bcnt = byte_count & ~MLX5_INLINE_SEG; 341 342 if (inline_segment) { 343 bcnt = bcnt & MLX5_WQE_INLINE_SEG_BYTE_COUNT_MASK; 344 wqe += ALIGN(sizeof(struct mlx5_wqe_inline_seg) + bcnt, 345 16); 346 } else { 347 wqe += sizeof(*dseg); 348 } 349 350 /* receive WQE end of sg list. */ 351 if (receive_queue && bcnt == 0 && key == MLX5_INVALID_LKEY && 352 io_virt == 0) 353 break; 354 355 if (!inline_segment && total_wqe_bytes) { 356 *total_wqe_bytes += bcnt - min_t(size_t, bcnt, 357 pfault->mpfault.bytes_committed); 358 } 359 360 /* A zero length data segment designates a length of 2GB. */ 361 if (bcnt == 0) 362 bcnt = 1U << 31; 363 364 if (inline_segment || bcnt <= pfault->mpfault.bytes_committed) { 365 pfault->mpfault.bytes_committed -= 366 min_t(size_t, bcnt, 367 pfault->mpfault.bytes_committed); 368 continue; 369 } 370 371 ret = pagefault_single_data_segment(qp, pfault, key, io_virt, 372 bcnt, bytes_mapped); 373 if (ret < 0) 374 break; 375 npages += ret; 376 } 377 378 return ret < 0 ? ret : npages; 379 } 380 381 /* 382 * Parse initiator WQE. Advances the wqe pointer to point at the 383 * scatter-gather list, and set wqe_end to the end of the WQE. 384 */ 385 static int mlx5_ib_mr_initiator_pfault_handler( 386 struct mlx5_ib_qp *qp, struct mlx5_ib_pfault *pfault, 387 void **wqe, void **wqe_end, int wqe_length) 388 { 389 struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.pd->device); 390 struct mlx5_wqe_ctrl_seg *ctrl = *wqe; 391 u16 wqe_index = pfault->mpfault.wqe.wqe_index; 392 unsigned ds, opcode; 393 #if defined(DEBUG) 394 u32 ctrl_wqe_index, ctrl_qpn; 395 #endif 396 u32 qpn = qp->trans_qp.base.mqp.qpn; 397 398 ds = be32_to_cpu(ctrl->qpn_ds) & MLX5_WQE_CTRL_DS_MASK; 399 if (ds * MLX5_WQE_DS_UNITS > wqe_length) { 400 mlx5_ib_err(dev, "Unable to read the complete WQE. ds = 0x%x, ret = 0x%x\n", 401 ds, wqe_length); 402 return -EFAULT; 403 } 404 405 if (ds == 0) { 406 mlx5_ib_err(dev, "Got WQE with zero DS. wqe_index=%x, qpn=%x\n", 407 wqe_index, qpn); 408 return -EFAULT; 409 } 410 411 #if defined(DEBUG) 412 ctrl_wqe_index = (be32_to_cpu(ctrl->opmod_idx_opcode) & 413 MLX5_WQE_CTRL_WQE_INDEX_MASK) >> 414 MLX5_WQE_CTRL_WQE_INDEX_SHIFT; 415 if (wqe_index != ctrl_wqe_index) { 416 mlx5_ib_err(dev, "Got WQE with invalid wqe_index. wqe_index=0x%x, qpn=0x%x ctrl->wqe_index=0x%x\n", 417 wqe_index, qpn, 418 ctrl_wqe_index); 419 return -EFAULT; 420 } 421 422 ctrl_qpn = (be32_to_cpu(ctrl->qpn_ds) & MLX5_WQE_CTRL_QPN_MASK) >> 423 MLX5_WQE_CTRL_QPN_SHIFT; 424 if (qpn != ctrl_qpn) { 425 mlx5_ib_err(dev, "Got WQE with incorrect QP number. wqe_index=0x%x, qpn=0x%x ctrl->qpn=0x%x\n", 426 wqe_index, qpn, 427 ctrl_qpn); 428 return -EFAULT; 429 } 430 #endif /* DEBUG */ 431 432 *wqe_end = *wqe + ds * MLX5_WQE_DS_UNITS; 433 *wqe += sizeof(*ctrl); 434 435 opcode = be32_to_cpu(ctrl->opmod_idx_opcode) & 436 MLX5_WQE_CTRL_OPCODE_MASK; 437 switch (qp->ibqp.qp_type) { 438 case IB_QPT_RC: 439 switch (opcode) { 440 case MLX5_OPCODE_SEND: 441 case MLX5_OPCODE_SEND_IMM: 442 case MLX5_OPCODE_SEND_INVAL: 443 if (!(dev->odp_caps.per_transport_caps.rc_odp_caps & 444 IB_ODP_SUPPORT_SEND)) 445 goto invalid_transport_or_opcode; 446 break; 447 case MLX5_OPCODE_RDMA_WRITE: 448 case MLX5_OPCODE_RDMA_WRITE_IMM: 449 if (!(dev->odp_caps.per_transport_caps.rc_odp_caps & 450 IB_ODP_SUPPORT_WRITE)) 451 goto invalid_transport_or_opcode; 452 *wqe += sizeof(struct mlx5_wqe_raddr_seg); 453 break; 454 case MLX5_OPCODE_RDMA_READ: 455 if (!(dev->odp_caps.per_transport_caps.rc_odp_caps & 456 IB_ODP_SUPPORT_READ)) 457 goto invalid_transport_or_opcode; 458 *wqe += sizeof(struct mlx5_wqe_raddr_seg); 459 break; 460 default: 461 goto invalid_transport_or_opcode; 462 } 463 break; 464 case IB_QPT_UD: 465 switch (opcode) { 466 case MLX5_OPCODE_SEND: 467 case MLX5_OPCODE_SEND_IMM: 468 if (!(dev->odp_caps.per_transport_caps.ud_odp_caps & 469 IB_ODP_SUPPORT_SEND)) 470 goto invalid_transport_or_opcode; 471 *wqe += sizeof(struct mlx5_wqe_datagram_seg); 472 break; 473 default: 474 goto invalid_transport_or_opcode; 475 } 476 break; 477 default: 478 invalid_transport_or_opcode: 479 mlx5_ib_err(dev, "ODP fault on QP of an unsupported opcode or transport. transport: 0x%x opcode: 0x%x.\n", 480 qp->ibqp.qp_type, opcode); 481 return -EFAULT; 482 } 483 484 return 0; 485 } 486 487 /* 488 * Parse responder WQE. Advances the wqe pointer to point at the 489 * scatter-gather list, and set wqe_end to the end of the WQE. 490 */ 491 static int mlx5_ib_mr_responder_pfault_handler( 492 struct mlx5_ib_qp *qp, struct mlx5_ib_pfault *pfault, 493 void **wqe, void **wqe_end, int wqe_length) 494 { 495 struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.pd->device); 496 struct mlx5_ib_wq *wq = &qp->rq; 497 int wqe_size = 1 << wq->wqe_shift; 498 499 if (qp->ibqp.srq) { 500 mlx5_ib_err(dev, "ODP fault on SRQ is not supported\n"); 501 return -EFAULT; 502 } 503 504 if (qp->wq_sig) { 505 mlx5_ib_err(dev, "ODP fault with WQE signatures is not supported\n"); 506 return -EFAULT; 507 } 508 509 if (wqe_size > wqe_length) { 510 mlx5_ib_err(dev, "Couldn't read all of the receive WQE's content\n"); 511 return -EFAULT; 512 } 513 514 switch (qp->ibqp.qp_type) { 515 case IB_QPT_RC: 516 if (!(dev->odp_caps.per_transport_caps.rc_odp_caps & 517 IB_ODP_SUPPORT_RECV)) 518 goto invalid_transport_or_opcode; 519 break; 520 default: 521 invalid_transport_or_opcode: 522 mlx5_ib_err(dev, "ODP fault on QP of an unsupported transport. transport: 0x%x\n", 523 qp->ibqp.qp_type); 524 return -EFAULT; 525 } 526 527 *wqe_end = *wqe + wqe_size; 528 529 return 0; 530 } 531 532 static void mlx5_ib_mr_wqe_pfault_handler(struct mlx5_ib_qp *qp, 533 struct mlx5_ib_pfault *pfault) 534 { 535 struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.pd->device); 536 int ret; 537 void *wqe, *wqe_end; 538 u32 bytes_mapped, total_wqe_bytes; 539 char *buffer = NULL; 540 int resume_with_error = 0; 541 u16 wqe_index = pfault->mpfault.wqe.wqe_index; 542 int requestor = pfault->mpfault.flags & MLX5_PFAULT_REQUESTOR; 543 u32 qpn = qp->trans_qp.base.mqp.qpn; 544 545 buffer = (char *)__get_free_page(GFP_KERNEL); 546 if (!buffer) { 547 mlx5_ib_err(dev, "Error allocating memory for IO page fault handling.\n"); 548 resume_with_error = 1; 549 goto resolve_page_fault; 550 } 551 552 ret = mlx5_ib_read_user_wqe(qp, requestor, wqe_index, buffer, 553 PAGE_SIZE, &qp->trans_qp.base); 554 if (ret < 0) { 555 mlx5_ib_err(dev, "Failed reading a WQE following page fault, error=%x, wqe_index=%x, qpn=%x\n", 556 -ret, wqe_index, qpn); 557 resume_with_error = 1; 558 goto resolve_page_fault; 559 } 560 561 wqe = buffer; 562 if (requestor) 563 ret = mlx5_ib_mr_initiator_pfault_handler(qp, pfault, &wqe, 564 &wqe_end, ret); 565 else 566 ret = mlx5_ib_mr_responder_pfault_handler(qp, pfault, &wqe, 567 &wqe_end, ret); 568 if (ret < 0) { 569 resume_with_error = 1; 570 goto resolve_page_fault; 571 } 572 573 if (wqe >= wqe_end) { 574 mlx5_ib_err(dev, "ODP fault on invalid WQE.\n"); 575 resume_with_error = 1; 576 goto resolve_page_fault; 577 } 578 579 ret = pagefault_data_segments(qp, pfault, wqe, wqe_end, &bytes_mapped, 580 &total_wqe_bytes, !requestor); 581 if (ret == -EAGAIN) { 582 goto resolve_page_fault; 583 } else if (ret < 0 || total_wqe_bytes > bytes_mapped) { 584 mlx5_ib_err(dev, "Error getting user pages for page fault. Error: 0x%x\n", 585 -ret); 586 resume_with_error = 1; 587 goto resolve_page_fault; 588 } 589 590 resolve_page_fault: 591 mlx5_ib_page_fault_resume(qp, pfault, resume_with_error); 592 mlx5_ib_dbg(dev, "PAGE FAULT completed. QP 0x%x resume_with_error=%d, flags: 0x%x\n", 593 qpn, resume_with_error, 594 pfault->mpfault.flags); 595 596 free_page((unsigned long)buffer); 597 } 598 599 static int pages_in_range(u64 address, u32 length) 600 { 601 return (ALIGN(address + length, PAGE_SIZE) - 602 (address & PAGE_MASK)) >> PAGE_SHIFT; 603 } 604 605 static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_qp *qp, 606 struct mlx5_ib_pfault *pfault) 607 { 608 struct mlx5_pagefault *mpfault = &pfault->mpfault; 609 u64 address; 610 u32 length; 611 u32 prefetch_len = mpfault->bytes_committed; 612 int prefetch_activated = 0; 613 u32 rkey = mpfault->rdma.r_key; 614 int ret; 615 616 /* The RDMA responder handler handles the page fault in two parts. 617 * First it brings the necessary pages for the current packet 618 * (and uses the pfault context), and then (after resuming the QP) 619 * prefetches more pages. The second operation cannot use the pfault 620 * context and therefore uses the dummy_pfault context allocated on 621 * the stack */ 622 struct mlx5_ib_pfault dummy_pfault = {}; 623 624 dummy_pfault.mpfault.bytes_committed = 0; 625 626 mpfault->rdma.rdma_va += mpfault->bytes_committed; 627 mpfault->rdma.rdma_op_len -= min(mpfault->bytes_committed, 628 mpfault->rdma.rdma_op_len); 629 mpfault->bytes_committed = 0; 630 631 address = mpfault->rdma.rdma_va; 632 length = mpfault->rdma.rdma_op_len; 633 634 /* For some operations, the hardware cannot tell the exact message 635 * length, and in those cases it reports zero. Use prefetch 636 * logic. */ 637 if (length == 0) { 638 prefetch_activated = 1; 639 length = mpfault->rdma.packet_size; 640 prefetch_len = min(MAX_PREFETCH_LEN, prefetch_len); 641 } 642 643 ret = pagefault_single_data_segment(qp, pfault, rkey, address, length, 644 NULL); 645 if (ret == -EAGAIN) { 646 /* We're racing with an invalidation, don't prefetch */ 647 prefetch_activated = 0; 648 } else if (ret < 0 || pages_in_range(address, length) > ret) { 649 mlx5_ib_page_fault_resume(qp, pfault, 1); 650 return; 651 } 652 653 mlx5_ib_page_fault_resume(qp, pfault, 0); 654 655 /* At this point, there might be a new pagefault already arriving in 656 * the eq, switch to the dummy pagefault for the rest of the 657 * processing. We're still OK with the objects being alive as the 658 * work-queue is being fenced. */ 659 660 if (prefetch_activated) { 661 ret = pagefault_single_data_segment(qp, &dummy_pfault, rkey, 662 address, 663 prefetch_len, 664 NULL); 665 if (ret < 0) { 666 pr_warn("Prefetch failed (ret = %d, prefetch_activated = %d) for QPN %d, address: 0x%.16llx, length = 0x%.16x\n", 667 ret, prefetch_activated, 668 qp->ibqp.qp_num, address, prefetch_len); 669 } 670 } 671 } 672 673 void mlx5_ib_mr_pfault_handler(struct mlx5_ib_qp *qp, 674 struct mlx5_ib_pfault *pfault) 675 { 676 u8 event_subtype = pfault->mpfault.event_subtype; 677 678 switch (event_subtype) { 679 case MLX5_PFAULT_SUBTYPE_WQE: 680 mlx5_ib_mr_wqe_pfault_handler(qp, pfault); 681 break; 682 case MLX5_PFAULT_SUBTYPE_RDMA: 683 mlx5_ib_mr_rdma_pfault_handler(qp, pfault); 684 break; 685 default: 686 pr_warn("Invalid page fault event subtype: 0x%x\n", 687 event_subtype); 688 mlx5_ib_page_fault_resume(qp, pfault, 1); 689 break; 690 } 691 } 692 693 static void mlx5_ib_qp_pfault_action(struct work_struct *work) 694 { 695 struct mlx5_ib_pfault *pfault = container_of(work, 696 struct mlx5_ib_pfault, 697 work); 698 enum mlx5_ib_pagefault_context context = 699 mlx5_ib_get_pagefault_context(&pfault->mpfault); 700 struct mlx5_ib_qp *qp = container_of(pfault, struct mlx5_ib_qp, 701 pagefaults[context]); 702 mlx5_ib_mr_pfault_handler(qp, pfault); 703 } 704 705 void mlx5_ib_qp_disable_pagefaults(struct mlx5_ib_qp *qp) 706 { 707 unsigned long flags; 708 709 spin_lock_irqsave(&qp->disable_page_faults_lock, flags); 710 qp->disable_page_faults = 1; 711 spin_unlock_irqrestore(&qp->disable_page_faults_lock, flags); 712 713 /* 714 * Note that at this point, we are guarenteed that no more 715 * work queue elements will be posted to the work queue with 716 * the QP we are closing. 717 */ 718 flush_workqueue(mlx5_ib_page_fault_wq); 719 } 720 721 void mlx5_ib_qp_enable_pagefaults(struct mlx5_ib_qp *qp) 722 { 723 unsigned long flags; 724 725 spin_lock_irqsave(&qp->disable_page_faults_lock, flags); 726 qp->disable_page_faults = 0; 727 spin_unlock_irqrestore(&qp->disable_page_faults_lock, flags); 728 } 729 730 static void mlx5_ib_pfault_handler(struct mlx5_core_qp *qp, 731 struct mlx5_pagefault *pfault) 732 { 733 /* 734 * Note that we will only get one fault event per QP per context 735 * (responder/initiator, read/write), until we resolve the page fault 736 * with the mlx5_ib_page_fault_resume command. Since this function is 737 * called from within the work element, there is no risk of missing 738 * events. 739 */ 740 struct mlx5_ib_qp *mibqp = to_mibqp(qp); 741 enum mlx5_ib_pagefault_context context = 742 mlx5_ib_get_pagefault_context(pfault); 743 struct mlx5_ib_pfault *qp_pfault = &mibqp->pagefaults[context]; 744 745 qp_pfault->mpfault = *pfault; 746 747 /* No need to stop interrupts here since we are in an interrupt */ 748 spin_lock(&mibqp->disable_page_faults_lock); 749 if (!mibqp->disable_page_faults) 750 queue_work(mlx5_ib_page_fault_wq, &qp_pfault->work); 751 spin_unlock(&mibqp->disable_page_faults_lock); 752 } 753 754 void mlx5_ib_odp_create_qp(struct mlx5_ib_qp *qp) 755 { 756 int i; 757 758 qp->disable_page_faults = 1; 759 spin_lock_init(&qp->disable_page_faults_lock); 760 761 qp->trans_qp.base.mqp.pfault_handler = mlx5_ib_pfault_handler; 762 763 for (i = 0; i < MLX5_IB_PAGEFAULT_CONTEXTS; ++i) 764 INIT_WORK(&qp->pagefaults[i].work, mlx5_ib_qp_pfault_action); 765 } 766 767 int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev) 768 { 769 int ret; 770 771 ret = init_srcu_struct(&ibdev->mr_srcu); 772 if (ret) 773 return ret; 774 775 return 0; 776 } 777 778 void mlx5_ib_odp_remove_one(struct mlx5_ib_dev *ibdev) 779 { 780 cleanup_srcu_struct(&ibdev->mr_srcu); 781 } 782 783 int __init mlx5_ib_odp_init(void) 784 { 785 mlx5_ib_page_fault_wq = 786 create_singlethread_workqueue("mlx5_ib_page_faults"); 787 if (!mlx5_ib_page_fault_wq) 788 return -ENOMEM; 789 790 return 0; 791 } 792 793 void mlx5_ib_odp_cleanup(void) 794 { 795 destroy_workqueue(mlx5_ib_page_fault_wq); 796 } 797