1 /* 2 * Copyright (c) 2014 Mellanox Technologies. All rights reserved. 3 * 4 * This software is available to you under a choice of one of two 5 * licenses. You may choose to be licensed under the terms of the GNU 6 * General Public License (GPL) Version 2, available from the file 7 * COPYING in the main directory of this source tree, or the 8 * OpenIB.org BSD license below: 9 * 10 * Redistribution and use in source and binary forms, with or 11 * without modification, are permitted provided that the following 12 * conditions are met: 13 * 14 * - Redistributions of source code must retain the above 15 * copyright notice, this list of conditions and the following 16 * disclaimer. 17 * 18 * - Redistributions in binary form must reproduce the above 19 * copyright notice, this list of conditions and the following 20 * disclaimer in the documentation and/or other materials 21 * provided with the distribution. 22 * 23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 30 * SOFTWARE. 31 */ 32 33 #include <rdma/ib_umem.h> 34 #include <rdma/ib_umem_odp.h> 35 36 #include "mlx5_ib.h" 37 38 #define MAX_PREFETCH_LEN (4*1024*1024U) 39 40 /* Timeout in ms to wait for an active mmu notifier to complete when handling 41 * a pagefault. */ 42 #define MMU_NOTIFIER_TIMEOUT 1000 43 44 struct workqueue_struct *mlx5_ib_page_fault_wq; 45 46 void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start, 47 unsigned long end) 48 { 49 struct mlx5_ib_mr *mr; 50 const u64 umr_block_mask = (MLX5_UMR_MTT_ALIGNMENT / sizeof(u64)) - 1; 51 u64 idx = 0, blk_start_idx = 0; 52 int in_block = 0; 53 u64 addr; 54 55 if (!umem || !umem->odp_data) { 56 pr_err("invalidation called on NULL umem or non-ODP umem\n"); 57 return; 58 } 59 60 mr = umem->odp_data->private; 61 62 if (!mr || !mr->ibmr.pd) 63 return; 64 65 start = max_t(u64, ib_umem_start(umem), start); 66 end = min_t(u64, ib_umem_end(umem), end); 67 68 /* 69 * Iteration one - zap the HW's MTTs. The notifiers_count ensures that 70 * while we are doing the invalidation, no page fault will attempt to 71 * overwrite the same MTTs. Concurent invalidations might race us, 72 * but they will write 0s as well, so no difference in the end result. 73 */ 74 75 for (addr = start; addr < end; addr += (u64)umem->page_size) { 76 idx = (addr - ib_umem_start(umem)) / PAGE_SIZE; 77 /* 78 * Strive to write the MTTs in chunks, but avoid overwriting 79 * non-existing MTTs. The huristic here can be improved to 80 * estimate the cost of another UMR vs. the cost of bigger 81 * UMR. 82 */ 83 if (umem->odp_data->dma_list[idx] & 84 (ODP_READ_ALLOWED_BIT | ODP_WRITE_ALLOWED_BIT)) { 85 if (!in_block) { 86 blk_start_idx = idx; 87 in_block = 1; 88 } 89 } else { 90 u64 umr_offset = idx & umr_block_mask; 91 92 if (in_block && umr_offset == 0) { 93 mlx5_ib_update_mtt(mr, blk_start_idx, 94 idx - blk_start_idx, 1); 95 in_block = 0; 96 } 97 } 98 } 99 if (in_block) 100 mlx5_ib_update_mtt(mr, blk_start_idx, idx - blk_start_idx + 1, 101 1); 102 103 /* 104 * We are now sure that the device will not access the 105 * memory. We can safely unmap it, and mark it as dirty if 106 * needed. 107 */ 108 109 ib_umem_odp_unmap_dma_pages(umem, start, end); 110 } 111 112 #define COPY_ODP_BIT_MLX_TO_IB(reg, ib_caps, field_name, bit_name) do { \ 113 if (be32_to_cpu(reg.field_name) & MLX5_ODP_SUPPORT_##bit_name) \ 114 ib_caps->field_name |= IB_ODP_SUPPORT_##bit_name; \ 115 } while (0) 116 117 int mlx5_ib_internal_query_odp_caps(struct mlx5_ib_dev *dev) 118 { 119 int err; 120 struct mlx5_odp_caps hw_caps; 121 struct ib_odp_caps *caps = &dev->odp_caps; 122 123 memset(caps, 0, sizeof(*caps)); 124 125 if (!(dev->mdev->caps.gen.flags & MLX5_DEV_CAP_FLAG_ON_DMND_PG)) 126 return 0; 127 128 err = mlx5_query_odp_caps(dev->mdev, &hw_caps); 129 if (err) 130 goto out; 131 132 caps->general_caps = IB_ODP_SUPPORT; 133 COPY_ODP_BIT_MLX_TO_IB(hw_caps, caps, per_transport_caps.ud_odp_caps, 134 SEND); 135 COPY_ODP_BIT_MLX_TO_IB(hw_caps, caps, per_transport_caps.rc_odp_caps, 136 SEND); 137 COPY_ODP_BIT_MLX_TO_IB(hw_caps, caps, per_transport_caps.rc_odp_caps, 138 RECV); 139 COPY_ODP_BIT_MLX_TO_IB(hw_caps, caps, per_transport_caps.rc_odp_caps, 140 WRITE); 141 COPY_ODP_BIT_MLX_TO_IB(hw_caps, caps, per_transport_caps.rc_odp_caps, 142 READ); 143 144 out: 145 return err; 146 } 147 148 static struct mlx5_ib_mr *mlx5_ib_odp_find_mr_lkey(struct mlx5_ib_dev *dev, 149 u32 key) 150 { 151 u32 base_key = mlx5_base_mkey(key); 152 struct mlx5_core_mr *mmr = __mlx5_mr_lookup(dev->mdev, base_key); 153 struct mlx5_ib_mr *mr = container_of(mmr, struct mlx5_ib_mr, mmr); 154 155 if (!mmr || mmr->key != key || !mr->live) 156 return NULL; 157 158 return container_of(mmr, struct mlx5_ib_mr, mmr); 159 } 160 161 static void mlx5_ib_page_fault_resume(struct mlx5_ib_qp *qp, 162 struct mlx5_ib_pfault *pfault, 163 int error) { 164 struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.pd->device); 165 int ret = mlx5_core_page_fault_resume(dev->mdev, qp->mqp.qpn, 166 pfault->mpfault.flags, 167 error); 168 if (ret) 169 pr_err("Failed to resolve the page fault on QP 0x%x\n", 170 qp->mqp.qpn); 171 } 172 173 /* 174 * Handle a single data segment in a page-fault WQE. 175 * 176 * Returns number of pages retrieved on success. The caller will continue to 177 * the next data segment. 178 * Can return the following error codes: 179 * -EAGAIN to designate a temporary error. The caller will abort handling the 180 * page fault and resolve it. 181 * -EFAULT when there's an error mapping the requested pages. The caller will 182 * abort the page fault handling and possibly move the QP to an error state. 183 * On other errors the QP should also be closed with an error. 184 */ 185 static int pagefault_single_data_segment(struct mlx5_ib_qp *qp, 186 struct mlx5_ib_pfault *pfault, 187 u32 key, u64 io_virt, size_t bcnt, 188 u32 *bytes_mapped) 189 { 190 struct mlx5_ib_dev *mib_dev = to_mdev(qp->ibqp.pd->device); 191 int srcu_key; 192 unsigned int current_seq; 193 u64 start_idx; 194 int npages = 0, ret = 0; 195 struct mlx5_ib_mr *mr; 196 u64 access_mask = ODP_READ_ALLOWED_BIT; 197 198 srcu_key = srcu_read_lock(&mib_dev->mr_srcu); 199 mr = mlx5_ib_odp_find_mr_lkey(mib_dev, key); 200 /* 201 * If we didn't find the MR, it means the MR was closed while we were 202 * handling the ODP event. In this case we return -EFAULT so that the 203 * QP will be closed. 204 */ 205 if (!mr || !mr->ibmr.pd) { 206 pr_err("Failed to find relevant mr for lkey=0x%06x, probably the MR was destroyed\n", 207 key); 208 ret = -EFAULT; 209 goto srcu_unlock; 210 } 211 if (!mr->umem->odp_data) { 212 pr_debug("skipping non ODP MR (lkey=0x%06x) in page fault handler.\n", 213 key); 214 if (bytes_mapped) 215 *bytes_mapped += 216 (bcnt - pfault->mpfault.bytes_committed); 217 goto srcu_unlock; 218 } 219 if (mr->ibmr.pd != qp->ibqp.pd) { 220 pr_err("Page-fault with different PDs for QP and MR.\n"); 221 ret = -EFAULT; 222 goto srcu_unlock; 223 } 224 225 current_seq = ACCESS_ONCE(mr->umem->odp_data->notifiers_seq); 226 /* 227 * Ensure the sequence number is valid for some time before we call 228 * gup. 229 */ 230 smp_rmb(); 231 232 /* 233 * Avoid branches - this code will perform correctly 234 * in all iterations (in iteration 2 and above, 235 * bytes_committed == 0). 236 */ 237 io_virt += pfault->mpfault.bytes_committed; 238 bcnt -= pfault->mpfault.bytes_committed; 239 240 start_idx = (io_virt - (mr->mmr.iova & PAGE_MASK)) >> PAGE_SHIFT; 241 242 if (mr->umem->writable) 243 access_mask |= ODP_WRITE_ALLOWED_BIT; 244 npages = ib_umem_odp_map_dma_pages(mr->umem, io_virt, bcnt, 245 access_mask, current_seq); 246 if (npages < 0) { 247 ret = npages; 248 goto srcu_unlock; 249 } 250 251 if (npages > 0) { 252 mutex_lock(&mr->umem->odp_data->umem_mutex); 253 if (!ib_umem_mmu_notifier_retry(mr->umem, current_seq)) { 254 /* 255 * No need to check whether the MTTs really belong to 256 * this MR, since ib_umem_odp_map_dma_pages already 257 * checks this. 258 */ 259 ret = mlx5_ib_update_mtt(mr, start_idx, npages, 0); 260 } else { 261 ret = -EAGAIN; 262 } 263 mutex_unlock(&mr->umem->odp_data->umem_mutex); 264 if (ret < 0) { 265 if (ret != -EAGAIN) 266 pr_err("Failed to update mkey page tables\n"); 267 goto srcu_unlock; 268 } 269 270 if (bytes_mapped) { 271 u32 new_mappings = npages * PAGE_SIZE - 272 (io_virt - round_down(io_virt, PAGE_SIZE)); 273 *bytes_mapped += min_t(u32, new_mappings, bcnt); 274 } 275 } 276 277 srcu_unlock: 278 if (ret == -EAGAIN) { 279 if (!mr->umem->odp_data->dying) { 280 struct ib_umem_odp *odp_data = mr->umem->odp_data; 281 unsigned long timeout = 282 msecs_to_jiffies(MMU_NOTIFIER_TIMEOUT); 283 284 if (!wait_for_completion_timeout( 285 &odp_data->notifier_completion, 286 timeout)) { 287 pr_warn("timeout waiting for mmu notifier completion\n"); 288 } 289 } else { 290 /* The MR is being killed, kill the QP as well. */ 291 ret = -EFAULT; 292 } 293 } 294 srcu_read_unlock(&mib_dev->mr_srcu, srcu_key); 295 pfault->mpfault.bytes_committed = 0; 296 return ret ? ret : npages; 297 } 298 299 /** 300 * Parse a series of data segments for page fault handling. 301 * 302 * @qp the QP on which the fault occurred. 303 * @pfault contains page fault information. 304 * @wqe points at the first data segment in the WQE. 305 * @wqe_end points after the end of the WQE. 306 * @bytes_mapped receives the number of bytes that the function was able to 307 * map. This allows the caller to decide intelligently whether 308 * enough memory was mapped to resolve the page fault 309 * successfully (e.g. enough for the next MTU, or the entire 310 * WQE). 311 * @total_wqe_bytes receives the total data size of this WQE in bytes (minus 312 * the committed bytes). 313 * 314 * Returns the number of pages loaded if positive, zero for an empty WQE, or a 315 * negative error code. 316 */ 317 static int pagefault_data_segments(struct mlx5_ib_qp *qp, 318 struct mlx5_ib_pfault *pfault, void *wqe, 319 void *wqe_end, u32 *bytes_mapped, 320 u32 *total_wqe_bytes, int receive_queue) 321 { 322 int ret = 0, npages = 0; 323 u64 io_virt; 324 u32 key; 325 u32 byte_count; 326 size_t bcnt; 327 int inline_segment; 328 329 /* Skip SRQ next-WQE segment. */ 330 if (receive_queue && qp->ibqp.srq) 331 wqe += sizeof(struct mlx5_wqe_srq_next_seg); 332 333 if (bytes_mapped) 334 *bytes_mapped = 0; 335 if (total_wqe_bytes) 336 *total_wqe_bytes = 0; 337 338 while (wqe < wqe_end) { 339 struct mlx5_wqe_data_seg *dseg = wqe; 340 341 io_virt = be64_to_cpu(dseg->addr); 342 key = be32_to_cpu(dseg->lkey); 343 byte_count = be32_to_cpu(dseg->byte_count); 344 inline_segment = !!(byte_count & MLX5_INLINE_SEG); 345 bcnt = byte_count & ~MLX5_INLINE_SEG; 346 347 if (inline_segment) { 348 bcnt = bcnt & MLX5_WQE_INLINE_SEG_BYTE_COUNT_MASK; 349 wqe += ALIGN(sizeof(struct mlx5_wqe_inline_seg) + bcnt, 350 16); 351 } else { 352 wqe += sizeof(*dseg); 353 } 354 355 /* receive WQE end of sg list. */ 356 if (receive_queue && bcnt == 0 && key == MLX5_INVALID_LKEY && 357 io_virt == 0) 358 break; 359 360 if (!inline_segment && total_wqe_bytes) { 361 *total_wqe_bytes += bcnt - min_t(size_t, bcnt, 362 pfault->mpfault.bytes_committed); 363 } 364 365 /* A zero length data segment designates a length of 2GB. */ 366 if (bcnt == 0) 367 bcnt = 1U << 31; 368 369 if (inline_segment || bcnt <= pfault->mpfault.bytes_committed) { 370 pfault->mpfault.bytes_committed -= 371 min_t(size_t, bcnt, 372 pfault->mpfault.bytes_committed); 373 continue; 374 } 375 376 ret = pagefault_single_data_segment(qp, pfault, key, io_virt, 377 bcnt, bytes_mapped); 378 if (ret < 0) 379 break; 380 npages += ret; 381 } 382 383 return ret < 0 ? ret : npages; 384 } 385 386 /* 387 * Parse initiator WQE. Advances the wqe pointer to point at the 388 * scatter-gather list, and set wqe_end to the end of the WQE. 389 */ 390 static int mlx5_ib_mr_initiator_pfault_handler( 391 struct mlx5_ib_qp *qp, struct mlx5_ib_pfault *pfault, 392 void **wqe, void **wqe_end, int wqe_length) 393 { 394 struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.pd->device); 395 struct mlx5_wqe_ctrl_seg *ctrl = *wqe; 396 u16 wqe_index = pfault->mpfault.wqe.wqe_index; 397 unsigned ds, opcode; 398 #if defined(DEBUG) 399 u32 ctrl_wqe_index, ctrl_qpn; 400 #endif 401 402 ds = be32_to_cpu(ctrl->qpn_ds) & MLX5_WQE_CTRL_DS_MASK; 403 if (ds * MLX5_WQE_DS_UNITS > wqe_length) { 404 mlx5_ib_err(dev, "Unable to read the complete WQE. ds = 0x%x, ret = 0x%x\n", 405 ds, wqe_length); 406 return -EFAULT; 407 } 408 409 if (ds == 0) { 410 mlx5_ib_err(dev, "Got WQE with zero DS. wqe_index=%x, qpn=%x\n", 411 wqe_index, qp->mqp.qpn); 412 return -EFAULT; 413 } 414 415 #if defined(DEBUG) 416 ctrl_wqe_index = (be32_to_cpu(ctrl->opmod_idx_opcode) & 417 MLX5_WQE_CTRL_WQE_INDEX_MASK) >> 418 MLX5_WQE_CTRL_WQE_INDEX_SHIFT; 419 if (wqe_index != ctrl_wqe_index) { 420 mlx5_ib_err(dev, "Got WQE with invalid wqe_index. wqe_index=0x%x, qpn=0x%x ctrl->wqe_index=0x%x\n", 421 wqe_index, qp->mqp.qpn, 422 ctrl_wqe_index); 423 return -EFAULT; 424 } 425 426 ctrl_qpn = (be32_to_cpu(ctrl->qpn_ds) & MLX5_WQE_CTRL_QPN_MASK) >> 427 MLX5_WQE_CTRL_QPN_SHIFT; 428 if (qp->mqp.qpn != ctrl_qpn) { 429 mlx5_ib_err(dev, "Got WQE with incorrect QP number. wqe_index=0x%x, qpn=0x%x ctrl->qpn=0x%x\n", 430 wqe_index, qp->mqp.qpn, 431 ctrl_qpn); 432 return -EFAULT; 433 } 434 #endif /* DEBUG */ 435 436 *wqe_end = *wqe + ds * MLX5_WQE_DS_UNITS; 437 *wqe += sizeof(*ctrl); 438 439 opcode = be32_to_cpu(ctrl->opmod_idx_opcode) & 440 MLX5_WQE_CTRL_OPCODE_MASK; 441 switch (qp->ibqp.qp_type) { 442 case IB_QPT_RC: 443 switch (opcode) { 444 case MLX5_OPCODE_SEND: 445 case MLX5_OPCODE_SEND_IMM: 446 case MLX5_OPCODE_SEND_INVAL: 447 if (!(dev->odp_caps.per_transport_caps.rc_odp_caps & 448 IB_ODP_SUPPORT_SEND)) 449 goto invalid_transport_or_opcode; 450 break; 451 case MLX5_OPCODE_RDMA_WRITE: 452 case MLX5_OPCODE_RDMA_WRITE_IMM: 453 if (!(dev->odp_caps.per_transport_caps.rc_odp_caps & 454 IB_ODP_SUPPORT_WRITE)) 455 goto invalid_transport_or_opcode; 456 *wqe += sizeof(struct mlx5_wqe_raddr_seg); 457 break; 458 case MLX5_OPCODE_RDMA_READ: 459 if (!(dev->odp_caps.per_transport_caps.rc_odp_caps & 460 IB_ODP_SUPPORT_READ)) 461 goto invalid_transport_or_opcode; 462 *wqe += sizeof(struct mlx5_wqe_raddr_seg); 463 break; 464 default: 465 goto invalid_transport_or_opcode; 466 } 467 break; 468 case IB_QPT_UD: 469 switch (opcode) { 470 case MLX5_OPCODE_SEND: 471 case MLX5_OPCODE_SEND_IMM: 472 if (!(dev->odp_caps.per_transport_caps.ud_odp_caps & 473 IB_ODP_SUPPORT_SEND)) 474 goto invalid_transport_or_opcode; 475 *wqe += sizeof(struct mlx5_wqe_datagram_seg); 476 break; 477 default: 478 goto invalid_transport_or_opcode; 479 } 480 break; 481 default: 482 invalid_transport_or_opcode: 483 mlx5_ib_err(dev, "ODP fault on QP of an unsupported opcode or transport. transport: 0x%x opcode: 0x%x.\n", 484 qp->ibqp.qp_type, opcode); 485 return -EFAULT; 486 } 487 488 return 0; 489 } 490 491 /* 492 * Parse responder WQE. Advances the wqe pointer to point at the 493 * scatter-gather list, and set wqe_end to the end of the WQE. 494 */ 495 static int mlx5_ib_mr_responder_pfault_handler( 496 struct mlx5_ib_qp *qp, struct mlx5_ib_pfault *pfault, 497 void **wqe, void **wqe_end, int wqe_length) 498 { 499 struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.pd->device); 500 struct mlx5_ib_wq *wq = &qp->rq; 501 int wqe_size = 1 << wq->wqe_shift; 502 503 if (qp->ibqp.srq) { 504 mlx5_ib_err(dev, "ODP fault on SRQ is not supported\n"); 505 return -EFAULT; 506 } 507 508 if (qp->wq_sig) { 509 mlx5_ib_err(dev, "ODP fault with WQE signatures is not supported\n"); 510 return -EFAULT; 511 } 512 513 if (wqe_size > wqe_length) { 514 mlx5_ib_err(dev, "Couldn't read all of the receive WQE's content\n"); 515 return -EFAULT; 516 } 517 518 switch (qp->ibqp.qp_type) { 519 case IB_QPT_RC: 520 if (!(dev->odp_caps.per_transport_caps.rc_odp_caps & 521 IB_ODP_SUPPORT_RECV)) 522 goto invalid_transport_or_opcode; 523 break; 524 default: 525 invalid_transport_or_opcode: 526 mlx5_ib_err(dev, "ODP fault on QP of an unsupported transport. transport: 0x%x\n", 527 qp->ibqp.qp_type); 528 return -EFAULT; 529 } 530 531 *wqe_end = *wqe + wqe_size; 532 533 return 0; 534 } 535 536 static void mlx5_ib_mr_wqe_pfault_handler(struct mlx5_ib_qp *qp, 537 struct mlx5_ib_pfault *pfault) 538 { 539 struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.pd->device); 540 int ret; 541 void *wqe, *wqe_end; 542 u32 bytes_mapped, total_wqe_bytes; 543 char *buffer = NULL; 544 int resume_with_error = 0; 545 u16 wqe_index = pfault->mpfault.wqe.wqe_index; 546 int requestor = pfault->mpfault.flags & MLX5_PFAULT_REQUESTOR; 547 548 buffer = (char *)__get_free_page(GFP_KERNEL); 549 if (!buffer) { 550 mlx5_ib_err(dev, "Error allocating memory for IO page fault handling.\n"); 551 resume_with_error = 1; 552 goto resolve_page_fault; 553 } 554 555 ret = mlx5_ib_read_user_wqe(qp, requestor, wqe_index, buffer, 556 PAGE_SIZE); 557 if (ret < 0) { 558 mlx5_ib_err(dev, "Failed reading a WQE following page fault, error=%x, wqe_index=%x, qpn=%x\n", 559 -ret, wqe_index, qp->mqp.qpn); 560 resume_with_error = 1; 561 goto resolve_page_fault; 562 } 563 564 wqe = buffer; 565 if (requestor) 566 ret = mlx5_ib_mr_initiator_pfault_handler(qp, pfault, &wqe, 567 &wqe_end, ret); 568 else 569 ret = mlx5_ib_mr_responder_pfault_handler(qp, pfault, &wqe, 570 &wqe_end, ret); 571 if (ret < 0) { 572 resume_with_error = 1; 573 goto resolve_page_fault; 574 } 575 576 if (wqe >= wqe_end) { 577 mlx5_ib_err(dev, "ODP fault on invalid WQE.\n"); 578 resume_with_error = 1; 579 goto resolve_page_fault; 580 } 581 582 ret = pagefault_data_segments(qp, pfault, wqe, wqe_end, &bytes_mapped, 583 &total_wqe_bytes, !requestor); 584 if (ret == -EAGAIN) { 585 goto resolve_page_fault; 586 } else if (ret < 0 || total_wqe_bytes > bytes_mapped) { 587 mlx5_ib_err(dev, "Error getting user pages for page fault. Error: 0x%x\n", 588 -ret); 589 resume_with_error = 1; 590 goto resolve_page_fault; 591 } 592 593 resolve_page_fault: 594 mlx5_ib_page_fault_resume(qp, pfault, resume_with_error); 595 mlx5_ib_dbg(dev, "PAGE FAULT completed. QP 0x%x resume_with_error=%d, flags: 0x%x\n", 596 qp->mqp.qpn, resume_with_error, pfault->mpfault.flags); 597 598 free_page((unsigned long)buffer); 599 } 600 601 static int pages_in_range(u64 address, u32 length) 602 { 603 return (ALIGN(address + length, PAGE_SIZE) - 604 (address & PAGE_MASK)) >> PAGE_SHIFT; 605 } 606 607 static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_qp *qp, 608 struct mlx5_ib_pfault *pfault) 609 { 610 struct mlx5_pagefault *mpfault = &pfault->mpfault; 611 u64 address; 612 u32 length; 613 u32 prefetch_len = mpfault->bytes_committed; 614 int prefetch_activated = 0; 615 u32 rkey = mpfault->rdma.r_key; 616 int ret; 617 618 /* The RDMA responder handler handles the page fault in two parts. 619 * First it brings the necessary pages for the current packet 620 * (and uses the pfault context), and then (after resuming the QP) 621 * prefetches more pages. The second operation cannot use the pfault 622 * context and therefore uses the dummy_pfault context allocated on 623 * the stack */ 624 struct mlx5_ib_pfault dummy_pfault = {}; 625 626 dummy_pfault.mpfault.bytes_committed = 0; 627 628 mpfault->rdma.rdma_va += mpfault->bytes_committed; 629 mpfault->rdma.rdma_op_len -= min(mpfault->bytes_committed, 630 mpfault->rdma.rdma_op_len); 631 mpfault->bytes_committed = 0; 632 633 address = mpfault->rdma.rdma_va; 634 length = mpfault->rdma.rdma_op_len; 635 636 /* For some operations, the hardware cannot tell the exact message 637 * length, and in those cases it reports zero. Use prefetch 638 * logic. */ 639 if (length == 0) { 640 prefetch_activated = 1; 641 length = mpfault->rdma.packet_size; 642 prefetch_len = min(MAX_PREFETCH_LEN, prefetch_len); 643 } 644 645 ret = pagefault_single_data_segment(qp, pfault, rkey, address, length, 646 NULL); 647 if (ret == -EAGAIN) { 648 /* We're racing with an invalidation, don't prefetch */ 649 prefetch_activated = 0; 650 } else if (ret < 0 || pages_in_range(address, length) > ret) { 651 mlx5_ib_page_fault_resume(qp, pfault, 1); 652 return; 653 } 654 655 mlx5_ib_page_fault_resume(qp, pfault, 0); 656 657 /* At this point, there might be a new pagefault already arriving in 658 * the eq, switch to the dummy pagefault for the rest of the 659 * processing. We're still OK with the objects being alive as the 660 * work-queue is being fenced. */ 661 662 if (prefetch_activated) { 663 ret = pagefault_single_data_segment(qp, &dummy_pfault, rkey, 664 address, 665 prefetch_len, 666 NULL); 667 if (ret < 0) { 668 pr_warn("Prefetch failed (ret = %d, prefetch_activated = %d) for QPN %d, address: 0x%.16llx, length = 0x%.16x\n", 669 ret, prefetch_activated, 670 qp->ibqp.qp_num, address, prefetch_len); 671 } 672 } 673 } 674 675 void mlx5_ib_mr_pfault_handler(struct mlx5_ib_qp *qp, 676 struct mlx5_ib_pfault *pfault) 677 { 678 u8 event_subtype = pfault->mpfault.event_subtype; 679 680 switch (event_subtype) { 681 case MLX5_PFAULT_SUBTYPE_WQE: 682 mlx5_ib_mr_wqe_pfault_handler(qp, pfault); 683 break; 684 case MLX5_PFAULT_SUBTYPE_RDMA: 685 mlx5_ib_mr_rdma_pfault_handler(qp, pfault); 686 break; 687 default: 688 pr_warn("Invalid page fault event subtype: 0x%x\n", 689 event_subtype); 690 mlx5_ib_page_fault_resume(qp, pfault, 1); 691 break; 692 } 693 } 694 695 static void mlx5_ib_qp_pfault_action(struct work_struct *work) 696 { 697 struct mlx5_ib_pfault *pfault = container_of(work, 698 struct mlx5_ib_pfault, 699 work); 700 enum mlx5_ib_pagefault_context context = 701 mlx5_ib_get_pagefault_context(&pfault->mpfault); 702 struct mlx5_ib_qp *qp = container_of(pfault, struct mlx5_ib_qp, 703 pagefaults[context]); 704 mlx5_ib_mr_pfault_handler(qp, pfault); 705 } 706 707 void mlx5_ib_qp_disable_pagefaults(struct mlx5_ib_qp *qp) 708 { 709 unsigned long flags; 710 711 spin_lock_irqsave(&qp->disable_page_faults_lock, flags); 712 qp->disable_page_faults = 1; 713 spin_unlock_irqrestore(&qp->disable_page_faults_lock, flags); 714 715 /* 716 * Note that at this point, we are guarenteed that no more 717 * work queue elements will be posted to the work queue with 718 * the QP we are closing. 719 */ 720 flush_workqueue(mlx5_ib_page_fault_wq); 721 } 722 723 void mlx5_ib_qp_enable_pagefaults(struct mlx5_ib_qp *qp) 724 { 725 unsigned long flags; 726 727 spin_lock_irqsave(&qp->disable_page_faults_lock, flags); 728 qp->disable_page_faults = 0; 729 spin_unlock_irqrestore(&qp->disable_page_faults_lock, flags); 730 } 731 732 static void mlx5_ib_pfault_handler(struct mlx5_core_qp *qp, 733 struct mlx5_pagefault *pfault) 734 { 735 /* 736 * Note that we will only get one fault event per QP per context 737 * (responder/initiator, read/write), until we resolve the page fault 738 * with the mlx5_ib_page_fault_resume command. Since this function is 739 * called from within the work element, there is no risk of missing 740 * events. 741 */ 742 struct mlx5_ib_qp *mibqp = to_mibqp(qp); 743 enum mlx5_ib_pagefault_context context = 744 mlx5_ib_get_pagefault_context(pfault); 745 struct mlx5_ib_pfault *qp_pfault = &mibqp->pagefaults[context]; 746 747 qp_pfault->mpfault = *pfault; 748 749 /* No need to stop interrupts here since we are in an interrupt */ 750 spin_lock(&mibqp->disable_page_faults_lock); 751 if (!mibqp->disable_page_faults) 752 queue_work(mlx5_ib_page_fault_wq, &qp_pfault->work); 753 spin_unlock(&mibqp->disable_page_faults_lock); 754 } 755 756 void mlx5_ib_odp_create_qp(struct mlx5_ib_qp *qp) 757 { 758 int i; 759 760 qp->disable_page_faults = 1; 761 spin_lock_init(&qp->disable_page_faults_lock); 762 763 qp->mqp.pfault_handler = mlx5_ib_pfault_handler; 764 765 for (i = 0; i < MLX5_IB_PAGEFAULT_CONTEXTS; ++i) 766 INIT_WORK(&qp->pagefaults[i].work, mlx5_ib_qp_pfault_action); 767 } 768 769 int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev) 770 { 771 int ret; 772 773 ret = init_srcu_struct(&ibdev->mr_srcu); 774 if (ret) 775 return ret; 776 777 return 0; 778 } 779 780 void mlx5_ib_odp_remove_one(struct mlx5_ib_dev *ibdev) 781 { 782 cleanup_srcu_struct(&ibdev->mr_srcu); 783 } 784 785 int __init mlx5_ib_odp_init(void) 786 { 787 mlx5_ib_page_fault_wq = 788 create_singlethread_workqueue("mlx5_ib_page_faults"); 789 if (!mlx5_ib_page_fault_wq) 790 return -ENOMEM; 791 792 return 0; 793 } 794 795 void mlx5_ib_odp_cleanup(void) 796 { 797 destroy_workqueue(mlx5_ib_page_fault_wq); 798 } 799