1 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB 2 /* 3 * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. 4 * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. 5 */ 6 7 #include <linux/libnvdimm.h> 8 9 #include "rxe.h" 10 #include "rxe_loc.h" 11 12 /* Return a random 8 bit key value that is 13 * different than the last_key. Set last_key to -1 14 * if this is the first key for an MR or MW 15 */ 16 u8 rxe_get_next_key(u32 last_key) 17 { 18 u8 key; 19 20 do { 21 get_random_bytes(&key, 1); 22 } while (key == last_key); 23 24 return key; 25 } 26 27 int mr_check_range(struct rxe_mr *mr, u64 iova, size_t length) 28 { 29 switch (mr->ibmr.type) { 30 case IB_MR_TYPE_DMA: 31 return 0; 32 33 case IB_MR_TYPE_USER: 34 case IB_MR_TYPE_MEM_REG: 35 if (iova < mr->ibmr.iova || 36 iova + length > mr->ibmr.iova + mr->ibmr.length) { 37 rxe_dbg_mr(mr, "iova/length out of range\n"); 38 return -EINVAL; 39 } 40 return 0; 41 42 default: 43 rxe_dbg_mr(mr, "mr type not supported\n"); 44 return -EINVAL; 45 } 46 } 47 48 void rxe_mr_init(int access, struct rxe_mr *mr) 49 { 50 u32 key = mr->elem.index << 8 | rxe_get_next_key(-1); 51 52 /* set ibmr->l/rkey and also copy into private l/rkey 53 * for user MRs these will always be the same 54 * for cases where caller 'owns' the key portion 55 * they may be different until REG_MR WQE is executed. 56 */ 57 mr->lkey = mr->ibmr.lkey = key; 58 mr->rkey = mr->ibmr.rkey = key; 59 60 mr->access = access; 61 mr->ibmr.page_size = PAGE_SIZE; 62 mr->page_mask = PAGE_MASK; 63 mr->page_shift = PAGE_SHIFT; 64 mr->state = RXE_MR_STATE_INVALID; 65 } 66 67 void rxe_mr_init_dma(int access, struct rxe_mr *mr) 68 { 69 rxe_mr_init(access, mr); 70 71 mr->state = RXE_MR_STATE_VALID; 72 mr->ibmr.type = IB_MR_TYPE_DMA; 73 } 74 75 static unsigned long rxe_mr_iova_to_index(struct rxe_mr *mr, u64 iova) 76 { 77 return (iova >> mr->page_shift) - (mr->ibmr.iova >> mr->page_shift); 78 } 79 80 static unsigned long rxe_mr_iova_to_page_offset(struct rxe_mr *mr, u64 iova) 81 { 82 return iova & (mr_page_size(mr) - 1); 83 } 84 85 static bool is_pmem_page(struct page *pg) 86 { 87 unsigned long paddr = page_to_phys(pg); 88 89 return REGION_INTERSECTS == 90 region_intersects(paddr, PAGE_SIZE, IORESOURCE_MEM, 91 IORES_DESC_PERSISTENT_MEMORY); 92 } 93 94 static int rxe_mr_fill_pages_from_sgt(struct rxe_mr *mr, struct sg_table *sgt) 95 { 96 XA_STATE(xas, &mr->page_list, 0); 97 struct sg_page_iter sg_iter; 98 struct page *page; 99 bool persistent = !!(mr->access & IB_ACCESS_FLUSH_PERSISTENT); 100 101 __sg_page_iter_start(&sg_iter, sgt->sgl, sgt->orig_nents, 0); 102 if (!__sg_page_iter_next(&sg_iter)) 103 return 0; 104 105 do { 106 xas_lock(&xas); 107 while (true) { 108 page = sg_page_iter_page(&sg_iter); 109 110 if (persistent && !is_pmem_page(page)) { 111 rxe_dbg_mr(mr, "Page can't be persistent\n"); 112 xas_set_err(&xas, -EINVAL); 113 break; 114 } 115 116 xas_store(&xas, page); 117 if (xas_error(&xas)) 118 break; 119 xas_next(&xas); 120 if (!__sg_page_iter_next(&sg_iter)) 121 break; 122 } 123 xas_unlock(&xas); 124 } while (xas_nomem(&xas, GFP_KERNEL)); 125 126 return xas_error(&xas); 127 } 128 129 int rxe_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, 130 int access, struct rxe_mr *mr) 131 { 132 struct ib_umem *umem; 133 int err; 134 135 rxe_mr_init(access, mr); 136 137 xa_init(&mr->page_list); 138 139 umem = ib_umem_get(&rxe->ib_dev, start, length, access); 140 if (IS_ERR(umem)) { 141 rxe_dbg_mr(mr, "Unable to pin memory region err = %d\n", 142 (int)PTR_ERR(umem)); 143 return PTR_ERR(umem); 144 } 145 146 err = rxe_mr_fill_pages_from_sgt(mr, &umem->sgt_append.sgt); 147 if (err) { 148 ib_umem_release(umem); 149 return err; 150 } 151 152 mr->umem = umem; 153 mr->ibmr.type = IB_MR_TYPE_USER; 154 mr->state = RXE_MR_STATE_VALID; 155 156 return 0; 157 } 158 159 static int rxe_mr_alloc(struct rxe_mr *mr, int num_buf) 160 { 161 XA_STATE(xas, &mr->page_list, 0); 162 int i = 0; 163 int err; 164 165 xa_init(&mr->page_list); 166 167 do { 168 xas_lock(&xas); 169 while (i != num_buf) { 170 xas_store(&xas, XA_ZERO_ENTRY); 171 if (xas_error(&xas)) 172 break; 173 xas_next(&xas); 174 i++; 175 } 176 xas_unlock(&xas); 177 } while (xas_nomem(&xas, GFP_KERNEL)); 178 179 err = xas_error(&xas); 180 if (err) 181 return err; 182 183 mr->num_buf = num_buf; 184 185 return 0; 186 } 187 188 int rxe_mr_init_fast(int max_pages, struct rxe_mr *mr) 189 { 190 int err; 191 192 /* always allow remote access for FMRs */ 193 rxe_mr_init(RXE_ACCESS_REMOTE, mr); 194 195 err = rxe_mr_alloc(mr, max_pages); 196 if (err) 197 goto err1; 198 199 mr->state = RXE_MR_STATE_FREE; 200 mr->ibmr.type = IB_MR_TYPE_MEM_REG; 201 202 return 0; 203 204 err1: 205 return err; 206 } 207 208 static int rxe_set_page(struct ib_mr *ibmr, u64 dma_addr) 209 { 210 struct rxe_mr *mr = to_rmr(ibmr); 211 struct page *page = ib_virt_dma_to_page(dma_addr); 212 bool persistent = !!(mr->access & IB_ACCESS_FLUSH_PERSISTENT); 213 int err; 214 215 if (persistent && !is_pmem_page(page)) { 216 rxe_dbg_mr(mr, "Page cannot be persistent\n"); 217 return -EINVAL; 218 } 219 220 if (unlikely(mr->nbuf == mr->num_buf)) 221 return -ENOMEM; 222 223 err = xa_err(xa_store(&mr->page_list, mr->nbuf, page, GFP_KERNEL)); 224 if (err) 225 return err; 226 227 mr->nbuf++; 228 return 0; 229 } 230 231 int rxe_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sgl, 232 int sg_nents, unsigned int *sg_offset) 233 { 234 struct rxe_mr *mr = to_rmr(ibmr); 235 unsigned int page_size = mr_page_size(mr); 236 237 mr->nbuf = 0; 238 mr->page_shift = ilog2(page_size); 239 mr->page_mask = ~((u64)page_size - 1); 240 mr->page_offset = mr->ibmr.iova & (page_size - 1); 241 242 return ib_sg_to_pages(ibmr, sgl, sg_nents, sg_offset, rxe_set_page); 243 } 244 245 static int rxe_mr_copy_xarray(struct rxe_mr *mr, u64 iova, void *addr, 246 unsigned int length, enum rxe_mr_copy_dir dir) 247 { 248 unsigned int page_offset = rxe_mr_iova_to_page_offset(mr, iova); 249 unsigned long index = rxe_mr_iova_to_index(mr, iova); 250 unsigned int bytes; 251 struct page *page; 252 void *va; 253 254 while (length) { 255 page = xa_load(&mr->page_list, index); 256 if (!page) 257 return -EFAULT; 258 259 bytes = min_t(unsigned int, length, 260 mr_page_size(mr) - page_offset); 261 va = kmap_local_page(page); 262 if (dir == RXE_FROM_MR_OBJ) 263 memcpy(addr, va + page_offset, bytes); 264 else 265 memcpy(va + page_offset, addr, bytes); 266 kunmap_local(va); 267 268 page_offset = 0; 269 addr += bytes; 270 length -= bytes; 271 index++; 272 } 273 274 return 0; 275 } 276 277 static void rxe_mr_copy_dma(struct rxe_mr *mr, u64 dma_addr, void *addr, 278 unsigned int length, enum rxe_mr_copy_dir dir) 279 { 280 unsigned int page_offset = dma_addr & (PAGE_SIZE - 1); 281 unsigned int bytes; 282 struct page *page; 283 u8 *va; 284 285 while (length) { 286 page = ib_virt_dma_to_page(dma_addr); 287 bytes = min_t(unsigned int, length, 288 PAGE_SIZE - page_offset); 289 va = kmap_local_page(page); 290 291 if (dir == RXE_TO_MR_OBJ) 292 memcpy(va + page_offset, addr, bytes); 293 else 294 memcpy(addr, va + page_offset, bytes); 295 296 kunmap_local(va); 297 page_offset = 0; 298 dma_addr += bytes; 299 addr += bytes; 300 length -= bytes; 301 } 302 } 303 304 int rxe_mr_copy(struct rxe_mr *mr, u64 iova, void *addr, 305 unsigned int length, enum rxe_mr_copy_dir dir) 306 { 307 int err; 308 309 if (length == 0) 310 return 0; 311 312 if (WARN_ON(!mr)) 313 return -EINVAL; 314 315 if (mr->ibmr.type == IB_MR_TYPE_DMA) { 316 rxe_mr_copy_dma(mr, iova, addr, length, dir); 317 return 0; 318 } 319 320 err = mr_check_range(mr, iova, length); 321 if (unlikely(err)) { 322 rxe_dbg_mr(mr, "iova out of range\n"); 323 return err; 324 } 325 326 if (is_odp_mr(mr)) 327 return rxe_odp_mr_copy(mr, iova, addr, length, dir); 328 else 329 return rxe_mr_copy_xarray(mr, iova, addr, length, dir); 330 } 331 332 /* copy data in or out of a wqe, i.e. sg list 333 * under the control of a dma descriptor 334 */ 335 int copy_data( 336 struct rxe_pd *pd, 337 int access, 338 struct rxe_dma_info *dma, 339 void *addr, 340 int length, 341 enum rxe_mr_copy_dir dir) 342 { 343 int bytes; 344 struct rxe_sge *sge = &dma->sge[dma->cur_sge]; 345 int offset = dma->sge_offset; 346 int resid = dma->resid; 347 struct rxe_mr *mr = NULL; 348 u64 iova; 349 int err; 350 351 if (length == 0) 352 return 0; 353 354 if (length > resid) { 355 err = -EINVAL; 356 goto err2; 357 } 358 359 if (sge->length && (offset < sge->length)) { 360 mr = lookup_mr(pd, access, sge->lkey, RXE_LOOKUP_LOCAL); 361 if (!mr) { 362 err = -EINVAL; 363 goto err1; 364 } 365 } 366 367 while (length > 0) { 368 bytes = length; 369 370 if (offset >= sge->length) { 371 if (mr) { 372 rxe_put(mr); 373 mr = NULL; 374 } 375 sge++; 376 dma->cur_sge++; 377 offset = 0; 378 379 if (dma->cur_sge >= dma->num_sge) { 380 err = -ENOSPC; 381 goto err2; 382 } 383 384 if (sge->length) { 385 mr = lookup_mr(pd, access, sge->lkey, 386 RXE_LOOKUP_LOCAL); 387 if (!mr) { 388 err = -EINVAL; 389 goto err1; 390 } 391 } else { 392 continue; 393 } 394 } 395 396 if (bytes > sge->length - offset) 397 bytes = sge->length - offset; 398 399 if (bytes > 0) { 400 iova = sge->addr + offset; 401 err = rxe_mr_copy(mr, iova, addr, bytes, dir); 402 if (err) 403 goto err2; 404 405 offset += bytes; 406 resid -= bytes; 407 length -= bytes; 408 addr += bytes; 409 } 410 } 411 412 dma->sge_offset = offset; 413 dma->resid = resid; 414 415 if (mr) 416 rxe_put(mr); 417 418 return 0; 419 420 err2: 421 if (mr) 422 rxe_put(mr); 423 err1: 424 return err; 425 } 426 427 int rxe_flush_pmem_iova(struct rxe_mr *mr, u64 iova, unsigned int length) 428 { 429 unsigned int page_offset; 430 unsigned long index; 431 struct page *page; 432 unsigned int bytes; 433 int err; 434 u8 *va; 435 436 /* mr must be valid even if length is zero */ 437 if (WARN_ON(!mr)) 438 return -EINVAL; 439 440 if (length == 0) 441 return 0; 442 443 if (mr->ibmr.type == IB_MR_TYPE_DMA) 444 return -EFAULT; 445 446 err = mr_check_range(mr, iova, length); 447 if (err) 448 return err; 449 450 while (length > 0) { 451 index = rxe_mr_iova_to_index(mr, iova); 452 page = xa_load(&mr->page_list, index); 453 page_offset = rxe_mr_iova_to_page_offset(mr, iova); 454 if (!page) 455 return -EFAULT; 456 bytes = min_t(unsigned int, length, 457 mr_page_size(mr) - page_offset); 458 459 va = kmap_local_page(page); 460 arch_wb_cache_pmem(va + page_offset, bytes); 461 kunmap_local(va); 462 463 length -= bytes; 464 iova += bytes; 465 page_offset = 0; 466 } 467 468 return 0; 469 } 470 471 /* Guarantee atomicity of atomic operations at the machine level. */ 472 DEFINE_SPINLOCK(atomic_ops_lock); 473 474 int rxe_mr_do_atomic_op(struct rxe_mr *mr, u64 iova, int opcode, 475 u64 compare, u64 swap_add, u64 *orig_val) 476 { 477 unsigned int page_offset; 478 struct page *page; 479 u64 value; 480 u64 *va; 481 482 if (unlikely(mr->state != RXE_MR_STATE_VALID)) { 483 rxe_dbg_mr(mr, "mr not in valid state\n"); 484 return RESPST_ERR_RKEY_VIOLATION; 485 } 486 487 if (mr->ibmr.type == IB_MR_TYPE_DMA) { 488 page_offset = iova & (PAGE_SIZE - 1); 489 page = ib_virt_dma_to_page(iova); 490 } else { 491 unsigned long index; 492 int err; 493 494 err = mr_check_range(mr, iova, sizeof(value)); 495 if (err) { 496 rxe_dbg_mr(mr, "iova out of range\n"); 497 return RESPST_ERR_RKEY_VIOLATION; 498 } 499 page_offset = rxe_mr_iova_to_page_offset(mr, iova); 500 index = rxe_mr_iova_to_index(mr, iova); 501 page = xa_load(&mr->page_list, index); 502 if (!page) 503 return RESPST_ERR_RKEY_VIOLATION; 504 } 505 506 if (unlikely(page_offset & 0x7)) { 507 rxe_dbg_mr(mr, "iova not aligned\n"); 508 return RESPST_ERR_MISALIGNED_ATOMIC; 509 } 510 511 va = kmap_local_page(page); 512 513 spin_lock_bh(&atomic_ops_lock); 514 value = *orig_val = va[page_offset >> 3]; 515 516 if (opcode == IB_OPCODE_RC_COMPARE_SWAP) { 517 if (value == compare) 518 va[page_offset >> 3] = swap_add; 519 } else { 520 value += swap_add; 521 va[page_offset >> 3] = value; 522 } 523 spin_unlock_bh(&atomic_ops_lock); 524 525 kunmap_local(va); 526 527 return 0; 528 } 529 530 #if defined CONFIG_64BIT 531 /* only implemented or called for 64 bit architectures */ 532 int rxe_mr_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value) 533 { 534 unsigned int page_offset; 535 struct page *page; 536 u64 *va; 537 538 /* ODP is not supported right now. WIP. */ 539 if (is_odp_mr(mr)) 540 return RESPST_ERR_UNSUPPORTED_OPCODE; 541 542 /* See IBA oA19-28 */ 543 if (unlikely(mr->state != RXE_MR_STATE_VALID)) { 544 rxe_dbg_mr(mr, "mr not in valid state\n"); 545 return RESPST_ERR_RKEY_VIOLATION; 546 } 547 548 if (mr->ibmr.type == IB_MR_TYPE_DMA) { 549 page_offset = iova & (PAGE_SIZE - 1); 550 page = ib_virt_dma_to_page(iova); 551 } else { 552 unsigned long index; 553 int err; 554 555 /* See IBA oA19-28 */ 556 err = mr_check_range(mr, iova, sizeof(value)); 557 if (unlikely(err)) { 558 rxe_dbg_mr(mr, "iova out of range\n"); 559 return RESPST_ERR_RKEY_VIOLATION; 560 } 561 page_offset = rxe_mr_iova_to_page_offset(mr, iova); 562 index = rxe_mr_iova_to_index(mr, iova); 563 page = xa_load(&mr->page_list, index); 564 if (!page) 565 return RESPST_ERR_RKEY_VIOLATION; 566 } 567 568 /* See IBA A19.4.2 */ 569 if (unlikely(page_offset & 0x7)) { 570 rxe_dbg_mr(mr, "misaligned address\n"); 571 return RESPST_ERR_MISALIGNED_ATOMIC; 572 } 573 574 va = kmap_local_page(page); 575 576 /* Do atomic write after all prior operations have completed */ 577 smp_store_release(&va[page_offset >> 3], value); 578 579 kunmap_local(va); 580 581 return 0; 582 } 583 #else 584 int rxe_mr_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value) 585 { 586 return RESPST_ERR_UNSUPPORTED_OPCODE; 587 } 588 #endif 589 590 int advance_dma_data(struct rxe_dma_info *dma, unsigned int length) 591 { 592 struct rxe_sge *sge = &dma->sge[dma->cur_sge]; 593 int offset = dma->sge_offset; 594 int resid = dma->resid; 595 596 while (length) { 597 unsigned int bytes; 598 599 if (offset >= sge->length) { 600 sge++; 601 dma->cur_sge++; 602 offset = 0; 603 if (dma->cur_sge >= dma->num_sge) 604 return -ENOSPC; 605 } 606 607 bytes = length; 608 609 if (bytes > sge->length - offset) 610 bytes = sge->length - offset; 611 612 offset += bytes; 613 resid -= bytes; 614 length -= bytes; 615 } 616 617 dma->sge_offset = offset; 618 dma->resid = resid; 619 620 return 0; 621 } 622 623 struct rxe_mr *lookup_mr(struct rxe_pd *pd, int access, u32 key, 624 enum rxe_mr_lookup_type type) 625 { 626 struct rxe_mr *mr; 627 struct rxe_dev *rxe = to_rdev(pd->ibpd.device); 628 int index = key >> 8; 629 630 mr = rxe_pool_get_index(&rxe->mr_pool, index); 631 if (!mr) 632 return NULL; 633 634 if (unlikely((type == RXE_LOOKUP_LOCAL && mr->lkey != key) || 635 (type == RXE_LOOKUP_REMOTE && mr->rkey != key) || 636 mr_pd(mr) != pd || ((access & mr->access) != access) || 637 mr->state != RXE_MR_STATE_VALID)) { 638 rxe_put(mr); 639 mr = NULL; 640 } 641 642 return mr; 643 } 644 645 int rxe_invalidate_mr(struct rxe_qp *qp, u32 key) 646 { 647 struct rxe_dev *rxe = to_rdev(qp->ibqp.device); 648 struct rxe_mr *mr; 649 int remote; 650 int ret; 651 652 mr = rxe_pool_get_index(&rxe->mr_pool, key >> 8); 653 if (!mr) { 654 rxe_dbg_qp(qp, "No MR for key %#x\n", key); 655 ret = -EINVAL; 656 goto err; 657 } 658 659 remote = mr->access & RXE_ACCESS_REMOTE; 660 if (remote ? (key != mr->rkey) : (key != mr->lkey)) { 661 rxe_dbg_mr(mr, "wr key (%#x) doesn't match mr key (%#x)\n", 662 key, (remote ? mr->rkey : mr->lkey)); 663 ret = -EINVAL; 664 goto err_drop_ref; 665 } 666 667 if (atomic_read(&mr->num_mw) > 0) { 668 rxe_dbg_mr(mr, "Attempt to invalidate an MR while bound to MWs\n"); 669 ret = -EINVAL; 670 goto err_drop_ref; 671 } 672 673 if (unlikely(mr->ibmr.type != IB_MR_TYPE_MEM_REG)) { 674 rxe_dbg_mr(mr, "Type (%d) is wrong\n", mr->ibmr.type); 675 ret = -EINVAL; 676 goto err_drop_ref; 677 } 678 679 mr->state = RXE_MR_STATE_FREE; 680 ret = 0; 681 682 err_drop_ref: 683 rxe_put(mr); 684 err: 685 return ret; 686 } 687 688 /* user can (re)register fast MR by executing a REG_MR WQE. 689 * user is expected to hold a reference on the ib mr until the 690 * WQE completes. 691 * Once a fast MR is created this is the only way to change the 692 * private keys. It is the responsibility of the user to maintain 693 * the ib mr keys in sync with rxe mr keys. 694 */ 695 int rxe_reg_fast_mr(struct rxe_qp *qp, struct rxe_send_wqe *wqe) 696 { 697 struct rxe_mr *mr = to_rmr(wqe->wr.wr.reg.mr); 698 u32 key = wqe->wr.wr.reg.key; 699 u32 access = wqe->wr.wr.reg.access; 700 701 /* user can only register MR in free state */ 702 if (unlikely(mr->state != RXE_MR_STATE_FREE)) { 703 rxe_dbg_mr(mr, "mr->lkey = 0x%x not free\n", mr->lkey); 704 return -EINVAL; 705 } 706 707 /* user can only register mr with qp in same protection domain */ 708 if (unlikely(qp->ibqp.pd != mr->ibmr.pd)) { 709 rxe_dbg_mr(mr, "qp->pd and mr->pd don't match\n"); 710 return -EINVAL; 711 } 712 713 /* user is only allowed to change key portion of l/rkey */ 714 if (unlikely((mr->lkey & ~0xff) != (key & ~0xff))) { 715 rxe_dbg_mr(mr, "key = 0x%x has wrong index mr->lkey = 0x%x\n", 716 key, mr->lkey); 717 return -EINVAL; 718 } 719 720 mr->access = access; 721 mr->lkey = key; 722 mr->rkey = key; 723 mr->ibmr.iova = wqe->wr.wr.reg.mr->iova; 724 mr->state = RXE_MR_STATE_VALID; 725 726 return 0; 727 } 728 729 void rxe_mr_cleanup(struct rxe_pool_elem *elem) 730 { 731 struct rxe_mr *mr = container_of(elem, typeof(*mr), elem); 732 733 rxe_put(mr_pd(mr)); 734 ib_umem_release(mr->umem); 735 736 if (mr->ibmr.type != IB_MR_TYPE_DMA) 737 xa_destroy(&mr->page_list); 738 } 739