1 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB 2 /* 3 * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. 4 * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. 5 */ 6 7 #include <linux/libnvdimm.h> 8 9 #include "rxe.h" 10 #include "rxe_loc.h" 11 12 /* Return a random 8 bit key value that is 13 * different than the last_key. Set last_key to -1 14 * if this is the first key for an MR or MW 15 */ 16 u8 rxe_get_next_key(u32 last_key) 17 { 18 u8 key; 19 20 do { 21 get_random_bytes(&key, 1); 22 } while (key == last_key); 23 24 return key; 25 } 26 27 int mr_check_range(struct rxe_mr *mr, u64 iova, size_t length) 28 { 29 switch (mr->ibmr.type) { 30 case IB_MR_TYPE_DMA: 31 return 0; 32 33 case IB_MR_TYPE_USER: 34 case IB_MR_TYPE_MEM_REG: 35 if (iova < mr->ibmr.iova || 36 iova + length > mr->ibmr.iova + mr->ibmr.length) { 37 rxe_dbg_mr(mr, "iova/length out of range\n"); 38 return -EINVAL; 39 } 40 return 0; 41 42 default: 43 rxe_dbg_mr(mr, "mr type not supported\n"); 44 return -EINVAL; 45 } 46 } 47 48 void rxe_mr_init(int access, struct rxe_mr *mr) 49 { 50 u32 key = mr->elem.index << 8 | rxe_get_next_key(-1); 51 52 /* set ibmr->l/rkey and also copy into private l/rkey 53 * for user MRs these will always be the same 54 * for cases where caller 'owns' the key portion 55 * they may be different until REG_MR WQE is executed. 56 */ 57 mr->lkey = mr->ibmr.lkey = key; 58 mr->rkey = mr->ibmr.rkey = key; 59 60 mr->access = access; 61 mr->ibmr.page_size = PAGE_SIZE; 62 mr->page_mask = PAGE_MASK; 63 mr->page_shift = PAGE_SHIFT; 64 mr->state = RXE_MR_STATE_INVALID; 65 } 66 67 void rxe_mr_init_dma(int access, struct rxe_mr *mr) 68 { 69 rxe_mr_init(access, mr); 70 71 mr->state = RXE_MR_STATE_VALID; 72 mr->ibmr.type = IB_MR_TYPE_DMA; 73 } 74 75 static unsigned long rxe_mr_iova_to_index(struct rxe_mr *mr, u64 iova) 76 { 77 return (iova >> mr->page_shift) - (mr->ibmr.iova >> mr->page_shift); 78 } 79 80 static unsigned long rxe_mr_iova_to_page_offset(struct rxe_mr *mr, u64 iova) 81 { 82 return iova & (mr_page_size(mr) - 1); 83 } 84 85 static bool is_pmem_page(struct page *pg) 86 { 87 unsigned long paddr = page_to_phys(pg); 88 89 return REGION_INTERSECTS == 90 region_intersects(paddr, PAGE_SIZE, IORESOURCE_MEM, 91 IORES_DESC_PERSISTENT_MEMORY); 92 } 93 94 static int rxe_mr_fill_pages_from_sgt(struct rxe_mr *mr, struct sg_table *sgt) 95 { 96 XA_STATE(xas, &mr->page_list, 0); 97 struct sg_page_iter sg_iter; 98 struct page *page; 99 bool persistent = !!(mr->access & IB_ACCESS_FLUSH_PERSISTENT); 100 101 __sg_page_iter_start(&sg_iter, sgt->sgl, sgt->orig_nents, 0); 102 if (!__sg_page_iter_next(&sg_iter)) 103 return 0; 104 105 do { 106 xas_lock(&xas); 107 while (true) { 108 page = sg_page_iter_page(&sg_iter); 109 110 if (persistent && !is_pmem_page(page)) { 111 rxe_dbg_mr(mr, "Page can't be persistent\n"); 112 xas_set_err(&xas, -EINVAL); 113 break; 114 } 115 116 xas_store(&xas, page); 117 if (xas_error(&xas)) 118 break; 119 xas_next(&xas); 120 if (!__sg_page_iter_next(&sg_iter)) 121 break; 122 } 123 xas_unlock(&xas); 124 } while (xas_nomem(&xas, GFP_KERNEL)); 125 126 return xas_error(&xas); 127 } 128 129 int rxe_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, 130 int access, struct rxe_mr *mr) 131 { 132 struct ib_umem *umem; 133 int err; 134 135 rxe_mr_init(access, mr); 136 137 xa_init(&mr->page_list); 138 139 umem = ib_umem_get(&rxe->ib_dev, start, length, access); 140 if (IS_ERR(umem)) { 141 rxe_dbg_mr(mr, "Unable to pin memory region err = %d\n", 142 (int)PTR_ERR(umem)); 143 return PTR_ERR(umem); 144 } 145 146 err = rxe_mr_fill_pages_from_sgt(mr, &umem->sgt_append.sgt); 147 if (err) { 148 ib_umem_release(umem); 149 return err; 150 } 151 152 mr->umem = umem; 153 mr->ibmr.type = IB_MR_TYPE_USER; 154 mr->state = RXE_MR_STATE_VALID; 155 156 return 0; 157 } 158 159 static int rxe_mr_alloc(struct rxe_mr *mr, int num_buf) 160 { 161 XA_STATE(xas, &mr->page_list, 0); 162 int i = 0; 163 int err; 164 165 xa_init(&mr->page_list); 166 167 do { 168 xas_lock(&xas); 169 while (i != num_buf) { 170 xas_store(&xas, XA_ZERO_ENTRY); 171 if (xas_error(&xas)) 172 break; 173 xas_next(&xas); 174 i++; 175 } 176 xas_unlock(&xas); 177 } while (xas_nomem(&xas, GFP_KERNEL)); 178 179 err = xas_error(&xas); 180 if (err) 181 return err; 182 183 mr->num_buf = num_buf; 184 185 return 0; 186 } 187 188 int rxe_mr_init_fast(int max_pages, struct rxe_mr *mr) 189 { 190 int err; 191 192 /* always allow remote access for FMRs */ 193 rxe_mr_init(RXE_ACCESS_REMOTE, mr); 194 195 err = rxe_mr_alloc(mr, max_pages); 196 if (err) 197 goto err1; 198 199 mr->state = RXE_MR_STATE_FREE; 200 mr->ibmr.type = IB_MR_TYPE_MEM_REG; 201 202 return 0; 203 204 err1: 205 return err; 206 } 207 208 static int rxe_set_page(struct ib_mr *ibmr, u64 dma_addr) 209 { 210 struct rxe_mr *mr = to_rmr(ibmr); 211 struct page *page = ib_virt_dma_to_page(dma_addr); 212 bool persistent = !!(mr->access & IB_ACCESS_FLUSH_PERSISTENT); 213 int err; 214 215 if (persistent && !is_pmem_page(page)) { 216 rxe_dbg_mr(mr, "Page cannot be persistent\n"); 217 return -EINVAL; 218 } 219 220 if (unlikely(mr->nbuf == mr->num_buf)) 221 return -ENOMEM; 222 223 err = xa_err(xa_store(&mr->page_list, mr->nbuf, page, GFP_KERNEL)); 224 if (err) 225 return err; 226 227 mr->nbuf++; 228 return 0; 229 } 230 231 int rxe_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sgl, 232 int sg_nents, unsigned int *sg_offset) 233 { 234 struct rxe_mr *mr = to_rmr(ibmr); 235 unsigned int page_size = mr_page_size(mr); 236 237 mr->nbuf = 0; 238 mr->page_shift = ilog2(page_size); 239 mr->page_mask = ~((u64)page_size - 1); 240 mr->page_offset = mr->ibmr.iova & (page_size - 1); 241 242 return ib_sg_to_pages(ibmr, sgl, sg_nents, sg_offset, rxe_set_page); 243 } 244 245 static int rxe_mr_copy_xarray(struct rxe_mr *mr, u64 iova, void *addr, 246 unsigned int length, enum rxe_mr_copy_dir dir) 247 { 248 unsigned int page_offset = rxe_mr_iova_to_page_offset(mr, iova); 249 unsigned long index = rxe_mr_iova_to_index(mr, iova); 250 unsigned int bytes; 251 struct page *page; 252 void *va; 253 254 while (length) { 255 page = xa_load(&mr->page_list, index); 256 if (!page) 257 return -EFAULT; 258 259 bytes = min_t(unsigned int, length, 260 mr_page_size(mr) - page_offset); 261 va = kmap_local_page(page); 262 if (dir == RXE_FROM_MR_OBJ) 263 memcpy(addr, va + page_offset, bytes); 264 else 265 memcpy(va + page_offset, addr, bytes); 266 kunmap_local(va); 267 268 page_offset = 0; 269 addr += bytes; 270 length -= bytes; 271 index++; 272 } 273 274 return 0; 275 } 276 277 static void rxe_mr_copy_dma(struct rxe_mr *mr, u64 dma_addr, void *addr, 278 unsigned int length, enum rxe_mr_copy_dir dir) 279 { 280 unsigned int page_offset = dma_addr & (PAGE_SIZE - 1); 281 unsigned int bytes; 282 struct page *page; 283 u8 *va; 284 285 while (length) { 286 page = ib_virt_dma_to_page(dma_addr); 287 bytes = min_t(unsigned int, length, 288 PAGE_SIZE - page_offset); 289 va = kmap_local_page(page); 290 291 if (dir == RXE_TO_MR_OBJ) 292 memcpy(va + page_offset, addr, bytes); 293 else 294 memcpy(addr, va + page_offset, bytes); 295 296 kunmap_local(va); 297 page_offset = 0; 298 dma_addr += bytes; 299 addr += bytes; 300 length -= bytes; 301 } 302 } 303 304 int rxe_mr_copy(struct rxe_mr *mr, u64 iova, void *addr, 305 unsigned int length, enum rxe_mr_copy_dir dir) 306 { 307 int err; 308 309 if (length == 0) 310 return 0; 311 312 if (WARN_ON(!mr)) 313 return -EINVAL; 314 315 if (mr->ibmr.type == IB_MR_TYPE_DMA) { 316 rxe_mr_copy_dma(mr, iova, addr, length, dir); 317 return 0; 318 } 319 320 err = mr_check_range(mr, iova, length); 321 if (unlikely(err)) { 322 rxe_dbg_mr(mr, "iova out of range\n"); 323 return err; 324 } 325 326 if (is_odp_mr(mr)) 327 return rxe_odp_mr_copy(mr, iova, addr, length, dir); 328 else 329 return rxe_mr_copy_xarray(mr, iova, addr, length, dir); 330 } 331 332 /* copy data in or out of a wqe, i.e. sg list 333 * under the control of a dma descriptor 334 */ 335 int copy_data( 336 struct rxe_pd *pd, 337 int access, 338 struct rxe_dma_info *dma, 339 void *addr, 340 int length, 341 enum rxe_mr_copy_dir dir) 342 { 343 int bytes; 344 struct rxe_sge *sge = &dma->sge[dma->cur_sge]; 345 int offset = dma->sge_offset; 346 int resid = dma->resid; 347 struct rxe_mr *mr = NULL; 348 u64 iova; 349 int err; 350 351 if (length == 0) 352 return 0; 353 354 if (length > resid) { 355 err = -EINVAL; 356 goto err2; 357 } 358 359 if (sge->length && (offset < sge->length)) { 360 mr = lookup_mr(pd, access, sge->lkey, RXE_LOOKUP_LOCAL); 361 if (!mr) { 362 err = -EINVAL; 363 goto err1; 364 } 365 } 366 367 while (length > 0) { 368 bytes = length; 369 370 if (offset >= sge->length) { 371 if (mr) { 372 rxe_put(mr); 373 mr = NULL; 374 } 375 sge++; 376 dma->cur_sge++; 377 offset = 0; 378 379 if (dma->cur_sge >= dma->num_sge) { 380 err = -ENOSPC; 381 goto err2; 382 } 383 384 if (sge->length) { 385 mr = lookup_mr(pd, access, sge->lkey, 386 RXE_LOOKUP_LOCAL); 387 if (!mr) { 388 err = -EINVAL; 389 goto err1; 390 } 391 } else { 392 continue; 393 } 394 } 395 396 if (bytes > sge->length - offset) 397 bytes = sge->length - offset; 398 399 if (bytes > 0) { 400 iova = sge->addr + offset; 401 err = rxe_mr_copy(mr, iova, addr, bytes, dir); 402 if (err) 403 goto err2; 404 405 offset += bytes; 406 resid -= bytes; 407 length -= bytes; 408 addr += bytes; 409 } 410 } 411 412 dma->sge_offset = offset; 413 dma->resid = resid; 414 415 if (mr) 416 rxe_put(mr); 417 418 return 0; 419 420 err2: 421 if (mr) 422 rxe_put(mr); 423 err1: 424 return err; 425 } 426 427 static int rxe_mr_flush_pmem_iova(struct rxe_mr *mr, u64 iova, unsigned int length) 428 { 429 unsigned int page_offset; 430 unsigned long index; 431 struct page *page; 432 unsigned int bytes; 433 int err; 434 u8 *va; 435 436 err = mr_check_range(mr, iova, length); 437 if (err) 438 return err; 439 440 while (length > 0) { 441 index = rxe_mr_iova_to_index(mr, iova); 442 page = xa_load(&mr->page_list, index); 443 page_offset = rxe_mr_iova_to_page_offset(mr, iova); 444 if (!page) 445 return -EFAULT; 446 bytes = min_t(unsigned int, length, 447 mr_page_size(mr) - page_offset); 448 449 va = kmap_local_page(page); 450 arch_wb_cache_pmem(va + page_offset, bytes); 451 kunmap_local(va); 452 453 length -= bytes; 454 iova += bytes; 455 page_offset = 0; 456 } 457 458 return 0; 459 } 460 461 int rxe_flush_pmem_iova(struct rxe_mr *mr, u64 start, unsigned int length) 462 { 463 int err; 464 465 /* mr must be valid even if length is zero */ 466 if (WARN_ON(!mr)) 467 return -EINVAL; 468 469 if (length == 0) 470 return 0; 471 472 if (mr->ibmr.type == IB_MR_TYPE_DMA) 473 return -EFAULT; 474 475 if (is_odp_mr(mr)) 476 err = rxe_odp_flush_pmem_iova(mr, start, length); 477 else 478 err = rxe_mr_flush_pmem_iova(mr, start, length); 479 480 return err; 481 } 482 483 /* Guarantee atomicity of atomic operations at the machine level. */ 484 DEFINE_SPINLOCK(atomic_ops_lock); 485 486 enum resp_states rxe_mr_do_atomic_op(struct rxe_mr *mr, u64 iova, int opcode, 487 u64 compare, u64 swap_add, u64 *orig_val) 488 { 489 unsigned int page_offset; 490 struct page *page; 491 u64 value; 492 u64 *va; 493 494 if (unlikely(mr->state != RXE_MR_STATE_VALID)) { 495 rxe_dbg_mr(mr, "mr not in valid state\n"); 496 return RESPST_ERR_RKEY_VIOLATION; 497 } 498 499 if (mr->ibmr.type == IB_MR_TYPE_DMA) { 500 page_offset = iova & (PAGE_SIZE - 1); 501 page = ib_virt_dma_to_page(iova); 502 } else { 503 unsigned long index; 504 int err; 505 506 err = mr_check_range(mr, iova, sizeof(value)); 507 if (err) { 508 rxe_dbg_mr(mr, "iova out of range\n"); 509 return RESPST_ERR_RKEY_VIOLATION; 510 } 511 page_offset = rxe_mr_iova_to_page_offset(mr, iova); 512 index = rxe_mr_iova_to_index(mr, iova); 513 page = xa_load(&mr->page_list, index); 514 if (!page) 515 return RESPST_ERR_RKEY_VIOLATION; 516 } 517 518 if (unlikely(page_offset & 0x7)) { 519 rxe_dbg_mr(mr, "iova not aligned\n"); 520 return RESPST_ERR_MISALIGNED_ATOMIC; 521 } 522 523 va = kmap_local_page(page); 524 525 spin_lock_bh(&atomic_ops_lock); 526 value = *orig_val = va[page_offset >> 3]; 527 528 if (opcode == IB_OPCODE_RC_COMPARE_SWAP) { 529 if (value == compare) 530 va[page_offset >> 3] = swap_add; 531 } else { 532 value += swap_add; 533 va[page_offset >> 3] = value; 534 } 535 spin_unlock_bh(&atomic_ops_lock); 536 537 kunmap_local(va); 538 539 return RESPST_NONE; 540 } 541 542 enum resp_states rxe_mr_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value) 543 { 544 unsigned int page_offset; 545 struct page *page; 546 u64 *va; 547 548 if (mr->ibmr.type == IB_MR_TYPE_DMA) { 549 page_offset = iova & (PAGE_SIZE - 1); 550 page = ib_virt_dma_to_page(iova); 551 } else { 552 unsigned long index; 553 int err; 554 555 /* See IBA oA19-28 */ 556 err = mr_check_range(mr, iova, sizeof(value)); 557 if (unlikely(err)) { 558 rxe_dbg_mr(mr, "iova out of range\n"); 559 return RESPST_ERR_RKEY_VIOLATION; 560 } 561 page_offset = rxe_mr_iova_to_page_offset(mr, iova); 562 index = rxe_mr_iova_to_index(mr, iova); 563 page = xa_load(&mr->page_list, index); 564 if (!page) 565 return RESPST_ERR_RKEY_VIOLATION; 566 } 567 568 /* See IBA A19.4.2 */ 569 if (unlikely(page_offset & 0x7)) { 570 rxe_dbg_mr(mr, "misaligned address\n"); 571 return RESPST_ERR_MISALIGNED_ATOMIC; 572 } 573 574 va = kmap_local_page(page); 575 /* Do atomic write after all prior operations have completed */ 576 smp_store_release(&va[page_offset >> 3], value); 577 kunmap_local(va); 578 579 return RESPST_NONE; 580 } 581 582 int advance_dma_data(struct rxe_dma_info *dma, unsigned int length) 583 { 584 struct rxe_sge *sge = &dma->sge[dma->cur_sge]; 585 int offset = dma->sge_offset; 586 int resid = dma->resid; 587 588 while (length) { 589 unsigned int bytes; 590 591 if (offset >= sge->length) { 592 sge++; 593 dma->cur_sge++; 594 offset = 0; 595 if (dma->cur_sge >= dma->num_sge) 596 return -ENOSPC; 597 } 598 599 bytes = length; 600 601 if (bytes > sge->length - offset) 602 bytes = sge->length - offset; 603 604 offset += bytes; 605 resid -= bytes; 606 length -= bytes; 607 } 608 609 dma->sge_offset = offset; 610 dma->resid = resid; 611 612 return 0; 613 } 614 615 struct rxe_mr *lookup_mr(struct rxe_pd *pd, int access, u32 key, 616 enum rxe_mr_lookup_type type) 617 { 618 struct rxe_mr *mr; 619 struct rxe_dev *rxe = to_rdev(pd->ibpd.device); 620 int index = key >> 8; 621 622 mr = rxe_pool_get_index(&rxe->mr_pool, index); 623 if (!mr) 624 return NULL; 625 626 if (unlikely((type == RXE_LOOKUP_LOCAL && mr->lkey != key) || 627 (type == RXE_LOOKUP_REMOTE && mr->rkey != key) || 628 mr_pd(mr) != pd || ((access & mr->access) != access) || 629 mr->state != RXE_MR_STATE_VALID)) { 630 rxe_put(mr); 631 mr = NULL; 632 } 633 634 return mr; 635 } 636 637 int rxe_invalidate_mr(struct rxe_qp *qp, u32 key) 638 { 639 struct rxe_dev *rxe = to_rdev(qp->ibqp.device); 640 struct rxe_mr *mr; 641 int remote; 642 int ret; 643 644 mr = rxe_pool_get_index(&rxe->mr_pool, key >> 8); 645 if (!mr) { 646 rxe_dbg_qp(qp, "No MR for key %#x\n", key); 647 ret = -EINVAL; 648 goto err; 649 } 650 651 remote = mr->access & RXE_ACCESS_REMOTE; 652 if (remote ? (key != mr->rkey) : (key != mr->lkey)) { 653 rxe_dbg_mr(mr, "wr key (%#x) doesn't match mr key (%#x)\n", 654 key, (remote ? mr->rkey : mr->lkey)); 655 ret = -EINVAL; 656 goto err_drop_ref; 657 } 658 659 if (atomic_read(&mr->num_mw) > 0) { 660 rxe_dbg_mr(mr, "Attempt to invalidate an MR while bound to MWs\n"); 661 ret = -EINVAL; 662 goto err_drop_ref; 663 } 664 665 if (unlikely(mr->ibmr.type != IB_MR_TYPE_MEM_REG)) { 666 rxe_dbg_mr(mr, "Type (%d) is wrong\n", mr->ibmr.type); 667 ret = -EINVAL; 668 goto err_drop_ref; 669 } 670 671 mr->state = RXE_MR_STATE_FREE; 672 ret = 0; 673 674 err_drop_ref: 675 rxe_put(mr); 676 err: 677 return ret; 678 } 679 680 /* user can (re)register fast MR by executing a REG_MR WQE. 681 * user is expected to hold a reference on the ib mr until the 682 * WQE completes. 683 * Once a fast MR is created this is the only way to change the 684 * private keys. It is the responsibility of the user to maintain 685 * the ib mr keys in sync with rxe mr keys. 686 */ 687 int rxe_reg_fast_mr(struct rxe_qp *qp, struct rxe_send_wqe *wqe) 688 { 689 struct rxe_mr *mr = to_rmr(wqe->wr.wr.reg.mr); 690 u32 key = wqe->wr.wr.reg.key; 691 u32 access = wqe->wr.wr.reg.access; 692 693 /* user can only register MR in free state */ 694 if (unlikely(mr->state != RXE_MR_STATE_FREE)) { 695 rxe_dbg_mr(mr, "mr->lkey = 0x%x not free\n", mr->lkey); 696 return -EINVAL; 697 } 698 699 /* user can only register mr with qp in same protection domain */ 700 if (unlikely(qp->ibqp.pd != mr->ibmr.pd)) { 701 rxe_dbg_mr(mr, "qp->pd and mr->pd don't match\n"); 702 return -EINVAL; 703 } 704 705 /* user is only allowed to change key portion of l/rkey */ 706 if (unlikely((mr->lkey & ~0xff) != (key & ~0xff))) { 707 rxe_dbg_mr(mr, "key = 0x%x has wrong index mr->lkey = 0x%x\n", 708 key, mr->lkey); 709 return -EINVAL; 710 } 711 712 mr->access = access; 713 mr->lkey = key; 714 mr->rkey = key; 715 mr->ibmr.iova = wqe->wr.wr.reg.mr->iova; 716 mr->state = RXE_MR_STATE_VALID; 717 718 return 0; 719 } 720 721 void rxe_mr_cleanup(struct rxe_pool_elem *elem) 722 { 723 struct rxe_mr *mr = container_of(elem, typeof(*mr), elem); 724 725 rxe_put(mr_pd(mr)); 726 ib_umem_release(mr->umem); 727 728 if (mr->ibmr.type != IB_MR_TYPE_DMA) 729 xa_destroy(&mr->page_list); 730 } 731