1 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB 2 /* 3 * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. 4 * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. 5 */ 6 7 #include <linux/libnvdimm.h> 8 9 #include "rxe.h" 10 #include "rxe_loc.h" 11 12 /* Return a random 8 bit key value that is 13 * different than the last_key. Set last_key to -1 14 * if this is the first key for an MR or MW 15 */ 16 u8 rxe_get_next_key(u32 last_key) 17 { 18 u8 key; 19 20 do { 21 get_random_bytes(&key, 1); 22 } while (key == last_key); 23 24 return key; 25 } 26 27 int mr_check_range(struct rxe_mr *mr, u64 iova, size_t length) 28 { 29 switch (mr->ibmr.type) { 30 case IB_MR_TYPE_DMA: 31 return 0; 32 33 case IB_MR_TYPE_USER: 34 case IB_MR_TYPE_MEM_REG: 35 if (iova < mr->ibmr.iova || 36 iova + length > mr->ibmr.iova + mr->ibmr.length) { 37 rxe_dbg_mr(mr, "iova/length out of range\n"); 38 return -EINVAL; 39 } 40 return 0; 41 42 default: 43 rxe_dbg_mr(mr, "mr type not supported\n"); 44 return -EINVAL; 45 } 46 } 47 48 static void rxe_mr_init(int access, struct rxe_mr *mr) 49 { 50 u32 key = mr->elem.index << 8 | rxe_get_next_key(-1); 51 52 /* set ibmr->l/rkey and also copy into private l/rkey 53 * for user MRs these will always be the same 54 * for cases where caller 'owns' the key portion 55 * they may be different until REG_MR WQE is executed. 56 */ 57 mr->lkey = mr->ibmr.lkey = key; 58 mr->rkey = mr->ibmr.rkey = key; 59 60 mr->access = access; 61 mr->ibmr.page_size = PAGE_SIZE; 62 mr->page_mask = PAGE_MASK; 63 mr->page_shift = PAGE_SHIFT; 64 mr->state = RXE_MR_STATE_INVALID; 65 } 66 67 void rxe_mr_init_dma(int access, struct rxe_mr *mr) 68 { 69 rxe_mr_init(access, mr); 70 71 mr->state = RXE_MR_STATE_VALID; 72 mr->ibmr.type = IB_MR_TYPE_DMA; 73 } 74 75 static unsigned long rxe_mr_iova_to_index(struct rxe_mr *mr, u64 iova) 76 { 77 return (iova >> mr->page_shift) - (mr->ibmr.iova >> mr->page_shift); 78 } 79 80 static unsigned long rxe_mr_iova_to_page_offset(struct rxe_mr *mr, u64 iova) 81 { 82 return iova & (mr_page_size(mr) - 1); 83 } 84 85 static bool is_pmem_page(struct page *pg) 86 { 87 unsigned long paddr = page_to_phys(pg); 88 89 return REGION_INTERSECTS == 90 region_intersects(paddr, PAGE_SIZE, IORESOURCE_MEM, 91 IORES_DESC_PERSISTENT_MEMORY); 92 } 93 94 static int rxe_mr_fill_pages_from_sgt(struct rxe_mr *mr, struct sg_table *sgt) 95 { 96 XA_STATE(xas, &mr->page_list, 0); 97 struct sg_page_iter sg_iter; 98 struct page *page; 99 bool persistent = !!(mr->access & IB_ACCESS_FLUSH_PERSISTENT); 100 101 __sg_page_iter_start(&sg_iter, sgt->sgl, sgt->orig_nents, 0); 102 if (!__sg_page_iter_next(&sg_iter)) 103 return 0; 104 105 do { 106 xas_lock(&xas); 107 while (true) { 108 page = sg_page_iter_page(&sg_iter); 109 110 if (persistent && !is_pmem_page(page)) { 111 rxe_dbg_mr(mr, "Page can't be persistent\n"); 112 xas_set_err(&xas, -EINVAL); 113 break; 114 } 115 116 xas_store(&xas, page); 117 if (xas_error(&xas)) 118 break; 119 xas_next(&xas); 120 if (!__sg_page_iter_next(&sg_iter)) 121 break; 122 } 123 xas_unlock(&xas); 124 } while (xas_nomem(&xas, GFP_KERNEL)); 125 126 return xas_error(&xas); 127 } 128 129 int rxe_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, 130 int access, struct rxe_mr *mr) 131 { 132 struct ib_umem *umem; 133 int err; 134 135 rxe_mr_init(access, mr); 136 137 xa_init(&mr->page_list); 138 139 umem = ib_umem_get(&rxe->ib_dev, start, length, access); 140 if (IS_ERR(umem)) { 141 rxe_dbg_mr(mr, "Unable to pin memory region err = %d\n", 142 (int)PTR_ERR(umem)); 143 return PTR_ERR(umem); 144 } 145 146 err = rxe_mr_fill_pages_from_sgt(mr, &umem->sgt_append.sgt); 147 if (err) { 148 ib_umem_release(umem); 149 return err; 150 } 151 152 mr->umem = umem; 153 mr->ibmr.type = IB_MR_TYPE_USER; 154 mr->state = RXE_MR_STATE_VALID; 155 156 return 0; 157 } 158 159 static int rxe_mr_alloc(struct rxe_mr *mr, int num_buf) 160 { 161 XA_STATE(xas, &mr->page_list, 0); 162 int i = 0; 163 int err; 164 165 xa_init(&mr->page_list); 166 167 do { 168 xas_lock(&xas); 169 while (i != num_buf) { 170 xas_store(&xas, XA_ZERO_ENTRY); 171 if (xas_error(&xas)) 172 break; 173 xas_next(&xas); 174 i++; 175 } 176 xas_unlock(&xas); 177 } while (xas_nomem(&xas, GFP_KERNEL)); 178 179 err = xas_error(&xas); 180 if (err) 181 return err; 182 183 mr->num_buf = num_buf; 184 185 return 0; 186 } 187 188 int rxe_mr_init_fast(int max_pages, struct rxe_mr *mr) 189 { 190 int err; 191 192 /* always allow remote access for FMRs */ 193 rxe_mr_init(RXE_ACCESS_REMOTE, mr); 194 195 err = rxe_mr_alloc(mr, max_pages); 196 if (err) 197 goto err1; 198 199 mr->state = RXE_MR_STATE_FREE; 200 mr->ibmr.type = IB_MR_TYPE_MEM_REG; 201 202 return 0; 203 204 err1: 205 return err; 206 } 207 208 static int rxe_set_page(struct ib_mr *ibmr, u64 dma_addr) 209 { 210 struct rxe_mr *mr = to_rmr(ibmr); 211 struct page *page = ib_virt_dma_to_page(dma_addr); 212 bool persistent = !!(mr->access & IB_ACCESS_FLUSH_PERSISTENT); 213 int err; 214 215 if (persistent && !is_pmem_page(page)) { 216 rxe_dbg_mr(mr, "Page cannot be persistent\n"); 217 return -EINVAL; 218 } 219 220 if (unlikely(mr->nbuf == mr->num_buf)) 221 return -ENOMEM; 222 223 err = xa_err(xa_store(&mr->page_list, mr->nbuf, page, GFP_KERNEL)); 224 if (err) 225 return err; 226 227 mr->nbuf++; 228 return 0; 229 } 230 231 int rxe_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sgl, 232 int sg_nents, unsigned int *sg_offset) 233 { 234 struct rxe_mr *mr = to_rmr(ibmr); 235 unsigned int page_size = mr_page_size(mr); 236 237 mr->nbuf = 0; 238 mr->page_shift = ilog2(page_size); 239 mr->page_mask = ~((u64)page_size - 1); 240 mr->page_offset = mr->ibmr.iova & (page_size - 1); 241 242 return ib_sg_to_pages(ibmr, sgl, sg_nents, sg_offset, rxe_set_page); 243 } 244 245 static int rxe_mr_copy_xarray(struct rxe_mr *mr, u64 iova, void *addr, 246 unsigned int length, enum rxe_mr_copy_dir dir) 247 { 248 unsigned int page_offset = rxe_mr_iova_to_page_offset(mr, iova); 249 unsigned long index = rxe_mr_iova_to_index(mr, iova); 250 unsigned int bytes; 251 struct page *page; 252 void *va; 253 254 while (length) { 255 page = xa_load(&mr->page_list, index); 256 if (!page) 257 return -EFAULT; 258 259 bytes = min_t(unsigned int, length, 260 mr_page_size(mr) - page_offset); 261 va = kmap_local_page(page); 262 if (dir == RXE_FROM_MR_OBJ) 263 memcpy(addr, va + page_offset, bytes); 264 else 265 memcpy(va + page_offset, addr, bytes); 266 kunmap_local(va); 267 268 page_offset = 0; 269 addr += bytes; 270 length -= bytes; 271 index++; 272 } 273 274 return 0; 275 } 276 277 static void rxe_mr_copy_dma(struct rxe_mr *mr, u64 dma_addr, void *addr, 278 unsigned int length, enum rxe_mr_copy_dir dir) 279 { 280 unsigned int page_offset = dma_addr & (PAGE_SIZE - 1); 281 unsigned int bytes; 282 struct page *page; 283 u8 *va; 284 285 while (length) { 286 page = ib_virt_dma_to_page(dma_addr); 287 bytes = min_t(unsigned int, length, 288 PAGE_SIZE - page_offset); 289 va = kmap_local_page(page); 290 291 if (dir == RXE_TO_MR_OBJ) 292 memcpy(va + page_offset, addr, bytes); 293 else 294 memcpy(addr, va + page_offset, bytes); 295 296 kunmap_local(va); 297 page_offset = 0; 298 dma_addr += bytes; 299 addr += bytes; 300 length -= bytes; 301 } 302 } 303 304 int rxe_mr_copy(struct rxe_mr *mr, u64 iova, void *addr, 305 unsigned int length, enum rxe_mr_copy_dir dir) 306 { 307 int err; 308 309 if (length == 0) 310 return 0; 311 312 if (WARN_ON(!mr)) 313 return -EINVAL; 314 315 if (mr->ibmr.type == IB_MR_TYPE_DMA) { 316 rxe_mr_copy_dma(mr, iova, addr, length, dir); 317 return 0; 318 } 319 320 err = mr_check_range(mr, iova, length); 321 if (unlikely(err)) { 322 rxe_dbg_mr(mr, "iova out of range\n"); 323 return err; 324 } 325 326 return rxe_mr_copy_xarray(mr, iova, addr, length, dir); 327 } 328 329 /* copy data in or out of a wqe, i.e. sg list 330 * under the control of a dma descriptor 331 */ 332 int copy_data( 333 struct rxe_pd *pd, 334 int access, 335 struct rxe_dma_info *dma, 336 void *addr, 337 int length, 338 enum rxe_mr_copy_dir dir) 339 { 340 int bytes; 341 struct rxe_sge *sge = &dma->sge[dma->cur_sge]; 342 int offset = dma->sge_offset; 343 int resid = dma->resid; 344 struct rxe_mr *mr = NULL; 345 u64 iova; 346 int err; 347 348 if (length == 0) 349 return 0; 350 351 if (length > resid) { 352 err = -EINVAL; 353 goto err2; 354 } 355 356 if (sge->length && (offset < sge->length)) { 357 mr = lookup_mr(pd, access, sge->lkey, RXE_LOOKUP_LOCAL); 358 if (!mr) { 359 err = -EINVAL; 360 goto err1; 361 } 362 } 363 364 while (length > 0) { 365 bytes = length; 366 367 if (offset >= sge->length) { 368 if (mr) { 369 rxe_put(mr); 370 mr = NULL; 371 } 372 sge++; 373 dma->cur_sge++; 374 offset = 0; 375 376 if (dma->cur_sge >= dma->num_sge) { 377 err = -ENOSPC; 378 goto err2; 379 } 380 381 if (sge->length) { 382 mr = lookup_mr(pd, access, sge->lkey, 383 RXE_LOOKUP_LOCAL); 384 if (!mr) { 385 err = -EINVAL; 386 goto err1; 387 } 388 } else { 389 continue; 390 } 391 } 392 393 if (bytes > sge->length - offset) 394 bytes = sge->length - offset; 395 396 if (bytes > 0) { 397 iova = sge->addr + offset; 398 err = rxe_mr_copy(mr, iova, addr, bytes, dir); 399 if (err) 400 goto err2; 401 402 offset += bytes; 403 resid -= bytes; 404 length -= bytes; 405 addr += bytes; 406 } 407 } 408 409 dma->sge_offset = offset; 410 dma->resid = resid; 411 412 if (mr) 413 rxe_put(mr); 414 415 return 0; 416 417 err2: 418 if (mr) 419 rxe_put(mr); 420 err1: 421 return err; 422 } 423 424 int rxe_flush_pmem_iova(struct rxe_mr *mr, u64 iova, unsigned int length) 425 { 426 unsigned int page_offset; 427 unsigned long index; 428 struct page *page; 429 unsigned int bytes; 430 int err; 431 u8 *va; 432 433 /* mr must be valid even if length is zero */ 434 if (WARN_ON(!mr)) 435 return -EINVAL; 436 437 if (length == 0) 438 return 0; 439 440 if (mr->ibmr.type == IB_MR_TYPE_DMA) 441 return -EFAULT; 442 443 err = mr_check_range(mr, iova, length); 444 if (err) 445 return err; 446 447 while (length > 0) { 448 index = rxe_mr_iova_to_index(mr, iova); 449 page = xa_load(&mr->page_list, index); 450 page_offset = rxe_mr_iova_to_page_offset(mr, iova); 451 if (!page) 452 return -EFAULT; 453 bytes = min_t(unsigned int, length, 454 mr_page_size(mr) - page_offset); 455 456 va = kmap_local_page(page); 457 arch_wb_cache_pmem(va + page_offset, bytes); 458 kunmap_local(va); 459 460 length -= bytes; 461 iova += bytes; 462 page_offset = 0; 463 } 464 465 return 0; 466 } 467 468 /* Guarantee atomicity of atomic operations at the machine level. */ 469 static DEFINE_SPINLOCK(atomic_ops_lock); 470 471 int rxe_mr_do_atomic_op(struct rxe_mr *mr, u64 iova, int opcode, 472 u64 compare, u64 swap_add, u64 *orig_val) 473 { 474 unsigned int page_offset; 475 struct page *page; 476 u64 value; 477 u64 *va; 478 479 if (unlikely(mr->state != RXE_MR_STATE_VALID)) { 480 rxe_dbg_mr(mr, "mr not in valid state\n"); 481 return RESPST_ERR_RKEY_VIOLATION; 482 } 483 484 if (mr->ibmr.type == IB_MR_TYPE_DMA) { 485 page_offset = iova & (PAGE_SIZE - 1); 486 page = ib_virt_dma_to_page(iova); 487 } else { 488 unsigned long index; 489 int err; 490 491 err = mr_check_range(mr, iova, sizeof(value)); 492 if (err) { 493 rxe_dbg_mr(mr, "iova out of range\n"); 494 return RESPST_ERR_RKEY_VIOLATION; 495 } 496 page_offset = rxe_mr_iova_to_page_offset(mr, iova); 497 index = rxe_mr_iova_to_index(mr, iova); 498 page = xa_load(&mr->page_list, index); 499 if (!page) 500 return RESPST_ERR_RKEY_VIOLATION; 501 } 502 503 if (unlikely(page_offset & 0x7)) { 504 rxe_dbg_mr(mr, "iova not aligned\n"); 505 return RESPST_ERR_MISALIGNED_ATOMIC; 506 } 507 508 va = kmap_local_page(page); 509 510 spin_lock_bh(&atomic_ops_lock); 511 value = *orig_val = va[page_offset >> 3]; 512 513 if (opcode == IB_OPCODE_RC_COMPARE_SWAP) { 514 if (value == compare) 515 va[page_offset >> 3] = swap_add; 516 } else { 517 value += swap_add; 518 va[page_offset >> 3] = value; 519 } 520 spin_unlock_bh(&atomic_ops_lock); 521 522 kunmap_local(va); 523 524 return 0; 525 } 526 527 #if defined CONFIG_64BIT 528 /* only implemented or called for 64 bit architectures */ 529 int rxe_mr_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value) 530 { 531 unsigned int page_offset; 532 struct page *page; 533 u64 *va; 534 535 /* See IBA oA19-28 */ 536 if (unlikely(mr->state != RXE_MR_STATE_VALID)) { 537 rxe_dbg_mr(mr, "mr not in valid state\n"); 538 return RESPST_ERR_RKEY_VIOLATION; 539 } 540 541 if (mr->ibmr.type == IB_MR_TYPE_DMA) { 542 page_offset = iova & (PAGE_SIZE - 1); 543 page = ib_virt_dma_to_page(iova); 544 } else { 545 unsigned long index; 546 int err; 547 548 /* See IBA oA19-28 */ 549 err = mr_check_range(mr, iova, sizeof(value)); 550 if (unlikely(err)) { 551 rxe_dbg_mr(mr, "iova out of range\n"); 552 return RESPST_ERR_RKEY_VIOLATION; 553 } 554 page_offset = rxe_mr_iova_to_page_offset(mr, iova); 555 index = rxe_mr_iova_to_index(mr, iova); 556 page = xa_load(&mr->page_list, index); 557 if (!page) 558 return RESPST_ERR_RKEY_VIOLATION; 559 } 560 561 /* See IBA A19.4.2 */ 562 if (unlikely(page_offset & 0x7)) { 563 rxe_dbg_mr(mr, "misaligned address\n"); 564 return RESPST_ERR_MISALIGNED_ATOMIC; 565 } 566 567 va = kmap_local_page(page); 568 569 /* Do atomic write after all prior operations have completed */ 570 smp_store_release(&va[page_offset >> 3], value); 571 572 kunmap_local(va); 573 574 return 0; 575 } 576 #else 577 int rxe_mr_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value) 578 { 579 return RESPST_ERR_UNSUPPORTED_OPCODE; 580 } 581 #endif 582 583 int advance_dma_data(struct rxe_dma_info *dma, unsigned int length) 584 { 585 struct rxe_sge *sge = &dma->sge[dma->cur_sge]; 586 int offset = dma->sge_offset; 587 int resid = dma->resid; 588 589 while (length) { 590 unsigned int bytes; 591 592 if (offset >= sge->length) { 593 sge++; 594 dma->cur_sge++; 595 offset = 0; 596 if (dma->cur_sge >= dma->num_sge) 597 return -ENOSPC; 598 } 599 600 bytes = length; 601 602 if (bytes > sge->length - offset) 603 bytes = sge->length - offset; 604 605 offset += bytes; 606 resid -= bytes; 607 length -= bytes; 608 } 609 610 dma->sge_offset = offset; 611 dma->resid = resid; 612 613 return 0; 614 } 615 616 struct rxe_mr *lookup_mr(struct rxe_pd *pd, int access, u32 key, 617 enum rxe_mr_lookup_type type) 618 { 619 struct rxe_mr *mr; 620 struct rxe_dev *rxe = to_rdev(pd->ibpd.device); 621 int index = key >> 8; 622 623 mr = rxe_pool_get_index(&rxe->mr_pool, index); 624 if (!mr) 625 return NULL; 626 627 if (unlikely((type == RXE_LOOKUP_LOCAL && mr->lkey != key) || 628 (type == RXE_LOOKUP_REMOTE && mr->rkey != key) || 629 mr_pd(mr) != pd || ((access & mr->access) != access) || 630 mr->state != RXE_MR_STATE_VALID)) { 631 rxe_put(mr); 632 mr = NULL; 633 } 634 635 return mr; 636 } 637 638 int rxe_invalidate_mr(struct rxe_qp *qp, u32 key) 639 { 640 struct rxe_dev *rxe = to_rdev(qp->ibqp.device); 641 struct rxe_mr *mr; 642 int remote; 643 int ret; 644 645 mr = rxe_pool_get_index(&rxe->mr_pool, key >> 8); 646 if (!mr) { 647 rxe_dbg_qp(qp, "No MR for key %#x\n", key); 648 ret = -EINVAL; 649 goto err; 650 } 651 652 remote = mr->access & RXE_ACCESS_REMOTE; 653 if (remote ? (key != mr->rkey) : (key != mr->lkey)) { 654 rxe_dbg_mr(mr, "wr key (%#x) doesn't match mr key (%#x)\n", 655 key, (remote ? mr->rkey : mr->lkey)); 656 ret = -EINVAL; 657 goto err_drop_ref; 658 } 659 660 if (atomic_read(&mr->num_mw) > 0) { 661 rxe_dbg_mr(mr, "Attempt to invalidate an MR while bound to MWs\n"); 662 ret = -EINVAL; 663 goto err_drop_ref; 664 } 665 666 if (unlikely(mr->ibmr.type != IB_MR_TYPE_MEM_REG)) { 667 rxe_dbg_mr(mr, "Type (%d) is wrong\n", mr->ibmr.type); 668 ret = -EINVAL; 669 goto err_drop_ref; 670 } 671 672 mr->state = RXE_MR_STATE_FREE; 673 ret = 0; 674 675 err_drop_ref: 676 rxe_put(mr); 677 err: 678 return ret; 679 } 680 681 /* user can (re)register fast MR by executing a REG_MR WQE. 682 * user is expected to hold a reference on the ib mr until the 683 * WQE completes. 684 * Once a fast MR is created this is the only way to change the 685 * private keys. It is the responsibility of the user to maintain 686 * the ib mr keys in sync with rxe mr keys. 687 */ 688 int rxe_reg_fast_mr(struct rxe_qp *qp, struct rxe_send_wqe *wqe) 689 { 690 struct rxe_mr *mr = to_rmr(wqe->wr.wr.reg.mr); 691 u32 key = wqe->wr.wr.reg.key; 692 u32 access = wqe->wr.wr.reg.access; 693 694 /* user can only register MR in free state */ 695 if (unlikely(mr->state != RXE_MR_STATE_FREE)) { 696 rxe_dbg_mr(mr, "mr->lkey = 0x%x not free\n", mr->lkey); 697 return -EINVAL; 698 } 699 700 /* user can only register mr with qp in same protection domain */ 701 if (unlikely(qp->ibqp.pd != mr->ibmr.pd)) { 702 rxe_dbg_mr(mr, "qp->pd and mr->pd don't match\n"); 703 return -EINVAL; 704 } 705 706 /* user is only allowed to change key portion of l/rkey */ 707 if (unlikely((mr->lkey & ~0xff) != (key & ~0xff))) { 708 rxe_dbg_mr(mr, "key = 0x%x has wrong index mr->lkey = 0x%x\n", 709 key, mr->lkey); 710 return -EINVAL; 711 } 712 713 mr->access = access; 714 mr->lkey = key; 715 mr->rkey = key; 716 mr->ibmr.iova = wqe->wr.wr.reg.mr->iova; 717 mr->state = RXE_MR_STATE_VALID; 718 719 return 0; 720 } 721 722 void rxe_mr_cleanup(struct rxe_pool_elem *elem) 723 { 724 struct rxe_mr *mr = container_of(elem, typeof(*mr), elem); 725 726 rxe_put(mr_pd(mr)); 727 ib_umem_release(mr->umem); 728 729 if (mr->ibmr.type != IB_MR_TYPE_DMA) 730 xa_destroy(&mr->page_list); 731 } 732