1 /* 2 * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved. 3 * 4 * This software is available to you under a choice of one of two 5 * licenses. You may choose to be licensed under the terms of the GNU 6 * General Public License (GPL) Version 2, available from the file 7 * COPYING in the main directory of this source tree, or the 8 * OpenIB.org BSD license below: 9 * 10 * Redistribution and use in source and binary forms, with or 11 * without modification, are permitted provided that the following 12 * conditions are met: 13 * 14 * - Redistributions of source code must retain the above 15 * copyright notice, this list of conditions and the following 16 * disclaimer. 17 * 18 * - Redistributions in binary form must reproduce the above 19 * copyright notice, this list of conditions and the following 20 * disclaimer in the documentation and/or other materials 21 * provided with the distribution. 22 * 23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 30 * SOFTWARE. 31 */ 32 33 34 #include <linux/kref.h> 35 #include <linux/random.h> 36 #include <linux/debugfs.h> 37 #include <linux/export.h> 38 #include <linux/delay.h> 39 #include <rdma/ib_umem.h> 40 #include <rdma/ib_umem_odp.h> 41 #include <rdma/ib_verbs.h> 42 #include "mlx5_ib.h" 43 44 /* 45 * We can't use an array for xlt_emergency_page because dma_map_single doesn't 46 * work on kernel modules memory 47 */ 48 void *xlt_emergency_page; 49 static DEFINE_MUTEX(xlt_emergency_page_mutex); 50 51 enum { 52 MAX_PENDING_REG_MR = 8, 53 }; 54 55 #define MLX5_UMR_ALIGN 2048 56 57 static void 58 create_mkey_callback(int status, struct mlx5_async_work *context); 59 static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem, 60 u64 iova, int access_flags, 61 unsigned int page_size, bool populate); 62 63 static void set_mkc_access_pd_addr_fields(void *mkc, int acc, u64 start_addr, 64 struct ib_pd *pd) 65 { 66 struct mlx5_ib_dev *dev = to_mdev(pd->device); 67 68 MLX5_SET(mkc, mkc, a, !!(acc & IB_ACCESS_REMOTE_ATOMIC)); 69 MLX5_SET(mkc, mkc, rw, !!(acc & IB_ACCESS_REMOTE_WRITE)); 70 MLX5_SET(mkc, mkc, rr, !!(acc & IB_ACCESS_REMOTE_READ)); 71 MLX5_SET(mkc, mkc, lw, !!(acc & IB_ACCESS_LOCAL_WRITE)); 72 MLX5_SET(mkc, mkc, lr, 1); 73 74 if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write)) 75 MLX5_SET(mkc, mkc, relaxed_ordering_write, 76 !!(acc & IB_ACCESS_RELAXED_ORDERING)); 77 if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read)) 78 MLX5_SET(mkc, mkc, relaxed_ordering_read, 79 !!(acc & IB_ACCESS_RELAXED_ORDERING)); 80 81 MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn); 82 MLX5_SET(mkc, mkc, qpn, 0xffffff); 83 MLX5_SET64(mkc, mkc, start_addr, start_addr); 84 } 85 86 static void 87 assign_mkey_variant(struct mlx5_ib_dev *dev, struct mlx5_core_mkey *mkey, 88 u32 *in) 89 { 90 u8 key = atomic_inc_return(&dev->mkey_var); 91 void *mkc; 92 93 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 94 MLX5_SET(mkc, mkc, mkey_7_0, key); 95 mkey->key = key; 96 } 97 98 static int 99 mlx5_ib_create_mkey(struct mlx5_ib_dev *dev, struct mlx5_core_mkey *mkey, 100 u32 *in, int inlen) 101 { 102 assign_mkey_variant(dev, mkey, in); 103 return mlx5_core_create_mkey(dev->mdev, mkey, in, inlen); 104 } 105 106 static int 107 mlx5_ib_create_mkey_cb(struct mlx5_ib_dev *dev, 108 struct mlx5_core_mkey *mkey, 109 struct mlx5_async_ctx *async_ctx, 110 u32 *in, int inlen, u32 *out, int outlen, 111 struct mlx5_async_work *context) 112 { 113 MLX5_SET(create_mkey_in, in, opcode, MLX5_CMD_OP_CREATE_MKEY); 114 assign_mkey_variant(dev, mkey, in); 115 return mlx5_cmd_exec_cb(async_ctx, in, inlen, out, outlen, 116 create_mkey_callback, context); 117 } 118 119 static void clean_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr); 120 static void dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr); 121 static int mr_cache_max_order(struct mlx5_ib_dev *dev); 122 static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent); 123 124 static bool umr_can_use_indirect_mkey(struct mlx5_ib_dev *dev) 125 { 126 return !MLX5_CAP_GEN(dev->mdev, umr_indirect_mkey_disabled); 127 } 128 129 static int destroy_mkey(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) 130 { 131 WARN_ON(xa_load(&dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key))); 132 133 return mlx5_core_destroy_mkey(dev->mdev, &mr->mmkey); 134 } 135 136 static void create_mkey_callback(int status, struct mlx5_async_work *context) 137 { 138 struct mlx5_ib_mr *mr = 139 container_of(context, struct mlx5_ib_mr, cb_work); 140 struct mlx5_cache_ent *ent = mr->cache_ent; 141 struct mlx5_ib_dev *dev = ent->dev; 142 unsigned long flags; 143 144 if (status) { 145 mlx5_ib_warn(dev, "async reg mr failed. status %d\n", status); 146 kfree(mr); 147 spin_lock_irqsave(&ent->lock, flags); 148 ent->pending--; 149 WRITE_ONCE(dev->fill_delay, 1); 150 spin_unlock_irqrestore(&ent->lock, flags); 151 mod_timer(&dev->delay_timer, jiffies + HZ); 152 return; 153 } 154 155 mr->mmkey.type = MLX5_MKEY_MR; 156 mr->mmkey.key |= mlx5_idx_to_mkey( 157 MLX5_GET(create_mkey_out, mr->out, mkey_index)); 158 159 WRITE_ONCE(dev->cache.last_add, jiffies); 160 161 spin_lock_irqsave(&ent->lock, flags); 162 list_add_tail(&mr->list, &ent->head); 163 ent->available_mrs++; 164 ent->total_mrs++; 165 /* If we are doing fill_to_high_water then keep going. */ 166 queue_adjust_cache_locked(ent); 167 ent->pending--; 168 spin_unlock_irqrestore(&ent->lock, flags); 169 } 170 171 static struct mlx5_ib_mr *alloc_cache_mr(struct mlx5_cache_ent *ent, void *mkc) 172 { 173 struct mlx5_ib_mr *mr; 174 175 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 176 if (!mr) 177 return NULL; 178 mr->cache_ent = ent; 179 180 set_mkc_access_pd_addr_fields(mkc, 0, 0, ent->dev->umrc.pd); 181 MLX5_SET(mkc, mkc, free, 1); 182 MLX5_SET(mkc, mkc, umr_en, 1); 183 MLX5_SET(mkc, mkc, access_mode_1_0, ent->access_mode & 0x3); 184 MLX5_SET(mkc, mkc, access_mode_4_2, (ent->access_mode >> 2) & 0x7); 185 186 MLX5_SET(mkc, mkc, translations_octword_size, ent->xlt); 187 MLX5_SET(mkc, mkc, log_page_size, ent->page); 188 return mr; 189 } 190 191 /* Asynchronously schedule new MRs to be populated in the cache. */ 192 static int add_keys(struct mlx5_cache_ent *ent, unsigned int num) 193 { 194 size_t inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 195 struct mlx5_ib_mr *mr; 196 void *mkc; 197 u32 *in; 198 int err = 0; 199 int i; 200 201 in = kzalloc(inlen, GFP_KERNEL); 202 if (!in) 203 return -ENOMEM; 204 205 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 206 for (i = 0; i < num; i++) { 207 mr = alloc_cache_mr(ent, mkc); 208 if (!mr) { 209 err = -ENOMEM; 210 break; 211 } 212 spin_lock_irq(&ent->lock); 213 if (ent->pending >= MAX_PENDING_REG_MR) { 214 err = -EAGAIN; 215 spin_unlock_irq(&ent->lock); 216 kfree(mr); 217 break; 218 } 219 ent->pending++; 220 spin_unlock_irq(&ent->lock); 221 err = mlx5_ib_create_mkey_cb(ent->dev, &mr->mmkey, 222 &ent->dev->async_ctx, in, inlen, 223 mr->out, sizeof(mr->out), 224 &mr->cb_work); 225 if (err) { 226 spin_lock_irq(&ent->lock); 227 ent->pending--; 228 spin_unlock_irq(&ent->lock); 229 mlx5_ib_warn(ent->dev, "create mkey failed %d\n", err); 230 kfree(mr); 231 break; 232 } 233 } 234 235 kfree(in); 236 return err; 237 } 238 239 /* Synchronously create a MR in the cache */ 240 static struct mlx5_ib_mr *create_cache_mr(struct mlx5_cache_ent *ent) 241 { 242 size_t inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 243 struct mlx5_ib_mr *mr; 244 void *mkc; 245 u32 *in; 246 int err; 247 248 in = kzalloc(inlen, GFP_KERNEL); 249 if (!in) 250 return ERR_PTR(-ENOMEM); 251 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 252 253 mr = alloc_cache_mr(ent, mkc); 254 if (!mr) { 255 err = -ENOMEM; 256 goto free_in; 257 } 258 259 err = mlx5_core_create_mkey(ent->dev->mdev, &mr->mmkey, in, inlen); 260 if (err) 261 goto free_mr; 262 263 mr->mmkey.type = MLX5_MKEY_MR; 264 WRITE_ONCE(ent->dev->cache.last_add, jiffies); 265 spin_lock_irq(&ent->lock); 266 ent->total_mrs++; 267 spin_unlock_irq(&ent->lock); 268 kfree(in); 269 return mr; 270 free_mr: 271 kfree(mr); 272 free_in: 273 kfree(in); 274 return ERR_PTR(err); 275 } 276 277 static void remove_cache_mr_locked(struct mlx5_cache_ent *ent) 278 { 279 struct mlx5_ib_mr *mr; 280 281 lockdep_assert_held(&ent->lock); 282 if (list_empty(&ent->head)) 283 return; 284 mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list); 285 list_del(&mr->list); 286 ent->available_mrs--; 287 ent->total_mrs--; 288 spin_unlock_irq(&ent->lock); 289 mlx5_core_destroy_mkey(ent->dev->mdev, &mr->mmkey); 290 kfree(mr); 291 spin_lock_irq(&ent->lock); 292 } 293 294 static int resize_available_mrs(struct mlx5_cache_ent *ent, unsigned int target, 295 bool limit_fill) 296 { 297 int err; 298 299 lockdep_assert_held(&ent->lock); 300 301 while (true) { 302 if (limit_fill) 303 target = ent->limit * 2; 304 if (target == ent->available_mrs + ent->pending) 305 return 0; 306 if (target > ent->available_mrs + ent->pending) { 307 u32 todo = target - (ent->available_mrs + ent->pending); 308 309 spin_unlock_irq(&ent->lock); 310 err = add_keys(ent, todo); 311 if (err == -EAGAIN) 312 usleep_range(3000, 5000); 313 spin_lock_irq(&ent->lock); 314 if (err) { 315 if (err != -EAGAIN) 316 return err; 317 } else 318 return 0; 319 } else { 320 remove_cache_mr_locked(ent); 321 } 322 } 323 } 324 325 static ssize_t size_write(struct file *filp, const char __user *buf, 326 size_t count, loff_t *pos) 327 { 328 struct mlx5_cache_ent *ent = filp->private_data; 329 u32 target; 330 int err; 331 332 err = kstrtou32_from_user(buf, count, 0, &target); 333 if (err) 334 return err; 335 336 /* 337 * Target is the new value of total_mrs the user requests, however we 338 * cannot free MRs that are in use. Compute the target value for 339 * available_mrs. 340 */ 341 spin_lock_irq(&ent->lock); 342 if (target < ent->total_mrs - ent->available_mrs) { 343 err = -EINVAL; 344 goto err_unlock; 345 } 346 target = target - (ent->total_mrs - ent->available_mrs); 347 if (target < ent->limit || target > ent->limit*2) { 348 err = -EINVAL; 349 goto err_unlock; 350 } 351 err = resize_available_mrs(ent, target, false); 352 if (err) 353 goto err_unlock; 354 spin_unlock_irq(&ent->lock); 355 356 return count; 357 358 err_unlock: 359 spin_unlock_irq(&ent->lock); 360 return err; 361 } 362 363 static ssize_t size_read(struct file *filp, char __user *buf, size_t count, 364 loff_t *pos) 365 { 366 struct mlx5_cache_ent *ent = filp->private_data; 367 char lbuf[20]; 368 int err; 369 370 err = snprintf(lbuf, sizeof(lbuf), "%d\n", ent->total_mrs); 371 if (err < 0) 372 return err; 373 374 return simple_read_from_buffer(buf, count, pos, lbuf, err); 375 } 376 377 static const struct file_operations size_fops = { 378 .owner = THIS_MODULE, 379 .open = simple_open, 380 .write = size_write, 381 .read = size_read, 382 }; 383 384 static ssize_t limit_write(struct file *filp, const char __user *buf, 385 size_t count, loff_t *pos) 386 { 387 struct mlx5_cache_ent *ent = filp->private_data; 388 u32 var; 389 int err; 390 391 err = kstrtou32_from_user(buf, count, 0, &var); 392 if (err) 393 return err; 394 395 /* 396 * Upon set we immediately fill the cache to high water mark implied by 397 * the limit. 398 */ 399 spin_lock_irq(&ent->lock); 400 ent->limit = var; 401 err = resize_available_mrs(ent, 0, true); 402 spin_unlock_irq(&ent->lock); 403 if (err) 404 return err; 405 return count; 406 } 407 408 static ssize_t limit_read(struct file *filp, char __user *buf, size_t count, 409 loff_t *pos) 410 { 411 struct mlx5_cache_ent *ent = filp->private_data; 412 char lbuf[20]; 413 int err; 414 415 err = snprintf(lbuf, sizeof(lbuf), "%d\n", ent->limit); 416 if (err < 0) 417 return err; 418 419 return simple_read_from_buffer(buf, count, pos, lbuf, err); 420 } 421 422 static const struct file_operations limit_fops = { 423 .owner = THIS_MODULE, 424 .open = simple_open, 425 .write = limit_write, 426 .read = limit_read, 427 }; 428 429 static bool someone_adding(struct mlx5_mr_cache *cache) 430 { 431 unsigned int i; 432 433 for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) { 434 struct mlx5_cache_ent *ent = &cache->ent[i]; 435 bool ret; 436 437 spin_lock_irq(&ent->lock); 438 ret = ent->available_mrs < ent->limit; 439 spin_unlock_irq(&ent->lock); 440 if (ret) 441 return true; 442 } 443 return false; 444 } 445 446 /* 447 * Check if the bucket is outside the high/low water mark and schedule an async 448 * update. The cache refill has hysteresis, once the low water mark is hit it is 449 * refilled up to the high mark. 450 */ 451 static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent) 452 { 453 lockdep_assert_held(&ent->lock); 454 455 if (ent->disabled || READ_ONCE(ent->dev->fill_delay)) 456 return; 457 if (ent->available_mrs < ent->limit) { 458 ent->fill_to_high_water = true; 459 queue_work(ent->dev->cache.wq, &ent->work); 460 } else if (ent->fill_to_high_water && 461 ent->available_mrs + ent->pending < 2 * ent->limit) { 462 /* 463 * Once we start populating due to hitting a low water mark 464 * continue until we pass the high water mark. 465 */ 466 queue_work(ent->dev->cache.wq, &ent->work); 467 } else if (ent->available_mrs == 2 * ent->limit) { 468 ent->fill_to_high_water = false; 469 } else if (ent->available_mrs > 2 * ent->limit) { 470 /* Queue deletion of excess entries */ 471 ent->fill_to_high_water = false; 472 if (ent->pending) 473 queue_delayed_work(ent->dev->cache.wq, &ent->dwork, 474 msecs_to_jiffies(1000)); 475 else 476 queue_work(ent->dev->cache.wq, &ent->work); 477 } 478 } 479 480 static void __cache_work_func(struct mlx5_cache_ent *ent) 481 { 482 struct mlx5_ib_dev *dev = ent->dev; 483 struct mlx5_mr_cache *cache = &dev->cache; 484 int err; 485 486 spin_lock_irq(&ent->lock); 487 if (ent->disabled) 488 goto out; 489 490 if (ent->fill_to_high_water && 491 ent->available_mrs + ent->pending < 2 * ent->limit && 492 !READ_ONCE(dev->fill_delay)) { 493 spin_unlock_irq(&ent->lock); 494 err = add_keys(ent, 1); 495 spin_lock_irq(&ent->lock); 496 if (ent->disabled) 497 goto out; 498 if (err) { 499 /* 500 * EAGAIN only happens if pending is positive, so we 501 * will be rescheduled from reg_mr_callback(). The only 502 * failure path here is ENOMEM. 503 */ 504 if (err != -EAGAIN) { 505 mlx5_ib_warn( 506 dev, 507 "command failed order %d, err %d\n", 508 ent->order, err); 509 queue_delayed_work(cache->wq, &ent->dwork, 510 msecs_to_jiffies(1000)); 511 } 512 } 513 } else if (ent->available_mrs > 2 * ent->limit) { 514 bool need_delay; 515 516 /* 517 * The remove_cache_mr() logic is performed as garbage 518 * collection task. Such task is intended to be run when no 519 * other active processes are running. 520 * 521 * The need_resched() will return TRUE if there are user tasks 522 * to be activated in near future. 523 * 524 * In such case, we don't execute remove_cache_mr() and postpone 525 * the garbage collection work to try to run in next cycle, in 526 * order to free CPU resources to other tasks. 527 */ 528 spin_unlock_irq(&ent->lock); 529 need_delay = need_resched() || someone_adding(cache) || 530 time_after(jiffies, 531 READ_ONCE(cache->last_add) + 300 * HZ); 532 spin_lock_irq(&ent->lock); 533 if (ent->disabled) 534 goto out; 535 if (need_delay) 536 queue_delayed_work(cache->wq, &ent->dwork, 300 * HZ); 537 remove_cache_mr_locked(ent); 538 queue_adjust_cache_locked(ent); 539 } 540 out: 541 spin_unlock_irq(&ent->lock); 542 } 543 544 static void delayed_cache_work_func(struct work_struct *work) 545 { 546 struct mlx5_cache_ent *ent; 547 548 ent = container_of(work, struct mlx5_cache_ent, dwork.work); 549 __cache_work_func(ent); 550 } 551 552 static void cache_work_func(struct work_struct *work) 553 { 554 struct mlx5_cache_ent *ent; 555 556 ent = container_of(work, struct mlx5_cache_ent, work); 557 __cache_work_func(ent); 558 } 559 560 /* Allocate a special entry from the cache */ 561 struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, 562 unsigned int entry, int access_flags) 563 { 564 struct mlx5_mr_cache *cache = &dev->cache; 565 struct mlx5_cache_ent *ent; 566 struct mlx5_ib_mr *mr; 567 568 if (WARN_ON(entry <= MR_CACHE_LAST_STD_ENTRY || 569 entry >= ARRAY_SIZE(cache->ent))) 570 return ERR_PTR(-EINVAL); 571 572 /* Matches access in alloc_cache_mr() */ 573 if (!mlx5_ib_can_reconfig_with_umr(dev, 0, access_flags)) 574 return ERR_PTR(-EOPNOTSUPP); 575 576 ent = &cache->ent[entry]; 577 spin_lock_irq(&ent->lock); 578 if (list_empty(&ent->head)) { 579 spin_unlock_irq(&ent->lock); 580 mr = create_cache_mr(ent); 581 if (IS_ERR(mr)) 582 return mr; 583 } else { 584 mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list); 585 list_del(&mr->list); 586 ent->available_mrs--; 587 queue_adjust_cache_locked(ent); 588 spin_unlock_irq(&ent->lock); 589 } 590 mr->access_flags = access_flags; 591 return mr; 592 } 593 594 /* Return a MR already available in the cache */ 595 static struct mlx5_ib_mr *get_cache_mr(struct mlx5_cache_ent *req_ent) 596 { 597 struct mlx5_ib_dev *dev = req_ent->dev; 598 struct mlx5_ib_mr *mr = NULL; 599 struct mlx5_cache_ent *ent = req_ent; 600 601 /* Try larger MR pools from the cache to satisfy the allocation */ 602 for (; ent != &dev->cache.ent[MR_CACHE_LAST_STD_ENTRY + 1]; ent++) { 603 mlx5_ib_dbg(dev, "order %u, cache index %zu\n", ent->order, 604 ent - dev->cache.ent); 605 606 spin_lock_irq(&ent->lock); 607 if (!list_empty(&ent->head)) { 608 mr = list_first_entry(&ent->head, struct mlx5_ib_mr, 609 list); 610 list_del(&mr->list); 611 ent->available_mrs--; 612 queue_adjust_cache_locked(ent); 613 spin_unlock_irq(&ent->lock); 614 break; 615 } 616 queue_adjust_cache_locked(ent); 617 spin_unlock_irq(&ent->lock); 618 } 619 620 if (!mr) 621 req_ent->miss++; 622 623 return mr; 624 } 625 626 static void detach_mr_from_cache(struct mlx5_ib_mr *mr) 627 { 628 struct mlx5_cache_ent *ent = mr->cache_ent; 629 630 mr->cache_ent = NULL; 631 spin_lock_irq(&ent->lock); 632 ent->total_mrs--; 633 spin_unlock_irq(&ent->lock); 634 } 635 636 void mlx5_mr_cache_free(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) 637 { 638 struct mlx5_cache_ent *ent = mr->cache_ent; 639 640 if (!ent) 641 return; 642 643 if (mlx5_mr_cache_invalidate(mr)) { 644 detach_mr_from_cache(mr); 645 destroy_mkey(dev, mr); 646 kfree(mr); 647 return; 648 } 649 650 spin_lock_irq(&ent->lock); 651 list_add_tail(&mr->list, &ent->head); 652 ent->available_mrs++; 653 queue_adjust_cache_locked(ent); 654 spin_unlock_irq(&ent->lock); 655 } 656 657 static void clean_keys(struct mlx5_ib_dev *dev, int c) 658 { 659 struct mlx5_mr_cache *cache = &dev->cache; 660 struct mlx5_cache_ent *ent = &cache->ent[c]; 661 struct mlx5_ib_mr *tmp_mr; 662 struct mlx5_ib_mr *mr; 663 LIST_HEAD(del_list); 664 665 cancel_delayed_work(&ent->dwork); 666 while (1) { 667 spin_lock_irq(&ent->lock); 668 if (list_empty(&ent->head)) { 669 spin_unlock_irq(&ent->lock); 670 break; 671 } 672 mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list); 673 list_move(&mr->list, &del_list); 674 ent->available_mrs--; 675 ent->total_mrs--; 676 spin_unlock_irq(&ent->lock); 677 mlx5_core_destroy_mkey(dev->mdev, &mr->mmkey); 678 } 679 680 list_for_each_entry_safe(mr, tmp_mr, &del_list, list) { 681 list_del(&mr->list); 682 kfree(mr); 683 } 684 } 685 686 static void mlx5_mr_cache_debugfs_cleanup(struct mlx5_ib_dev *dev) 687 { 688 if (!mlx5_debugfs_root || dev->is_rep) 689 return; 690 691 debugfs_remove_recursive(dev->cache.root); 692 dev->cache.root = NULL; 693 } 694 695 static void mlx5_mr_cache_debugfs_init(struct mlx5_ib_dev *dev) 696 { 697 struct mlx5_mr_cache *cache = &dev->cache; 698 struct mlx5_cache_ent *ent; 699 struct dentry *dir; 700 int i; 701 702 if (!mlx5_debugfs_root || dev->is_rep) 703 return; 704 705 cache->root = debugfs_create_dir("mr_cache", dev->mdev->priv.dbg_root); 706 707 for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) { 708 ent = &cache->ent[i]; 709 sprintf(ent->name, "%d", ent->order); 710 dir = debugfs_create_dir(ent->name, cache->root); 711 debugfs_create_file("size", 0600, dir, ent, &size_fops); 712 debugfs_create_file("limit", 0600, dir, ent, &limit_fops); 713 debugfs_create_u32("cur", 0400, dir, &ent->available_mrs); 714 debugfs_create_u32("miss", 0600, dir, &ent->miss); 715 } 716 } 717 718 static void delay_time_func(struct timer_list *t) 719 { 720 struct mlx5_ib_dev *dev = from_timer(dev, t, delay_timer); 721 722 WRITE_ONCE(dev->fill_delay, 0); 723 } 724 725 int mlx5_mr_cache_init(struct mlx5_ib_dev *dev) 726 { 727 struct mlx5_mr_cache *cache = &dev->cache; 728 struct mlx5_cache_ent *ent; 729 int i; 730 731 mutex_init(&dev->slow_path_mutex); 732 cache->wq = alloc_ordered_workqueue("mkey_cache", WQ_MEM_RECLAIM); 733 if (!cache->wq) { 734 mlx5_ib_warn(dev, "failed to create work queue\n"); 735 return -ENOMEM; 736 } 737 738 mlx5_cmd_init_async_ctx(dev->mdev, &dev->async_ctx); 739 timer_setup(&dev->delay_timer, delay_time_func, 0); 740 for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) { 741 ent = &cache->ent[i]; 742 INIT_LIST_HEAD(&ent->head); 743 spin_lock_init(&ent->lock); 744 ent->order = i + 2; 745 ent->dev = dev; 746 ent->limit = 0; 747 748 INIT_WORK(&ent->work, cache_work_func); 749 INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func); 750 751 if (i > MR_CACHE_LAST_STD_ENTRY) { 752 mlx5_odp_init_mr_cache_entry(ent); 753 continue; 754 } 755 756 if (ent->order > mr_cache_max_order(dev)) 757 continue; 758 759 ent->page = PAGE_SHIFT; 760 ent->xlt = (1 << ent->order) * sizeof(struct mlx5_mtt) / 761 MLX5_IB_UMR_OCTOWORD; 762 ent->access_mode = MLX5_MKC_ACCESS_MODE_MTT; 763 if ((dev->mdev->profile->mask & MLX5_PROF_MASK_MR_CACHE) && 764 !dev->is_rep && mlx5_core_is_pf(dev->mdev) && 765 mlx5_ib_can_load_pas_with_umr(dev, 0)) 766 ent->limit = dev->mdev->profile->mr_cache[i].limit; 767 else 768 ent->limit = 0; 769 spin_lock_irq(&ent->lock); 770 queue_adjust_cache_locked(ent); 771 spin_unlock_irq(&ent->lock); 772 } 773 774 mlx5_mr_cache_debugfs_init(dev); 775 776 return 0; 777 } 778 779 int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev) 780 { 781 unsigned int i; 782 783 if (!dev->cache.wq) 784 return 0; 785 786 for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) { 787 struct mlx5_cache_ent *ent = &dev->cache.ent[i]; 788 789 spin_lock_irq(&ent->lock); 790 ent->disabled = true; 791 spin_unlock_irq(&ent->lock); 792 cancel_work_sync(&ent->work); 793 cancel_delayed_work_sync(&ent->dwork); 794 } 795 796 mlx5_mr_cache_debugfs_cleanup(dev); 797 mlx5_cmd_cleanup_async_ctx(&dev->async_ctx); 798 799 for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) 800 clean_keys(dev, i); 801 802 destroy_workqueue(dev->cache.wq); 803 del_timer_sync(&dev->delay_timer); 804 805 return 0; 806 } 807 808 struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc) 809 { 810 struct mlx5_ib_dev *dev = to_mdev(pd->device); 811 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 812 struct mlx5_ib_mr *mr; 813 void *mkc; 814 u32 *in; 815 int err; 816 817 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 818 if (!mr) 819 return ERR_PTR(-ENOMEM); 820 821 in = kzalloc(inlen, GFP_KERNEL); 822 if (!in) { 823 err = -ENOMEM; 824 goto err_free; 825 } 826 827 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 828 829 MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_PA); 830 MLX5_SET(mkc, mkc, length64, 1); 831 set_mkc_access_pd_addr_fields(mkc, acc, 0, pd); 832 833 err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen); 834 if (err) 835 goto err_in; 836 837 kfree(in); 838 mr->mmkey.type = MLX5_MKEY_MR; 839 mr->ibmr.lkey = mr->mmkey.key; 840 mr->ibmr.rkey = mr->mmkey.key; 841 mr->umem = NULL; 842 843 return &mr->ibmr; 844 845 err_in: 846 kfree(in); 847 848 err_free: 849 kfree(mr); 850 851 return ERR_PTR(err); 852 } 853 854 static int get_octo_len(u64 addr, u64 len, int page_shift) 855 { 856 u64 page_size = 1ULL << page_shift; 857 u64 offset; 858 int npages; 859 860 offset = addr & (page_size - 1); 861 npages = ALIGN(len + offset, page_size) >> page_shift; 862 return (npages + 1) / 2; 863 } 864 865 static int mr_cache_max_order(struct mlx5_ib_dev *dev) 866 { 867 if (MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset)) 868 return MR_CACHE_LAST_STD_ENTRY + 2; 869 return MLX5_MAX_UMR_SHIFT; 870 } 871 872 static void mlx5_ib_umr_done(struct ib_cq *cq, struct ib_wc *wc) 873 { 874 struct mlx5_ib_umr_context *context = 875 container_of(wc->wr_cqe, struct mlx5_ib_umr_context, cqe); 876 877 context->status = wc->status; 878 complete(&context->done); 879 } 880 881 static inline void mlx5_ib_init_umr_context(struct mlx5_ib_umr_context *context) 882 { 883 context->cqe.done = mlx5_ib_umr_done; 884 context->status = -1; 885 init_completion(&context->done); 886 } 887 888 static int mlx5_ib_post_send_wait(struct mlx5_ib_dev *dev, 889 struct mlx5_umr_wr *umrwr) 890 { 891 struct umr_common *umrc = &dev->umrc; 892 const struct ib_send_wr *bad; 893 int err; 894 struct mlx5_ib_umr_context umr_context; 895 896 mlx5_ib_init_umr_context(&umr_context); 897 umrwr->wr.wr_cqe = &umr_context.cqe; 898 899 down(&umrc->sem); 900 err = ib_post_send(umrc->qp, &umrwr->wr, &bad); 901 if (err) { 902 mlx5_ib_warn(dev, "UMR post send failed, err %d\n", err); 903 } else { 904 wait_for_completion(&umr_context.done); 905 if (umr_context.status != IB_WC_SUCCESS) { 906 mlx5_ib_warn(dev, "reg umr failed (%u)\n", 907 umr_context.status); 908 err = -EFAULT; 909 } 910 } 911 up(&umrc->sem); 912 return err; 913 } 914 915 static struct mlx5_cache_ent *mr_cache_ent_from_order(struct mlx5_ib_dev *dev, 916 unsigned int order) 917 { 918 struct mlx5_mr_cache *cache = &dev->cache; 919 920 if (order < cache->ent[0].order) 921 return &cache->ent[0]; 922 order = order - cache->ent[0].order; 923 if (order > MR_CACHE_LAST_STD_ENTRY) 924 return NULL; 925 return &cache->ent[order]; 926 } 927 928 static void set_mr_fields(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr, 929 u64 length, int access_flags) 930 { 931 mr->ibmr.lkey = mr->mmkey.key; 932 mr->ibmr.rkey = mr->mmkey.key; 933 mr->ibmr.length = length; 934 mr->ibmr.device = &dev->ib_dev; 935 mr->access_flags = access_flags; 936 } 937 938 static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd, 939 struct ib_umem *umem, u64 iova, 940 int access_flags) 941 { 942 struct mlx5_ib_dev *dev = to_mdev(pd->device); 943 struct mlx5_cache_ent *ent; 944 struct mlx5_ib_mr *mr; 945 unsigned int page_size; 946 947 page_size = mlx5_umem_find_best_pgsz(umem, mkc, log_page_size, 0, iova); 948 if (WARN_ON(!page_size)) 949 return ERR_PTR(-EINVAL); 950 ent = mr_cache_ent_from_order( 951 dev, order_base_2(ib_umem_num_dma_blocks(umem, page_size))); 952 /* 953 * Matches access in alloc_cache_mr(). If the MR can't come from the 954 * cache then synchronously create an uncached one. 955 */ 956 if (!ent || ent->limit == 0 || 957 !mlx5_ib_can_reconfig_with_umr(dev, 0, access_flags)) { 958 mutex_lock(&dev->slow_path_mutex); 959 mr = reg_create(pd, umem, iova, access_flags, page_size, false); 960 mutex_unlock(&dev->slow_path_mutex); 961 return mr; 962 } 963 964 mr = get_cache_mr(ent); 965 if (!mr) { 966 mr = create_cache_mr(ent); 967 /* 968 * The above already tried to do the same stuff as reg_create(), 969 * no reason to try it again. 970 */ 971 if (IS_ERR(mr)) 972 return mr; 973 } 974 975 mr->ibmr.pd = pd; 976 mr->umem = umem; 977 mr->access_flags = access_flags; 978 mr->desc_size = sizeof(struct mlx5_mtt); 979 mr->mmkey.iova = iova; 980 mr->mmkey.size = umem->length; 981 mr->mmkey.pd = to_mpd(pd)->pdn; 982 mr->page_shift = order_base_2(page_size); 983 mr->umem = umem; 984 set_mr_fields(dev, mr, umem->length, access_flags); 985 986 return mr; 987 } 988 989 #define MLX5_MAX_UMR_CHUNK ((1 << (MLX5_MAX_UMR_SHIFT + 4)) - \ 990 MLX5_UMR_MTT_ALIGNMENT) 991 #define MLX5_SPARE_UMR_CHUNK 0x10000 992 993 /* 994 * Allocate a temporary buffer to hold the per-page information to transfer to 995 * HW. For efficiency this should be as large as it can be, but buffer 996 * allocation failure is not allowed, so try smaller sizes. 997 */ 998 static void *mlx5_ib_alloc_xlt(size_t *nents, size_t ent_size, gfp_t gfp_mask) 999 { 1000 const size_t xlt_chunk_align = 1001 MLX5_UMR_MTT_ALIGNMENT / sizeof(ent_size); 1002 size_t size; 1003 void *res = NULL; 1004 1005 static_assert(PAGE_SIZE % MLX5_UMR_MTT_ALIGNMENT == 0); 1006 1007 /* 1008 * MLX5_IB_UPD_XLT_ATOMIC doesn't signal an atomic context just that the 1009 * allocation can't trigger any kind of reclaim. 1010 */ 1011 might_sleep(); 1012 1013 gfp_mask |= __GFP_ZERO; 1014 1015 /* 1016 * If the system already has a suitable high order page then just use 1017 * that, but don't try hard to create one. This max is about 1M, so a 1018 * free x86 huge page will satisfy it. 1019 */ 1020 size = min_t(size_t, ent_size * ALIGN(*nents, xlt_chunk_align), 1021 MLX5_MAX_UMR_CHUNK); 1022 *nents = size / ent_size; 1023 res = (void *)__get_free_pages(gfp_mask | __GFP_NOWARN, 1024 get_order(size)); 1025 if (res) 1026 return res; 1027 1028 if (size > MLX5_SPARE_UMR_CHUNK) { 1029 size = MLX5_SPARE_UMR_CHUNK; 1030 *nents = get_order(size) / ent_size; 1031 res = (void *)__get_free_pages(gfp_mask | __GFP_NOWARN, 1032 get_order(size)); 1033 if (res) 1034 return res; 1035 } 1036 1037 *nents = PAGE_SIZE / ent_size; 1038 res = (void *)__get_free_page(gfp_mask); 1039 if (res) 1040 return res; 1041 1042 mutex_lock(&xlt_emergency_page_mutex); 1043 memset(xlt_emergency_page, 0, PAGE_SIZE); 1044 return xlt_emergency_page; 1045 } 1046 1047 static void mlx5_ib_free_xlt(void *xlt, size_t length) 1048 { 1049 if (xlt == xlt_emergency_page) { 1050 mutex_unlock(&xlt_emergency_page_mutex); 1051 return; 1052 } 1053 1054 free_pages((unsigned long)xlt, get_order(length)); 1055 } 1056 1057 /* 1058 * Create a MLX5_IB_SEND_UMR_UPDATE_XLT work request and XLT buffer ready for 1059 * submission. 1060 */ 1061 static void *mlx5_ib_create_xlt_wr(struct mlx5_ib_mr *mr, 1062 struct mlx5_umr_wr *wr, struct ib_sge *sg, 1063 size_t nents, size_t ent_size, 1064 unsigned int flags) 1065 { 1066 struct mlx5_ib_dev *dev = mr_to_mdev(mr); 1067 struct device *ddev = &dev->mdev->pdev->dev; 1068 dma_addr_t dma; 1069 void *xlt; 1070 1071 xlt = mlx5_ib_alloc_xlt(&nents, ent_size, 1072 flags & MLX5_IB_UPD_XLT_ATOMIC ? GFP_ATOMIC : 1073 GFP_KERNEL); 1074 sg->length = nents * ent_size; 1075 dma = dma_map_single(ddev, xlt, sg->length, DMA_TO_DEVICE); 1076 if (dma_mapping_error(ddev, dma)) { 1077 mlx5_ib_err(dev, "unable to map DMA during XLT update.\n"); 1078 mlx5_ib_free_xlt(xlt, sg->length); 1079 return NULL; 1080 } 1081 sg->addr = dma; 1082 sg->lkey = dev->umrc.pd->local_dma_lkey; 1083 1084 memset(wr, 0, sizeof(*wr)); 1085 wr->wr.send_flags = MLX5_IB_SEND_UMR_UPDATE_XLT; 1086 if (!(flags & MLX5_IB_UPD_XLT_ENABLE)) 1087 wr->wr.send_flags |= MLX5_IB_SEND_UMR_FAIL_IF_FREE; 1088 wr->wr.sg_list = sg; 1089 wr->wr.num_sge = 1; 1090 wr->wr.opcode = MLX5_IB_WR_UMR; 1091 wr->pd = mr->ibmr.pd; 1092 wr->mkey = mr->mmkey.key; 1093 wr->length = mr->mmkey.size; 1094 wr->virt_addr = mr->mmkey.iova; 1095 wr->access_flags = mr->access_flags; 1096 wr->page_shift = mr->page_shift; 1097 wr->xlt_size = sg->length; 1098 return xlt; 1099 } 1100 1101 static void mlx5_ib_unmap_free_xlt(struct mlx5_ib_dev *dev, void *xlt, 1102 struct ib_sge *sg) 1103 { 1104 struct device *ddev = &dev->mdev->pdev->dev; 1105 1106 dma_unmap_single(ddev, sg->addr, sg->length, DMA_TO_DEVICE); 1107 mlx5_ib_free_xlt(xlt, sg->length); 1108 } 1109 1110 static unsigned int xlt_wr_final_send_flags(unsigned int flags) 1111 { 1112 unsigned int res = 0; 1113 1114 if (flags & MLX5_IB_UPD_XLT_ENABLE) 1115 res |= MLX5_IB_SEND_UMR_ENABLE_MR | 1116 MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS | 1117 MLX5_IB_SEND_UMR_UPDATE_TRANSLATION; 1118 if (flags & MLX5_IB_UPD_XLT_PD || flags & MLX5_IB_UPD_XLT_ACCESS) 1119 res |= MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS; 1120 if (flags & MLX5_IB_UPD_XLT_ADDR) 1121 res |= MLX5_IB_SEND_UMR_UPDATE_TRANSLATION; 1122 return res; 1123 } 1124 1125 int mlx5_ib_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages, 1126 int page_shift, int flags) 1127 { 1128 struct mlx5_ib_dev *dev = mr_to_mdev(mr); 1129 struct device *ddev = &dev->mdev->pdev->dev; 1130 void *xlt; 1131 struct mlx5_umr_wr wr; 1132 struct ib_sge sg; 1133 int err = 0; 1134 int desc_size = (flags & MLX5_IB_UPD_XLT_INDIRECT) 1135 ? sizeof(struct mlx5_klm) 1136 : sizeof(struct mlx5_mtt); 1137 const int page_align = MLX5_UMR_MTT_ALIGNMENT / desc_size; 1138 const int page_mask = page_align - 1; 1139 size_t pages_mapped = 0; 1140 size_t pages_to_map = 0; 1141 size_t pages_iter; 1142 size_t size_to_map = 0; 1143 size_t orig_sg_length; 1144 1145 if ((flags & MLX5_IB_UPD_XLT_INDIRECT) && 1146 !umr_can_use_indirect_mkey(dev)) 1147 return -EPERM; 1148 1149 if (WARN_ON(!mr->umem->is_odp)) 1150 return -EINVAL; 1151 1152 /* UMR copies MTTs in units of MLX5_UMR_MTT_ALIGNMENT bytes, 1153 * so we need to align the offset and length accordingly 1154 */ 1155 if (idx & page_mask) { 1156 npages += idx & page_mask; 1157 idx &= ~page_mask; 1158 } 1159 pages_to_map = ALIGN(npages, page_align); 1160 1161 xlt = mlx5_ib_create_xlt_wr(mr, &wr, &sg, npages, desc_size, flags); 1162 if (!xlt) 1163 return -ENOMEM; 1164 pages_iter = sg.length / desc_size; 1165 orig_sg_length = sg.length; 1166 1167 if (!(flags & MLX5_IB_UPD_XLT_INDIRECT)) { 1168 struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem); 1169 size_t max_pages = ib_umem_odp_num_pages(odp) - idx; 1170 1171 pages_to_map = min_t(size_t, pages_to_map, max_pages); 1172 } 1173 1174 wr.page_shift = page_shift; 1175 1176 for (pages_mapped = 0; 1177 pages_mapped < pages_to_map && !err; 1178 pages_mapped += pages_iter, idx += pages_iter) { 1179 npages = min_t(int, pages_iter, pages_to_map - pages_mapped); 1180 size_to_map = npages * desc_size; 1181 dma_sync_single_for_cpu(ddev, sg.addr, sg.length, 1182 DMA_TO_DEVICE); 1183 mlx5_odp_populate_xlt(xlt, idx, npages, mr, flags); 1184 dma_sync_single_for_device(ddev, sg.addr, sg.length, 1185 DMA_TO_DEVICE); 1186 1187 sg.length = ALIGN(size_to_map, MLX5_UMR_MTT_ALIGNMENT); 1188 1189 if (pages_mapped + pages_iter >= pages_to_map) 1190 wr.wr.send_flags |= xlt_wr_final_send_flags(flags); 1191 1192 wr.offset = idx * desc_size; 1193 wr.xlt_size = sg.length; 1194 1195 err = mlx5_ib_post_send_wait(dev, &wr); 1196 } 1197 sg.length = orig_sg_length; 1198 mlx5_ib_unmap_free_xlt(dev, xlt, &sg); 1199 return err; 1200 } 1201 1202 /* 1203 * Send the DMA list to the HW for a normal MR using UMR. 1204 */ 1205 static int mlx5_ib_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags) 1206 { 1207 struct mlx5_ib_dev *dev = mr_to_mdev(mr); 1208 struct device *ddev = &dev->mdev->pdev->dev; 1209 struct ib_block_iter biter; 1210 struct mlx5_mtt *cur_mtt; 1211 struct mlx5_umr_wr wr; 1212 size_t orig_sg_length; 1213 struct mlx5_mtt *mtt; 1214 size_t final_size; 1215 struct ib_sge sg; 1216 int err = 0; 1217 1218 if (WARN_ON(mr->umem->is_odp)) 1219 return -EINVAL; 1220 1221 mtt = mlx5_ib_create_xlt_wr(mr, &wr, &sg, 1222 ib_umem_num_dma_blocks(mr->umem, 1223 1 << mr->page_shift), 1224 sizeof(*mtt), flags); 1225 if (!mtt) 1226 return -ENOMEM; 1227 orig_sg_length = sg.length; 1228 1229 cur_mtt = mtt; 1230 rdma_for_each_block (mr->umem->sg_head.sgl, &biter, mr->umem->nmap, 1231 BIT(mr->page_shift)) { 1232 if (cur_mtt == (void *)mtt + sg.length) { 1233 dma_sync_single_for_device(ddev, sg.addr, sg.length, 1234 DMA_TO_DEVICE); 1235 err = mlx5_ib_post_send_wait(dev, &wr); 1236 if (err) 1237 goto err; 1238 dma_sync_single_for_cpu(ddev, sg.addr, sg.length, 1239 DMA_TO_DEVICE); 1240 wr.offset += sg.length; 1241 cur_mtt = mtt; 1242 } 1243 1244 cur_mtt->ptag = 1245 cpu_to_be64(rdma_block_iter_dma_address(&biter) | 1246 MLX5_IB_MTT_PRESENT); 1247 cur_mtt++; 1248 } 1249 1250 final_size = (void *)cur_mtt - (void *)mtt; 1251 sg.length = ALIGN(final_size, MLX5_UMR_MTT_ALIGNMENT); 1252 memset(cur_mtt, 0, sg.length - final_size); 1253 wr.wr.send_flags |= xlt_wr_final_send_flags(flags); 1254 wr.xlt_size = sg.length; 1255 1256 dma_sync_single_for_device(ddev, sg.addr, sg.length, DMA_TO_DEVICE); 1257 err = mlx5_ib_post_send_wait(dev, &wr); 1258 1259 err: 1260 sg.length = orig_sg_length; 1261 mlx5_ib_unmap_free_xlt(dev, mtt, &sg); 1262 return err; 1263 } 1264 1265 /* 1266 * If ibmr is NULL it will be allocated by reg_create. 1267 * Else, the given ibmr will be used. 1268 */ 1269 static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem, 1270 u64 iova, int access_flags, 1271 unsigned int page_size, bool populate) 1272 { 1273 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1274 struct mlx5_ib_mr *mr; 1275 __be64 *pas; 1276 void *mkc; 1277 int inlen; 1278 u32 *in; 1279 int err; 1280 bool pg_cap = !!(MLX5_CAP_GEN(dev->mdev, pg)); 1281 1282 if (!page_size) 1283 return ERR_PTR(-EINVAL); 1284 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 1285 if (!mr) 1286 return ERR_PTR(-ENOMEM); 1287 1288 mr->ibmr.pd = pd; 1289 mr->access_flags = access_flags; 1290 mr->page_shift = order_base_2(page_size); 1291 1292 inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 1293 if (populate) 1294 inlen += sizeof(*pas) * 1295 roundup(ib_umem_num_dma_blocks(umem, page_size), 2); 1296 in = kvzalloc(inlen, GFP_KERNEL); 1297 if (!in) { 1298 err = -ENOMEM; 1299 goto err_1; 1300 } 1301 pas = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt); 1302 if (populate) { 1303 if (WARN_ON(access_flags & IB_ACCESS_ON_DEMAND)) { 1304 err = -EINVAL; 1305 goto err_2; 1306 } 1307 mlx5_ib_populate_pas(umem, 1UL << mr->page_shift, pas, 1308 pg_cap ? MLX5_IB_MTT_PRESENT : 0); 1309 } 1310 1311 /* The pg_access bit allows setting the access flags 1312 * in the page list submitted with the command. */ 1313 MLX5_SET(create_mkey_in, in, pg_access, !!(pg_cap)); 1314 1315 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 1316 set_mkc_access_pd_addr_fields(mkc, access_flags, iova, 1317 populate ? pd : dev->umrc.pd); 1318 MLX5_SET(mkc, mkc, free, !populate); 1319 MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_MTT); 1320 MLX5_SET(mkc, mkc, umr_en, 1); 1321 1322 MLX5_SET64(mkc, mkc, len, umem->length); 1323 MLX5_SET(mkc, mkc, bsf_octword_size, 0); 1324 MLX5_SET(mkc, mkc, translations_octword_size, 1325 get_octo_len(iova, umem->length, mr->page_shift)); 1326 MLX5_SET(mkc, mkc, log_page_size, mr->page_shift); 1327 if (populate) { 1328 MLX5_SET(create_mkey_in, in, translations_octword_actual_size, 1329 get_octo_len(iova, umem->length, mr->page_shift)); 1330 } 1331 1332 err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen); 1333 if (err) { 1334 mlx5_ib_warn(dev, "create mkey failed\n"); 1335 goto err_2; 1336 } 1337 mr->mmkey.type = MLX5_MKEY_MR; 1338 mr->desc_size = sizeof(struct mlx5_mtt); 1339 mr->umem = umem; 1340 set_mr_fields(dev, mr, umem->length, access_flags); 1341 kvfree(in); 1342 1343 mlx5_ib_dbg(dev, "mkey = 0x%x\n", mr->mmkey.key); 1344 1345 return mr; 1346 1347 err_2: 1348 kvfree(in); 1349 err_1: 1350 kfree(mr); 1351 return ERR_PTR(err); 1352 } 1353 1354 static struct ib_mr *mlx5_ib_get_dm_mr(struct ib_pd *pd, u64 start_addr, 1355 u64 length, int acc, int mode) 1356 { 1357 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1358 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 1359 struct mlx5_ib_mr *mr; 1360 void *mkc; 1361 u32 *in; 1362 int err; 1363 1364 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 1365 if (!mr) 1366 return ERR_PTR(-ENOMEM); 1367 1368 in = kzalloc(inlen, GFP_KERNEL); 1369 if (!in) { 1370 err = -ENOMEM; 1371 goto err_free; 1372 } 1373 1374 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 1375 1376 MLX5_SET(mkc, mkc, access_mode_1_0, mode & 0x3); 1377 MLX5_SET(mkc, mkc, access_mode_4_2, (mode >> 2) & 0x7); 1378 MLX5_SET64(mkc, mkc, len, length); 1379 set_mkc_access_pd_addr_fields(mkc, acc, start_addr, pd); 1380 1381 err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen); 1382 if (err) 1383 goto err_in; 1384 1385 kfree(in); 1386 1387 set_mr_fields(dev, mr, length, acc); 1388 1389 return &mr->ibmr; 1390 1391 err_in: 1392 kfree(in); 1393 1394 err_free: 1395 kfree(mr); 1396 1397 return ERR_PTR(err); 1398 } 1399 1400 int mlx5_ib_advise_mr(struct ib_pd *pd, 1401 enum ib_uverbs_advise_mr_advice advice, 1402 u32 flags, 1403 struct ib_sge *sg_list, 1404 u32 num_sge, 1405 struct uverbs_attr_bundle *attrs) 1406 { 1407 if (advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH && 1408 advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE && 1409 advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_NO_FAULT) 1410 return -EOPNOTSUPP; 1411 1412 return mlx5_ib_advise_mr_prefetch(pd, advice, flags, 1413 sg_list, num_sge); 1414 } 1415 1416 struct ib_mr *mlx5_ib_reg_dm_mr(struct ib_pd *pd, struct ib_dm *dm, 1417 struct ib_dm_mr_attr *attr, 1418 struct uverbs_attr_bundle *attrs) 1419 { 1420 struct mlx5_ib_dm *mdm = to_mdm(dm); 1421 struct mlx5_core_dev *dev = to_mdev(dm->device)->mdev; 1422 u64 start_addr = mdm->dev_addr + attr->offset; 1423 int mode; 1424 1425 switch (mdm->type) { 1426 case MLX5_IB_UAPI_DM_TYPE_MEMIC: 1427 if (attr->access_flags & ~MLX5_IB_DM_MEMIC_ALLOWED_ACCESS) 1428 return ERR_PTR(-EINVAL); 1429 1430 mode = MLX5_MKC_ACCESS_MODE_MEMIC; 1431 start_addr -= pci_resource_start(dev->pdev, 0); 1432 break; 1433 case MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM: 1434 case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM: 1435 if (attr->access_flags & ~MLX5_IB_DM_SW_ICM_ALLOWED_ACCESS) 1436 return ERR_PTR(-EINVAL); 1437 1438 mode = MLX5_MKC_ACCESS_MODE_SW_ICM; 1439 break; 1440 default: 1441 return ERR_PTR(-EINVAL); 1442 } 1443 1444 return mlx5_ib_get_dm_mr(pd, start_addr, attr->length, 1445 attr->access_flags, mode); 1446 } 1447 1448 static struct ib_mr *create_real_mr(struct ib_pd *pd, struct ib_umem *umem, 1449 u64 iova, int access_flags) 1450 { 1451 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1452 struct mlx5_ib_mr *mr = NULL; 1453 bool xlt_with_umr; 1454 int err; 1455 1456 xlt_with_umr = mlx5_ib_can_load_pas_with_umr(dev, umem->length); 1457 if (xlt_with_umr) { 1458 mr = alloc_cacheable_mr(pd, umem, iova, access_flags); 1459 } else { 1460 unsigned int page_size = mlx5_umem_find_best_pgsz( 1461 umem, mkc, log_page_size, 0, iova); 1462 1463 mutex_lock(&dev->slow_path_mutex); 1464 mr = reg_create(pd, umem, iova, access_flags, page_size, true); 1465 mutex_unlock(&dev->slow_path_mutex); 1466 } 1467 if (IS_ERR(mr)) { 1468 ib_umem_release(umem); 1469 return ERR_CAST(mr); 1470 } 1471 1472 mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmkey.key); 1473 1474 atomic_add(ib_umem_num_pages(umem), &dev->mdev->priv.reg_pages); 1475 1476 if (xlt_with_umr) { 1477 /* 1478 * If the MR was created with reg_create then it will be 1479 * configured properly but left disabled. It is safe to go ahead 1480 * and configure it again via UMR while enabling it. 1481 */ 1482 err = mlx5_ib_update_mr_pas(mr, MLX5_IB_UPD_XLT_ENABLE); 1483 if (err) { 1484 dereg_mr(dev, mr); 1485 return ERR_PTR(err); 1486 } 1487 } 1488 return &mr->ibmr; 1489 } 1490 1491 static struct ib_mr *create_user_odp_mr(struct ib_pd *pd, u64 start, u64 length, 1492 u64 iova, int access_flags, 1493 struct ib_udata *udata) 1494 { 1495 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1496 struct ib_umem_odp *odp; 1497 struct mlx5_ib_mr *mr; 1498 int err; 1499 1500 if (!IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) 1501 return ERR_PTR(-EOPNOTSUPP); 1502 1503 if (!start && length == U64_MAX) { 1504 if (iova != 0) 1505 return ERR_PTR(-EINVAL); 1506 if (!(dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT)) 1507 return ERR_PTR(-EINVAL); 1508 1509 mr = mlx5_ib_alloc_implicit_mr(to_mpd(pd), udata, access_flags); 1510 if (IS_ERR(mr)) 1511 return ERR_CAST(mr); 1512 return &mr->ibmr; 1513 } 1514 1515 /* ODP requires xlt update via umr to work. */ 1516 if (!mlx5_ib_can_load_pas_with_umr(dev, length)) 1517 return ERR_PTR(-EINVAL); 1518 1519 odp = ib_umem_odp_get(&dev->ib_dev, start, length, access_flags, 1520 &mlx5_mn_ops); 1521 if (IS_ERR(odp)) 1522 return ERR_CAST(odp); 1523 1524 mr = alloc_cacheable_mr(pd, &odp->umem, iova, access_flags); 1525 if (IS_ERR(mr)) { 1526 ib_umem_release(&odp->umem); 1527 return ERR_CAST(mr); 1528 } 1529 1530 odp->private = mr; 1531 init_waitqueue_head(&mr->q_deferred_work); 1532 atomic_set(&mr->num_deferred_work, 0); 1533 err = xa_err(xa_store(&dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key), 1534 &mr->mmkey, GFP_KERNEL)); 1535 if (err) 1536 goto err_dereg_mr; 1537 1538 err = mlx5_ib_init_odp_mr(mr); 1539 if (err) 1540 goto err_dereg_mr; 1541 return &mr->ibmr; 1542 1543 err_dereg_mr: 1544 dereg_mr(dev, mr); 1545 return ERR_PTR(err); 1546 } 1547 1548 struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, 1549 u64 iova, int access_flags, 1550 struct ib_udata *udata) 1551 { 1552 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1553 struct ib_umem *umem; 1554 1555 if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM)) 1556 return ERR_PTR(-EOPNOTSUPP); 1557 1558 mlx5_ib_dbg(dev, "start 0x%llx, iova 0x%llx, length 0x%llx, access_flags 0x%x\n", 1559 start, iova, length, access_flags); 1560 1561 if (access_flags & IB_ACCESS_ON_DEMAND) 1562 return create_user_odp_mr(pd, start, length, iova, access_flags, 1563 udata); 1564 umem = ib_umem_get(&dev->ib_dev, start, length, access_flags); 1565 if (IS_ERR(umem)) 1566 return ERR_CAST(umem); 1567 return create_real_mr(pd, umem, iova, access_flags); 1568 } 1569 1570 /** 1571 * mlx5_mr_cache_invalidate - Fence all DMA on the MR 1572 * @mr: The MR to fence 1573 * 1574 * Upon return the NIC will not be doing any DMA to the pages under the MR, 1575 * and any DMA inprogress will be completed. Failure of this function 1576 * indicates the HW has failed catastrophically. 1577 */ 1578 int mlx5_mr_cache_invalidate(struct mlx5_ib_mr *mr) 1579 { 1580 struct mlx5_umr_wr umrwr = {}; 1581 1582 if (mr_to_mdev(mr)->mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) 1583 return 0; 1584 1585 umrwr.wr.send_flags = MLX5_IB_SEND_UMR_DISABLE_MR | 1586 MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS; 1587 umrwr.wr.opcode = MLX5_IB_WR_UMR; 1588 umrwr.pd = mr_to_mdev(mr)->umrc.pd; 1589 umrwr.mkey = mr->mmkey.key; 1590 umrwr.ignore_free_state = 1; 1591 1592 return mlx5_ib_post_send_wait(mr_to_mdev(mr), &umrwr); 1593 } 1594 1595 /* 1596 * True if the change in access flags can be done via UMR, only some access 1597 * flags can be updated. 1598 */ 1599 static bool can_use_umr_rereg_access(struct mlx5_ib_dev *dev, 1600 unsigned int current_access_flags, 1601 unsigned int target_access_flags) 1602 { 1603 unsigned int diffs = current_access_flags ^ target_access_flags; 1604 1605 if (diffs & ~(IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE | 1606 IB_ACCESS_REMOTE_READ | IB_ACCESS_RELAXED_ORDERING)) 1607 return false; 1608 return mlx5_ib_can_reconfig_with_umr(dev, current_access_flags, 1609 target_access_flags); 1610 } 1611 1612 static int umr_rereg_pd_access(struct mlx5_ib_mr *mr, struct ib_pd *pd, 1613 int access_flags) 1614 { 1615 struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device); 1616 struct mlx5_umr_wr umrwr = { 1617 .wr = { 1618 .send_flags = MLX5_IB_SEND_UMR_FAIL_IF_FREE | 1619 MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS, 1620 .opcode = MLX5_IB_WR_UMR, 1621 }, 1622 .mkey = mr->mmkey.key, 1623 .pd = pd, 1624 .access_flags = access_flags, 1625 }; 1626 int err; 1627 1628 err = mlx5_ib_post_send_wait(dev, &umrwr); 1629 if (err) 1630 return err; 1631 1632 mr->access_flags = access_flags; 1633 mr->mmkey.pd = to_mpd(pd)->pdn; 1634 return 0; 1635 } 1636 1637 static bool can_use_umr_rereg_pas(struct mlx5_ib_mr *mr, 1638 struct ib_umem *new_umem, 1639 int new_access_flags, u64 iova, 1640 unsigned long *page_size) 1641 { 1642 struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device); 1643 1644 /* We only track the allocated sizes of MRs from the cache */ 1645 if (!mr->cache_ent) 1646 return false; 1647 if (!mlx5_ib_can_load_pas_with_umr(dev, new_umem->length)) 1648 return false; 1649 1650 *page_size = 1651 mlx5_umem_find_best_pgsz(new_umem, mkc, log_page_size, 0, iova); 1652 if (WARN_ON(!*page_size)) 1653 return false; 1654 return (1ULL << mr->cache_ent->order) >= 1655 ib_umem_num_dma_blocks(new_umem, *page_size); 1656 } 1657 1658 static int umr_rereg_pas(struct mlx5_ib_mr *mr, struct ib_pd *pd, 1659 int access_flags, int flags, struct ib_umem *new_umem, 1660 u64 iova, unsigned long page_size) 1661 { 1662 struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device); 1663 int upd_flags = MLX5_IB_UPD_XLT_ADDR | MLX5_IB_UPD_XLT_ENABLE; 1664 struct ib_umem *old_umem = mr->umem; 1665 int err; 1666 1667 /* 1668 * To keep everything simple the MR is revoked before we start to mess 1669 * with it. This ensure the change is atomic relative to any use of the 1670 * MR. 1671 */ 1672 err = mlx5_mr_cache_invalidate(mr); 1673 if (err) 1674 return err; 1675 1676 if (flags & IB_MR_REREG_PD) { 1677 mr->ibmr.pd = pd; 1678 mr->mmkey.pd = to_mpd(pd)->pdn; 1679 upd_flags |= MLX5_IB_UPD_XLT_PD; 1680 } 1681 if (flags & IB_MR_REREG_ACCESS) { 1682 mr->access_flags = access_flags; 1683 upd_flags |= MLX5_IB_UPD_XLT_ACCESS; 1684 } 1685 1686 mr->ibmr.length = new_umem->length; 1687 mr->mmkey.iova = iova; 1688 mr->mmkey.size = new_umem->length; 1689 mr->page_shift = order_base_2(page_size); 1690 mr->umem = new_umem; 1691 err = mlx5_ib_update_mr_pas(mr, upd_flags); 1692 if (err) { 1693 /* 1694 * The MR is revoked at this point so there is no issue to free 1695 * new_umem. 1696 */ 1697 mr->umem = old_umem; 1698 return err; 1699 } 1700 1701 atomic_sub(ib_umem_num_pages(old_umem), &dev->mdev->priv.reg_pages); 1702 ib_umem_release(old_umem); 1703 atomic_add(ib_umem_num_pages(new_umem), &dev->mdev->priv.reg_pages); 1704 return 0; 1705 } 1706 1707 struct ib_mr *mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, 1708 u64 length, u64 iova, int new_access_flags, 1709 struct ib_pd *new_pd, 1710 struct ib_udata *udata) 1711 { 1712 struct mlx5_ib_dev *dev = to_mdev(ib_mr->device); 1713 struct mlx5_ib_mr *mr = to_mmr(ib_mr); 1714 int err; 1715 1716 if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM)) 1717 return ERR_PTR(-EOPNOTSUPP); 1718 1719 mlx5_ib_dbg( 1720 dev, 1721 "start 0x%llx, iova 0x%llx, length 0x%llx, access_flags 0x%x\n", 1722 start, iova, length, new_access_flags); 1723 1724 if (flags & ~(IB_MR_REREG_TRANS | IB_MR_REREG_PD | IB_MR_REREG_ACCESS)) 1725 return ERR_PTR(-EOPNOTSUPP); 1726 1727 if (!(flags & IB_MR_REREG_ACCESS)) 1728 new_access_flags = mr->access_flags; 1729 if (!(flags & IB_MR_REREG_PD)) 1730 new_pd = ib_mr->pd; 1731 1732 if (!(flags & IB_MR_REREG_TRANS)) { 1733 struct ib_umem *umem; 1734 1735 /* Fast path for PD/access change */ 1736 if (can_use_umr_rereg_access(dev, mr->access_flags, 1737 new_access_flags)) { 1738 err = umr_rereg_pd_access(mr, new_pd, new_access_flags); 1739 if (err) 1740 return ERR_PTR(err); 1741 return NULL; 1742 } 1743 /* DM or ODP MR's don't have a umem so we can't re-use it */ 1744 if (!mr->umem || is_odp_mr(mr)) 1745 goto recreate; 1746 1747 /* 1748 * Only one active MR can refer to a umem at one time, revoke 1749 * the old MR before assigning the umem to the new one. 1750 */ 1751 err = mlx5_mr_cache_invalidate(mr); 1752 if (err) 1753 return ERR_PTR(err); 1754 umem = mr->umem; 1755 mr->umem = NULL; 1756 atomic_sub(ib_umem_num_pages(umem), &dev->mdev->priv.reg_pages); 1757 1758 return create_real_mr(new_pd, umem, mr->mmkey.iova, 1759 new_access_flags); 1760 } 1761 1762 /* 1763 * DM doesn't have a PAS list so we can't re-use it, odp does but the 1764 * logic around releasing the umem is different 1765 */ 1766 if (!mr->umem || is_odp_mr(mr)) 1767 goto recreate; 1768 1769 if (!(new_access_flags & IB_ACCESS_ON_DEMAND) && 1770 can_use_umr_rereg_access(dev, mr->access_flags, new_access_flags)) { 1771 struct ib_umem *new_umem; 1772 unsigned long page_size; 1773 1774 new_umem = ib_umem_get(&dev->ib_dev, start, length, 1775 new_access_flags); 1776 if (IS_ERR(new_umem)) 1777 return ERR_CAST(new_umem); 1778 1779 /* Fast path for PAS change */ 1780 if (can_use_umr_rereg_pas(mr, new_umem, new_access_flags, iova, 1781 &page_size)) { 1782 err = umr_rereg_pas(mr, new_pd, new_access_flags, flags, 1783 new_umem, iova, page_size); 1784 if (err) { 1785 ib_umem_release(new_umem); 1786 return ERR_PTR(err); 1787 } 1788 return NULL; 1789 } 1790 return create_real_mr(new_pd, new_umem, iova, new_access_flags); 1791 } 1792 1793 /* 1794 * Everything else has no state we can preserve, just create a new MR 1795 * from scratch 1796 */ 1797 recreate: 1798 return mlx5_ib_reg_user_mr(new_pd, start, length, iova, 1799 new_access_flags, udata); 1800 } 1801 1802 static int 1803 mlx5_alloc_priv_descs(struct ib_device *device, 1804 struct mlx5_ib_mr *mr, 1805 int ndescs, 1806 int desc_size) 1807 { 1808 struct mlx5_ib_dev *dev = to_mdev(device); 1809 struct device *ddev = &dev->mdev->pdev->dev; 1810 int size = ndescs * desc_size; 1811 int add_size; 1812 int ret; 1813 1814 add_size = max_t(int, MLX5_UMR_ALIGN - ARCH_KMALLOC_MINALIGN, 0); 1815 1816 mr->descs_alloc = kzalloc(size + add_size, GFP_KERNEL); 1817 if (!mr->descs_alloc) 1818 return -ENOMEM; 1819 1820 mr->descs = PTR_ALIGN(mr->descs_alloc, MLX5_UMR_ALIGN); 1821 1822 mr->desc_map = dma_map_single(ddev, mr->descs, size, DMA_TO_DEVICE); 1823 if (dma_mapping_error(ddev, mr->desc_map)) { 1824 ret = -ENOMEM; 1825 goto err; 1826 } 1827 1828 return 0; 1829 err: 1830 kfree(mr->descs_alloc); 1831 1832 return ret; 1833 } 1834 1835 static void 1836 mlx5_free_priv_descs(struct mlx5_ib_mr *mr) 1837 { 1838 if (mr->descs) { 1839 struct ib_device *device = mr->ibmr.device; 1840 int size = mr->max_descs * mr->desc_size; 1841 struct mlx5_ib_dev *dev = to_mdev(device); 1842 1843 dma_unmap_single(&dev->mdev->pdev->dev, mr->desc_map, size, 1844 DMA_TO_DEVICE); 1845 kfree(mr->descs_alloc); 1846 mr->descs = NULL; 1847 } 1848 } 1849 1850 static void clean_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) 1851 { 1852 if (mr->sig) { 1853 if (mlx5_core_destroy_psv(dev->mdev, 1854 mr->sig->psv_memory.psv_idx)) 1855 mlx5_ib_warn(dev, "failed to destroy mem psv %d\n", 1856 mr->sig->psv_memory.psv_idx); 1857 if (mlx5_core_destroy_psv(dev->mdev, 1858 mr->sig->psv_wire.psv_idx)) 1859 mlx5_ib_warn(dev, "failed to destroy wire psv %d\n", 1860 mr->sig->psv_wire.psv_idx); 1861 xa_erase(&dev->sig_mrs, mlx5_base_mkey(mr->mmkey.key)); 1862 kfree(mr->sig); 1863 mr->sig = NULL; 1864 } 1865 1866 if (!mr->cache_ent) { 1867 destroy_mkey(dev, mr); 1868 mlx5_free_priv_descs(mr); 1869 } 1870 } 1871 1872 static void dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) 1873 { 1874 struct ib_umem *umem = mr->umem; 1875 1876 /* Stop all DMA */ 1877 if (is_odp_mr(mr)) 1878 mlx5_ib_fence_odp_mr(mr); 1879 else 1880 clean_mr(dev, mr); 1881 1882 if (umem) { 1883 if (!is_odp_mr(mr)) 1884 atomic_sub(ib_umem_num_pages(umem), 1885 &dev->mdev->priv.reg_pages); 1886 ib_umem_release(umem); 1887 } 1888 1889 if (mr->cache_ent) 1890 mlx5_mr_cache_free(dev, mr); 1891 else 1892 kfree(mr); 1893 } 1894 1895 int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) 1896 { 1897 struct mlx5_ib_mr *mmr = to_mmr(ibmr); 1898 1899 if (ibmr->type == IB_MR_TYPE_INTEGRITY) { 1900 dereg_mr(to_mdev(mmr->mtt_mr->ibmr.device), mmr->mtt_mr); 1901 dereg_mr(to_mdev(mmr->klm_mr->ibmr.device), mmr->klm_mr); 1902 } 1903 1904 if (is_odp_mr(mmr) && to_ib_umem_odp(mmr->umem)->is_implicit_odp) { 1905 mlx5_ib_free_implicit_mr(mmr); 1906 return 0; 1907 } 1908 1909 dereg_mr(to_mdev(ibmr->device), mmr); 1910 1911 return 0; 1912 } 1913 1914 static void mlx5_set_umr_free_mkey(struct ib_pd *pd, u32 *in, int ndescs, 1915 int access_mode, int page_shift) 1916 { 1917 void *mkc; 1918 1919 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 1920 1921 /* This is only used from the kernel, so setting the PD is OK. */ 1922 set_mkc_access_pd_addr_fields(mkc, 0, 0, pd); 1923 MLX5_SET(mkc, mkc, free, 1); 1924 MLX5_SET(mkc, mkc, translations_octword_size, ndescs); 1925 MLX5_SET(mkc, mkc, access_mode_1_0, access_mode & 0x3); 1926 MLX5_SET(mkc, mkc, access_mode_4_2, (access_mode >> 2) & 0x7); 1927 MLX5_SET(mkc, mkc, umr_en, 1); 1928 MLX5_SET(mkc, mkc, log_page_size, page_shift); 1929 } 1930 1931 static int _mlx5_alloc_mkey_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr, 1932 int ndescs, int desc_size, int page_shift, 1933 int access_mode, u32 *in, int inlen) 1934 { 1935 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1936 int err; 1937 1938 mr->access_mode = access_mode; 1939 mr->desc_size = desc_size; 1940 mr->max_descs = ndescs; 1941 1942 err = mlx5_alloc_priv_descs(pd->device, mr, ndescs, desc_size); 1943 if (err) 1944 return err; 1945 1946 mlx5_set_umr_free_mkey(pd, in, ndescs, access_mode, page_shift); 1947 1948 err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen); 1949 if (err) 1950 goto err_free_descs; 1951 1952 mr->mmkey.type = MLX5_MKEY_MR; 1953 mr->ibmr.lkey = mr->mmkey.key; 1954 mr->ibmr.rkey = mr->mmkey.key; 1955 1956 return 0; 1957 1958 err_free_descs: 1959 mlx5_free_priv_descs(mr); 1960 return err; 1961 } 1962 1963 static struct mlx5_ib_mr *mlx5_ib_alloc_pi_mr(struct ib_pd *pd, 1964 u32 max_num_sg, u32 max_num_meta_sg, 1965 int desc_size, int access_mode) 1966 { 1967 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 1968 int ndescs = ALIGN(max_num_sg + max_num_meta_sg, 4); 1969 int page_shift = 0; 1970 struct mlx5_ib_mr *mr; 1971 u32 *in; 1972 int err; 1973 1974 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 1975 if (!mr) 1976 return ERR_PTR(-ENOMEM); 1977 1978 mr->ibmr.pd = pd; 1979 mr->ibmr.device = pd->device; 1980 1981 in = kzalloc(inlen, GFP_KERNEL); 1982 if (!in) { 1983 err = -ENOMEM; 1984 goto err_free; 1985 } 1986 1987 if (access_mode == MLX5_MKC_ACCESS_MODE_MTT) 1988 page_shift = PAGE_SHIFT; 1989 1990 err = _mlx5_alloc_mkey_descs(pd, mr, ndescs, desc_size, page_shift, 1991 access_mode, in, inlen); 1992 if (err) 1993 goto err_free_in; 1994 1995 mr->umem = NULL; 1996 kfree(in); 1997 1998 return mr; 1999 2000 err_free_in: 2001 kfree(in); 2002 err_free: 2003 kfree(mr); 2004 return ERR_PTR(err); 2005 } 2006 2007 static int mlx5_alloc_mem_reg_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr, 2008 int ndescs, u32 *in, int inlen) 2009 { 2010 return _mlx5_alloc_mkey_descs(pd, mr, ndescs, sizeof(struct mlx5_mtt), 2011 PAGE_SHIFT, MLX5_MKC_ACCESS_MODE_MTT, in, 2012 inlen); 2013 } 2014 2015 static int mlx5_alloc_sg_gaps_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr, 2016 int ndescs, u32 *in, int inlen) 2017 { 2018 return _mlx5_alloc_mkey_descs(pd, mr, ndescs, sizeof(struct mlx5_klm), 2019 0, MLX5_MKC_ACCESS_MODE_KLMS, in, inlen); 2020 } 2021 2022 static int mlx5_alloc_integrity_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr, 2023 int max_num_sg, int max_num_meta_sg, 2024 u32 *in, int inlen) 2025 { 2026 struct mlx5_ib_dev *dev = to_mdev(pd->device); 2027 u32 psv_index[2]; 2028 void *mkc; 2029 int err; 2030 2031 mr->sig = kzalloc(sizeof(*mr->sig), GFP_KERNEL); 2032 if (!mr->sig) 2033 return -ENOMEM; 2034 2035 /* create mem & wire PSVs */ 2036 err = mlx5_core_create_psv(dev->mdev, to_mpd(pd)->pdn, 2, psv_index); 2037 if (err) 2038 goto err_free_sig; 2039 2040 mr->sig->psv_memory.psv_idx = psv_index[0]; 2041 mr->sig->psv_wire.psv_idx = psv_index[1]; 2042 2043 mr->sig->sig_status_checked = true; 2044 mr->sig->sig_err_exists = false; 2045 /* Next UMR, Arm SIGERR */ 2046 ++mr->sig->sigerr_count; 2047 mr->klm_mr = mlx5_ib_alloc_pi_mr(pd, max_num_sg, max_num_meta_sg, 2048 sizeof(struct mlx5_klm), 2049 MLX5_MKC_ACCESS_MODE_KLMS); 2050 if (IS_ERR(mr->klm_mr)) { 2051 err = PTR_ERR(mr->klm_mr); 2052 goto err_destroy_psv; 2053 } 2054 mr->mtt_mr = mlx5_ib_alloc_pi_mr(pd, max_num_sg, max_num_meta_sg, 2055 sizeof(struct mlx5_mtt), 2056 MLX5_MKC_ACCESS_MODE_MTT); 2057 if (IS_ERR(mr->mtt_mr)) { 2058 err = PTR_ERR(mr->mtt_mr); 2059 goto err_free_klm_mr; 2060 } 2061 2062 /* Set bsf descriptors for mkey */ 2063 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 2064 MLX5_SET(mkc, mkc, bsf_en, 1); 2065 MLX5_SET(mkc, mkc, bsf_octword_size, MLX5_MKEY_BSF_OCTO_SIZE); 2066 2067 err = _mlx5_alloc_mkey_descs(pd, mr, 4, sizeof(struct mlx5_klm), 0, 2068 MLX5_MKC_ACCESS_MODE_KLMS, in, inlen); 2069 if (err) 2070 goto err_free_mtt_mr; 2071 2072 err = xa_err(xa_store(&dev->sig_mrs, mlx5_base_mkey(mr->mmkey.key), 2073 mr->sig, GFP_KERNEL)); 2074 if (err) 2075 goto err_free_descs; 2076 return 0; 2077 2078 err_free_descs: 2079 destroy_mkey(dev, mr); 2080 mlx5_free_priv_descs(mr); 2081 err_free_mtt_mr: 2082 dereg_mr(to_mdev(mr->mtt_mr->ibmr.device), mr->mtt_mr); 2083 mr->mtt_mr = NULL; 2084 err_free_klm_mr: 2085 dereg_mr(to_mdev(mr->klm_mr->ibmr.device), mr->klm_mr); 2086 mr->klm_mr = NULL; 2087 err_destroy_psv: 2088 if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_memory.psv_idx)) 2089 mlx5_ib_warn(dev, "failed to destroy mem psv %d\n", 2090 mr->sig->psv_memory.psv_idx); 2091 if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_wire.psv_idx)) 2092 mlx5_ib_warn(dev, "failed to destroy wire psv %d\n", 2093 mr->sig->psv_wire.psv_idx); 2094 err_free_sig: 2095 kfree(mr->sig); 2096 2097 return err; 2098 } 2099 2100 static struct ib_mr *__mlx5_ib_alloc_mr(struct ib_pd *pd, 2101 enum ib_mr_type mr_type, u32 max_num_sg, 2102 u32 max_num_meta_sg) 2103 { 2104 struct mlx5_ib_dev *dev = to_mdev(pd->device); 2105 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 2106 int ndescs = ALIGN(max_num_sg, 4); 2107 struct mlx5_ib_mr *mr; 2108 u32 *in; 2109 int err; 2110 2111 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 2112 if (!mr) 2113 return ERR_PTR(-ENOMEM); 2114 2115 in = kzalloc(inlen, GFP_KERNEL); 2116 if (!in) { 2117 err = -ENOMEM; 2118 goto err_free; 2119 } 2120 2121 mr->ibmr.device = pd->device; 2122 mr->umem = NULL; 2123 2124 switch (mr_type) { 2125 case IB_MR_TYPE_MEM_REG: 2126 err = mlx5_alloc_mem_reg_descs(pd, mr, ndescs, in, inlen); 2127 break; 2128 case IB_MR_TYPE_SG_GAPS: 2129 err = mlx5_alloc_sg_gaps_descs(pd, mr, ndescs, in, inlen); 2130 break; 2131 case IB_MR_TYPE_INTEGRITY: 2132 err = mlx5_alloc_integrity_descs(pd, mr, max_num_sg, 2133 max_num_meta_sg, in, inlen); 2134 break; 2135 default: 2136 mlx5_ib_warn(dev, "Invalid mr type %d\n", mr_type); 2137 err = -EINVAL; 2138 } 2139 2140 if (err) 2141 goto err_free_in; 2142 2143 kfree(in); 2144 2145 return &mr->ibmr; 2146 2147 err_free_in: 2148 kfree(in); 2149 err_free: 2150 kfree(mr); 2151 return ERR_PTR(err); 2152 } 2153 2154 struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, 2155 u32 max_num_sg) 2156 { 2157 return __mlx5_ib_alloc_mr(pd, mr_type, max_num_sg, 0); 2158 } 2159 2160 struct ib_mr *mlx5_ib_alloc_mr_integrity(struct ib_pd *pd, 2161 u32 max_num_sg, u32 max_num_meta_sg) 2162 { 2163 return __mlx5_ib_alloc_mr(pd, IB_MR_TYPE_INTEGRITY, max_num_sg, 2164 max_num_meta_sg); 2165 } 2166 2167 int mlx5_ib_alloc_mw(struct ib_mw *ibmw, struct ib_udata *udata) 2168 { 2169 struct mlx5_ib_dev *dev = to_mdev(ibmw->device); 2170 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 2171 struct mlx5_ib_mw *mw = to_mmw(ibmw); 2172 u32 *in = NULL; 2173 void *mkc; 2174 int ndescs; 2175 int err; 2176 struct mlx5_ib_alloc_mw req = {}; 2177 struct { 2178 __u32 comp_mask; 2179 __u32 response_length; 2180 } resp = {}; 2181 2182 err = ib_copy_from_udata(&req, udata, min(udata->inlen, sizeof(req))); 2183 if (err) 2184 return err; 2185 2186 if (req.comp_mask || req.reserved1 || req.reserved2) 2187 return -EOPNOTSUPP; 2188 2189 if (udata->inlen > sizeof(req) && 2190 !ib_is_udata_cleared(udata, sizeof(req), 2191 udata->inlen - sizeof(req))) 2192 return -EOPNOTSUPP; 2193 2194 ndescs = req.num_klms ? roundup(req.num_klms, 4) : roundup(1, 4); 2195 2196 in = kzalloc(inlen, GFP_KERNEL); 2197 if (!in) { 2198 err = -ENOMEM; 2199 goto free; 2200 } 2201 2202 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 2203 2204 MLX5_SET(mkc, mkc, free, 1); 2205 MLX5_SET(mkc, mkc, translations_octword_size, ndescs); 2206 MLX5_SET(mkc, mkc, pd, to_mpd(ibmw->pd)->pdn); 2207 MLX5_SET(mkc, mkc, umr_en, 1); 2208 MLX5_SET(mkc, mkc, lr, 1); 2209 MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_KLMS); 2210 MLX5_SET(mkc, mkc, en_rinval, !!((ibmw->type == IB_MW_TYPE_2))); 2211 MLX5_SET(mkc, mkc, qpn, 0xffffff); 2212 2213 err = mlx5_ib_create_mkey(dev, &mw->mmkey, in, inlen); 2214 if (err) 2215 goto free; 2216 2217 mw->mmkey.type = MLX5_MKEY_MW; 2218 ibmw->rkey = mw->mmkey.key; 2219 mw->ndescs = ndescs; 2220 2221 resp.response_length = 2222 min(offsetofend(typeof(resp), response_length), udata->outlen); 2223 if (resp.response_length) { 2224 err = ib_copy_to_udata(udata, &resp, resp.response_length); 2225 if (err) 2226 goto free_mkey; 2227 } 2228 2229 if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) { 2230 err = xa_err(xa_store(&dev->odp_mkeys, 2231 mlx5_base_mkey(mw->mmkey.key), &mw->mmkey, 2232 GFP_KERNEL)); 2233 if (err) 2234 goto free_mkey; 2235 } 2236 2237 kfree(in); 2238 return 0; 2239 2240 free_mkey: 2241 mlx5_core_destroy_mkey(dev->mdev, &mw->mmkey); 2242 free: 2243 kfree(in); 2244 return err; 2245 } 2246 2247 int mlx5_ib_dealloc_mw(struct ib_mw *mw) 2248 { 2249 struct mlx5_ib_dev *dev = to_mdev(mw->device); 2250 struct mlx5_ib_mw *mmw = to_mmw(mw); 2251 2252 if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) { 2253 xa_erase(&dev->odp_mkeys, mlx5_base_mkey(mmw->mmkey.key)); 2254 /* 2255 * pagefault_single_data_segment() may be accessing mmw under 2256 * SRCU if the user bound an ODP MR to this MW. 2257 */ 2258 synchronize_srcu(&dev->odp_srcu); 2259 } 2260 2261 return mlx5_core_destroy_mkey(dev->mdev, &mmw->mmkey); 2262 } 2263 2264 int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask, 2265 struct ib_mr_status *mr_status) 2266 { 2267 struct mlx5_ib_mr *mmr = to_mmr(ibmr); 2268 int ret = 0; 2269 2270 if (check_mask & ~IB_MR_CHECK_SIG_STATUS) { 2271 pr_err("Invalid status check mask\n"); 2272 ret = -EINVAL; 2273 goto done; 2274 } 2275 2276 mr_status->fail_status = 0; 2277 if (check_mask & IB_MR_CHECK_SIG_STATUS) { 2278 if (!mmr->sig) { 2279 ret = -EINVAL; 2280 pr_err("signature status check requested on a non-signature enabled MR\n"); 2281 goto done; 2282 } 2283 2284 mmr->sig->sig_status_checked = true; 2285 if (!mmr->sig->sig_err_exists) 2286 goto done; 2287 2288 if (ibmr->lkey == mmr->sig->err_item.key) 2289 memcpy(&mr_status->sig_err, &mmr->sig->err_item, 2290 sizeof(mr_status->sig_err)); 2291 else { 2292 mr_status->sig_err.err_type = IB_SIG_BAD_GUARD; 2293 mr_status->sig_err.sig_err_offset = 0; 2294 mr_status->sig_err.key = mmr->sig->err_item.key; 2295 } 2296 2297 mmr->sig->sig_err_exists = false; 2298 mr_status->fail_status |= IB_MR_CHECK_SIG_STATUS; 2299 } 2300 2301 done: 2302 return ret; 2303 } 2304 2305 static int 2306 mlx5_ib_map_pa_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg, 2307 int data_sg_nents, unsigned int *data_sg_offset, 2308 struct scatterlist *meta_sg, int meta_sg_nents, 2309 unsigned int *meta_sg_offset) 2310 { 2311 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2312 unsigned int sg_offset = 0; 2313 int n = 0; 2314 2315 mr->meta_length = 0; 2316 if (data_sg_nents == 1) { 2317 n++; 2318 mr->ndescs = 1; 2319 if (data_sg_offset) 2320 sg_offset = *data_sg_offset; 2321 mr->data_length = sg_dma_len(data_sg) - sg_offset; 2322 mr->data_iova = sg_dma_address(data_sg) + sg_offset; 2323 if (meta_sg_nents == 1) { 2324 n++; 2325 mr->meta_ndescs = 1; 2326 if (meta_sg_offset) 2327 sg_offset = *meta_sg_offset; 2328 else 2329 sg_offset = 0; 2330 mr->meta_length = sg_dma_len(meta_sg) - sg_offset; 2331 mr->pi_iova = sg_dma_address(meta_sg) + sg_offset; 2332 } 2333 ibmr->length = mr->data_length + mr->meta_length; 2334 } 2335 2336 return n; 2337 } 2338 2339 static int 2340 mlx5_ib_sg_to_klms(struct mlx5_ib_mr *mr, 2341 struct scatterlist *sgl, 2342 unsigned short sg_nents, 2343 unsigned int *sg_offset_p, 2344 struct scatterlist *meta_sgl, 2345 unsigned short meta_sg_nents, 2346 unsigned int *meta_sg_offset_p) 2347 { 2348 struct scatterlist *sg = sgl; 2349 struct mlx5_klm *klms = mr->descs; 2350 unsigned int sg_offset = sg_offset_p ? *sg_offset_p : 0; 2351 u32 lkey = mr->ibmr.pd->local_dma_lkey; 2352 int i, j = 0; 2353 2354 mr->ibmr.iova = sg_dma_address(sg) + sg_offset; 2355 mr->ibmr.length = 0; 2356 2357 for_each_sg(sgl, sg, sg_nents, i) { 2358 if (unlikely(i >= mr->max_descs)) 2359 break; 2360 klms[i].va = cpu_to_be64(sg_dma_address(sg) + sg_offset); 2361 klms[i].bcount = cpu_to_be32(sg_dma_len(sg) - sg_offset); 2362 klms[i].key = cpu_to_be32(lkey); 2363 mr->ibmr.length += sg_dma_len(sg) - sg_offset; 2364 2365 sg_offset = 0; 2366 } 2367 2368 if (sg_offset_p) 2369 *sg_offset_p = sg_offset; 2370 2371 mr->ndescs = i; 2372 mr->data_length = mr->ibmr.length; 2373 2374 if (meta_sg_nents) { 2375 sg = meta_sgl; 2376 sg_offset = meta_sg_offset_p ? *meta_sg_offset_p : 0; 2377 for_each_sg(meta_sgl, sg, meta_sg_nents, j) { 2378 if (unlikely(i + j >= mr->max_descs)) 2379 break; 2380 klms[i + j].va = cpu_to_be64(sg_dma_address(sg) + 2381 sg_offset); 2382 klms[i + j].bcount = cpu_to_be32(sg_dma_len(sg) - 2383 sg_offset); 2384 klms[i + j].key = cpu_to_be32(lkey); 2385 mr->ibmr.length += sg_dma_len(sg) - sg_offset; 2386 2387 sg_offset = 0; 2388 } 2389 if (meta_sg_offset_p) 2390 *meta_sg_offset_p = sg_offset; 2391 2392 mr->meta_ndescs = j; 2393 mr->meta_length = mr->ibmr.length - mr->data_length; 2394 } 2395 2396 return i + j; 2397 } 2398 2399 static int mlx5_set_page(struct ib_mr *ibmr, u64 addr) 2400 { 2401 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2402 __be64 *descs; 2403 2404 if (unlikely(mr->ndescs == mr->max_descs)) 2405 return -ENOMEM; 2406 2407 descs = mr->descs; 2408 descs[mr->ndescs++] = cpu_to_be64(addr | MLX5_EN_RD | MLX5_EN_WR); 2409 2410 return 0; 2411 } 2412 2413 static int mlx5_set_page_pi(struct ib_mr *ibmr, u64 addr) 2414 { 2415 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2416 __be64 *descs; 2417 2418 if (unlikely(mr->ndescs + mr->meta_ndescs == mr->max_descs)) 2419 return -ENOMEM; 2420 2421 descs = mr->descs; 2422 descs[mr->ndescs + mr->meta_ndescs++] = 2423 cpu_to_be64(addr | MLX5_EN_RD | MLX5_EN_WR); 2424 2425 return 0; 2426 } 2427 2428 static int 2429 mlx5_ib_map_mtt_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg, 2430 int data_sg_nents, unsigned int *data_sg_offset, 2431 struct scatterlist *meta_sg, int meta_sg_nents, 2432 unsigned int *meta_sg_offset) 2433 { 2434 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2435 struct mlx5_ib_mr *pi_mr = mr->mtt_mr; 2436 int n; 2437 2438 pi_mr->ndescs = 0; 2439 pi_mr->meta_ndescs = 0; 2440 pi_mr->meta_length = 0; 2441 2442 ib_dma_sync_single_for_cpu(ibmr->device, pi_mr->desc_map, 2443 pi_mr->desc_size * pi_mr->max_descs, 2444 DMA_TO_DEVICE); 2445 2446 pi_mr->ibmr.page_size = ibmr->page_size; 2447 n = ib_sg_to_pages(&pi_mr->ibmr, data_sg, data_sg_nents, data_sg_offset, 2448 mlx5_set_page); 2449 if (n != data_sg_nents) 2450 return n; 2451 2452 pi_mr->data_iova = pi_mr->ibmr.iova; 2453 pi_mr->data_length = pi_mr->ibmr.length; 2454 pi_mr->ibmr.length = pi_mr->data_length; 2455 ibmr->length = pi_mr->data_length; 2456 2457 if (meta_sg_nents) { 2458 u64 page_mask = ~((u64)ibmr->page_size - 1); 2459 u64 iova = pi_mr->data_iova; 2460 2461 n += ib_sg_to_pages(&pi_mr->ibmr, meta_sg, meta_sg_nents, 2462 meta_sg_offset, mlx5_set_page_pi); 2463 2464 pi_mr->meta_length = pi_mr->ibmr.length; 2465 /* 2466 * PI address for the HW is the offset of the metadata address 2467 * relative to the first data page address. 2468 * It equals to first data page address + size of data pages + 2469 * metadata offset at the first metadata page 2470 */ 2471 pi_mr->pi_iova = (iova & page_mask) + 2472 pi_mr->ndescs * ibmr->page_size + 2473 (pi_mr->ibmr.iova & ~page_mask); 2474 /* 2475 * In order to use one MTT MR for data and metadata, we register 2476 * also the gaps between the end of the data and the start of 2477 * the metadata (the sig MR will verify that the HW will access 2478 * to right addresses). This mapping is safe because we use 2479 * internal mkey for the registration. 2480 */ 2481 pi_mr->ibmr.length = pi_mr->pi_iova + pi_mr->meta_length - iova; 2482 pi_mr->ibmr.iova = iova; 2483 ibmr->length += pi_mr->meta_length; 2484 } 2485 2486 ib_dma_sync_single_for_device(ibmr->device, pi_mr->desc_map, 2487 pi_mr->desc_size * pi_mr->max_descs, 2488 DMA_TO_DEVICE); 2489 2490 return n; 2491 } 2492 2493 static int 2494 mlx5_ib_map_klm_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg, 2495 int data_sg_nents, unsigned int *data_sg_offset, 2496 struct scatterlist *meta_sg, int meta_sg_nents, 2497 unsigned int *meta_sg_offset) 2498 { 2499 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2500 struct mlx5_ib_mr *pi_mr = mr->klm_mr; 2501 int n; 2502 2503 pi_mr->ndescs = 0; 2504 pi_mr->meta_ndescs = 0; 2505 pi_mr->meta_length = 0; 2506 2507 ib_dma_sync_single_for_cpu(ibmr->device, pi_mr->desc_map, 2508 pi_mr->desc_size * pi_mr->max_descs, 2509 DMA_TO_DEVICE); 2510 2511 n = mlx5_ib_sg_to_klms(pi_mr, data_sg, data_sg_nents, data_sg_offset, 2512 meta_sg, meta_sg_nents, meta_sg_offset); 2513 2514 ib_dma_sync_single_for_device(ibmr->device, pi_mr->desc_map, 2515 pi_mr->desc_size * pi_mr->max_descs, 2516 DMA_TO_DEVICE); 2517 2518 /* This is zero-based memory region */ 2519 pi_mr->data_iova = 0; 2520 pi_mr->ibmr.iova = 0; 2521 pi_mr->pi_iova = pi_mr->data_length; 2522 ibmr->length = pi_mr->ibmr.length; 2523 2524 return n; 2525 } 2526 2527 int mlx5_ib_map_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg, 2528 int data_sg_nents, unsigned int *data_sg_offset, 2529 struct scatterlist *meta_sg, int meta_sg_nents, 2530 unsigned int *meta_sg_offset) 2531 { 2532 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2533 struct mlx5_ib_mr *pi_mr = NULL; 2534 int n; 2535 2536 WARN_ON(ibmr->type != IB_MR_TYPE_INTEGRITY); 2537 2538 mr->ndescs = 0; 2539 mr->data_length = 0; 2540 mr->data_iova = 0; 2541 mr->meta_ndescs = 0; 2542 mr->pi_iova = 0; 2543 /* 2544 * As a performance optimization, if possible, there is no need to 2545 * perform UMR operation to register the data/metadata buffers. 2546 * First try to map the sg lists to PA descriptors with local_dma_lkey. 2547 * Fallback to UMR only in case of a failure. 2548 */ 2549 n = mlx5_ib_map_pa_mr_sg_pi(ibmr, data_sg, data_sg_nents, 2550 data_sg_offset, meta_sg, meta_sg_nents, 2551 meta_sg_offset); 2552 if (n == data_sg_nents + meta_sg_nents) 2553 goto out; 2554 /* 2555 * As a performance optimization, if possible, there is no need to map 2556 * the sg lists to KLM descriptors. First try to map the sg lists to MTT 2557 * descriptors and fallback to KLM only in case of a failure. 2558 * It's more efficient for the HW to work with MTT descriptors 2559 * (especially in high load). 2560 * Use KLM (indirect access) only if it's mandatory. 2561 */ 2562 pi_mr = mr->mtt_mr; 2563 n = mlx5_ib_map_mtt_mr_sg_pi(ibmr, data_sg, data_sg_nents, 2564 data_sg_offset, meta_sg, meta_sg_nents, 2565 meta_sg_offset); 2566 if (n == data_sg_nents + meta_sg_nents) 2567 goto out; 2568 2569 pi_mr = mr->klm_mr; 2570 n = mlx5_ib_map_klm_mr_sg_pi(ibmr, data_sg, data_sg_nents, 2571 data_sg_offset, meta_sg, meta_sg_nents, 2572 meta_sg_offset); 2573 if (unlikely(n != data_sg_nents + meta_sg_nents)) 2574 return -ENOMEM; 2575 2576 out: 2577 /* This is zero-based memory region */ 2578 ibmr->iova = 0; 2579 mr->pi_mr = pi_mr; 2580 if (pi_mr) 2581 ibmr->sig_attrs->meta_length = pi_mr->meta_length; 2582 else 2583 ibmr->sig_attrs->meta_length = mr->meta_length; 2584 2585 return 0; 2586 } 2587 2588 int mlx5_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, 2589 unsigned int *sg_offset) 2590 { 2591 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2592 int n; 2593 2594 mr->ndescs = 0; 2595 2596 ib_dma_sync_single_for_cpu(ibmr->device, mr->desc_map, 2597 mr->desc_size * mr->max_descs, 2598 DMA_TO_DEVICE); 2599 2600 if (mr->access_mode == MLX5_MKC_ACCESS_MODE_KLMS) 2601 n = mlx5_ib_sg_to_klms(mr, sg, sg_nents, sg_offset, NULL, 0, 2602 NULL); 2603 else 2604 n = ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, 2605 mlx5_set_page); 2606 2607 ib_dma_sync_single_for_device(ibmr->device, mr->desc_map, 2608 mr->desc_size * mr->max_descs, 2609 DMA_TO_DEVICE); 2610 2611 return n; 2612 } 2613