1 /* 2 * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved. 3 * Copyright (c) 2020, Intel Corporation. All rights reserved. 4 * 5 * This software is available to you under a choice of one of two 6 * licenses. You may choose to be licensed under the terms of the GNU 7 * General Public License (GPL) Version 2, available from the file 8 * COPYING in the main directory of this source tree, or the 9 * OpenIB.org BSD license below: 10 * 11 * Redistribution and use in source and binary forms, with or 12 * without modification, are permitted provided that the following 13 * conditions are met: 14 * 15 * - Redistributions of source code must retain the above 16 * copyright notice, this list of conditions and the following 17 * disclaimer. 18 * 19 * - Redistributions in binary form must reproduce the above 20 * copyright notice, this list of conditions and the following 21 * disclaimer in the documentation and/or other materials 22 * provided with the distribution. 23 * 24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 25 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 26 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 27 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 28 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 29 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 30 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 31 * SOFTWARE. 32 */ 33 34 35 #include <linux/kref.h> 36 #include <linux/random.h> 37 #include <linux/debugfs.h> 38 #include <linux/export.h> 39 #include <linux/delay.h> 40 #include <linux/dma-buf.h> 41 #include <linux/dma-resv.h> 42 #include <rdma/ib_umem.h> 43 #include <rdma/ib_umem_odp.h> 44 #include <rdma/ib_verbs.h> 45 #include "dm.h" 46 #include "mlx5_ib.h" 47 #include "umr.h" 48 49 enum { 50 MAX_PENDING_REG_MR = 8, 51 }; 52 53 #define MLX5_UMR_ALIGN 2048 54 55 static void 56 create_mkey_callback(int status, struct mlx5_async_work *context); 57 static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem, 58 u64 iova, int access_flags, 59 unsigned int page_size, bool populate); 60 61 static void set_mkc_access_pd_addr_fields(void *mkc, int acc, u64 start_addr, 62 struct ib_pd *pd) 63 { 64 struct mlx5_ib_dev *dev = to_mdev(pd->device); 65 66 MLX5_SET(mkc, mkc, a, !!(acc & IB_ACCESS_REMOTE_ATOMIC)); 67 MLX5_SET(mkc, mkc, rw, !!(acc & IB_ACCESS_REMOTE_WRITE)); 68 MLX5_SET(mkc, mkc, rr, !!(acc & IB_ACCESS_REMOTE_READ)); 69 MLX5_SET(mkc, mkc, lw, !!(acc & IB_ACCESS_LOCAL_WRITE)); 70 MLX5_SET(mkc, mkc, lr, 1); 71 72 if ((acc & IB_ACCESS_RELAXED_ORDERING) && 73 pcie_relaxed_ordering_enabled(dev->mdev->pdev)) { 74 if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write)) 75 MLX5_SET(mkc, mkc, relaxed_ordering_write, 1); 76 if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read)) 77 MLX5_SET(mkc, mkc, relaxed_ordering_read, 1); 78 } 79 80 MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn); 81 MLX5_SET(mkc, mkc, qpn, 0xffffff); 82 MLX5_SET64(mkc, mkc, start_addr, start_addr); 83 } 84 85 static void assign_mkey_variant(struct mlx5_ib_dev *dev, 86 struct mlx5_ib_mkey *mkey, u32 *in) 87 { 88 u8 key = atomic_inc_return(&dev->mkey_var); 89 void *mkc; 90 91 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 92 MLX5_SET(mkc, mkc, mkey_7_0, key); 93 mkey->key = key; 94 } 95 96 static int mlx5_ib_create_mkey(struct mlx5_ib_dev *dev, 97 struct mlx5_ib_mkey *mkey, u32 *in, int inlen) 98 { 99 int ret; 100 101 assign_mkey_variant(dev, mkey, in); 102 ret = mlx5_core_create_mkey(dev->mdev, &mkey->key, in, inlen); 103 if (!ret) 104 init_waitqueue_head(&mkey->wait); 105 106 return ret; 107 } 108 109 static int 110 mlx5_ib_create_mkey_cb(struct mlx5_ib_dev *dev, 111 struct mlx5_ib_mkey *mkey, 112 struct mlx5_async_ctx *async_ctx, 113 u32 *in, int inlen, u32 *out, int outlen, 114 struct mlx5_async_work *context) 115 { 116 MLX5_SET(create_mkey_in, in, opcode, MLX5_CMD_OP_CREATE_MKEY); 117 assign_mkey_variant(dev, mkey, in); 118 return mlx5_cmd_exec_cb(async_ctx, in, inlen, out, outlen, 119 create_mkey_callback, context); 120 } 121 122 static int mr_cache_max_order(struct mlx5_ib_dev *dev); 123 static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent); 124 125 static int destroy_mkey(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) 126 { 127 WARN_ON(xa_load(&dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key))); 128 129 return mlx5_core_destroy_mkey(dev->mdev, mr->mmkey.key); 130 } 131 132 static void create_mkey_warn(struct mlx5_ib_dev *dev, int status, void *out) 133 { 134 if (status == -ENXIO) /* core driver is not available */ 135 return; 136 137 mlx5_ib_warn(dev, "async reg mr failed. status %d\n", status); 138 if (status != -EREMOTEIO) /* driver specific failure */ 139 return; 140 141 /* Failed in FW, print cmd out failure details */ 142 mlx5_cmd_out_err(dev->mdev, MLX5_CMD_OP_CREATE_MKEY, 0, out); 143 } 144 145 static void create_mkey_callback(int status, struct mlx5_async_work *context) 146 { 147 struct mlx5_ib_mr *mr = 148 container_of(context, struct mlx5_ib_mr, cb_work); 149 struct mlx5_cache_ent *ent = mr->cache_ent; 150 struct mlx5_ib_dev *dev = ent->dev; 151 unsigned long flags; 152 153 if (status) { 154 create_mkey_warn(dev, status, mr->out); 155 kfree(mr); 156 spin_lock_irqsave(&ent->lock, flags); 157 ent->pending--; 158 WRITE_ONCE(dev->fill_delay, 1); 159 spin_unlock_irqrestore(&ent->lock, flags); 160 mod_timer(&dev->delay_timer, jiffies + HZ); 161 return; 162 } 163 164 mr->mmkey.type = MLX5_MKEY_MR; 165 mr->mmkey.key |= mlx5_idx_to_mkey( 166 MLX5_GET(create_mkey_out, mr->out, mkey_index)); 167 init_waitqueue_head(&mr->mmkey.wait); 168 169 WRITE_ONCE(dev->cache.last_add, jiffies); 170 171 spin_lock_irqsave(&ent->lock, flags); 172 list_add_tail(&mr->list, &ent->head); 173 ent->available_mrs++; 174 ent->total_mrs++; 175 /* If we are doing fill_to_high_water then keep going. */ 176 queue_adjust_cache_locked(ent); 177 ent->pending--; 178 spin_unlock_irqrestore(&ent->lock, flags); 179 } 180 181 static int get_mkc_octo_size(unsigned int access_mode, unsigned int ndescs) 182 { 183 int ret = 0; 184 185 switch (access_mode) { 186 case MLX5_MKC_ACCESS_MODE_MTT: 187 ret = DIV_ROUND_UP(ndescs, MLX5_IB_UMR_OCTOWORD / 188 sizeof(struct mlx5_mtt)); 189 break; 190 case MLX5_MKC_ACCESS_MODE_KSM: 191 ret = DIV_ROUND_UP(ndescs, MLX5_IB_UMR_OCTOWORD / 192 sizeof(struct mlx5_klm)); 193 break; 194 default: 195 WARN_ON(1); 196 } 197 return ret; 198 } 199 200 static struct mlx5_ib_mr *alloc_cache_mr(struct mlx5_cache_ent *ent, void *mkc) 201 { 202 struct mlx5_ib_mr *mr; 203 204 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 205 if (!mr) 206 return NULL; 207 mr->cache_ent = ent; 208 209 set_mkc_access_pd_addr_fields(mkc, 0, 0, ent->dev->umrc.pd); 210 MLX5_SET(mkc, mkc, free, 1); 211 MLX5_SET(mkc, mkc, umr_en, 1); 212 MLX5_SET(mkc, mkc, access_mode_1_0, ent->access_mode & 0x3); 213 MLX5_SET(mkc, mkc, access_mode_4_2, (ent->access_mode >> 2) & 0x7); 214 215 MLX5_SET(mkc, mkc, translations_octword_size, 216 get_mkc_octo_size(ent->access_mode, ent->ndescs)); 217 MLX5_SET(mkc, mkc, log_page_size, ent->page); 218 return mr; 219 } 220 221 /* Asynchronously schedule new MRs to be populated in the cache. */ 222 static int add_keys(struct mlx5_cache_ent *ent, unsigned int num) 223 { 224 size_t inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 225 struct mlx5_ib_mr *mr; 226 void *mkc; 227 u32 *in; 228 int err = 0; 229 int i; 230 231 in = kzalloc(inlen, GFP_KERNEL); 232 if (!in) 233 return -ENOMEM; 234 235 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 236 for (i = 0; i < num; i++) { 237 mr = alloc_cache_mr(ent, mkc); 238 if (!mr) { 239 err = -ENOMEM; 240 break; 241 } 242 spin_lock_irq(&ent->lock); 243 if (ent->pending >= MAX_PENDING_REG_MR) { 244 err = -EAGAIN; 245 spin_unlock_irq(&ent->lock); 246 kfree(mr); 247 break; 248 } 249 ent->pending++; 250 spin_unlock_irq(&ent->lock); 251 err = mlx5_ib_create_mkey_cb(ent->dev, &mr->mmkey, 252 &ent->dev->async_ctx, in, inlen, 253 mr->out, sizeof(mr->out), 254 &mr->cb_work); 255 if (err) { 256 spin_lock_irq(&ent->lock); 257 ent->pending--; 258 spin_unlock_irq(&ent->lock); 259 mlx5_ib_warn(ent->dev, "create mkey failed %d\n", err); 260 kfree(mr); 261 break; 262 } 263 } 264 265 kfree(in); 266 return err; 267 } 268 269 /* Synchronously create a MR in the cache */ 270 static struct mlx5_ib_mr *create_cache_mr(struct mlx5_cache_ent *ent) 271 { 272 size_t inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 273 struct mlx5_ib_mr *mr; 274 void *mkc; 275 u32 *in; 276 int err; 277 278 in = kzalloc(inlen, GFP_KERNEL); 279 if (!in) 280 return ERR_PTR(-ENOMEM); 281 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 282 283 mr = alloc_cache_mr(ent, mkc); 284 if (!mr) { 285 err = -ENOMEM; 286 goto free_in; 287 } 288 289 err = mlx5_core_create_mkey(ent->dev->mdev, &mr->mmkey.key, in, inlen); 290 if (err) 291 goto free_mr; 292 293 init_waitqueue_head(&mr->mmkey.wait); 294 mr->mmkey.type = MLX5_MKEY_MR; 295 WRITE_ONCE(ent->dev->cache.last_add, jiffies); 296 spin_lock_irq(&ent->lock); 297 ent->total_mrs++; 298 spin_unlock_irq(&ent->lock); 299 kfree(in); 300 return mr; 301 free_mr: 302 kfree(mr); 303 free_in: 304 kfree(in); 305 return ERR_PTR(err); 306 } 307 308 static void remove_cache_mr_locked(struct mlx5_cache_ent *ent) 309 { 310 struct mlx5_ib_mr *mr; 311 312 lockdep_assert_held(&ent->lock); 313 if (list_empty(&ent->head)) 314 return; 315 mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list); 316 list_del(&mr->list); 317 ent->available_mrs--; 318 ent->total_mrs--; 319 spin_unlock_irq(&ent->lock); 320 mlx5_core_destroy_mkey(ent->dev->mdev, mr->mmkey.key); 321 kfree(mr); 322 spin_lock_irq(&ent->lock); 323 } 324 325 static int resize_available_mrs(struct mlx5_cache_ent *ent, unsigned int target, 326 bool limit_fill) 327 { 328 int err; 329 330 lockdep_assert_held(&ent->lock); 331 332 while (true) { 333 if (limit_fill) 334 target = ent->limit * 2; 335 if (target == ent->available_mrs + ent->pending) 336 return 0; 337 if (target > ent->available_mrs + ent->pending) { 338 u32 todo = target - (ent->available_mrs + ent->pending); 339 340 spin_unlock_irq(&ent->lock); 341 err = add_keys(ent, todo); 342 if (err == -EAGAIN) 343 usleep_range(3000, 5000); 344 spin_lock_irq(&ent->lock); 345 if (err) { 346 if (err != -EAGAIN) 347 return err; 348 } else 349 return 0; 350 } else { 351 remove_cache_mr_locked(ent); 352 } 353 } 354 } 355 356 static ssize_t size_write(struct file *filp, const char __user *buf, 357 size_t count, loff_t *pos) 358 { 359 struct mlx5_cache_ent *ent = filp->private_data; 360 u32 target; 361 int err; 362 363 err = kstrtou32_from_user(buf, count, 0, &target); 364 if (err) 365 return err; 366 367 /* 368 * Target is the new value of total_mrs the user requests, however we 369 * cannot free MRs that are in use. Compute the target value for 370 * available_mrs. 371 */ 372 spin_lock_irq(&ent->lock); 373 if (target < ent->total_mrs - ent->available_mrs) { 374 err = -EINVAL; 375 goto err_unlock; 376 } 377 target = target - (ent->total_mrs - ent->available_mrs); 378 if (target < ent->limit || target > ent->limit*2) { 379 err = -EINVAL; 380 goto err_unlock; 381 } 382 err = resize_available_mrs(ent, target, false); 383 if (err) 384 goto err_unlock; 385 spin_unlock_irq(&ent->lock); 386 387 return count; 388 389 err_unlock: 390 spin_unlock_irq(&ent->lock); 391 return err; 392 } 393 394 static ssize_t size_read(struct file *filp, char __user *buf, size_t count, 395 loff_t *pos) 396 { 397 struct mlx5_cache_ent *ent = filp->private_data; 398 char lbuf[20]; 399 int err; 400 401 err = snprintf(lbuf, sizeof(lbuf), "%d\n", ent->total_mrs); 402 if (err < 0) 403 return err; 404 405 return simple_read_from_buffer(buf, count, pos, lbuf, err); 406 } 407 408 static const struct file_operations size_fops = { 409 .owner = THIS_MODULE, 410 .open = simple_open, 411 .write = size_write, 412 .read = size_read, 413 }; 414 415 static ssize_t limit_write(struct file *filp, const char __user *buf, 416 size_t count, loff_t *pos) 417 { 418 struct mlx5_cache_ent *ent = filp->private_data; 419 u32 var; 420 int err; 421 422 err = kstrtou32_from_user(buf, count, 0, &var); 423 if (err) 424 return err; 425 426 /* 427 * Upon set we immediately fill the cache to high water mark implied by 428 * the limit. 429 */ 430 spin_lock_irq(&ent->lock); 431 ent->limit = var; 432 err = resize_available_mrs(ent, 0, true); 433 spin_unlock_irq(&ent->lock); 434 if (err) 435 return err; 436 return count; 437 } 438 439 static ssize_t limit_read(struct file *filp, char __user *buf, size_t count, 440 loff_t *pos) 441 { 442 struct mlx5_cache_ent *ent = filp->private_data; 443 char lbuf[20]; 444 int err; 445 446 err = snprintf(lbuf, sizeof(lbuf), "%d\n", ent->limit); 447 if (err < 0) 448 return err; 449 450 return simple_read_from_buffer(buf, count, pos, lbuf, err); 451 } 452 453 static const struct file_operations limit_fops = { 454 .owner = THIS_MODULE, 455 .open = simple_open, 456 .write = limit_write, 457 .read = limit_read, 458 }; 459 460 static bool someone_adding(struct mlx5_mr_cache *cache) 461 { 462 unsigned int i; 463 464 for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) { 465 struct mlx5_cache_ent *ent = &cache->ent[i]; 466 bool ret; 467 468 spin_lock_irq(&ent->lock); 469 ret = ent->available_mrs < ent->limit; 470 spin_unlock_irq(&ent->lock); 471 if (ret) 472 return true; 473 } 474 return false; 475 } 476 477 /* 478 * Check if the bucket is outside the high/low water mark and schedule an async 479 * update. The cache refill has hysteresis, once the low water mark is hit it is 480 * refilled up to the high mark. 481 */ 482 static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent) 483 { 484 lockdep_assert_held(&ent->lock); 485 486 if (ent->disabled || READ_ONCE(ent->dev->fill_delay)) 487 return; 488 if (ent->available_mrs < ent->limit) { 489 ent->fill_to_high_water = true; 490 mod_delayed_work(ent->dev->cache.wq, &ent->dwork, 0); 491 } else if (ent->fill_to_high_water && 492 ent->available_mrs + ent->pending < 2 * ent->limit) { 493 /* 494 * Once we start populating due to hitting a low water mark 495 * continue until we pass the high water mark. 496 */ 497 mod_delayed_work(ent->dev->cache.wq, &ent->dwork, 0); 498 } else if (ent->available_mrs == 2 * ent->limit) { 499 ent->fill_to_high_water = false; 500 } else if (ent->available_mrs > 2 * ent->limit) { 501 /* Queue deletion of excess entries */ 502 ent->fill_to_high_water = false; 503 if (ent->pending) 504 queue_delayed_work(ent->dev->cache.wq, &ent->dwork, 505 msecs_to_jiffies(1000)); 506 else 507 mod_delayed_work(ent->dev->cache.wq, &ent->dwork, 0); 508 } 509 } 510 511 static void __cache_work_func(struct mlx5_cache_ent *ent) 512 { 513 struct mlx5_ib_dev *dev = ent->dev; 514 struct mlx5_mr_cache *cache = &dev->cache; 515 int err; 516 517 spin_lock_irq(&ent->lock); 518 if (ent->disabled) 519 goto out; 520 521 if (ent->fill_to_high_water && 522 ent->available_mrs + ent->pending < 2 * ent->limit && 523 !READ_ONCE(dev->fill_delay)) { 524 spin_unlock_irq(&ent->lock); 525 err = add_keys(ent, 1); 526 spin_lock_irq(&ent->lock); 527 if (ent->disabled) 528 goto out; 529 if (err) { 530 /* 531 * EAGAIN only happens if pending is positive, so we 532 * will be rescheduled from reg_mr_callback(). The only 533 * failure path here is ENOMEM. 534 */ 535 if (err != -EAGAIN) { 536 mlx5_ib_warn( 537 dev, 538 "command failed order %d, err %d\n", 539 ent->order, err); 540 queue_delayed_work(cache->wq, &ent->dwork, 541 msecs_to_jiffies(1000)); 542 } 543 } 544 } else if (ent->available_mrs > 2 * ent->limit) { 545 bool need_delay; 546 547 /* 548 * The remove_cache_mr() logic is performed as garbage 549 * collection task. Such task is intended to be run when no 550 * other active processes are running. 551 * 552 * The need_resched() will return TRUE if there are user tasks 553 * to be activated in near future. 554 * 555 * In such case, we don't execute remove_cache_mr() and postpone 556 * the garbage collection work to try to run in next cycle, in 557 * order to free CPU resources to other tasks. 558 */ 559 spin_unlock_irq(&ent->lock); 560 need_delay = need_resched() || someone_adding(cache) || 561 !time_after(jiffies, 562 READ_ONCE(cache->last_add) + 300 * HZ); 563 spin_lock_irq(&ent->lock); 564 if (ent->disabled) 565 goto out; 566 if (need_delay) { 567 queue_delayed_work(cache->wq, &ent->dwork, 300 * HZ); 568 goto out; 569 } 570 remove_cache_mr_locked(ent); 571 queue_adjust_cache_locked(ent); 572 } 573 out: 574 spin_unlock_irq(&ent->lock); 575 } 576 577 static void delayed_cache_work_func(struct work_struct *work) 578 { 579 struct mlx5_cache_ent *ent; 580 581 ent = container_of(work, struct mlx5_cache_ent, dwork.work); 582 __cache_work_func(ent); 583 } 584 585 struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, 586 struct mlx5_cache_ent *ent, 587 int access_flags) 588 { 589 struct mlx5_ib_mr *mr; 590 591 /* Matches access in alloc_cache_mr() */ 592 if (!mlx5r_umr_can_reconfig(dev, 0, access_flags)) 593 return ERR_PTR(-EOPNOTSUPP); 594 595 spin_lock_irq(&ent->lock); 596 if (list_empty(&ent->head)) { 597 queue_adjust_cache_locked(ent); 598 ent->miss++; 599 spin_unlock_irq(&ent->lock); 600 mr = create_cache_mr(ent); 601 if (IS_ERR(mr)) 602 return mr; 603 } else { 604 mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list); 605 list_del(&mr->list); 606 ent->available_mrs--; 607 queue_adjust_cache_locked(ent); 608 spin_unlock_irq(&ent->lock); 609 610 mlx5_clear_mr(mr); 611 } 612 return mr; 613 } 614 615 static void mlx5_mr_cache_free(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) 616 { 617 struct mlx5_cache_ent *ent = mr->cache_ent; 618 619 WRITE_ONCE(dev->cache.last_add, jiffies); 620 spin_lock_irq(&ent->lock); 621 list_add_tail(&mr->list, &ent->head); 622 ent->available_mrs++; 623 queue_adjust_cache_locked(ent); 624 spin_unlock_irq(&ent->lock); 625 } 626 627 static void clean_keys(struct mlx5_ib_dev *dev, int c) 628 { 629 struct mlx5_mr_cache *cache = &dev->cache; 630 struct mlx5_cache_ent *ent = &cache->ent[c]; 631 struct mlx5_ib_mr *tmp_mr; 632 struct mlx5_ib_mr *mr; 633 LIST_HEAD(del_list); 634 635 cancel_delayed_work(&ent->dwork); 636 while (1) { 637 spin_lock_irq(&ent->lock); 638 if (list_empty(&ent->head)) { 639 spin_unlock_irq(&ent->lock); 640 break; 641 } 642 mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list); 643 list_move(&mr->list, &del_list); 644 ent->available_mrs--; 645 ent->total_mrs--; 646 spin_unlock_irq(&ent->lock); 647 mlx5_core_destroy_mkey(dev->mdev, mr->mmkey.key); 648 } 649 650 list_for_each_entry_safe(mr, tmp_mr, &del_list, list) { 651 list_del(&mr->list); 652 kfree(mr); 653 } 654 } 655 656 static void mlx5_mr_cache_debugfs_cleanup(struct mlx5_ib_dev *dev) 657 { 658 if (!mlx5_debugfs_root || dev->is_rep) 659 return; 660 661 debugfs_remove_recursive(dev->cache.root); 662 dev->cache.root = NULL; 663 } 664 665 static void mlx5_mr_cache_debugfs_init(struct mlx5_ib_dev *dev) 666 { 667 struct mlx5_mr_cache *cache = &dev->cache; 668 struct mlx5_cache_ent *ent; 669 struct dentry *dir; 670 int i; 671 672 if (!mlx5_debugfs_root || dev->is_rep) 673 return; 674 675 cache->root = debugfs_create_dir("mr_cache", mlx5_debugfs_get_dev_root(dev->mdev)); 676 677 for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) { 678 ent = &cache->ent[i]; 679 sprintf(ent->name, "%d", ent->order); 680 dir = debugfs_create_dir(ent->name, cache->root); 681 debugfs_create_file("size", 0600, dir, ent, &size_fops); 682 debugfs_create_file("limit", 0600, dir, ent, &limit_fops); 683 debugfs_create_u32("cur", 0400, dir, &ent->available_mrs); 684 debugfs_create_u32("miss", 0600, dir, &ent->miss); 685 } 686 } 687 688 static void delay_time_func(struct timer_list *t) 689 { 690 struct mlx5_ib_dev *dev = from_timer(dev, t, delay_timer); 691 692 WRITE_ONCE(dev->fill_delay, 0); 693 } 694 695 int mlx5_mr_cache_init(struct mlx5_ib_dev *dev) 696 { 697 struct mlx5_mr_cache *cache = &dev->cache; 698 struct mlx5_cache_ent *ent; 699 int i; 700 701 mutex_init(&dev->slow_path_mutex); 702 cache->wq = alloc_ordered_workqueue("mkey_cache", WQ_MEM_RECLAIM); 703 if (!cache->wq) { 704 mlx5_ib_warn(dev, "failed to create work queue\n"); 705 return -ENOMEM; 706 } 707 708 mlx5_cmd_init_async_ctx(dev->mdev, &dev->async_ctx); 709 timer_setup(&dev->delay_timer, delay_time_func, 0); 710 for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) { 711 ent = &cache->ent[i]; 712 INIT_LIST_HEAD(&ent->head); 713 spin_lock_init(&ent->lock); 714 ent->order = i + 2; 715 ent->dev = dev; 716 ent->limit = 0; 717 718 INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func); 719 720 if (i > MR_CACHE_LAST_STD_ENTRY) { 721 mlx5_odp_init_mr_cache_entry(ent); 722 continue; 723 } 724 725 if (ent->order > mr_cache_max_order(dev)) 726 continue; 727 728 ent->page = PAGE_SHIFT; 729 ent->ndescs = 1 << ent->order; 730 ent->access_mode = MLX5_MKC_ACCESS_MODE_MTT; 731 if ((dev->mdev->profile.mask & MLX5_PROF_MASK_MR_CACHE) && 732 !dev->is_rep && mlx5_core_is_pf(dev->mdev) && 733 mlx5r_umr_can_load_pas(dev, 0)) 734 ent->limit = dev->mdev->profile.mr_cache[i].limit; 735 else 736 ent->limit = 0; 737 spin_lock_irq(&ent->lock); 738 queue_adjust_cache_locked(ent); 739 spin_unlock_irq(&ent->lock); 740 } 741 742 mlx5_mr_cache_debugfs_init(dev); 743 744 return 0; 745 } 746 747 int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev) 748 { 749 unsigned int i; 750 751 if (!dev->cache.wq) 752 return 0; 753 754 for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) { 755 struct mlx5_cache_ent *ent = &dev->cache.ent[i]; 756 757 spin_lock_irq(&ent->lock); 758 ent->disabled = true; 759 spin_unlock_irq(&ent->lock); 760 cancel_delayed_work_sync(&ent->dwork); 761 } 762 763 mlx5_mr_cache_debugfs_cleanup(dev); 764 mlx5_cmd_cleanup_async_ctx(&dev->async_ctx); 765 766 for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) 767 clean_keys(dev, i); 768 769 destroy_workqueue(dev->cache.wq); 770 del_timer_sync(&dev->delay_timer); 771 772 return 0; 773 } 774 775 struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc) 776 { 777 struct mlx5_ib_dev *dev = to_mdev(pd->device); 778 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 779 struct mlx5_ib_mr *mr; 780 void *mkc; 781 u32 *in; 782 int err; 783 784 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 785 if (!mr) 786 return ERR_PTR(-ENOMEM); 787 788 in = kzalloc(inlen, GFP_KERNEL); 789 if (!in) { 790 err = -ENOMEM; 791 goto err_free; 792 } 793 794 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 795 796 MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_PA); 797 MLX5_SET(mkc, mkc, length64, 1); 798 set_mkc_access_pd_addr_fields(mkc, acc | IB_ACCESS_RELAXED_ORDERING, 0, 799 pd); 800 801 err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen); 802 if (err) 803 goto err_in; 804 805 kfree(in); 806 mr->mmkey.type = MLX5_MKEY_MR; 807 mr->ibmr.lkey = mr->mmkey.key; 808 mr->ibmr.rkey = mr->mmkey.key; 809 mr->umem = NULL; 810 811 return &mr->ibmr; 812 813 err_in: 814 kfree(in); 815 816 err_free: 817 kfree(mr); 818 819 return ERR_PTR(err); 820 } 821 822 static int get_octo_len(u64 addr, u64 len, int page_shift) 823 { 824 u64 page_size = 1ULL << page_shift; 825 u64 offset; 826 int npages; 827 828 offset = addr & (page_size - 1); 829 npages = ALIGN(len + offset, page_size) >> page_shift; 830 return (npages + 1) / 2; 831 } 832 833 static int mr_cache_max_order(struct mlx5_ib_dev *dev) 834 { 835 if (MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset)) 836 return MR_CACHE_LAST_STD_ENTRY + 2; 837 return MLX5_MAX_UMR_SHIFT; 838 } 839 840 static struct mlx5_cache_ent *mr_cache_ent_from_order(struct mlx5_ib_dev *dev, 841 unsigned int order) 842 { 843 struct mlx5_mr_cache *cache = &dev->cache; 844 845 if (order < cache->ent[0].order) 846 return &cache->ent[0]; 847 order = order - cache->ent[0].order; 848 if (order > MR_CACHE_LAST_STD_ENTRY) 849 return NULL; 850 return &cache->ent[order]; 851 } 852 853 static void set_mr_fields(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr, 854 u64 length, int access_flags, u64 iova) 855 { 856 mr->ibmr.lkey = mr->mmkey.key; 857 mr->ibmr.rkey = mr->mmkey.key; 858 mr->ibmr.length = length; 859 mr->ibmr.device = &dev->ib_dev; 860 mr->ibmr.iova = iova; 861 mr->access_flags = access_flags; 862 } 863 864 static unsigned int mlx5_umem_dmabuf_default_pgsz(struct ib_umem *umem, 865 u64 iova) 866 { 867 /* 868 * The alignment of iova has already been checked upon entering 869 * UVERBS_METHOD_REG_DMABUF_MR 870 */ 871 umem->iova = iova; 872 return PAGE_SIZE; 873 } 874 875 static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd, 876 struct ib_umem *umem, u64 iova, 877 int access_flags) 878 { 879 struct mlx5_ib_dev *dev = to_mdev(pd->device); 880 struct mlx5_cache_ent *ent; 881 struct mlx5_ib_mr *mr; 882 unsigned int page_size; 883 884 if (umem->is_dmabuf) 885 page_size = mlx5_umem_dmabuf_default_pgsz(umem, iova); 886 else 887 page_size = mlx5_umem_find_best_pgsz(umem, mkc, log_page_size, 888 0, iova); 889 if (WARN_ON(!page_size)) 890 return ERR_PTR(-EINVAL); 891 ent = mr_cache_ent_from_order( 892 dev, order_base_2(ib_umem_num_dma_blocks(umem, page_size))); 893 /* 894 * Matches access in alloc_cache_mr(). If the MR can't come from the 895 * cache then synchronously create an uncached one. 896 */ 897 if (!ent || ent->limit == 0 || 898 !mlx5r_umr_can_reconfig(dev, 0, access_flags)) { 899 mutex_lock(&dev->slow_path_mutex); 900 mr = reg_create(pd, umem, iova, access_flags, page_size, false); 901 mutex_unlock(&dev->slow_path_mutex); 902 return mr; 903 } 904 905 mr = mlx5_mr_cache_alloc(dev, ent, access_flags); 906 if (IS_ERR(mr)) 907 return mr; 908 909 mr->ibmr.pd = pd; 910 mr->umem = umem; 911 mr->page_shift = order_base_2(page_size); 912 set_mr_fields(dev, mr, umem->length, access_flags, iova); 913 914 return mr; 915 } 916 917 /* 918 * If ibmr is NULL it will be allocated by reg_create. 919 * Else, the given ibmr will be used. 920 */ 921 static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem, 922 u64 iova, int access_flags, 923 unsigned int page_size, bool populate) 924 { 925 struct mlx5_ib_dev *dev = to_mdev(pd->device); 926 struct mlx5_ib_mr *mr; 927 __be64 *pas; 928 void *mkc; 929 int inlen; 930 u32 *in; 931 int err; 932 bool pg_cap = !!(MLX5_CAP_GEN(dev->mdev, pg)); 933 934 if (!page_size) 935 return ERR_PTR(-EINVAL); 936 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 937 if (!mr) 938 return ERR_PTR(-ENOMEM); 939 940 mr->ibmr.pd = pd; 941 mr->access_flags = access_flags; 942 mr->page_shift = order_base_2(page_size); 943 944 inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 945 if (populate) 946 inlen += sizeof(*pas) * 947 roundup(ib_umem_num_dma_blocks(umem, page_size), 2); 948 in = kvzalloc(inlen, GFP_KERNEL); 949 if (!in) { 950 err = -ENOMEM; 951 goto err_1; 952 } 953 pas = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt); 954 if (populate) { 955 if (WARN_ON(access_flags & IB_ACCESS_ON_DEMAND)) { 956 err = -EINVAL; 957 goto err_2; 958 } 959 mlx5_ib_populate_pas(umem, 1UL << mr->page_shift, pas, 960 pg_cap ? MLX5_IB_MTT_PRESENT : 0); 961 } 962 963 /* The pg_access bit allows setting the access flags 964 * in the page list submitted with the command. */ 965 MLX5_SET(create_mkey_in, in, pg_access, !!(pg_cap)); 966 967 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 968 set_mkc_access_pd_addr_fields(mkc, access_flags, iova, 969 populate ? pd : dev->umrc.pd); 970 MLX5_SET(mkc, mkc, free, !populate); 971 MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_MTT); 972 MLX5_SET(mkc, mkc, umr_en, 1); 973 974 MLX5_SET64(mkc, mkc, len, umem->length); 975 MLX5_SET(mkc, mkc, bsf_octword_size, 0); 976 MLX5_SET(mkc, mkc, translations_octword_size, 977 get_octo_len(iova, umem->length, mr->page_shift)); 978 MLX5_SET(mkc, mkc, log_page_size, mr->page_shift); 979 if (populate) { 980 MLX5_SET(create_mkey_in, in, translations_octword_actual_size, 981 get_octo_len(iova, umem->length, mr->page_shift)); 982 } 983 984 err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen); 985 if (err) { 986 mlx5_ib_warn(dev, "create mkey failed\n"); 987 goto err_2; 988 } 989 mr->mmkey.type = MLX5_MKEY_MR; 990 mr->umem = umem; 991 set_mr_fields(dev, mr, umem->length, access_flags, iova); 992 kvfree(in); 993 994 mlx5_ib_dbg(dev, "mkey = 0x%x\n", mr->mmkey.key); 995 996 return mr; 997 998 err_2: 999 kvfree(in); 1000 err_1: 1001 kfree(mr); 1002 return ERR_PTR(err); 1003 } 1004 1005 static struct ib_mr *mlx5_ib_get_dm_mr(struct ib_pd *pd, u64 start_addr, 1006 u64 length, int acc, int mode) 1007 { 1008 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1009 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 1010 struct mlx5_ib_mr *mr; 1011 void *mkc; 1012 u32 *in; 1013 int err; 1014 1015 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 1016 if (!mr) 1017 return ERR_PTR(-ENOMEM); 1018 1019 in = kzalloc(inlen, GFP_KERNEL); 1020 if (!in) { 1021 err = -ENOMEM; 1022 goto err_free; 1023 } 1024 1025 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 1026 1027 MLX5_SET(mkc, mkc, access_mode_1_0, mode & 0x3); 1028 MLX5_SET(mkc, mkc, access_mode_4_2, (mode >> 2) & 0x7); 1029 MLX5_SET64(mkc, mkc, len, length); 1030 set_mkc_access_pd_addr_fields(mkc, acc, start_addr, pd); 1031 1032 err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen); 1033 if (err) 1034 goto err_in; 1035 1036 kfree(in); 1037 1038 set_mr_fields(dev, mr, length, acc, start_addr); 1039 1040 return &mr->ibmr; 1041 1042 err_in: 1043 kfree(in); 1044 1045 err_free: 1046 kfree(mr); 1047 1048 return ERR_PTR(err); 1049 } 1050 1051 int mlx5_ib_advise_mr(struct ib_pd *pd, 1052 enum ib_uverbs_advise_mr_advice advice, 1053 u32 flags, 1054 struct ib_sge *sg_list, 1055 u32 num_sge, 1056 struct uverbs_attr_bundle *attrs) 1057 { 1058 if (advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH && 1059 advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE && 1060 advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_NO_FAULT) 1061 return -EOPNOTSUPP; 1062 1063 return mlx5_ib_advise_mr_prefetch(pd, advice, flags, 1064 sg_list, num_sge); 1065 } 1066 1067 struct ib_mr *mlx5_ib_reg_dm_mr(struct ib_pd *pd, struct ib_dm *dm, 1068 struct ib_dm_mr_attr *attr, 1069 struct uverbs_attr_bundle *attrs) 1070 { 1071 struct mlx5_ib_dm *mdm = to_mdm(dm); 1072 struct mlx5_core_dev *dev = to_mdev(dm->device)->mdev; 1073 u64 start_addr = mdm->dev_addr + attr->offset; 1074 int mode; 1075 1076 switch (mdm->type) { 1077 case MLX5_IB_UAPI_DM_TYPE_MEMIC: 1078 if (attr->access_flags & ~MLX5_IB_DM_MEMIC_ALLOWED_ACCESS) 1079 return ERR_PTR(-EINVAL); 1080 1081 mode = MLX5_MKC_ACCESS_MODE_MEMIC; 1082 start_addr -= pci_resource_start(dev->pdev, 0); 1083 break; 1084 case MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM: 1085 case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM: 1086 case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_PATTERN_SW_ICM: 1087 if (attr->access_flags & ~MLX5_IB_DM_SW_ICM_ALLOWED_ACCESS) 1088 return ERR_PTR(-EINVAL); 1089 1090 mode = MLX5_MKC_ACCESS_MODE_SW_ICM; 1091 break; 1092 default: 1093 return ERR_PTR(-EINVAL); 1094 } 1095 1096 return mlx5_ib_get_dm_mr(pd, start_addr, attr->length, 1097 attr->access_flags, mode); 1098 } 1099 1100 static struct ib_mr *create_real_mr(struct ib_pd *pd, struct ib_umem *umem, 1101 u64 iova, int access_flags) 1102 { 1103 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1104 struct mlx5_ib_mr *mr = NULL; 1105 bool xlt_with_umr; 1106 int err; 1107 1108 xlt_with_umr = mlx5r_umr_can_load_pas(dev, umem->length); 1109 if (xlt_with_umr) { 1110 mr = alloc_cacheable_mr(pd, umem, iova, access_flags); 1111 } else { 1112 unsigned int page_size = mlx5_umem_find_best_pgsz( 1113 umem, mkc, log_page_size, 0, iova); 1114 1115 mutex_lock(&dev->slow_path_mutex); 1116 mr = reg_create(pd, umem, iova, access_flags, page_size, true); 1117 mutex_unlock(&dev->slow_path_mutex); 1118 } 1119 if (IS_ERR(mr)) { 1120 ib_umem_release(umem); 1121 return ERR_CAST(mr); 1122 } 1123 1124 mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmkey.key); 1125 1126 atomic_add(ib_umem_num_pages(umem), &dev->mdev->priv.reg_pages); 1127 1128 if (xlt_with_umr) { 1129 /* 1130 * If the MR was created with reg_create then it will be 1131 * configured properly but left disabled. It is safe to go ahead 1132 * and configure it again via UMR while enabling it. 1133 */ 1134 err = mlx5r_umr_update_mr_pas(mr, MLX5_IB_UPD_XLT_ENABLE); 1135 if (err) { 1136 mlx5_ib_dereg_mr(&mr->ibmr, NULL); 1137 return ERR_PTR(err); 1138 } 1139 } 1140 return &mr->ibmr; 1141 } 1142 1143 static struct ib_mr *create_user_odp_mr(struct ib_pd *pd, u64 start, u64 length, 1144 u64 iova, int access_flags, 1145 struct ib_udata *udata) 1146 { 1147 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1148 struct ib_umem_odp *odp; 1149 struct mlx5_ib_mr *mr; 1150 int err; 1151 1152 if (!IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) 1153 return ERR_PTR(-EOPNOTSUPP); 1154 1155 err = mlx5r_odp_create_eq(dev, &dev->odp_pf_eq); 1156 if (err) 1157 return ERR_PTR(err); 1158 if (!start && length == U64_MAX) { 1159 if (iova != 0) 1160 return ERR_PTR(-EINVAL); 1161 if (!(dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT)) 1162 return ERR_PTR(-EINVAL); 1163 1164 mr = mlx5_ib_alloc_implicit_mr(to_mpd(pd), access_flags); 1165 if (IS_ERR(mr)) 1166 return ERR_CAST(mr); 1167 return &mr->ibmr; 1168 } 1169 1170 /* ODP requires xlt update via umr to work. */ 1171 if (!mlx5r_umr_can_load_pas(dev, length)) 1172 return ERR_PTR(-EINVAL); 1173 1174 odp = ib_umem_odp_get(&dev->ib_dev, start, length, access_flags, 1175 &mlx5_mn_ops); 1176 if (IS_ERR(odp)) 1177 return ERR_CAST(odp); 1178 1179 mr = alloc_cacheable_mr(pd, &odp->umem, iova, access_flags); 1180 if (IS_ERR(mr)) { 1181 ib_umem_release(&odp->umem); 1182 return ERR_CAST(mr); 1183 } 1184 xa_init(&mr->implicit_children); 1185 1186 odp->private = mr; 1187 err = mlx5r_store_odp_mkey(dev, &mr->mmkey); 1188 if (err) 1189 goto err_dereg_mr; 1190 1191 err = mlx5_ib_init_odp_mr(mr); 1192 if (err) 1193 goto err_dereg_mr; 1194 return &mr->ibmr; 1195 1196 err_dereg_mr: 1197 mlx5_ib_dereg_mr(&mr->ibmr, NULL); 1198 return ERR_PTR(err); 1199 } 1200 1201 struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, 1202 u64 iova, int access_flags, 1203 struct ib_udata *udata) 1204 { 1205 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1206 struct ib_umem *umem; 1207 1208 if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM)) 1209 return ERR_PTR(-EOPNOTSUPP); 1210 1211 mlx5_ib_dbg(dev, "start 0x%llx, iova 0x%llx, length 0x%llx, access_flags 0x%x\n", 1212 start, iova, length, access_flags); 1213 1214 if (access_flags & IB_ACCESS_ON_DEMAND) 1215 return create_user_odp_mr(pd, start, length, iova, access_flags, 1216 udata); 1217 umem = ib_umem_get(&dev->ib_dev, start, length, access_flags); 1218 if (IS_ERR(umem)) 1219 return ERR_CAST(umem); 1220 return create_real_mr(pd, umem, iova, access_flags); 1221 } 1222 1223 static void mlx5_ib_dmabuf_invalidate_cb(struct dma_buf_attachment *attach) 1224 { 1225 struct ib_umem_dmabuf *umem_dmabuf = attach->importer_priv; 1226 struct mlx5_ib_mr *mr = umem_dmabuf->private; 1227 1228 dma_resv_assert_held(umem_dmabuf->attach->dmabuf->resv); 1229 1230 if (!umem_dmabuf->sgt) 1231 return; 1232 1233 mlx5r_umr_update_mr_pas(mr, MLX5_IB_UPD_XLT_ZAP); 1234 ib_umem_dmabuf_unmap_pages(umem_dmabuf); 1235 } 1236 1237 static struct dma_buf_attach_ops mlx5_ib_dmabuf_attach_ops = { 1238 .allow_peer2peer = 1, 1239 .move_notify = mlx5_ib_dmabuf_invalidate_cb, 1240 }; 1241 1242 struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset, 1243 u64 length, u64 virt_addr, 1244 int fd, int access_flags, 1245 struct ib_udata *udata) 1246 { 1247 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1248 struct mlx5_ib_mr *mr = NULL; 1249 struct ib_umem_dmabuf *umem_dmabuf; 1250 int err; 1251 1252 if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM) || 1253 !IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) 1254 return ERR_PTR(-EOPNOTSUPP); 1255 1256 mlx5_ib_dbg(dev, 1257 "offset 0x%llx, virt_addr 0x%llx, length 0x%llx, fd %d, access_flags 0x%x\n", 1258 offset, virt_addr, length, fd, access_flags); 1259 1260 /* dmabuf requires xlt update via umr to work. */ 1261 if (!mlx5r_umr_can_load_pas(dev, length)) 1262 return ERR_PTR(-EINVAL); 1263 1264 umem_dmabuf = ib_umem_dmabuf_get(&dev->ib_dev, offset, length, fd, 1265 access_flags, 1266 &mlx5_ib_dmabuf_attach_ops); 1267 if (IS_ERR(umem_dmabuf)) { 1268 mlx5_ib_dbg(dev, "umem_dmabuf get failed (%ld)\n", 1269 PTR_ERR(umem_dmabuf)); 1270 return ERR_CAST(umem_dmabuf); 1271 } 1272 1273 mr = alloc_cacheable_mr(pd, &umem_dmabuf->umem, virt_addr, 1274 access_flags); 1275 if (IS_ERR(mr)) { 1276 ib_umem_release(&umem_dmabuf->umem); 1277 return ERR_CAST(mr); 1278 } 1279 1280 mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmkey.key); 1281 1282 atomic_add(ib_umem_num_pages(mr->umem), &dev->mdev->priv.reg_pages); 1283 umem_dmabuf->private = mr; 1284 err = mlx5r_store_odp_mkey(dev, &mr->mmkey); 1285 if (err) 1286 goto err_dereg_mr; 1287 1288 err = mlx5_ib_init_dmabuf_mr(mr); 1289 if (err) 1290 goto err_dereg_mr; 1291 return &mr->ibmr; 1292 1293 err_dereg_mr: 1294 mlx5_ib_dereg_mr(&mr->ibmr, NULL); 1295 return ERR_PTR(err); 1296 } 1297 1298 /* 1299 * True if the change in access flags can be done via UMR, only some access 1300 * flags can be updated. 1301 */ 1302 static bool can_use_umr_rereg_access(struct mlx5_ib_dev *dev, 1303 unsigned int current_access_flags, 1304 unsigned int target_access_flags) 1305 { 1306 unsigned int diffs = current_access_flags ^ target_access_flags; 1307 1308 if (diffs & ~(IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE | 1309 IB_ACCESS_REMOTE_READ | IB_ACCESS_RELAXED_ORDERING)) 1310 return false; 1311 return mlx5r_umr_can_reconfig(dev, current_access_flags, 1312 target_access_flags); 1313 } 1314 1315 static bool can_use_umr_rereg_pas(struct mlx5_ib_mr *mr, 1316 struct ib_umem *new_umem, 1317 int new_access_flags, u64 iova, 1318 unsigned long *page_size) 1319 { 1320 struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device); 1321 1322 /* We only track the allocated sizes of MRs from the cache */ 1323 if (!mr->cache_ent) 1324 return false; 1325 if (!mlx5r_umr_can_load_pas(dev, new_umem->length)) 1326 return false; 1327 1328 *page_size = 1329 mlx5_umem_find_best_pgsz(new_umem, mkc, log_page_size, 0, iova); 1330 if (WARN_ON(!*page_size)) 1331 return false; 1332 return (1ULL << mr->cache_ent->order) >= 1333 ib_umem_num_dma_blocks(new_umem, *page_size); 1334 } 1335 1336 static int umr_rereg_pas(struct mlx5_ib_mr *mr, struct ib_pd *pd, 1337 int access_flags, int flags, struct ib_umem *new_umem, 1338 u64 iova, unsigned long page_size) 1339 { 1340 struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device); 1341 int upd_flags = MLX5_IB_UPD_XLT_ADDR | MLX5_IB_UPD_XLT_ENABLE; 1342 struct ib_umem *old_umem = mr->umem; 1343 int err; 1344 1345 /* 1346 * To keep everything simple the MR is revoked before we start to mess 1347 * with it. This ensure the change is atomic relative to any use of the 1348 * MR. 1349 */ 1350 err = mlx5r_umr_revoke_mr(mr); 1351 if (err) 1352 return err; 1353 1354 if (flags & IB_MR_REREG_PD) { 1355 mr->ibmr.pd = pd; 1356 upd_flags |= MLX5_IB_UPD_XLT_PD; 1357 } 1358 if (flags & IB_MR_REREG_ACCESS) { 1359 mr->access_flags = access_flags; 1360 upd_flags |= MLX5_IB_UPD_XLT_ACCESS; 1361 } 1362 1363 mr->ibmr.length = new_umem->length; 1364 mr->ibmr.iova = iova; 1365 mr->ibmr.length = new_umem->length; 1366 mr->page_shift = order_base_2(page_size); 1367 mr->umem = new_umem; 1368 err = mlx5r_umr_update_mr_pas(mr, upd_flags); 1369 if (err) { 1370 /* 1371 * The MR is revoked at this point so there is no issue to free 1372 * new_umem. 1373 */ 1374 mr->umem = old_umem; 1375 return err; 1376 } 1377 1378 atomic_sub(ib_umem_num_pages(old_umem), &dev->mdev->priv.reg_pages); 1379 ib_umem_release(old_umem); 1380 atomic_add(ib_umem_num_pages(new_umem), &dev->mdev->priv.reg_pages); 1381 return 0; 1382 } 1383 1384 struct ib_mr *mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, 1385 u64 length, u64 iova, int new_access_flags, 1386 struct ib_pd *new_pd, 1387 struct ib_udata *udata) 1388 { 1389 struct mlx5_ib_dev *dev = to_mdev(ib_mr->device); 1390 struct mlx5_ib_mr *mr = to_mmr(ib_mr); 1391 int err; 1392 1393 if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM)) 1394 return ERR_PTR(-EOPNOTSUPP); 1395 1396 mlx5_ib_dbg( 1397 dev, 1398 "start 0x%llx, iova 0x%llx, length 0x%llx, access_flags 0x%x\n", 1399 start, iova, length, new_access_flags); 1400 1401 if (flags & ~(IB_MR_REREG_TRANS | IB_MR_REREG_PD | IB_MR_REREG_ACCESS)) 1402 return ERR_PTR(-EOPNOTSUPP); 1403 1404 if (!(flags & IB_MR_REREG_ACCESS)) 1405 new_access_flags = mr->access_flags; 1406 if (!(flags & IB_MR_REREG_PD)) 1407 new_pd = ib_mr->pd; 1408 1409 if (!(flags & IB_MR_REREG_TRANS)) { 1410 struct ib_umem *umem; 1411 1412 /* Fast path for PD/access change */ 1413 if (can_use_umr_rereg_access(dev, mr->access_flags, 1414 new_access_flags)) { 1415 err = mlx5r_umr_rereg_pd_access(mr, new_pd, 1416 new_access_flags); 1417 if (err) 1418 return ERR_PTR(err); 1419 return NULL; 1420 } 1421 /* DM or ODP MR's don't have a normal umem so we can't re-use it */ 1422 if (!mr->umem || is_odp_mr(mr) || is_dmabuf_mr(mr)) 1423 goto recreate; 1424 1425 /* 1426 * Only one active MR can refer to a umem at one time, revoke 1427 * the old MR before assigning the umem to the new one. 1428 */ 1429 err = mlx5r_umr_revoke_mr(mr); 1430 if (err) 1431 return ERR_PTR(err); 1432 umem = mr->umem; 1433 mr->umem = NULL; 1434 atomic_sub(ib_umem_num_pages(umem), &dev->mdev->priv.reg_pages); 1435 1436 return create_real_mr(new_pd, umem, mr->ibmr.iova, 1437 new_access_flags); 1438 } 1439 1440 /* 1441 * DM doesn't have a PAS list so we can't re-use it, odp/dmabuf does 1442 * but the logic around releasing the umem is different 1443 */ 1444 if (!mr->umem || is_odp_mr(mr) || is_dmabuf_mr(mr)) 1445 goto recreate; 1446 1447 if (!(new_access_flags & IB_ACCESS_ON_DEMAND) && 1448 can_use_umr_rereg_access(dev, mr->access_flags, new_access_flags)) { 1449 struct ib_umem *new_umem; 1450 unsigned long page_size; 1451 1452 new_umem = ib_umem_get(&dev->ib_dev, start, length, 1453 new_access_flags); 1454 if (IS_ERR(new_umem)) 1455 return ERR_CAST(new_umem); 1456 1457 /* Fast path for PAS change */ 1458 if (can_use_umr_rereg_pas(mr, new_umem, new_access_flags, iova, 1459 &page_size)) { 1460 err = umr_rereg_pas(mr, new_pd, new_access_flags, flags, 1461 new_umem, iova, page_size); 1462 if (err) { 1463 ib_umem_release(new_umem); 1464 return ERR_PTR(err); 1465 } 1466 return NULL; 1467 } 1468 return create_real_mr(new_pd, new_umem, iova, new_access_flags); 1469 } 1470 1471 /* 1472 * Everything else has no state we can preserve, just create a new MR 1473 * from scratch 1474 */ 1475 recreate: 1476 return mlx5_ib_reg_user_mr(new_pd, start, length, iova, 1477 new_access_flags, udata); 1478 } 1479 1480 static int 1481 mlx5_alloc_priv_descs(struct ib_device *device, 1482 struct mlx5_ib_mr *mr, 1483 int ndescs, 1484 int desc_size) 1485 { 1486 struct mlx5_ib_dev *dev = to_mdev(device); 1487 struct device *ddev = &dev->mdev->pdev->dev; 1488 int size = ndescs * desc_size; 1489 int add_size; 1490 int ret; 1491 1492 add_size = max_t(int, MLX5_UMR_ALIGN - ARCH_KMALLOC_MINALIGN, 0); 1493 1494 mr->descs_alloc = kzalloc(size + add_size, GFP_KERNEL); 1495 if (!mr->descs_alloc) 1496 return -ENOMEM; 1497 1498 mr->descs = PTR_ALIGN(mr->descs_alloc, MLX5_UMR_ALIGN); 1499 1500 mr->desc_map = dma_map_single(ddev, mr->descs, size, DMA_TO_DEVICE); 1501 if (dma_mapping_error(ddev, mr->desc_map)) { 1502 ret = -ENOMEM; 1503 goto err; 1504 } 1505 1506 return 0; 1507 err: 1508 kfree(mr->descs_alloc); 1509 1510 return ret; 1511 } 1512 1513 static void 1514 mlx5_free_priv_descs(struct mlx5_ib_mr *mr) 1515 { 1516 if (!mr->umem && mr->descs) { 1517 struct ib_device *device = mr->ibmr.device; 1518 int size = mr->max_descs * mr->desc_size; 1519 struct mlx5_ib_dev *dev = to_mdev(device); 1520 1521 dma_unmap_single(&dev->mdev->pdev->dev, mr->desc_map, size, 1522 DMA_TO_DEVICE); 1523 kfree(mr->descs_alloc); 1524 mr->descs = NULL; 1525 } 1526 } 1527 1528 int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) 1529 { 1530 struct mlx5_ib_mr *mr = to_mmr(ibmr); 1531 struct mlx5_ib_dev *dev = to_mdev(ibmr->device); 1532 int rc; 1533 1534 /* 1535 * Any async use of the mr must hold the refcount, once the refcount 1536 * goes to zero no other thread, such as ODP page faults, prefetch, any 1537 * UMR activity, etc can touch the mkey. Thus it is safe to destroy it. 1538 */ 1539 if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) && 1540 refcount_read(&mr->mmkey.usecount) != 0 && 1541 xa_erase(&mr_to_mdev(mr)->odp_mkeys, mlx5_base_mkey(mr->mmkey.key))) 1542 mlx5r_deref_wait_odp_mkey(&mr->mmkey); 1543 1544 if (ibmr->type == IB_MR_TYPE_INTEGRITY) { 1545 xa_cmpxchg(&dev->sig_mrs, mlx5_base_mkey(mr->mmkey.key), 1546 mr->sig, NULL, GFP_KERNEL); 1547 1548 if (mr->mtt_mr) { 1549 rc = mlx5_ib_dereg_mr(&mr->mtt_mr->ibmr, NULL); 1550 if (rc) 1551 return rc; 1552 mr->mtt_mr = NULL; 1553 } 1554 if (mr->klm_mr) { 1555 rc = mlx5_ib_dereg_mr(&mr->klm_mr->ibmr, NULL); 1556 if (rc) 1557 return rc; 1558 mr->klm_mr = NULL; 1559 } 1560 1561 if (mlx5_core_destroy_psv(dev->mdev, 1562 mr->sig->psv_memory.psv_idx)) 1563 mlx5_ib_warn(dev, "failed to destroy mem psv %d\n", 1564 mr->sig->psv_memory.psv_idx); 1565 if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_wire.psv_idx)) 1566 mlx5_ib_warn(dev, "failed to destroy wire psv %d\n", 1567 mr->sig->psv_wire.psv_idx); 1568 kfree(mr->sig); 1569 mr->sig = NULL; 1570 } 1571 1572 /* Stop DMA */ 1573 if (mr->cache_ent) { 1574 if (mlx5r_umr_revoke_mr(mr)) { 1575 spin_lock_irq(&mr->cache_ent->lock); 1576 mr->cache_ent->total_mrs--; 1577 spin_unlock_irq(&mr->cache_ent->lock); 1578 mr->cache_ent = NULL; 1579 } 1580 } 1581 if (!mr->cache_ent) { 1582 rc = destroy_mkey(to_mdev(mr->ibmr.device), mr); 1583 if (rc) 1584 return rc; 1585 } 1586 1587 if (mr->umem) { 1588 bool is_odp = is_odp_mr(mr); 1589 1590 if (!is_odp) 1591 atomic_sub(ib_umem_num_pages(mr->umem), 1592 &dev->mdev->priv.reg_pages); 1593 ib_umem_release(mr->umem); 1594 if (is_odp) 1595 mlx5_ib_free_odp_mr(mr); 1596 } 1597 1598 if (mr->cache_ent) { 1599 mlx5_mr_cache_free(dev, mr); 1600 } else { 1601 mlx5_free_priv_descs(mr); 1602 kfree(mr); 1603 } 1604 return 0; 1605 } 1606 1607 static void mlx5_set_umr_free_mkey(struct ib_pd *pd, u32 *in, int ndescs, 1608 int access_mode, int page_shift) 1609 { 1610 void *mkc; 1611 1612 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 1613 1614 /* This is only used from the kernel, so setting the PD is OK. */ 1615 set_mkc_access_pd_addr_fields(mkc, IB_ACCESS_RELAXED_ORDERING, 0, pd); 1616 MLX5_SET(mkc, mkc, free, 1); 1617 MLX5_SET(mkc, mkc, translations_octword_size, ndescs); 1618 MLX5_SET(mkc, mkc, access_mode_1_0, access_mode & 0x3); 1619 MLX5_SET(mkc, mkc, access_mode_4_2, (access_mode >> 2) & 0x7); 1620 MLX5_SET(mkc, mkc, umr_en, 1); 1621 MLX5_SET(mkc, mkc, log_page_size, page_shift); 1622 } 1623 1624 static int _mlx5_alloc_mkey_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr, 1625 int ndescs, int desc_size, int page_shift, 1626 int access_mode, u32 *in, int inlen) 1627 { 1628 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1629 int err; 1630 1631 mr->access_mode = access_mode; 1632 mr->desc_size = desc_size; 1633 mr->max_descs = ndescs; 1634 1635 err = mlx5_alloc_priv_descs(pd->device, mr, ndescs, desc_size); 1636 if (err) 1637 return err; 1638 1639 mlx5_set_umr_free_mkey(pd, in, ndescs, access_mode, page_shift); 1640 1641 err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen); 1642 if (err) 1643 goto err_free_descs; 1644 1645 mr->mmkey.type = MLX5_MKEY_MR; 1646 mr->ibmr.lkey = mr->mmkey.key; 1647 mr->ibmr.rkey = mr->mmkey.key; 1648 1649 return 0; 1650 1651 err_free_descs: 1652 mlx5_free_priv_descs(mr); 1653 return err; 1654 } 1655 1656 static struct mlx5_ib_mr *mlx5_ib_alloc_pi_mr(struct ib_pd *pd, 1657 u32 max_num_sg, u32 max_num_meta_sg, 1658 int desc_size, int access_mode) 1659 { 1660 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 1661 int ndescs = ALIGN(max_num_sg + max_num_meta_sg, 4); 1662 int page_shift = 0; 1663 struct mlx5_ib_mr *mr; 1664 u32 *in; 1665 int err; 1666 1667 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 1668 if (!mr) 1669 return ERR_PTR(-ENOMEM); 1670 1671 mr->ibmr.pd = pd; 1672 mr->ibmr.device = pd->device; 1673 1674 in = kzalloc(inlen, GFP_KERNEL); 1675 if (!in) { 1676 err = -ENOMEM; 1677 goto err_free; 1678 } 1679 1680 if (access_mode == MLX5_MKC_ACCESS_MODE_MTT) 1681 page_shift = PAGE_SHIFT; 1682 1683 err = _mlx5_alloc_mkey_descs(pd, mr, ndescs, desc_size, page_shift, 1684 access_mode, in, inlen); 1685 if (err) 1686 goto err_free_in; 1687 1688 mr->umem = NULL; 1689 kfree(in); 1690 1691 return mr; 1692 1693 err_free_in: 1694 kfree(in); 1695 err_free: 1696 kfree(mr); 1697 return ERR_PTR(err); 1698 } 1699 1700 static int mlx5_alloc_mem_reg_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr, 1701 int ndescs, u32 *in, int inlen) 1702 { 1703 return _mlx5_alloc_mkey_descs(pd, mr, ndescs, sizeof(struct mlx5_mtt), 1704 PAGE_SHIFT, MLX5_MKC_ACCESS_MODE_MTT, in, 1705 inlen); 1706 } 1707 1708 static int mlx5_alloc_sg_gaps_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr, 1709 int ndescs, u32 *in, int inlen) 1710 { 1711 return _mlx5_alloc_mkey_descs(pd, mr, ndescs, sizeof(struct mlx5_klm), 1712 0, MLX5_MKC_ACCESS_MODE_KLMS, in, inlen); 1713 } 1714 1715 static int mlx5_alloc_integrity_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr, 1716 int max_num_sg, int max_num_meta_sg, 1717 u32 *in, int inlen) 1718 { 1719 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1720 u32 psv_index[2]; 1721 void *mkc; 1722 int err; 1723 1724 mr->sig = kzalloc(sizeof(*mr->sig), GFP_KERNEL); 1725 if (!mr->sig) 1726 return -ENOMEM; 1727 1728 /* create mem & wire PSVs */ 1729 err = mlx5_core_create_psv(dev->mdev, to_mpd(pd)->pdn, 2, psv_index); 1730 if (err) 1731 goto err_free_sig; 1732 1733 mr->sig->psv_memory.psv_idx = psv_index[0]; 1734 mr->sig->psv_wire.psv_idx = psv_index[1]; 1735 1736 mr->sig->sig_status_checked = true; 1737 mr->sig->sig_err_exists = false; 1738 /* Next UMR, Arm SIGERR */ 1739 ++mr->sig->sigerr_count; 1740 mr->klm_mr = mlx5_ib_alloc_pi_mr(pd, max_num_sg, max_num_meta_sg, 1741 sizeof(struct mlx5_klm), 1742 MLX5_MKC_ACCESS_MODE_KLMS); 1743 if (IS_ERR(mr->klm_mr)) { 1744 err = PTR_ERR(mr->klm_mr); 1745 goto err_destroy_psv; 1746 } 1747 mr->mtt_mr = mlx5_ib_alloc_pi_mr(pd, max_num_sg, max_num_meta_sg, 1748 sizeof(struct mlx5_mtt), 1749 MLX5_MKC_ACCESS_MODE_MTT); 1750 if (IS_ERR(mr->mtt_mr)) { 1751 err = PTR_ERR(mr->mtt_mr); 1752 goto err_free_klm_mr; 1753 } 1754 1755 /* Set bsf descriptors for mkey */ 1756 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 1757 MLX5_SET(mkc, mkc, bsf_en, 1); 1758 MLX5_SET(mkc, mkc, bsf_octword_size, MLX5_MKEY_BSF_OCTO_SIZE); 1759 1760 err = _mlx5_alloc_mkey_descs(pd, mr, 4, sizeof(struct mlx5_klm), 0, 1761 MLX5_MKC_ACCESS_MODE_KLMS, in, inlen); 1762 if (err) 1763 goto err_free_mtt_mr; 1764 1765 err = xa_err(xa_store(&dev->sig_mrs, mlx5_base_mkey(mr->mmkey.key), 1766 mr->sig, GFP_KERNEL)); 1767 if (err) 1768 goto err_free_descs; 1769 return 0; 1770 1771 err_free_descs: 1772 destroy_mkey(dev, mr); 1773 mlx5_free_priv_descs(mr); 1774 err_free_mtt_mr: 1775 mlx5_ib_dereg_mr(&mr->mtt_mr->ibmr, NULL); 1776 mr->mtt_mr = NULL; 1777 err_free_klm_mr: 1778 mlx5_ib_dereg_mr(&mr->klm_mr->ibmr, NULL); 1779 mr->klm_mr = NULL; 1780 err_destroy_psv: 1781 if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_memory.psv_idx)) 1782 mlx5_ib_warn(dev, "failed to destroy mem psv %d\n", 1783 mr->sig->psv_memory.psv_idx); 1784 if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_wire.psv_idx)) 1785 mlx5_ib_warn(dev, "failed to destroy wire psv %d\n", 1786 mr->sig->psv_wire.psv_idx); 1787 err_free_sig: 1788 kfree(mr->sig); 1789 1790 return err; 1791 } 1792 1793 static struct ib_mr *__mlx5_ib_alloc_mr(struct ib_pd *pd, 1794 enum ib_mr_type mr_type, u32 max_num_sg, 1795 u32 max_num_meta_sg) 1796 { 1797 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1798 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 1799 int ndescs = ALIGN(max_num_sg, 4); 1800 struct mlx5_ib_mr *mr; 1801 u32 *in; 1802 int err; 1803 1804 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 1805 if (!mr) 1806 return ERR_PTR(-ENOMEM); 1807 1808 in = kzalloc(inlen, GFP_KERNEL); 1809 if (!in) { 1810 err = -ENOMEM; 1811 goto err_free; 1812 } 1813 1814 mr->ibmr.device = pd->device; 1815 mr->umem = NULL; 1816 1817 switch (mr_type) { 1818 case IB_MR_TYPE_MEM_REG: 1819 err = mlx5_alloc_mem_reg_descs(pd, mr, ndescs, in, inlen); 1820 break; 1821 case IB_MR_TYPE_SG_GAPS: 1822 err = mlx5_alloc_sg_gaps_descs(pd, mr, ndescs, in, inlen); 1823 break; 1824 case IB_MR_TYPE_INTEGRITY: 1825 err = mlx5_alloc_integrity_descs(pd, mr, max_num_sg, 1826 max_num_meta_sg, in, inlen); 1827 break; 1828 default: 1829 mlx5_ib_warn(dev, "Invalid mr type %d\n", mr_type); 1830 err = -EINVAL; 1831 } 1832 1833 if (err) 1834 goto err_free_in; 1835 1836 kfree(in); 1837 1838 return &mr->ibmr; 1839 1840 err_free_in: 1841 kfree(in); 1842 err_free: 1843 kfree(mr); 1844 return ERR_PTR(err); 1845 } 1846 1847 struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, 1848 u32 max_num_sg) 1849 { 1850 return __mlx5_ib_alloc_mr(pd, mr_type, max_num_sg, 0); 1851 } 1852 1853 struct ib_mr *mlx5_ib_alloc_mr_integrity(struct ib_pd *pd, 1854 u32 max_num_sg, u32 max_num_meta_sg) 1855 { 1856 return __mlx5_ib_alloc_mr(pd, IB_MR_TYPE_INTEGRITY, max_num_sg, 1857 max_num_meta_sg); 1858 } 1859 1860 int mlx5_ib_alloc_mw(struct ib_mw *ibmw, struct ib_udata *udata) 1861 { 1862 struct mlx5_ib_dev *dev = to_mdev(ibmw->device); 1863 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 1864 struct mlx5_ib_mw *mw = to_mmw(ibmw); 1865 unsigned int ndescs; 1866 u32 *in = NULL; 1867 void *mkc; 1868 int err; 1869 struct mlx5_ib_alloc_mw req = {}; 1870 struct { 1871 __u32 comp_mask; 1872 __u32 response_length; 1873 } resp = {}; 1874 1875 err = ib_copy_from_udata(&req, udata, min(udata->inlen, sizeof(req))); 1876 if (err) 1877 return err; 1878 1879 if (req.comp_mask || req.reserved1 || req.reserved2) 1880 return -EOPNOTSUPP; 1881 1882 if (udata->inlen > sizeof(req) && 1883 !ib_is_udata_cleared(udata, sizeof(req), 1884 udata->inlen - sizeof(req))) 1885 return -EOPNOTSUPP; 1886 1887 ndescs = req.num_klms ? roundup(req.num_klms, 4) : roundup(1, 4); 1888 1889 in = kzalloc(inlen, GFP_KERNEL); 1890 if (!in) { 1891 err = -ENOMEM; 1892 goto free; 1893 } 1894 1895 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 1896 1897 MLX5_SET(mkc, mkc, free, 1); 1898 MLX5_SET(mkc, mkc, translations_octword_size, ndescs); 1899 MLX5_SET(mkc, mkc, pd, to_mpd(ibmw->pd)->pdn); 1900 MLX5_SET(mkc, mkc, umr_en, 1); 1901 MLX5_SET(mkc, mkc, lr, 1); 1902 MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_KLMS); 1903 MLX5_SET(mkc, mkc, en_rinval, !!((ibmw->type == IB_MW_TYPE_2))); 1904 MLX5_SET(mkc, mkc, qpn, 0xffffff); 1905 1906 err = mlx5_ib_create_mkey(dev, &mw->mmkey, in, inlen); 1907 if (err) 1908 goto free; 1909 1910 mw->mmkey.type = MLX5_MKEY_MW; 1911 ibmw->rkey = mw->mmkey.key; 1912 mw->mmkey.ndescs = ndescs; 1913 1914 resp.response_length = 1915 min(offsetofend(typeof(resp), response_length), udata->outlen); 1916 if (resp.response_length) { 1917 err = ib_copy_to_udata(udata, &resp, resp.response_length); 1918 if (err) 1919 goto free_mkey; 1920 } 1921 1922 if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) { 1923 err = mlx5r_store_odp_mkey(dev, &mw->mmkey); 1924 if (err) 1925 goto free_mkey; 1926 } 1927 1928 kfree(in); 1929 return 0; 1930 1931 free_mkey: 1932 mlx5_core_destroy_mkey(dev->mdev, mw->mmkey.key); 1933 free: 1934 kfree(in); 1935 return err; 1936 } 1937 1938 int mlx5_ib_dealloc_mw(struct ib_mw *mw) 1939 { 1940 struct mlx5_ib_dev *dev = to_mdev(mw->device); 1941 struct mlx5_ib_mw *mmw = to_mmw(mw); 1942 1943 if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) && 1944 xa_erase(&dev->odp_mkeys, mlx5_base_mkey(mmw->mmkey.key))) 1945 /* 1946 * pagefault_single_data_segment() may be accessing mmw 1947 * if the user bound an ODP MR to this MW. 1948 */ 1949 mlx5r_deref_wait_odp_mkey(&mmw->mmkey); 1950 1951 return mlx5_core_destroy_mkey(dev->mdev, mmw->mmkey.key); 1952 } 1953 1954 int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask, 1955 struct ib_mr_status *mr_status) 1956 { 1957 struct mlx5_ib_mr *mmr = to_mmr(ibmr); 1958 int ret = 0; 1959 1960 if (check_mask & ~IB_MR_CHECK_SIG_STATUS) { 1961 pr_err("Invalid status check mask\n"); 1962 ret = -EINVAL; 1963 goto done; 1964 } 1965 1966 mr_status->fail_status = 0; 1967 if (check_mask & IB_MR_CHECK_SIG_STATUS) { 1968 if (!mmr->sig) { 1969 ret = -EINVAL; 1970 pr_err("signature status check requested on a non-signature enabled MR\n"); 1971 goto done; 1972 } 1973 1974 mmr->sig->sig_status_checked = true; 1975 if (!mmr->sig->sig_err_exists) 1976 goto done; 1977 1978 if (ibmr->lkey == mmr->sig->err_item.key) 1979 memcpy(&mr_status->sig_err, &mmr->sig->err_item, 1980 sizeof(mr_status->sig_err)); 1981 else { 1982 mr_status->sig_err.err_type = IB_SIG_BAD_GUARD; 1983 mr_status->sig_err.sig_err_offset = 0; 1984 mr_status->sig_err.key = mmr->sig->err_item.key; 1985 } 1986 1987 mmr->sig->sig_err_exists = false; 1988 mr_status->fail_status |= IB_MR_CHECK_SIG_STATUS; 1989 } 1990 1991 done: 1992 return ret; 1993 } 1994 1995 static int 1996 mlx5_ib_map_pa_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg, 1997 int data_sg_nents, unsigned int *data_sg_offset, 1998 struct scatterlist *meta_sg, int meta_sg_nents, 1999 unsigned int *meta_sg_offset) 2000 { 2001 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2002 unsigned int sg_offset = 0; 2003 int n = 0; 2004 2005 mr->meta_length = 0; 2006 if (data_sg_nents == 1) { 2007 n++; 2008 mr->mmkey.ndescs = 1; 2009 if (data_sg_offset) 2010 sg_offset = *data_sg_offset; 2011 mr->data_length = sg_dma_len(data_sg) - sg_offset; 2012 mr->data_iova = sg_dma_address(data_sg) + sg_offset; 2013 if (meta_sg_nents == 1) { 2014 n++; 2015 mr->meta_ndescs = 1; 2016 if (meta_sg_offset) 2017 sg_offset = *meta_sg_offset; 2018 else 2019 sg_offset = 0; 2020 mr->meta_length = sg_dma_len(meta_sg) - sg_offset; 2021 mr->pi_iova = sg_dma_address(meta_sg) + sg_offset; 2022 } 2023 ibmr->length = mr->data_length + mr->meta_length; 2024 } 2025 2026 return n; 2027 } 2028 2029 static int 2030 mlx5_ib_sg_to_klms(struct mlx5_ib_mr *mr, 2031 struct scatterlist *sgl, 2032 unsigned short sg_nents, 2033 unsigned int *sg_offset_p, 2034 struct scatterlist *meta_sgl, 2035 unsigned short meta_sg_nents, 2036 unsigned int *meta_sg_offset_p) 2037 { 2038 struct scatterlist *sg = sgl; 2039 struct mlx5_klm *klms = mr->descs; 2040 unsigned int sg_offset = sg_offset_p ? *sg_offset_p : 0; 2041 u32 lkey = mr->ibmr.pd->local_dma_lkey; 2042 int i, j = 0; 2043 2044 mr->ibmr.iova = sg_dma_address(sg) + sg_offset; 2045 mr->ibmr.length = 0; 2046 2047 for_each_sg(sgl, sg, sg_nents, i) { 2048 if (unlikely(i >= mr->max_descs)) 2049 break; 2050 klms[i].va = cpu_to_be64(sg_dma_address(sg) + sg_offset); 2051 klms[i].bcount = cpu_to_be32(sg_dma_len(sg) - sg_offset); 2052 klms[i].key = cpu_to_be32(lkey); 2053 mr->ibmr.length += sg_dma_len(sg) - sg_offset; 2054 2055 sg_offset = 0; 2056 } 2057 2058 if (sg_offset_p) 2059 *sg_offset_p = sg_offset; 2060 2061 mr->mmkey.ndescs = i; 2062 mr->data_length = mr->ibmr.length; 2063 2064 if (meta_sg_nents) { 2065 sg = meta_sgl; 2066 sg_offset = meta_sg_offset_p ? *meta_sg_offset_p : 0; 2067 for_each_sg(meta_sgl, sg, meta_sg_nents, j) { 2068 if (unlikely(i + j >= mr->max_descs)) 2069 break; 2070 klms[i + j].va = cpu_to_be64(sg_dma_address(sg) + 2071 sg_offset); 2072 klms[i + j].bcount = cpu_to_be32(sg_dma_len(sg) - 2073 sg_offset); 2074 klms[i + j].key = cpu_to_be32(lkey); 2075 mr->ibmr.length += sg_dma_len(sg) - sg_offset; 2076 2077 sg_offset = 0; 2078 } 2079 if (meta_sg_offset_p) 2080 *meta_sg_offset_p = sg_offset; 2081 2082 mr->meta_ndescs = j; 2083 mr->meta_length = mr->ibmr.length - mr->data_length; 2084 } 2085 2086 return i + j; 2087 } 2088 2089 static int mlx5_set_page(struct ib_mr *ibmr, u64 addr) 2090 { 2091 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2092 __be64 *descs; 2093 2094 if (unlikely(mr->mmkey.ndescs == mr->max_descs)) 2095 return -ENOMEM; 2096 2097 descs = mr->descs; 2098 descs[mr->mmkey.ndescs++] = cpu_to_be64(addr | MLX5_EN_RD | MLX5_EN_WR); 2099 2100 return 0; 2101 } 2102 2103 static int mlx5_set_page_pi(struct ib_mr *ibmr, u64 addr) 2104 { 2105 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2106 __be64 *descs; 2107 2108 if (unlikely(mr->mmkey.ndescs + mr->meta_ndescs == mr->max_descs)) 2109 return -ENOMEM; 2110 2111 descs = mr->descs; 2112 descs[mr->mmkey.ndescs + mr->meta_ndescs++] = 2113 cpu_to_be64(addr | MLX5_EN_RD | MLX5_EN_WR); 2114 2115 return 0; 2116 } 2117 2118 static int 2119 mlx5_ib_map_mtt_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg, 2120 int data_sg_nents, unsigned int *data_sg_offset, 2121 struct scatterlist *meta_sg, int meta_sg_nents, 2122 unsigned int *meta_sg_offset) 2123 { 2124 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2125 struct mlx5_ib_mr *pi_mr = mr->mtt_mr; 2126 int n; 2127 2128 pi_mr->mmkey.ndescs = 0; 2129 pi_mr->meta_ndescs = 0; 2130 pi_mr->meta_length = 0; 2131 2132 ib_dma_sync_single_for_cpu(ibmr->device, pi_mr->desc_map, 2133 pi_mr->desc_size * pi_mr->max_descs, 2134 DMA_TO_DEVICE); 2135 2136 pi_mr->ibmr.page_size = ibmr->page_size; 2137 n = ib_sg_to_pages(&pi_mr->ibmr, data_sg, data_sg_nents, data_sg_offset, 2138 mlx5_set_page); 2139 if (n != data_sg_nents) 2140 return n; 2141 2142 pi_mr->data_iova = pi_mr->ibmr.iova; 2143 pi_mr->data_length = pi_mr->ibmr.length; 2144 pi_mr->ibmr.length = pi_mr->data_length; 2145 ibmr->length = pi_mr->data_length; 2146 2147 if (meta_sg_nents) { 2148 u64 page_mask = ~((u64)ibmr->page_size - 1); 2149 u64 iova = pi_mr->data_iova; 2150 2151 n += ib_sg_to_pages(&pi_mr->ibmr, meta_sg, meta_sg_nents, 2152 meta_sg_offset, mlx5_set_page_pi); 2153 2154 pi_mr->meta_length = pi_mr->ibmr.length; 2155 /* 2156 * PI address for the HW is the offset of the metadata address 2157 * relative to the first data page address. 2158 * It equals to first data page address + size of data pages + 2159 * metadata offset at the first metadata page 2160 */ 2161 pi_mr->pi_iova = (iova & page_mask) + 2162 pi_mr->mmkey.ndescs * ibmr->page_size + 2163 (pi_mr->ibmr.iova & ~page_mask); 2164 /* 2165 * In order to use one MTT MR for data and metadata, we register 2166 * also the gaps between the end of the data and the start of 2167 * the metadata (the sig MR will verify that the HW will access 2168 * to right addresses). This mapping is safe because we use 2169 * internal mkey for the registration. 2170 */ 2171 pi_mr->ibmr.length = pi_mr->pi_iova + pi_mr->meta_length - iova; 2172 pi_mr->ibmr.iova = iova; 2173 ibmr->length += pi_mr->meta_length; 2174 } 2175 2176 ib_dma_sync_single_for_device(ibmr->device, pi_mr->desc_map, 2177 pi_mr->desc_size * pi_mr->max_descs, 2178 DMA_TO_DEVICE); 2179 2180 return n; 2181 } 2182 2183 static int 2184 mlx5_ib_map_klm_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg, 2185 int data_sg_nents, unsigned int *data_sg_offset, 2186 struct scatterlist *meta_sg, int meta_sg_nents, 2187 unsigned int *meta_sg_offset) 2188 { 2189 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2190 struct mlx5_ib_mr *pi_mr = mr->klm_mr; 2191 int n; 2192 2193 pi_mr->mmkey.ndescs = 0; 2194 pi_mr->meta_ndescs = 0; 2195 pi_mr->meta_length = 0; 2196 2197 ib_dma_sync_single_for_cpu(ibmr->device, pi_mr->desc_map, 2198 pi_mr->desc_size * pi_mr->max_descs, 2199 DMA_TO_DEVICE); 2200 2201 n = mlx5_ib_sg_to_klms(pi_mr, data_sg, data_sg_nents, data_sg_offset, 2202 meta_sg, meta_sg_nents, meta_sg_offset); 2203 2204 ib_dma_sync_single_for_device(ibmr->device, pi_mr->desc_map, 2205 pi_mr->desc_size * pi_mr->max_descs, 2206 DMA_TO_DEVICE); 2207 2208 /* This is zero-based memory region */ 2209 pi_mr->data_iova = 0; 2210 pi_mr->ibmr.iova = 0; 2211 pi_mr->pi_iova = pi_mr->data_length; 2212 ibmr->length = pi_mr->ibmr.length; 2213 2214 return n; 2215 } 2216 2217 int mlx5_ib_map_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg, 2218 int data_sg_nents, unsigned int *data_sg_offset, 2219 struct scatterlist *meta_sg, int meta_sg_nents, 2220 unsigned int *meta_sg_offset) 2221 { 2222 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2223 struct mlx5_ib_mr *pi_mr = NULL; 2224 int n; 2225 2226 WARN_ON(ibmr->type != IB_MR_TYPE_INTEGRITY); 2227 2228 mr->mmkey.ndescs = 0; 2229 mr->data_length = 0; 2230 mr->data_iova = 0; 2231 mr->meta_ndescs = 0; 2232 mr->pi_iova = 0; 2233 /* 2234 * As a performance optimization, if possible, there is no need to 2235 * perform UMR operation to register the data/metadata buffers. 2236 * First try to map the sg lists to PA descriptors with local_dma_lkey. 2237 * Fallback to UMR only in case of a failure. 2238 */ 2239 n = mlx5_ib_map_pa_mr_sg_pi(ibmr, data_sg, data_sg_nents, 2240 data_sg_offset, meta_sg, meta_sg_nents, 2241 meta_sg_offset); 2242 if (n == data_sg_nents + meta_sg_nents) 2243 goto out; 2244 /* 2245 * As a performance optimization, if possible, there is no need to map 2246 * the sg lists to KLM descriptors. First try to map the sg lists to MTT 2247 * descriptors and fallback to KLM only in case of a failure. 2248 * It's more efficient for the HW to work with MTT descriptors 2249 * (especially in high load). 2250 * Use KLM (indirect access) only if it's mandatory. 2251 */ 2252 pi_mr = mr->mtt_mr; 2253 n = mlx5_ib_map_mtt_mr_sg_pi(ibmr, data_sg, data_sg_nents, 2254 data_sg_offset, meta_sg, meta_sg_nents, 2255 meta_sg_offset); 2256 if (n == data_sg_nents + meta_sg_nents) 2257 goto out; 2258 2259 pi_mr = mr->klm_mr; 2260 n = mlx5_ib_map_klm_mr_sg_pi(ibmr, data_sg, data_sg_nents, 2261 data_sg_offset, meta_sg, meta_sg_nents, 2262 meta_sg_offset); 2263 if (unlikely(n != data_sg_nents + meta_sg_nents)) 2264 return -ENOMEM; 2265 2266 out: 2267 /* This is zero-based memory region */ 2268 ibmr->iova = 0; 2269 mr->pi_mr = pi_mr; 2270 if (pi_mr) 2271 ibmr->sig_attrs->meta_length = pi_mr->meta_length; 2272 else 2273 ibmr->sig_attrs->meta_length = mr->meta_length; 2274 2275 return 0; 2276 } 2277 2278 int mlx5_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, 2279 unsigned int *sg_offset) 2280 { 2281 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2282 int n; 2283 2284 mr->mmkey.ndescs = 0; 2285 2286 ib_dma_sync_single_for_cpu(ibmr->device, mr->desc_map, 2287 mr->desc_size * mr->max_descs, 2288 DMA_TO_DEVICE); 2289 2290 if (mr->access_mode == MLX5_MKC_ACCESS_MODE_KLMS) 2291 n = mlx5_ib_sg_to_klms(mr, sg, sg_nents, sg_offset, NULL, 0, 2292 NULL); 2293 else 2294 n = ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, 2295 mlx5_set_page); 2296 2297 ib_dma_sync_single_for_device(ibmr->device, mr->desc_map, 2298 mr->desc_size * mr->max_descs, 2299 DMA_TO_DEVICE); 2300 2301 return n; 2302 } 2303