1 /* 2 * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved. 3 * Copyright (c) 2020, Intel Corporation. All rights reserved. 4 * 5 * This software is available to you under a choice of one of two 6 * licenses. You may choose to be licensed under the terms of the GNU 7 * General Public License (GPL) Version 2, available from the file 8 * COPYING in the main directory of this source tree, or the 9 * OpenIB.org BSD license below: 10 * 11 * Redistribution and use in source and binary forms, with or 12 * without modification, are permitted provided that the following 13 * conditions are met: 14 * 15 * - Redistributions of source code must retain the above 16 * copyright notice, this list of conditions and the following 17 * disclaimer. 18 * 19 * - Redistributions in binary form must reproduce the above 20 * copyright notice, this list of conditions and the following 21 * disclaimer in the documentation and/or other materials 22 * provided with the distribution. 23 * 24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 25 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 26 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 27 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 28 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 29 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 30 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 31 * SOFTWARE. 32 */ 33 34 35 #include <linux/kref.h> 36 #include <linux/random.h> 37 #include <linux/debugfs.h> 38 #include <linux/export.h> 39 #include <linux/delay.h> 40 #include <linux/dma-buf.h> 41 #include <linux/dma-resv.h> 42 #include <rdma/ib_umem_odp.h> 43 #include "dm.h" 44 #include "mlx5_ib.h" 45 #include "umr.h" 46 #include "data_direct.h" 47 48 enum { 49 MAX_PENDING_REG_MR = 8, 50 }; 51 52 #define MLX5_MR_CACHE_PERSISTENT_ENTRY_MIN_DESCS 4 53 #define MLX5_UMR_ALIGN 2048 54 55 static void 56 create_mkey_callback(int status, struct mlx5_async_work *context); 57 static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem, 58 u64 iova, int access_flags, 59 unsigned long page_size, bool populate, 60 int access_mode); 61 static int __mlx5_ib_dereg_mr(struct ib_mr *ibmr); 62 63 static void set_mkc_access_pd_addr_fields(void *mkc, int acc, u64 start_addr, 64 struct ib_pd *pd) 65 { 66 struct mlx5_ib_dev *dev = to_mdev(pd->device); 67 68 MLX5_SET(mkc, mkc, a, !!(acc & IB_ACCESS_REMOTE_ATOMIC)); 69 MLX5_SET(mkc, mkc, rw, !!(acc & IB_ACCESS_REMOTE_WRITE)); 70 MLX5_SET(mkc, mkc, rr, !!(acc & IB_ACCESS_REMOTE_READ)); 71 MLX5_SET(mkc, mkc, lw, !!(acc & IB_ACCESS_LOCAL_WRITE)); 72 MLX5_SET(mkc, mkc, lr, 1); 73 74 if (acc & IB_ACCESS_RELAXED_ORDERING) { 75 if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write)) 76 MLX5_SET(mkc, mkc, relaxed_ordering_write, 1); 77 78 if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read) || 79 (MLX5_CAP_GEN(dev->mdev, 80 relaxed_ordering_read_pci_enabled) && 81 pcie_relaxed_ordering_enabled(dev->mdev->pdev))) 82 MLX5_SET(mkc, mkc, relaxed_ordering_read, 1); 83 } 84 85 MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn); 86 MLX5_SET(mkc, mkc, qpn, 0xffffff); 87 MLX5_SET64(mkc, mkc, start_addr, start_addr); 88 } 89 90 static void assign_mkey_variant(struct mlx5_ib_dev *dev, u32 *mkey, u32 *in) 91 { 92 u8 key = atomic_inc_return(&dev->mkey_var); 93 void *mkc; 94 95 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 96 MLX5_SET(mkc, mkc, mkey_7_0, key); 97 *mkey = key; 98 } 99 100 static int mlx5_ib_create_mkey(struct mlx5_ib_dev *dev, 101 struct mlx5_ib_mkey *mkey, u32 *in, int inlen) 102 { 103 int ret; 104 105 assign_mkey_variant(dev, &mkey->key, in); 106 ret = mlx5_core_create_mkey(dev->mdev, &mkey->key, in, inlen); 107 if (!ret) 108 init_waitqueue_head(&mkey->wait); 109 110 return ret; 111 } 112 113 static int mlx5_ib_create_mkey_cb(struct mlx5r_async_create_mkey *async_create) 114 { 115 struct mlx5_ib_dev *dev = async_create->ent->dev; 116 size_t inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 117 size_t outlen = MLX5_ST_SZ_BYTES(create_mkey_out); 118 119 MLX5_SET(create_mkey_in, async_create->in, opcode, 120 MLX5_CMD_OP_CREATE_MKEY); 121 assign_mkey_variant(dev, &async_create->mkey, async_create->in); 122 return mlx5_cmd_exec_cb(&dev->async_ctx, async_create->in, inlen, 123 async_create->out, outlen, create_mkey_callback, 124 &async_create->cb_work); 125 } 126 127 static int mkey_cache_max_order(struct mlx5_ib_dev *dev); 128 static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent); 129 130 static int destroy_mkey(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) 131 { 132 WARN_ON(xa_load(&dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key))); 133 134 return mlx5_core_destroy_mkey(dev->mdev, mr->mmkey.key); 135 } 136 137 static void create_mkey_warn(struct mlx5_ib_dev *dev, int status, void *out) 138 { 139 if (status == -ENXIO) /* core driver is not available */ 140 return; 141 142 mlx5_ib_warn(dev, "async reg mr failed. status %d\n", status); 143 if (status != -EREMOTEIO) /* driver specific failure */ 144 return; 145 146 /* Failed in FW, print cmd out failure details */ 147 mlx5_cmd_out_err(dev->mdev, MLX5_CMD_OP_CREATE_MKEY, 0, out); 148 } 149 150 static int push_mkey_locked(struct mlx5_cache_ent *ent, u32 mkey) 151 { 152 unsigned long tmp = ent->mkeys_queue.ci % NUM_MKEYS_PER_PAGE; 153 struct mlx5_mkeys_page *page; 154 155 lockdep_assert_held(&ent->mkeys_queue.lock); 156 if (ent->mkeys_queue.ci >= 157 ent->mkeys_queue.num_pages * NUM_MKEYS_PER_PAGE) { 158 page = kzalloc(sizeof(*page), GFP_ATOMIC); 159 if (!page) 160 return -ENOMEM; 161 ent->mkeys_queue.num_pages++; 162 list_add_tail(&page->list, &ent->mkeys_queue.pages_list); 163 } else { 164 page = list_last_entry(&ent->mkeys_queue.pages_list, 165 struct mlx5_mkeys_page, list); 166 } 167 168 page->mkeys[tmp] = mkey; 169 ent->mkeys_queue.ci++; 170 return 0; 171 } 172 173 static int pop_mkey_locked(struct mlx5_cache_ent *ent) 174 { 175 unsigned long tmp = (ent->mkeys_queue.ci - 1) % NUM_MKEYS_PER_PAGE; 176 struct mlx5_mkeys_page *last_page; 177 u32 mkey; 178 179 lockdep_assert_held(&ent->mkeys_queue.lock); 180 last_page = list_last_entry(&ent->mkeys_queue.pages_list, 181 struct mlx5_mkeys_page, list); 182 mkey = last_page->mkeys[tmp]; 183 last_page->mkeys[tmp] = 0; 184 ent->mkeys_queue.ci--; 185 if (ent->mkeys_queue.num_pages > 1 && !tmp) { 186 list_del(&last_page->list); 187 ent->mkeys_queue.num_pages--; 188 kfree(last_page); 189 } 190 return mkey; 191 } 192 193 static void create_mkey_callback(int status, struct mlx5_async_work *context) 194 { 195 struct mlx5r_async_create_mkey *mkey_out = 196 container_of(context, struct mlx5r_async_create_mkey, cb_work); 197 struct mlx5_cache_ent *ent = mkey_out->ent; 198 struct mlx5_ib_dev *dev = ent->dev; 199 unsigned long flags; 200 201 if (status) { 202 create_mkey_warn(dev, status, mkey_out->out); 203 kfree(mkey_out); 204 spin_lock_irqsave(&ent->mkeys_queue.lock, flags); 205 ent->pending--; 206 WRITE_ONCE(dev->fill_delay, 1); 207 spin_unlock_irqrestore(&ent->mkeys_queue.lock, flags); 208 mod_timer(&dev->delay_timer, jiffies + HZ); 209 return; 210 } 211 212 mkey_out->mkey |= mlx5_idx_to_mkey( 213 MLX5_GET(create_mkey_out, mkey_out->out, mkey_index)); 214 WRITE_ONCE(dev->cache.last_add, jiffies); 215 216 spin_lock_irqsave(&ent->mkeys_queue.lock, flags); 217 push_mkey_locked(ent, mkey_out->mkey); 218 ent->pending--; 219 /* If we are doing fill_to_high_water then keep going. */ 220 queue_adjust_cache_locked(ent); 221 spin_unlock_irqrestore(&ent->mkeys_queue.lock, flags); 222 kfree(mkey_out); 223 } 224 225 static int get_mkc_octo_size(unsigned int access_mode, unsigned int ndescs) 226 { 227 int ret = 0; 228 229 switch (access_mode) { 230 case MLX5_MKC_ACCESS_MODE_MTT: 231 ret = DIV_ROUND_UP(ndescs, MLX5_IB_UMR_OCTOWORD / 232 sizeof(struct mlx5_mtt)); 233 break; 234 case MLX5_MKC_ACCESS_MODE_KSM: 235 ret = DIV_ROUND_UP(ndescs, MLX5_IB_UMR_OCTOWORD / 236 sizeof(struct mlx5_klm)); 237 break; 238 default: 239 WARN_ON(1); 240 } 241 return ret; 242 } 243 244 static void set_cache_mkc(struct mlx5_cache_ent *ent, void *mkc) 245 { 246 set_mkc_access_pd_addr_fields(mkc, ent->rb_key.access_flags, 0, 247 ent->dev->umrc.pd); 248 MLX5_SET(mkc, mkc, free, 1); 249 MLX5_SET(mkc, mkc, umr_en, 1); 250 MLX5_SET(mkc, mkc, access_mode_1_0, ent->rb_key.access_mode & 0x3); 251 MLX5_SET(mkc, mkc, access_mode_4_2, 252 (ent->rb_key.access_mode >> 2) & 0x7); 253 MLX5_SET(mkc, mkc, ma_translation_mode, !!ent->rb_key.ats); 254 255 MLX5_SET(mkc, mkc, translations_octword_size, 256 get_mkc_octo_size(ent->rb_key.access_mode, 257 ent->rb_key.ndescs)); 258 MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT); 259 } 260 261 /* Asynchronously schedule new MRs to be populated in the cache. */ 262 static int add_keys(struct mlx5_cache_ent *ent, unsigned int num) 263 { 264 struct mlx5r_async_create_mkey *async_create; 265 void *mkc; 266 int err = 0; 267 int i; 268 269 for (i = 0; i < num; i++) { 270 async_create = kzalloc(sizeof(struct mlx5r_async_create_mkey), 271 GFP_KERNEL); 272 if (!async_create) 273 return -ENOMEM; 274 mkc = MLX5_ADDR_OF(create_mkey_in, async_create->in, 275 memory_key_mkey_entry); 276 set_cache_mkc(ent, mkc); 277 async_create->ent = ent; 278 279 spin_lock_irq(&ent->mkeys_queue.lock); 280 if (ent->pending >= MAX_PENDING_REG_MR) { 281 err = -EAGAIN; 282 goto free_async_create; 283 } 284 ent->pending++; 285 spin_unlock_irq(&ent->mkeys_queue.lock); 286 287 err = mlx5_ib_create_mkey_cb(async_create); 288 if (err) { 289 mlx5_ib_warn(ent->dev, "create mkey failed %d\n", err); 290 goto err_create_mkey; 291 } 292 } 293 294 return 0; 295 296 err_create_mkey: 297 spin_lock_irq(&ent->mkeys_queue.lock); 298 ent->pending--; 299 free_async_create: 300 spin_unlock_irq(&ent->mkeys_queue.lock); 301 kfree(async_create); 302 return err; 303 } 304 305 /* Synchronously create a MR in the cache */ 306 static int create_cache_mkey(struct mlx5_cache_ent *ent, u32 *mkey) 307 { 308 size_t inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 309 void *mkc; 310 u32 *in; 311 int err; 312 313 in = kzalloc(inlen, GFP_KERNEL); 314 if (!in) 315 return -ENOMEM; 316 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 317 set_cache_mkc(ent, mkc); 318 319 err = mlx5_core_create_mkey(ent->dev->mdev, mkey, in, inlen); 320 if (err) 321 goto free_in; 322 323 WRITE_ONCE(ent->dev->cache.last_add, jiffies); 324 free_in: 325 kfree(in); 326 return err; 327 } 328 329 static void remove_cache_mr_locked(struct mlx5_cache_ent *ent) 330 { 331 u32 mkey; 332 333 lockdep_assert_held(&ent->mkeys_queue.lock); 334 if (!ent->mkeys_queue.ci) 335 return; 336 mkey = pop_mkey_locked(ent); 337 spin_unlock_irq(&ent->mkeys_queue.lock); 338 mlx5_core_destroy_mkey(ent->dev->mdev, mkey); 339 spin_lock_irq(&ent->mkeys_queue.lock); 340 } 341 342 static int resize_available_mrs(struct mlx5_cache_ent *ent, unsigned int target, 343 bool limit_fill) 344 __acquires(&ent->mkeys_queue.lock) __releases(&ent->mkeys_queue.lock) 345 { 346 int err; 347 348 lockdep_assert_held(&ent->mkeys_queue.lock); 349 350 while (true) { 351 if (limit_fill) 352 target = ent->limit * 2; 353 if (target == ent->pending + ent->mkeys_queue.ci) 354 return 0; 355 if (target > ent->pending + ent->mkeys_queue.ci) { 356 u32 todo = target - (ent->pending + ent->mkeys_queue.ci); 357 358 spin_unlock_irq(&ent->mkeys_queue.lock); 359 err = add_keys(ent, todo); 360 if (err == -EAGAIN) 361 usleep_range(3000, 5000); 362 spin_lock_irq(&ent->mkeys_queue.lock); 363 if (err) { 364 if (err != -EAGAIN) 365 return err; 366 } else 367 return 0; 368 } else { 369 remove_cache_mr_locked(ent); 370 } 371 } 372 } 373 374 static ssize_t size_write(struct file *filp, const char __user *buf, 375 size_t count, loff_t *pos) 376 { 377 struct mlx5_cache_ent *ent = filp->private_data; 378 u32 target; 379 int err; 380 381 err = kstrtou32_from_user(buf, count, 0, &target); 382 if (err) 383 return err; 384 385 /* 386 * Target is the new value of total_mrs the user requests, however we 387 * cannot free MRs that are in use. Compute the target value for stored 388 * mkeys. 389 */ 390 spin_lock_irq(&ent->mkeys_queue.lock); 391 if (target < ent->in_use) { 392 err = -EINVAL; 393 goto err_unlock; 394 } 395 target = target - ent->in_use; 396 if (target < ent->limit || target > ent->limit*2) { 397 err = -EINVAL; 398 goto err_unlock; 399 } 400 err = resize_available_mrs(ent, target, false); 401 if (err) 402 goto err_unlock; 403 spin_unlock_irq(&ent->mkeys_queue.lock); 404 405 return count; 406 407 err_unlock: 408 spin_unlock_irq(&ent->mkeys_queue.lock); 409 return err; 410 } 411 412 static ssize_t size_read(struct file *filp, char __user *buf, size_t count, 413 loff_t *pos) 414 { 415 struct mlx5_cache_ent *ent = filp->private_data; 416 char lbuf[20]; 417 int err; 418 419 err = snprintf(lbuf, sizeof(lbuf), "%ld\n", 420 ent->mkeys_queue.ci + ent->in_use); 421 if (err < 0) 422 return err; 423 424 return simple_read_from_buffer(buf, count, pos, lbuf, err); 425 } 426 427 static const struct file_operations size_fops = { 428 .owner = THIS_MODULE, 429 .open = simple_open, 430 .write = size_write, 431 .read = size_read, 432 }; 433 434 static ssize_t limit_write(struct file *filp, const char __user *buf, 435 size_t count, loff_t *pos) 436 { 437 struct mlx5_cache_ent *ent = filp->private_data; 438 u32 var; 439 int err; 440 441 err = kstrtou32_from_user(buf, count, 0, &var); 442 if (err) 443 return err; 444 445 /* 446 * Upon set we immediately fill the cache to high water mark implied by 447 * the limit. 448 */ 449 spin_lock_irq(&ent->mkeys_queue.lock); 450 ent->limit = var; 451 err = resize_available_mrs(ent, 0, true); 452 spin_unlock_irq(&ent->mkeys_queue.lock); 453 if (err) 454 return err; 455 return count; 456 } 457 458 static ssize_t limit_read(struct file *filp, char __user *buf, size_t count, 459 loff_t *pos) 460 { 461 struct mlx5_cache_ent *ent = filp->private_data; 462 char lbuf[20]; 463 int err; 464 465 err = snprintf(lbuf, sizeof(lbuf), "%d\n", ent->limit); 466 if (err < 0) 467 return err; 468 469 return simple_read_from_buffer(buf, count, pos, lbuf, err); 470 } 471 472 static const struct file_operations limit_fops = { 473 .owner = THIS_MODULE, 474 .open = simple_open, 475 .write = limit_write, 476 .read = limit_read, 477 }; 478 479 static bool someone_adding(struct mlx5_mkey_cache *cache) 480 { 481 struct mlx5_cache_ent *ent; 482 struct rb_node *node; 483 bool ret; 484 485 mutex_lock(&cache->rb_lock); 486 for (node = rb_first(&cache->rb_root); node; node = rb_next(node)) { 487 ent = rb_entry(node, struct mlx5_cache_ent, node); 488 spin_lock_irq(&ent->mkeys_queue.lock); 489 ret = ent->mkeys_queue.ci < ent->limit; 490 spin_unlock_irq(&ent->mkeys_queue.lock); 491 if (ret) { 492 mutex_unlock(&cache->rb_lock); 493 return true; 494 } 495 } 496 mutex_unlock(&cache->rb_lock); 497 return false; 498 } 499 500 /* 501 * Check if the bucket is outside the high/low water mark and schedule an async 502 * update. The cache refill has hysteresis, once the low water mark is hit it is 503 * refilled up to the high mark. 504 */ 505 static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent) 506 { 507 lockdep_assert_held(&ent->mkeys_queue.lock); 508 509 if (ent->disabled || READ_ONCE(ent->dev->fill_delay) || ent->is_tmp) 510 return; 511 if (ent->mkeys_queue.ci < ent->limit) { 512 ent->fill_to_high_water = true; 513 mod_delayed_work(ent->dev->cache.wq, &ent->dwork, 0); 514 } else if (ent->fill_to_high_water && 515 ent->mkeys_queue.ci + ent->pending < 2 * ent->limit) { 516 /* 517 * Once we start populating due to hitting a low water mark 518 * continue until we pass the high water mark. 519 */ 520 mod_delayed_work(ent->dev->cache.wq, &ent->dwork, 0); 521 } else if (ent->mkeys_queue.ci == 2 * ent->limit) { 522 ent->fill_to_high_water = false; 523 } else if (ent->mkeys_queue.ci > 2 * ent->limit) { 524 /* Queue deletion of excess entries */ 525 ent->fill_to_high_water = false; 526 if (ent->pending) 527 queue_delayed_work(ent->dev->cache.wq, &ent->dwork, 528 secs_to_jiffies(1)); 529 else 530 mod_delayed_work(ent->dev->cache.wq, &ent->dwork, 0); 531 } 532 } 533 534 static void clean_keys(struct mlx5_ib_dev *dev, struct mlx5_cache_ent *ent) 535 { 536 u32 mkey; 537 538 spin_lock_irq(&ent->mkeys_queue.lock); 539 while (ent->mkeys_queue.ci) { 540 mkey = pop_mkey_locked(ent); 541 spin_unlock_irq(&ent->mkeys_queue.lock); 542 mlx5_core_destroy_mkey(dev->mdev, mkey); 543 spin_lock_irq(&ent->mkeys_queue.lock); 544 } 545 ent->tmp_cleanup_scheduled = false; 546 spin_unlock_irq(&ent->mkeys_queue.lock); 547 } 548 549 static void __cache_work_func(struct mlx5_cache_ent *ent) 550 { 551 struct mlx5_ib_dev *dev = ent->dev; 552 struct mlx5_mkey_cache *cache = &dev->cache; 553 int err; 554 555 spin_lock_irq(&ent->mkeys_queue.lock); 556 if (ent->disabled) 557 goto out; 558 559 if (ent->fill_to_high_water && 560 ent->mkeys_queue.ci + ent->pending < 2 * ent->limit && 561 !READ_ONCE(dev->fill_delay)) { 562 spin_unlock_irq(&ent->mkeys_queue.lock); 563 err = add_keys(ent, 1); 564 spin_lock_irq(&ent->mkeys_queue.lock); 565 if (ent->disabled) 566 goto out; 567 if (err) { 568 /* 569 * EAGAIN only happens if there are pending MRs, so we 570 * will be rescheduled when storing them. The only 571 * failure path here is ENOMEM. 572 */ 573 if (err != -EAGAIN) { 574 mlx5_ib_warn( 575 dev, 576 "add keys command failed, err %d\n", 577 err); 578 queue_delayed_work(cache->wq, &ent->dwork, 579 secs_to_jiffies(1)); 580 } 581 } 582 } else if (ent->mkeys_queue.ci > 2 * ent->limit) { 583 bool need_delay; 584 585 /* 586 * The remove_cache_mr() logic is performed as garbage 587 * collection task. Such task is intended to be run when no 588 * other active processes are running. 589 * 590 * The need_resched() will return TRUE if there are user tasks 591 * to be activated in near future. 592 * 593 * In such case, we don't execute remove_cache_mr() and postpone 594 * the garbage collection work to try to run in next cycle, in 595 * order to free CPU resources to other tasks. 596 */ 597 spin_unlock_irq(&ent->mkeys_queue.lock); 598 need_delay = need_resched() || someone_adding(cache) || 599 !time_after(jiffies, 600 READ_ONCE(cache->last_add) + 300 * HZ); 601 spin_lock_irq(&ent->mkeys_queue.lock); 602 if (ent->disabled) 603 goto out; 604 if (need_delay) { 605 queue_delayed_work(cache->wq, &ent->dwork, 300 * HZ); 606 goto out; 607 } 608 remove_cache_mr_locked(ent); 609 queue_adjust_cache_locked(ent); 610 } 611 out: 612 spin_unlock_irq(&ent->mkeys_queue.lock); 613 } 614 615 static void delayed_cache_work_func(struct work_struct *work) 616 { 617 struct mlx5_cache_ent *ent; 618 619 ent = container_of(work, struct mlx5_cache_ent, dwork.work); 620 /* temp entries are never filled, only cleaned */ 621 if (ent->is_tmp) 622 clean_keys(ent->dev, ent); 623 else 624 __cache_work_func(ent); 625 } 626 627 static int cache_ent_key_cmp(struct mlx5r_cache_rb_key key1, 628 struct mlx5r_cache_rb_key key2) 629 { 630 int res; 631 632 res = key1.ats - key2.ats; 633 if (res) 634 return res; 635 636 res = key1.access_mode - key2.access_mode; 637 if (res) 638 return res; 639 640 res = key1.access_flags - key2.access_flags; 641 if (res) 642 return res; 643 644 /* 645 * keep ndescs the last in the compare table since the find function 646 * searches for an exact match on all properties and only closest 647 * match in size. 648 */ 649 return key1.ndescs - key2.ndescs; 650 } 651 652 static int mlx5_cache_ent_insert(struct mlx5_mkey_cache *cache, 653 struct mlx5_cache_ent *ent) 654 { 655 struct rb_node **new = &cache->rb_root.rb_node, *parent = NULL; 656 struct mlx5_cache_ent *cur; 657 int cmp; 658 659 /* Figure out where to put new node */ 660 while (*new) { 661 cur = rb_entry(*new, struct mlx5_cache_ent, node); 662 parent = *new; 663 cmp = cache_ent_key_cmp(cur->rb_key, ent->rb_key); 664 if (cmp > 0) 665 new = &((*new)->rb_left); 666 if (cmp < 0) 667 new = &((*new)->rb_right); 668 if (cmp == 0) 669 return -EEXIST; 670 } 671 672 /* Add new node and rebalance tree. */ 673 rb_link_node(&ent->node, parent, new); 674 rb_insert_color(&ent->node, &cache->rb_root); 675 676 return 0; 677 } 678 679 static struct mlx5_cache_ent * 680 mkey_cache_ent_from_rb_key(struct mlx5_ib_dev *dev, 681 struct mlx5r_cache_rb_key rb_key) 682 { 683 struct rb_node *node = dev->cache.rb_root.rb_node; 684 struct mlx5_cache_ent *cur, *smallest = NULL; 685 u64 ndescs_limit; 686 int cmp; 687 688 /* 689 * Find the smallest ent with order >= requested_order. 690 */ 691 while (node) { 692 cur = rb_entry(node, struct mlx5_cache_ent, node); 693 cmp = cache_ent_key_cmp(cur->rb_key, rb_key); 694 if (cmp > 0) { 695 smallest = cur; 696 node = node->rb_left; 697 } 698 if (cmp < 0) 699 node = node->rb_right; 700 if (cmp == 0) 701 return cur; 702 } 703 704 /* 705 * Limit the usage of mkeys larger than twice the required size while 706 * also allowing the usage of smallest cache entry for small MRs. 707 */ 708 ndescs_limit = max_t(u64, rb_key.ndescs * 2, 709 MLX5_MR_CACHE_PERSISTENT_ENTRY_MIN_DESCS); 710 711 return (smallest && 712 smallest->rb_key.access_mode == rb_key.access_mode && 713 smallest->rb_key.access_flags == rb_key.access_flags && 714 smallest->rb_key.ats == rb_key.ats && 715 smallest->rb_key.ndescs <= ndescs_limit) ? 716 smallest : 717 NULL; 718 } 719 720 static struct mlx5_ib_mr *_mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, 721 struct mlx5_cache_ent *ent) 722 { 723 struct mlx5_ib_mr *mr; 724 int err; 725 726 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 727 if (!mr) 728 return ERR_PTR(-ENOMEM); 729 730 spin_lock_irq(&ent->mkeys_queue.lock); 731 ent->in_use++; 732 733 if (!ent->mkeys_queue.ci) { 734 queue_adjust_cache_locked(ent); 735 ent->miss++; 736 spin_unlock_irq(&ent->mkeys_queue.lock); 737 err = create_cache_mkey(ent, &mr->mmkey.key); 738 if (err) { 739 spin_lock_irq(&ent->mkeys_queue.lock); 740 ent->in_use--; 741 spin_unlock_irq(&ent->mkeys_queue.lock); 742 kfree(mr); 743 return ERR_PTR(err); 744 } 745 } else { 746 mr->mmkey.key = pop_mkey_locked(ent); 747 queue_adjust_cache_locked(ent); 748 spin_unlock_irq(&ent->mkeys_queue.lock); 749 } 750 mr->mmkey.cache_ent = ent; 751 mr->mmkey.type = MLX5_MKEY_MR; 752 mr->mmkey.rb_key = ent->rb_key; 753 mr->mmkey.cacheable = true; 754 init_waitqueue_head(&mr->mmkey.wait); 755 return mr; 756 } 757 758 static int get_unchangeable_access_flags(struct mlx5_ib_dev *dev, 759 int access_flags) 760 { 761 int ret = 0; 762 763 if ((access_flags & IB_ACCESS_REMOTE_ATOMIC) && 764 MLX5_CAP_GEN(dev->mdev, atomic) && 765 MLX5_CAP_GEN(dev->mdev, umr_modify_atomic_disabled)) 766 ret |= IB_ACCESS_REMOTE_ATOMIC; 767 768 if ((access_flags & IB_ACCESS_RELAXED_ORDERING) && 769 MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write) && 770 !MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write_umr)) 771 ret |= IB_ACCESS_RELAXED_ORDERING; 772 773 if ((access_flags & IB_ACCESS_RELAXED_ORDERING) && 774 (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read) || 775 MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read_pci_enabled)) && 776 !MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read_umr)) 777 ret |= IB_ACCESS_RELAXED_ORDERING; 778 779 return ret; 780 } 781 782 struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, 783 int access_flags, int access_mode, 784 int ndescs) 785 { 786 struct mlx5r_cache_rb_key rb_key = { 787 .ndescs = ndescs, 788 .access_mode = access_mode, 789 .access_flags = get_unchangeable_access_flags(dev, access_flags) 790 }; 791 struct mlx5_cache_ent *ent = mkey_cache_ent_from_rb_key(dev, rb_key); 792 793 if (!ent) 794 return ERR_PTR(-EOPNOTSUPP); 795 796 return _mlx5_mr_cache_alloc(dev, ent); 797 } 798 799 static void mlx5_mkey_cache_debugfs_cleanup(struct mlx5_ib_dev *dev) 800 { 801 if (!mlx5_debugfs_root || dev->is_rep) 802 return; 803 804 debugfs_remove_recursive(dev->cache.fs_root); 805 dev->cache.fs_root = NULL; 806 } 807 808 static void mlx5_mkey_cache_debugfs_add_ent(struct mlx5_ib_dev *dev, 809 struct mlx5_cache_ent *ent) 810 { 811 int order = order_base_2(ent->rb_key.ndescs); 812 struct dentry *dir; 813 814 if (!mlx5_debugfs_root || dev->is_rep) 815 return; 816 817 if (ent->rb_key.access_mode == MLX5_MKC_ACCESS_MODE_KSM) 818 order = MLX5_IMR_KSM_CACHE_ENTRY + 2; 819 820 sprintf(ent->name, "%d", order); 821 dir = debugfs_create_dir(ent->name, dev->cache.fs_root); 822 debugfs_create_file("size", 0600, dir, ent, &size_fops); 823 debugfs_create_file("limit", 0600, dir, ent, &limit_fops); 824 debugfs_create_ulong("cur", 0400, dir, &ent->mkeys_queue.ci); 825 debugfs_create_u32("miss", 0600, dir, &ent->miss); 826 } 827 828 static void mlx5_mkey_cache_debugfs_init(struct mlx5_ib_dev *dev) 829 { 830 struct dentry *dbg_root = mlx5_debugfs_get_dev_root(dev->mdev); 831 struct mlx5_mkey_cache *cache = &dev->cache; 832 833 if (!mlx5_debugfs_root || dev->is_rep) 834 return; 835 836 cache->fs_root = debugfs_create_dir("mr_cache", dbg_root); 837 } 838 839 static void delay_time_func(struct timer_list *t) 840 { 841 struct mlx5_ib_dev *dev = timer_container_of(dev, t, delay_timer); 842 843 WRITE_ONCE(dev->fill_delay, 0); 844 } 845 846 static int mlx5r_mkeys_init(struct mlx5_cache_ent *ent) 847 { 848 struct mlx5_mkeys_page *page; 849 850 page = kzalloc(sizeof(*page), GFP_KERNEL); 851 if (!page) 852 return -ENOMEM; 853 INIT_LIST_HEAD(&ent->mkeys_queue.pages_list); 854 spin_lock_init(&ent->mkeys_queue.lock); 855 list_add_tail(&page->list, &ent->mkeys_queue.pages_list); 856 ent->mkeys_queue.num_pages++; 857 return 0; 858 } 859 860 static void mlx5r_mkeys_uninit(struct mlx5_cache_ent *ent) 861 { 862 struct mlx5_mkeys_page *page; 863 864 WARN_ON(ent->mkeys_queue.ci || ent->mkeys_queue.num_pages > 1); 865 page = list_last_entry(&ent->mkeys_queue.pages_list, 866 struct mlx5_mkeys_page, list); 867 list_del(&page->list); 868 kfree(page); 869 } 870 871 struct mlx5_cache_ent * 872 mlx5r_cache_create_ent_locked(struct mlx5_ib_dev *dev, 873 struct mlx5r_cache_rb_key rb_key, 874 bool persistent_entry) 875 { 876 struct mlx5_cache_ent *ent; 877 int order; 878 int ret; 879 880 ent = kzalloc(sizeof(*ent), GFP_KERNEL); 881 if (!ent) 882 return ERR_PTR(-ENOMEM); 883 884 ret = mlx5r_mkeys_init(ent); 885 if (ret) 886 goto mkeys_err; 887 ent->rb_key = rb_key; 888 ent->dev = dev; 889 ent->is_tmp = !persistent_entry; 890 891 INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func); 892 893 ret = mlx5_cache_ent_insert(&dev->cache, ent); 894 if (ret) 895 goto ent_insert_err; 896 897 if (persistent_entry) { 898 if (rb_key.access_mode == MLX5_MKC_ACCESS_MODE_KSM) 899 order = MLX5_IMR_KSM_CACHE_ENTRY; 900 else 901 order = order_base_2(rb_key.ndescs) - 2; 902 903 if ((dev->mdev->profile.mask & MLX5_PROF_MASK_MR_CACHE) && 904 !dev->is_rep && mlx5_core_is_pf(dev->mdev) && 905 mlx5r_umr_can_load_pas(dev, 0)) 906 ent->limit = dev->mdev->profile.mr_cache[order].limit; 907 else 908 ent->limit = 0; 909 910 mlx5_mkey_cache_debugfs_add_ent(dev, ent); 911 } 912 913 return ent; 914 ent_insert_err: 915 mlx5r_mkeys_uninit(ent); 916 mkeys_err: 917 kfree(ent); 918 return ERR_PTR(ret); 919 } 920 921 static void mlx5r_destroy_cache_entries(struct mlx5_ib_dev *dev) 922 { 923 struct rb_root *root = &dev->cache.rb_root; 924 struct mlx5_cache_ent *ent; 925 struct rb_node *node; 926 927 mutex_lock(&dev->cache.rb_lock); 928 node = rb_first(root); 929 while (node) { 930 ent = rb_entry(node, struct mlx5_cache_ent, node); 931 node = rb_next(node); 932 clean_keys(dev, ent); 933 rb_erase(&ent->node, root); 934 mlx5r_mkeys_uninit(ent); 935 kfree(ent); 936 } 937 mutex_unlock(&dev->cache.rb_lock); 938 } 939 940 int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev) 941 { 942 struct mlx5_mkey_cache *cache = &dev->cache; 943 struct rb_root *root = &dev->cache.rb_root; 944 struct mlx5r_cache_rb_key rb_key = { 945 .access_mode = MLX5_MKC_ACCESS_MODE_MTT, 946 }; 947 struct mlx5_cache_ent *ent; 948 struct rb_node *node; 949 int ret; 950 int i; 951 952 mutex_init(&dev->slow_path_mutex); 953 mutex_init(&dev->cache.rb_lock); 954 dev->cache.rb_root = RB_ROOT; 955 cache->wq = alloc_ordered_workqueue("mkey_cache", WQ_MEM_RECLAIM); 956 if (!cache->wq) { 957 mlx5_ib_warn(dev, "failed to create work queue\n"); 958 return -ENOMEM; 959 } 960 961 mlx5_cmd_init_async_ctx(dev->mdev, &dev->async_ctx); 962 timer_setup(&dev->delay_timer, delay_time_func, 0); 963 mlx5_mkey_cache_debugfs_init(dev); 964 mutex_lock(&cache->rb_lock); 965 for (i = 0; i <= mkey_cache_max_order(dev); i++) { 966 rb_key.ndescs = MLX5_MR_CACHE_PERSISTENT_ENTRY_MIN_DESCS << i; 967 ent = mlx5r_cache_create_ent_locked(dev, rb_key, true); 968 if (IS_ERR(ent)) { 969 ret = PTR_ERR(ent); 970 goto err; 971 } 972 } 973 974 ret = mlx5_odp_init_mkey_cache(dev); 975 if (ret) 976 goto err; 977 978 mutex_unlock(&cache->rb_lock); 979 for (node = rb_first(root); node; node = rb_next(node)) { 980 ent = rb_entry(node, struct mlx5_cache_ent, node); 981 spin_lock_irq(&ent->mkeys_queue.lock); 982 queue_adjust_cache_locked(ent); 983 spin_unlock_irq(&ent->mkeys_queue.lock); 984 } 985 986 return 0; 987 988 err: 989 mutex_unlock(&cache->rb_lock); 990 mlx5_mkey_cache_debugfs_cleanup(dev); 991 mlx5r_destroy_cache_entries(dev); 992 destroy_workqueue(cache->wq); 993 mlx5_ib_warn(dev, "failed to create mkey cache entry\n"); 994 return ret; 995 } 996 997 void mlx5_mkey_cache_cleanup(struct mlx5_ib_dev *dev) 998 { 999 struct rb_root *root = &dev->cache.rb_root; 1000 struct mlx5_cache_ent *ent; 1001 struct rb_node *node; 1002 1003 if (!dev->cache.wq) 1004 return; 1005 1006 mutex_lock(&dev->cache.rb_lock); 1007 for (node = rb_first(root); node; node = rb_next(node)) { 1008 ent = rb_entry(node, struct mlx5_cache_ent, node); 1009 spin_lock_irq(&ent->mkeys_queue.lock); 1010 ent->disabled = true; 1011 spin_unlock_irq(&ent->mkeys_queue.lock); 1012 cancel_delayed_work(&ent->dwork); 1013 } 1014 mutex_unlock(&dev->cache.rb_lock); 1015 1016 /* 1017 * After all entries are disabled and will not reschedule on WQ, 1018 * flush it and all async commands. 1019 */ 1020 flush_workqueue(dev->cache.wq); 1021 1022 mlx5_mkey_cache_debugfs_cleanup(dev); 1023 mlx5_cmd_cleanup_async_ctx(&dev->async_ctx); 1024 1025 /* At this point all entries are disabled and have no concurrent work. */ 1026 mlx5r_destroy_cache_entries(dev); 1027 1028 destroy_workqueue(dev->cache.wq); 1029 timer_delete_sync(&dev->delay_timer); 1030 } 1031 1032 struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc) 1033 { 1034 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1035 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 1036 struct mlx5_ib_mr *mr; 1037 void *mkc; 1038 u32 *in; 1039 int err; 1040 1041 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 1042 if (!mr) 1043 return ERR_PTR(-ENOMEM); 1044 1045 in = kzalloc(inlen, GFP_KERNEL); 1046 if (!in) { 1047 err = -ENOMEM; 1048 goto err_free; 1049 } 1050 1051 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 1052 1053 MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_PA); 1054 MLX5_SET(mkc, mkc, length64, 1); 1055 set_mkc_access_pd_addr_fields(mkc, acc | IB_ACCESS_RELAXED_ORDERING, 0, 1056 pd); 1057 MLX5_SET(mkc, mkc, ma_translation_mode, MLX5_CAP_GEN(dev->mdev, ats)); 1058 1059 err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen); 1060 if (err) 1061 goto err_in; 1062 1063 kfree(in); 1064 mr->mmkey.type = MLX5_MKEY_MR; 1065 mr->ibmr.lkey = mr->mmkey.key; 1066 mr->ibmr.rkey = mr->mmkey.key; 1067 mr->umem = NULL; 1068 1069 return &mr->ibmr; 1070 1071 err_in: 1072 kfree(in); 1073 1074 err_free: 1075 kfree(mr); 1076 1077 return ERR_PTR(err); 1078 } 1079 1080 static int get_octo_len(u64 addr, u64 len, int page_shift) 1081 { 1082 u64 page_size = 1ULL << page_shift; 1083 u64 offset; 1084 int npages; 1085 1086 offset = addr & (page_size - 1); 1087 npages = ALIGN(len + offset, page_size) >> page_shift; 1088 return (npages + 1) / 2; 1089 } 1090 1091 static int mkey_cache_max_order(struct mlx5_ib_dev *dev) 1092 { 1093 if (MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset)) 1094 return MKEY_CACHE_LAST_STD_ENTRY; 1095 return MLX5_MAX_UMR_SHIFT; 1096 } 1097 1098 static void set_mr_fields(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr, 1099 u64 length, int access_flags, u64 iova) 1100 { 1101 mr->ibmr.lkey = mr->mmkey.key; 1102 mr->ibmr.rkey = mr->mmkey.key; 1103 mr->ibmr.length = length; 1104 mr->ibmr.device = &dev->ib_dev; 1105 mr->ibmr.iova = iova; 1106 mr->access_flags = access_flags; 1107 } 1108 1109 static unsigned int mlx5_umem_dmabuf_default_pgsz(struct ib_umem *umem, 1110 u64 iova) 1111 { 1112 /* 1113 * The alignment of iova has already been checked upon entering 1114 * UVERBS_METHOD_REG_DMABUF_MR 1115 */ 1116 umem->iova = iova; 1117 return PAGE_SIZE; 1118 } 1119 1120 static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd, 1121 struct ib_umem *umem, u64 iova, 1122 int access_flags, int access_mode) 1123 { 1124 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1125 struct mlx5r_cache_rb_key rb_key = {}; 1126 struct mlx5_cache_ent *ent; 1127 struct mlx5_ib_mr *mr; 1128 unsigned long page_size; 1129 1130 if (umem->is_dmabuf) 1131 page_size = mlx5_umem_dmabuf_default_pgsz(umem, iova); 1132 else 1133 page_size = mlx5_umem_mkc_find_best_pgsz(dev, umem, iova, 1134 access_mode); 1135 if (WARN_ON(!page_size)) 1136 return ERR_PTR(-EINVAL); 1137 1138 rb_key.access_mode = access_mode; 1139 rb_key.ndescs = ib_umem_num_dma_blocks(umem, page_size); 1140 rb_key.ats = mlx5_umem_needs_ats(dev, umem, access_flags); 1141 rb_key.access_flags = get_unchangeable_access_flags(dev, access_flags); 1142 ent = mkey_cache_ent_from_rb_key(dev, rb_key); 1143 /* 1144 * If the MR can't come from the cache then synchronously create an uncached 1145 * one. 1146 */ 1147 if (!ent) { 1148 mutex_lock(&dev->slow_path_mutex); 1149 mr = reg_create(pd, umem, iova, access_flags, page_size, false, access_mode); 1150 mutex_unlock(&dev->slow_path_mutex); 1151 if (IS_ERR(mr)) 1152 return mr; 1153 mr->mmkey.rb_key = rb_key; 1154 mr->mmkey.cacheable = true; 1155 return mr; 1156 } 1157 1158 mr = _mlx5_mr_cache_alloc(dev, ent); 1159 if (IS_ERR(mr)) 1160 return mr; 1161 1162 mr->ibmr.pd = pd; 1163 mr->umem = umem; 1164 mr->page_shift = order_base_2(page_size); 1165 set_mr_fields(dev, mr, umem->length, access_flags, iova); 1166 1167 return mr; 1168 } 1169 1170 static struct ib_mr * 1171 reg_create_crossing_vhca_mr(struct ib_pd *pd, u64 iova, u64 length, int access_flags, 1172 u32 crossed_lkey) 1173 { 1174 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1175 int access_mode = MLX5_MKC_ACCESS_MODE_CROSSING; 1176 struct mlx5_ib_mr *mr; 1177 void *mkc; 1178 int inlen; 1179 u32 *in; 1180 int err; 1181 1182 if (!MLX5_CAP_GEN(dev->mdev, crossing_vhca_mkey)) 1183 return ERR_PTR(-EOPNOTSUPP); 1184 1185 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 1186 if (!mr) 1187 return ERR_PTR(-ENOMEM); 1188 1189 inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 1190 in = kvzalloc(inlen, GFP_KERNEL); 1191 if (!in) { 1192 err = -ENOMEM; 1193 goto err_1; 1194 } 1195 1196 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 1197 MLX5_SET(mkc, mkc, crossing_target_vhca_id, 1198 MLX5_CAP_GEN(dev->mdev, vhca_id)); 1199 MLX5_SET(mkc, mkc, translations_octword_size, crossed_lkey); 1200 MLX5_SET(mkc, mkc, access_mode_1_0, access_mode & 0x3); 1201 MLX5_SET(mkc, mkc, access_mode_4_2, (access_mode >> 2) & 0x7); 1202 1203 /* for this crossing mkey IOVA should be 0 and len should be IOVA + len */ 1204 set_mkc_access_pd_addr_fields(mkc, access_flags, 0, pd); 1205 MLX5_SET64(mkc, mkc, len, iova + length); 1206 1207 MLX5_SET(mkc, mkc, free, 0); 1208 MLX5_SET(mkc, mkc, umr_en, 0); 1209 err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen); 1210 if (err) 1211 goto err_2; 1212 1213 mr->mmkey.type = MLX5_MKEY_MR; 1214 set_mr_fields(dev, mr, length, access_flags, iova); 1215 mr->ibmr.pd = pd; 1216 kvfree(in); 1217 mlx5_ib_dbg(dev, "crossing mkey = 0x%x\n", mr->mmkey.key); 1218 1219 return &mr->ibmr; 1220 err_2: 1221 kvfree(in); 1222 err_1: 1223 kfree(mr); 1224 return ERR_PTR(err); 1225 } 1226 1227 /* 1228 * If ibmr is NULL it will be allocated by reg_create. 1229 * Else, the given ibmr will be used. 1230 */ 1231 static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem, 1232 u64 iova, int access_flags, 1233 unsigned long page_size, bool populate, 1234 int access_mode) 1235 { 1236 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1237 struct mlx5_ib_mr *mr; 1238 __be64 *pas; 1239 void *mkc; 1240 int inlen; 1241 u32 *in; 1242 int err; 1243 bool pg_cap = !!(MLX5_CAP_GEN(dev->mdev, pg)) && 1244 (access_mode == MLX5_MKC_ACCESS_MODE_MTT); 1245 bool ksm_mode = (access_mode == MLX5_MKC_ACCESS_MODE_KSM); 1246 1247 if (!page_size) 1248 return ERR_PTR(-EINVAL); 1249 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 1250 if (!mr) 1251 return ERR_PTR(-ENOMEM); 1252 1253 mr->ibmr.pd = pd; 1254 mr->access_flags = access_flags; 1255 mr->page_shift = order_base_2(page_size); 1256 1257 inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 1258 if (populate) 1259 inlen += sizeof(*pas) * 1260 roundup(ib_umem_num_dma_blocks(umem, page_size), 2); 1261 in = kvzalloc(inlen, GFP_KERNEL); 1262 if (!in) { 1263 err = -ENOMEM; 1264 goto err_1; 1265 } 1266 pas = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt); 1267 if (populate) { 1268 if (WARN_ON(access_flags & IB_ACCESS_ON_DEMAND || ksm_mode)) { 1269 err = -EINVAL; 1270 goto err_2; 1271 } 1272 mlx5_ib_populate_pas(umem, 1UL << mr->page_shift, pas, 1273 pg_cap ? MLX5_IB_MTT_PRESENT : 0); 1274 } 1275 1276 /* The pg_access bit allows setting the access flags 1277 * in the page list submitted with the command. 1278 */ 1279 MLX5_SET(create_mkey_in, in, pg_access, !!(pg_cap)); 1280 1281 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 1282 set_mkc_access_pd_addr_fields(mkc, access_flags, iova, 1283 populate ? pd : dev->umrc.pd); 1284 /* In case a data direct flow, overwrite the pdn field by its internal kernel PD */ 1285 if (umem->is_dmabuf && ksm_mode) 1286 MLX5_SET(mkc, mkc, pd, dev->ddr.pdn); 1287 1288 MLX5_SET(mkc, mkc, free, !populate); 1289 MLX5_SET(mkc, mkc, access_mode_1_0, access_mode); 1290 MLX5_SET(mkc, mkc, umr_en, 1); 1291 1292 MLX5_SET64(mkc, mkc, len, umem->length); 1293 MLX5_SET(mkc, mkc, bsf_octword_size, 0); 1294 if (ksm_mode) 1295 MLX5_SET(mkc, mkc, translations_octword_size, 1296 get_octo_len(iova, umem->length, mr->page_shift) * 2); 1297 else 1298 MLX5_SET(mkc, mkc, translations_octword_size, 1299 get_octo_len(iova, umem->length, mr->page_shift)); 1300 MLX5_SET(mkc, mkc, log_page_size, mr->page_shift); 1301 if (mlx5_umem_needs_ats(dev, umem, access_flags)) 1302 MLX5_SET(mkc, mkc, ma_translation_mode, 1); 1303 if (populate) { 1304 MLX5_SET(create_mkey_in, in, translations_octword_actual_size, 1305 get_octo_len(iova, umem->length, mr->page_shift)); 1306 } 1307 1308 err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen); 1309 if (err) { 1310 mlx5_ib_warn(dev, "create mkey failed\n"); 1311 goto err_2; 1312 } 1313 mr->mmkey.type = MLX5_MKEY_MR; 1314 mr->mmkey.ndescs = get_octo_len(iova, umem->length, mr->page_shift); 1315 mr->umem = umem; 1316 set_mr_fields(dev, mr, umem->length, access_flags, iova); 1317 kvfree(in); 1318 1319 mlx5_ib_dbg(dev, "mkey = 0x%x\n", mr->mmkey.key); 1320 1321 return mr; 1322 1323 err_2: 1324 kvfree(in); 1325 err_1: 1326 kfree(mr); 1327 return ERR_PTR(err); 1328 } 1329 1330 static struct ib_mr *mlx5_ib_get_dm_mr(struct ib_pd *pd, u64 start_addr, 1331 u64 length, int acc, int mode) 1332 { 1333 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1334 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 1335 struct mlx5_ib_mr *mr; 1336 void *mkc; 1337 u32 *in; 1338 int err; 1339 1340 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 1341 if (!mr) 1342 return ERR_PTR(-ENOMEM); 1343 1344 in = kzalloc(inlen, GFP_KERNEL); 1345 if (!in) { 1346 err = -ENOMEM; 1347 goto err_free; 1348 } 1349 1350 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 1351 1352 MLX5_SET(mkc, mkc, access_mode_1_0, mode & 0x3); 1353 MLX5_SET(mkc, mkc, access_mode_4_2, (mode >> 2) & 0x7); 1354 MLX5_SET64(mkc, mkc, len, length); 1355 set_mkc_access_pd_addr_fields(mkc, acc, start_addr, pd); 1356 1357 err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen); 1358 if (err) 1359 goto err_in; 1360 1361 kfree(in); 1362 1363 set_mr_fields(dev, mr, length, acc, start_addr); 1364 1365 return &mr->ibmr; 1366 1367 err_in: 1368 kfree(in); 1369 1370 err_free: 1371 kfree(mr); 1372 1373 return ERR_PTR(err); 1374 } 1375 1376 int mlx5_ib_advise_mr(struct ib_pd *pd, 1377 enum ib_uverbs_advise_mr_advice advice, 1378 u32 flags, 1379 struct ib_sge *sg_list, 1380 u32 num_sge, 1381 struct uverbs_attr_bundle *attrs) 1382 { 1383 if (advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH && 1384 advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE && 1385 advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_NO_FAULT) 1386 return -EOPNOTSUPP; 1387 1388 return mlx5_ib_advise_mr_prefetch(pd, advice, flags, 1389 sg_list, num_sge); 1390 } 1391 1392 struct ib_mr *mlx5_ib_reg_dm_mr(struct ib_pd *pd, struct ib_dm *dm, 1393 struct ib_dm_mr_attr *attr, 1394 struct uverbs_attr_bundle *attrs) 1395 { 1396 struct mlx5_ib_dm *mdm = to_mdm(dm); 1397 struct mlx5_core_dev *dev = to_mdev(dm->device)->mdev; 1398 u64 start_addr = mdm->dev_addr + attr->offset; 1399 int mode; 1400 1401 switch (mdm->type) { 1402 case MLX5_IB_UAPI_DM_TYPE_MEMIC: 1403 if (attr->access_flags & ~MLX5_IB_DM_MEMIC_ALLOWED_ACCESS) 1404 return ERR_PTR(-EINVAL); 1405 1406 mode = MLX5_MKC_ACCESS_MODE_MEMIC; 1407 start_addr -= pci_resource_start(dev->pdev, 0); 1408 break; 1409 case MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM: 1410 case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM: 1411 case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_PATTERN_SW_ICM: 1412 case MLX5_IB_UAPI_DM_TYPE_ENCAP_SW_ICM: 1413 if (attr->access_flags & ~MLX5_IB_DM_SW_ICM_ALLOWED_ACCESS) 1414 return ERR_PTR(-EINVAL); 1415 1416 mode = MLX5_MKC_ACCESS_MODE_SW_ICM; 1417 break; 1418 default: 1419 return ERR_PTR(-EINVAL); 1420 } 1421 1422 return mlx5_ib_get_dm_mr(pd, start_addr, attr->length, 1423 attr->access_flags, mode); 1424 } 1425 1426 static struct ib_mr *create_real_mr(struct ib_pd *pd, struct ib_umem *umem, 1427 u64 iova, int access_flags) 1428 { 1429 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1430 struct mlx5_ib_mr *mr = NULL; 1431 bool xlt_with_umr; 1432 int err; 1433 1434 xlt_with_umr = mlx5r_umr_can_load_pas(dev, umem->length); 1435 if (xlt_with_umr) { 1436 mr = alloc_cacheable_mr(pd, umem, iova, access_flags, 1437 MLX5_MKC_ACCESS_MODE_MTT); 1438 } else { 1439 unsigned long page_size = mlx5_umem_mkc_find_best_pgsz( 1440 dev, umem, iova, MLX5_MKC_ACCESS_MODE_MTT); 1441 1442 mutex_lock(&dev->slow_path_mutex); 1443 mr = reg_create(pd, umem, iova, access_flags, page_size, 1444 true, MLX5_MKC_ACCESS_MODE_MTT); 1445 mutex_unlock(&dev->slow_path_mutex); 1446 } 1447 if (IS_ERR(mr)) { 1448 ib_umem_release(umem); 1449 return ERR_CAST(mr); 1450 } 1451 1452 mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmkey.key); 1453 1454 atomic_add(ib_umem_num_pages(umem), &dev->mdev->priv.reg_pages); 1455 1456 if (xlt_with_umr) { 1457 /* 1458 * If the MR was created with reg_create then it will be 1459 * configured properly but left disabled. It is safe to go ahead 1460 * and configure it again via UMR while enabling it. 1461 */ 1462 err = mlx5r_umr_update_mr_pas(mr, MLX5_IB_UPD_XLT_ENABLE); 1463 if (err) { 1464 mlx5_ib_dereg_mr(&mr->ibmr, NULL); 1465 return ERR_PTR(err); 1466 } 1467 } 1468 return &mr->ibmr; 1469 } 1470 1471 static struct ib_mr *create_user_odp_mr(struct ib_pd *pd, u64 start, u64 length, 1472 u64 iova, int access_flags, 1473 struct ib_udata *udata) 1474 { 1475 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1476 struct ib_umem_odp *odp; 1477 struct mlx5_ib_mr *mr; 1478 int err; 1479 1480 if (!IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) 1481 return ERR_PTR(-EOPNOTSUPP); 1482 1483 err = mlx5r_odp_create_eq(dev, &dev->odp_pf_eq); 1484 if (err) 1485 return ERR_PTR(err); 1486 if (!start && length == U64_MAX) { 1487 if (iova != 0) 1488 return ERR_PTR(-EINVAL); 1489 if (!(dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT)) 1490 return ERR_PTR(-EINVAL); 1491 1492 mr = mlx5_ib_alloc_implicit_mr(to_mpd(pd), access_flags); 1493 if (IS_ERR(mr)) 1494 return ERR_CAST(mr); 1495 return &mr->ibmr; 1496 } 1497 1498 /* ODP requires xlt update via umr to work. */ 1499 if (!mlx5r_umr_can_load_pas(dev, length)) 1500 return ERR_PTR(-EINVAL); 1501 1502 odp = ib_umem_odp_get(&dev->ib_dev, start, length, access_flags, 1503 &mlx5_mn_ops); 1504 if (IS_ERR(odp)) 1505 return ERR_CAST(odp); 1506 1507 mr = alloc_cacheable_mr(pd, &odp->umem, iova, access_flags, 1508 MLX5_MKC_ACCESS_MODE_MTT); 1509 if (IS_ERR(mr)) { 1510 ib_umem_release(&odp->umem); 1511 return ERR_CAST(mr); 1512 } 1513 xa_init(&mr->implicit_children); 1514 1515 odp->private = mr; 1516 err = mlx5r_store_odp_mkey(dev, &mr->mmkey); 1517 if (err) 1518 goto err_dereg_mr; 1519 1520 err = mlx5_ib_init_odp_mr(mr); 1521 if (err) 1522 goto err_dereg_mr; 1523 return &mr->ibmr; 1524 1525 err_dereg_mr: 1526 mlx5_ib_dereg_mr(&mr->ibmr, NULL); 1527 return ERR_PTR(err); 1528 } 1529 1530 struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, 1531 u64 iova, int access_flags, 1532 struct ib_udata *udata) 1533 { 1534 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1535 struct ib_umem *umem; 1536 int err; 1537 1538 if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM)) 1539 return ERR_PTR(-EOPNOTSUPP); 1540 1541 mlx5_ib_dbg(dev, "start 0x%llx, iova 0x%llx, length 0x%llx, access_flags 0x%x\n", 1542 start, iova, length, access_flags); 1543 1544 err = mlx5r_umr_resource_init(dev); 1545 if (err) 1546 return ERR_PTR(err); 1547 1548 if (access_flags & IB_ACCESS_ON_DEMAND) 1549 return create_user_odp_mr(pd, start, length, iova, access_flags, 1550 udata); 1551 umem = ib_umem_get(&dev->ib_dev, start, length, access_flags); 1552 if (IS_ERR(umem)) 1553 return ERR_CAST(umem); 1554 return create_real_mr(pd, umem, iova, access_flags); 1555 } 1556 1557 static void mlx5_ib_dmabuf_invalidate_cb(struct dma_buf_attachment *attach) 1558 { 1559 struct ib_umem_dmabuf *umem_dmabuf = attach->importer_priv; 1560 struct mlx5_ib_mr *mr = umem_dmabuf->private; 1561 1562 dma_resv_assert_held(umem_dmabuf->attach->dmabuf->resv); 1563 1564 if (!umem_dmabuf->sgt || !mr) 1565 return; 1566 1567 mlx5r_umr_update_mr_pas(mr, MLX5_IB_UPD_XLT_ZAP); 1568 ib_umem_dmabuf_unmap_pages(umem_dmabuf); 1569 } 1570 1571 static struct dma_buf_attach_ops mlx5_ib_dmabuf_attach_ops = { 1572 .allow_peer2peer = 1, 1573 .move_notify = mlx5_ib_dmabuf_invalidate_cb, 1574 }; 1575 1576 static struct ib_mr * 1577 reg_user_mr_dmabuf(struct ib_pd *pd, struct device *dma_device, 1578 u64 offset, u64 length, u64 virt_addr, 1579 int fd, int access_flags, int access_mode) 1580 { 1581 bool pinned_mode = (access_mode == MLX5_MKC_ACCESS_MODE_KSM); 1582 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1583 struct mlx5_ib_mr *mr = NULL; 1584 struct ib_umem_dmabuf *umem_dmabuf; 1585 int err; 1586 1587 err = mlx5r_umr_resource_init(dev); 1588 if (err) 1589 return ERR_PTR(err); 1590 1591 if (!pinned_mode) 1592 umem_dmabuf = ib_umem_dmabuf_get(&dev->ib_dev, 1593 offset, length, fd, 1594 access_flags, 1595 &mlx5_ib_dmabuf_attach_ops); 1596 else 1597 umem_dmabuf = ib_umem_dmabuf_get_pinned_with_dma_device(&dev->ib_dev, 1598 dma_device, offset, length, 1599 fd, access_flags); 1600 1601 if (IS_ERR(umem_dmabuf)) { 1602 mlx5_ib_dbg(dev, "umem_dmabuf get failed (%ld)\n", 1603 PTR_ERR(umem_dmabuf)); 1604 return ERR_CAST(umem_dmabuf); 1605 } 1606 1607 mr = alloc_cacheable_mr(pd, &umem_dmabuf->umem, virt_addr, 1608 access_flags, access_mode); 1609 if (IS_ERR(mr)) { 1610 ib_umem_release(&umem_dmabuf->umem); 1611 return ERR_CAST(mr); 1612 } 1613 1614 mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmkey.key); 1615 1616 atomic_add(ib_umem_num_pages(mr->umem), &dev->mdev->priv.reg_pages); 1617 umem_dmabuf->private = mr; 1618 if (!pinned_mode) { 1619 err = mlx5r_store_odp_mkey(dev, &mr->mmkey); 1620 if (err) 1621 goto err_dereg_mr; 1622 } else { 1623 mr->data_direct = true; 1624 } 1625 1626 err = mlx5_ib_init_dmabuf_mr(mr); 1627 if (err) 1628 goto err_dereg_mr; 1629 return &mr->ibmr; 1630 1631 err_dereg_mr: 1632 __mlx5_ib_dereg_mr(&mr->ibmr); 1633 return ERR_PTR(err); 1634 } 1635 1636 static struct ib_mr * 1637 reg_user_mr_dmabuf_by_data_direct(struct ib_pd *pd, u64 offset, 1638 u64 length, u64 virt_addr, 1639 int fd, int access_flags) 1640 { 1641 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1642 struct mlx5_data_direct_dev *data_direct_dev; 1643 struct ib_mr *crossing_mr; 1644 struct ib_mr *crossed_mr; 1645 int ret = 0; 1646 1647 /* As of HW behaviour the IOVA must be page aligned in KSM mode */ 1648 if (!PAGE_ALIGNED(virt_addr) || (access_flags & IB_ACCESS_ON_DEMAND)) 1649 return ERR_PTR(-EOPNOTSUPP); 1650 1651 mutex_lock(&dev->data_direct_lock); 1652 data_direct_dev = dev->data_direct_dev; 1653 if (!data_direct_dev) { 1654 ret = -EINVAL; 1655 goto end; 1656 } 1657 1658 /* The device's 'data direct mkey' was created without RO flags to 1659 * simplify things and allow for a single mkey per device. 1660 * Since RO is not a must, mask it out accordingly. 1661 */ 1662 access_flags &= ~IB_ACCESS_RELAXED_ORDERING; 1663 crossed_mr = reg_user_mr_dmabuf(pd, &data_direct_dev->pdev->dev, 1664 offset, length, virt_addr, fd, 1665 access_flags, MLX5_MKC_ACCESS_MODE_KSM); 1666 if (IS_ERR(crossed_mr)) { 1667 ret = PTR_ERR(crossed_mr); 1668 goto end; 1669 } 1670 1671 mutex_lock(&dev->slow_path_mutex); 1672 crossing_mr = reg_create_crossing_vhca_mr(pd, virt_addr, length, access_flags, 1673 crossed_mr->lkey); 1674 mutex_unlock(&dev->slow_path_mutex); 1675 if (IS_ERR(crossing_mr)) { 1676 __mlx5_ib_dereg_mr(crossed_mr); 1677 ret = PTR_ERR(crossing_mr); 1678 goto end; 1679 } 1680 1681 list_add_tail(&to_mmr(crossed_mr)->dd_node, &dev->data_direct_mr_list); 1682 to_mmr(crossing_mr)->dd_crossed_mr = to_mmr(crossed_mr); 1683 to_mmr(crossing_mr)->data_direct = true; 1684 end: 1685 mutex_unlock(&dev->data_direct_lock); 1686 return ret ? ERR_PTR(ret) : crossing_mr; 1687 } 1688 1689 struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset, 1690 u64 length, u64 virt_addr, 1691 int fd, int access_flags, 1692 struct uverbs_attr_bundle *attrs) 1693 { 1694 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1695 int mlx5_access_flags = 0; 1696 int err; 1697 1698 if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM) || 1699 !IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) 1700 return ERR_PTR(-EOPNOTSUPP); 1701 1702 if (uverbs_attr_is_valid(attrs, MLX5_IB_ATTR_REG_DMABUF_MR_ACCESS_FLAGS)) { 1703 err = uverbs_get_flags32(&mlx5_access_flags, attrs, 1704 MLX5_IB_ATTR_REG_DMABUF_MR_ACCESS_FLAGS, 1705 MLX5_IB_UAPI_REG_DMABUF_ACCESS_DATA_DIRECT); 1706 if (err) 1707 return ERR_PTR(err); 1708 } 1709 1710 mlx5_ib_dbg(dev, 1711 "offset 0x%llx, virt_addr 0x%llx, length 0x%llx, fd %d, access_flags 0x%x, mlx5_access_flags 0x%x\n", 1712 offset, virt_addr, length, fd, access_flags, mlx5_access_flags); 1713 1714 /* dmabuf requires xlt update via umr to work. */ 1715 if (!mlx5r_umr_can_load_pas(dev, length)) 1716 return ERR_PTR(-EINVAL); 1717 1718 if (mlx5_access_flags & MLX5_IB_UAPI_REG_DMABUF_ACCESS_DATA_DIRECT) 1719 return reg_user_mr_dmabuf_by_data_direct(pd, offset, length, virt_addr, 1720 fd, access_flags); 1721 1722 return reg_user_mr_dmabuf(pd, pd->device->dma_device, 1723 offset, length, virt_addr, 1724 fd, access_flags, MLX5_MKC_ACCESS_MODE_MTT); 1725 } 1726 1727 /* 1728 * True if the change in access flags can be done via UMR, only some access 1729 * flags can be updated. 1730 */ 1731 static bool can_use_umr_rereg_access(struct mlx5_ib_dev *dev, 1732 unsigned int current_access_flags, 1733 unsigned int target_access_flags) 1734 { 1735 unsigned int diffs = current_access_flags ^ target_access_flags; 1736 1737 if (diffs & ~(IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE | 1738 IB_ACCESS_REMOTE_READ | IB_ACCESS_RELAXED_ORDERING | 1739 IB_ACCESS_REMOTE_ATOMIC)) 1740 return false; 1741 return mlx5r_umr_can_reconfig(dev, current_access_flags, 1742 target_access_flags); 1743 } 1744 1745 static bool can_use_umr_rereg_pas(struct mlx5_ib_mr *mr, 1746 struct ib_umem *new_umem, 1747 int new_access_flags, u64 iova, 1748 unsigned long *page_size) 1749 { 1750 struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device); 1751 1752 /* We only track the allocated sizes of MRs from the cache */ 1753 if (!mr->mmkey.cache_ent) 1754 return false; 1755 if (!mlx5r_umr_can_load_pas(dev, new_umem->length)) 1756 return false; 1757 1758 *page_size = mlx5_umem_mkc_find_best_pgsz( 1759 dev, new_umem, iova, mr->mmkey.cache_ent->rb_key.access_mode); 1760 if (WARN_ON(!*page_size)) 1761 return false; 1762 return (mr->mmkey.cache_ent->rb_key.ndescs) >= 1763 ib_umem_num_dma_blocks(new_umem, *page_size); 1764 } 1765 1766 static int umr_rereg_pas(struct mlx5_ib_mr *mr, struct ib_pd *pd, 1767 int access_flags, int flags, struct ib_umem *new_umem, 1768 u64 iova, unsigned long page_size) 1769 { 1770 struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device); 1771 int upd_flags = MLX5_IB_UPD_XLT_ADDR | MLX5_IB_UPD_XLT_ENABLE; 1772 struct ib_umem *old_umem = mr->umem; 1773 int err; 1774 1775 /* 1776 * To keep everything simple the MR is revoked before we start to mess 1777 * with it. This ensure the change is atomic relative to any use of the 1778 * MR. 1779 */ 1780 err = mlx5r_umr_revoke_mr(mr); 1781 if (err) 1782 return err; 1783 1784 if (flags & IB_MR_REREG_PD) { 1785 mr->ibmr.pd = pd; 1786 upd_flags |= MLX5_IB_UPD_XLT_PD; 1787 } 1788 if (flags & IB_MR_REREG_ACCESS) { 1789 mr->access_flags = access_flags; 1790 upd_flags |= MLX5_IB_UPD_XLT_ACCESS; 1791 } 1792 1793 mr->ibmr.iova = iova; 1794 mr->ibmr.length = new_umem->length; 1795 mr->page_shift = order_base_2(page_size); 1796 mr->umem = new_umem; 1797 err = mlx5r_umr_update_mr_pas(mr, upd_flags); 1798 if (err) { 1799 /* 1800 * The MR is revoked at this point so there is no issue to free 1801 * new_umem. 1802 */ 1803 mr->umem = old_umem; 1804 return err; 1805 } 1806 1807 atomic_sub(ib_umem_num_pages(old_umem), &dev->mdev->priv.reg_pages); 1808 ib_umem_release(old_umem); 1809 atomic_add(ib_umem_num_pages(new_umem), &dev->mdev->priv.reg_pages); 1810 return 0; 1811 } 1812 1813 struct ib_mr *mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, 1814 u64 length, u64 iova, int new_access_flags, 1815 struct ib_pd *new_pd, 1816 struct ib_udata *udata) 1817 { 1818 struct mlx5_ib_dev *dev = to_mdev(ib_mr->device); 1819 struct mlx5_ib_mr *mr = to_mmr(ib_mr); 1820 int err; 1821 1822 if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM) || mr->data_direct) 1823 return ERR_PTR(-EOPNOTSUPP); 1824 1825 mlx5_ib_dbg( 1826 dev, 1827 "start 0x%llx, iova 0x%llx, length 0x%llx, access_flags 0x%x\n", 1828 start, iova, length, new_access_flags); 1829 1830 if (flags & ~(IB_MR_REREG_TRANS | IB_MR_REREG_PD | IB_MR_REREG_ACCESS)) 1831 return ERR_PTR(-EOPNOTSUPP); 1832 1833 if (!(flags & IB_MR_REREG_ACCESS)) 1834 new_access_flags = mr->access_flags; 1835 if (!(flags & IB_MR_REREG_PD)) 1836 new_pd = ib_mr->pd; 1837 1838 if (!(flags & IB_MR_REREG_TRANS)) { 1839 struct ib_umem *umem; 1840 1841 /* Fast path for PD/access change */ 1842 if (can_use_umr_rereg_access(dev, mr->access_flags, 1843 new_access_flags)) { 1844 err = mlx5r_umr_rereg_pd_access(mr, new_pd, 1845 new_access_flags); 1846 if (err) 1847 return ERR_PTR(err); 1848 return NULL; 1849 } 1850 /* DM or ODP MR's don't have a normal umem so we can't re-use it */ 1851 if (!mr->umem || is_odp_mr(mr) || is_dmabuf_mr(mr)) 1852 goto recreate; 1853 1854 /* 1855 * Only one active MR can refer to a umem at one time, revoke 1856 * the old MR before assigning the umem to the new one. 1857 */ 1858 err = mlx5r_umr_revoke_mr(mr); 1859 if (err) 1860 return ERR_PTR(err); 1861 umem = mr->umem; 1862 mr->umem = NULL; 1863 atomic_sub(ib_umem_num_pages(umem), &dev->mdev->priv.reg_pages); 1864 1865 return create_real_mr(new_pd, umem, mr->ibmr.iova, 1866 new_access_flags); 1867 } 1868 1869 /* 1870 * DM doesn't have a PAS list so we can't re-use it, odp/dmabuf does 1871 * but the logic around releasing the umem is different 1872 */ 1873 if (!mr->umem || is_odp_mr(mr) || is_dmabuf_mr(mr)) 1874 goto recreate; 1875 1876 if (!(new_access_flags & IB_ACCESS_ON_DEMAND) && 1877 can_use_umr_rereg_access(dev, mr->access_flags, new_access_flags)) { 1878 struct ib_umem *new_umem; 1879 unsigned long page_size; 1880 1881 new_umem = ib_umem_get(&dev->ib_dev, start, length, 1882 new_access_flags); 1883 if (IS_ERR(new_umem)) 1884 return ERR_CAST(new_umem); 1885 1886 /* Fast path for PAS change */ 1887 if (can_use_umr_rereg_pas(mr, new_umem, new_access_flags, iova, 1888 &page_size)) { 1889 err = umr_rereg_pas(mr, new_pd, new_access_flags, flags, 1890 new_umem, iova, page_size); 1891 if (err) { 1892 ib_umem_release(new_umem); 1893 return ERR_PTR(err); 1894 } 1895 return NULL; 1896 } 1897 return create_real_mr(new_pd, new_umem, iova, new_access_flags); 1898 } 1899 1900 /* 1901 * Everything else has no state we can preserve, just create a new MR 1902 * from scratch 1903 */ 1904 recreate: 1905 return mlx5_ib_reg_user_mr(new_pd, start, length, iova, 1906 new_access_flags, udata); 1907 } 1908 1909 static int 1910 mlx5_alloc_priv_descs(struct ib_device *device, 1911 struct mlx5_ib_mr *mr, 1912 int ndescs, 1913 int desc_size) 1914 { 1915 struct mlx5_ib_dev *dev = to_mdev(device); 1916 struct device *ddev = &dev->mdev->pdev->dev; 1917 int size = ndescs * desc_size; 1918 int add_size; 1919 int ret; 1920 1921 add_size = max_t(int, MLX5_UMR_ALIGN - ARCH_KMALLOC_MINALIGN, 0); 1922 if (is_power_of_2(MLX5_UMR_ALIGN) && add_size) { 1923 int end = max_t(int, MLX5_UMR_ALIGN, roundup_pow_of_two(size)); 1924 1925 add_size = min_t(int, end - size, add_size); 1926 } 1927 1928 mr->descs_alloc = kzalloc(size + add_size, GFP_KERNEL); 1929 if (!mr->descs_alloc) 1930 return -ENOMEM; 1931 1932 mr->descs = PTR_ALIGN(mr->descs_alloc, MLX5_UMR_ALIGN); 1933 1934 mr->desc_map = dma_map_single(ddev, mr->descs, size, DMA_TO_DEVICE); 1935 if (dma_mapping_error(ddev, mr->desc_map)) { 1936 ret = -ENOMEM; 1937 goto err; 1938 } 1939 1940 return 0; 1941 err: 1942 kfree(mr->descs_alloc); 1943 1944 return ret; 1945 } 1946 1947 static void 1948 mlx5_free_priv_descs(struct mlx5_ib_mr *mr) 1949 { 1950 if (!mr->umem && !mr->data_direct && 1951 mr->ibmr.type != IB_MR_TYPE_DM && mr->descs) { 1952 struct ib_device *device = mr->ibmr.device; 1953 int size = mr->max_descs * mr->desc_size; 1954 struct mlx5_ib_dev *dev = to_mdev(device); 1955 1956 dma_unmap_single(&dev->mdev->pdev->dev, mr->desc_map, size, 1957 DMA_TO_DEVICE); 1958 kfree(mr->descs_alloc); 1959 mr->descs = NULL; 1960 } 1961 } 1962 1963 static int cache_ent_find_and_store(struct mlx5_ib_dev *dev, 1964 struct mlx5_ib_mr *mr) 1965 { 1966 struct mlx5_mkey_cache *cache = &dev->cache; 1967 struct mlx5_cache_ent *ent; 1968 int ret; 1969 1970 if (mr->mmkey.cache_ent) { 1971 spin_lock_irq(&mr->mmkey.cache_ent->mkeys_queue.lock); 1972 goto end; 1973 } 1974 1975 mutex_lock(&cache->rb_lock); 1976 ent = mkey_cache_ent_from_rb_key(dev, mr->mmkey.rb_key); 1977 if (ent) { 1978 if (ent->rb_key.ndescs == mr->mmkey.rb_key.ndescs) { 1979 if (ent->disabled) { 1980 mutex_unlock(&cache->rb_lock); 1981 return -EOPNOTSUPP; 1982 } 1983 mr->mmkey.cache_ent = ent; 1984 spin_lock_irq(&mr->mmkey.cache_ent->mkeys_queue.lock); 1985 mutex_unlock(&cache->rb_lock); 1986 goto end; 1987 } 1988 } 1989 1990 ent = mlx5r_cache_create_ent_locked(dev, mr->mmkey.rb_key, false); 1991 mutex_unlock(&cache->rb_lock); 1992 if (IS_ERR(ent)) 1993 return PTR_ERR(ent); 1994 1995 mr->mmkey.cache_ent = ent; 1996 spin_lock_irq(&mr->mmkey.cache_ent->mkeys_queue.lock); 1997 1998 end: 1999 ret = push_mkey_locked(mr->mmkey.cache_ent, mr->mmkey.key); 2000 spin_unlock_irq(&mr->mmkey.cache_ent->mkeys_queue.lock); 2001 return ret; 2002 } 2003 2004 static int mlx5_ib_revoke_data_direct_mr(struct mlx5_ib_mr *mr) 2005 { 2006 struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device); 2007 struct ib_umem_dmabuf *umem_dmabuf = to_ib_umem_dmabuf(mr->umem); 2008 int err; 2009 2010 lockdep_assert_held(&dev->data_direct_lock); 2011 mr->revoked = true; 2012 err = mlx5r_umr_revoke_mr(mr); 2013 if (WARN_ON(err)) 2014 return err; 2015 2016 ib_umem_dmabuf_revoke(umem_dmabuf); 2017 return 0; 2018 } 2019 2020 void mlx5_ib_revoke_data_direct_mrs(struct mlx5_ib_dev *dev) 2021 { 2022 struct mlx5_ib_mr *mr, *next; 2023 2024 lockdep_assert_held(&dev->data_direct_lock); 2025 2026 list_for_each_entry_safe(mr, next, &dev->data_direct_mr_list, dd_node) { 2027 list_del(&mr->dd_node); 2028 mlx5_ib_revoke_data_direct_mr(mr); 2029 } 2030 } 2031 2032 static int mlx5_revoke_mr(struct mlx5_ib_mr *mr) 2033 { 2034 struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device); 2035 struct mlx5_cache_ent *ent = mr->mmkey.cache_ent; 2036 bool is_odp = is_odp_mr(mr); 2037 bool is_odp_dma_buf = is_dmabuf_mr(mr) && 2038 !to_ib_umem_dmabuf(mr->umem)->pinned; 2039 bool from_cache = !!ent; 2040 int ret = 0; 2041 2042 if (is_odp) 2043 mutex_lock(&to_ib_umem_odp(mr->umem)->umem_mutex); 2044 2045 if (is_odp_dma_buf) 2046 dma_resv_lock(to_ib_umem_dmabuf(mr->umem)->attach->dmabuf->resv, NULL); 2047 2048 if (mr->mmkey.cacheable && !mlx5r_umr_revoke_mr(mr) && !cache_ent_find_and_store(dev, mr)) { 2049 ent = mr->mmkey.cache_ent; 2050 /* upon storing to a clean temp entry - schedule its cleanup */ 2051 spin_lock_irq(&ent->mkeys_queue.lock); 2052 if (from_cache) 2053 ent->in_use--; 2054 if (ent->is_tmp && !ent->tmp_cleanup_scheduled) { 2055 mod_delayed_work(ent->dev->cache.wq, &ent->dwork, 2056 secs_to_jiffies(30)); 2057 ent->tmp_cleanup_scheduled = true; 2058 } 2059 spin_unlock_irq(&ent->mkeys_queue.lock); 2060 goto out; 2061 } 2062 2063 if (ent) { 2064 spin_lock_irq(&ent->mkeys_queue.lock); 2065 ent->in_use--; 2066 mr->mmkey.cache_ent = NULL; 2067 spin_unlock_irq(&ent->mkeys_queue.lock); 2068 } 2069 ret = destroy_mkey(dev, mr); 2070 out: 2071 if (is_odp) { 2072 if (!ret) 2073 to_ib_umem_odp(mr->umem)->private = NULL; 2074 mutex_unlock(&to_ib_umem_odp(mr->umem)->umem_mutex); 2075 } 2076 2077 if (is_odp_dma_buf) { 2078 if (!ret) 2079 to_ib_umem_dmabuf(mr->umem)->private = NULL; 2080 dma_resv_unlock(to_ib_umem_dmabuf(mr->umem)->attach->dmabuf->resv); 2081 } 2082 2083 return ret; 2084 } 2085 2086 static int __mlx5_ib_dereg_mr(struct ib_mr *ibmr) 2087 { 2088 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2089 struct mlx5_ib_dev *dev = to_mdev(ibmr->device); 2090 int rc; 2091 2092 /* 2093 * Any async use of the mr must hold the refcount, once the refcount 2094 * goes to zero no other thread, such as ODP page faults, prefetch, any 2095 * UMR activity, etc can touch the mkey. Thus it is safe to destroy it. 2096 */ 2097 if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) && 2098 refcount_read(&mr->mmkey.usecount) != 0 && 2099 xa_erase(&mr_to_mdev(mr)->odp_mkeys, mlx5_base_mkey(mr->mmkey.key))) 2100 mlx5r_deref_wait_odp_mkey(&mr->mmkey); 2101 2102 if (ibmr->type == IB_MR_TYPE_INTEGRITY) { 2103 xa_cmpxchg(&dev->sig_mrs, mlx5_base_mkey(mr->mmkey.key), 2104 mr->sig, NULL, GFP_KERNEL); 2105 2106 if (mr->mtt_mr) { 2107 rc = mlx5_ib_dereg_mr(&mr->mtt_mr->ibmr, NULL); 2108 if (rc) 2109 return rc; 2110 mr->mtt_mr = NULL; 2111 } 2112 if (mr->klm_mr) { 2113 rc = mlx5_ib_dereg_mr(&mr->klm_mr->ibmr, NULL); 2114 if (rc) 2115 return rc; 2116 mr->klm_mr = NULL; 2117 } 2118 2119 if (mlx5_core_destroy_psv(dev->mdev, 2120 mr->sig->psv_memory.psv_idx)) 2121 mlx5_ib_warn(dev, "failed to destroy mem psv %d\n", 2122 mr->sig->psv_memory.psv_idx); 2123 if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_wire.psv_idx)) 2124 mlx5_ib_warn(dev, "failed to destroy wire psv %d\n", 2125 mr->sig->psv_wire.psv_idx); 2126 kfree(mr->sig); 2127 mr->sig = NULL; 2128 } 2129 2130 /* Stop DMA */ 2131 rc = mlx5_revoke_mr(mr); 2132 if (rc) 2133 return rc; 2134 2135 if (mr->umem) { 2136 bool is_odp = is_odp_mr(mr); 2137 2138 if (!is_odp) 2139 atomic_sub(ib_umem_num_pages(mr->umem), 2140 &dev->mdev->priv.reg_pages); 2141 ib_umem_release(mr->umem); 2142 if (is_odp) 2143 mlx5_ib_free_odp_mr(mr); 2144 } 2145 2146 if (!mr->mmkey.cache_ent) 2147 mlx5_free_priv_descs(mr); 2148 2149 kfree(mr); 2150 return 0; 2151 } 2152 2153 static int dereg_crossing_data_direct_mr(struct mlx5_ib_dev *dev, 2154 struct mlx5_ib_mr *mr) 2155 { 2156 struct mlx5_ib_mr *dd_crossed_mr = mr->dd_crossed_mr; 2157 int ret; 2158 2159 ret = __mlx5_ib_dereg_mr(&mr->ibmr); 2160 if (ret) 2161 return ret; 2162 2163 mutex_lock(&dev->data_direct_lock); 2164 if (!dd_crossed_mr->revoked) 2165 list_del(&dd_crossed_mr->dd_node); 2166 2167 ret = __mlx5_ib_dereg_mr(&dd_crossed_mr->ibmr); 2168 mutex_unlock(&dev->data_direct_lock); 2169 return ret; 2170 } 2171 2172 int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) 2173 { 2174 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2175 struct mlx5_ib_dev *dev = to_mdev(ibmr->device); 2176 2177 if (mr->data_direct) 2178 return dereg_crossing_data_direct_mr(dev, mr); 2179 2180 return __mlx5_ib_dereg_mr(ibmr); 2181 } 2182 2183 static void mlx5_set_umr_free_mkey(struct ib_pd *pd, u32 *in, int ndescs, 2184 int access_mode, int page_shift) 2185 { 2186 struct mlx5_ib_dev *dev = to_mdev(pd->device); 2187 void *mkc; 2188 2189 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 2190 2191 /* This is only used from the kernel, so setting the PD is OK. */ 2192 set_mkc_access_pd_addr_fields(mkc, IB_ACCESS_RELAXED_ORDERING, 0, pd); 2193 MLX5_SET(mkc, mkc, free, 1); 2194 MLX5_SET(mkc, mkc, translations_octword_size, ndescs); 2195 MLX5_SET(mkc, mkc, access_mode_1_0, access_mode & 0x3); 2196 MLX5_SET(mkc, mkc, access_mode_4_2, (access_mode >> 2) & 0x7); 2197 MLX5_SET(mkc, mkc, umr_en, 1); 2198 MLX5_SET(mkc, mkc, log_page_size, page_shift); 2199 if (access_mode == MLX5_MKC_ACCESS_MODE_PA || 2200 access_mode == MLX5_MKC_ACCESS_MODE_MTT) 2201 MLX5_SET(mkc, mkc, ma_translation_mode, MLX5_CAP_GEN(dev->mdev, ats)); 2202 } 2203 2204 static int _mlx5_alloc_mkey_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr, 2205 int ndescs, int desc_size, int page_shift, 2206 int access_mode, u32 *in, int inlen) 2207 { 2208 struct mlx5_ib_dev *dev = to_mdev(pd->device); 2209 int err; 2210 2211 mr->access_mode = access_mode; 2212 mr->desc_size = desc_size; 2213 mr->max_descs = ndescs; 2214 2215 err = mlx5_alloc_priv_descs(pd->device, mr, ndescs, desc_size); 2216 if (err) 2217 return err; 2218 2219 mlx5_set_umr_free_mkey(pd, in, ndescs, access_mode, page_shift); 2220 2221 err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen); 2222 if (err) 2223 goto err_free_descs; 2224 2225 mr->mmkey.type = MLX5_MKEY_MR; 2226 mr->ibmr.lkey = mr->mmkey.key; 2227 mr->ibmr.rkey = mr->mmkey.key; 2228 2229 return 0; 2230 2231 err_free_descs: 2232 mlx5_free_priv_descs(mr); 2233 return err; 2234 } 2235 2236 static struct mlx5_ib_mr *mlx5_ib_alloc_pi_mr(struct ib_pd *pd, 2237 u32 max_num_sg, u32 max_num_meta_sg, 2238 int desc_size, int access_mode) 2239 { 2240 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 2241 int ndescs = ALIGN(max_num_sg + max_num_meta_sg, 4); 2242 int page_shift = 0; 2243 struct mlx5_ib_mr *mr; 2244 u32 *in; 2245 int err; 2246 2247 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 2248 if (!mr) 2249 return ERR_PTR(-ENOMEM); 2250 2251 mr->ibmr.pd = pd; 2252 mr->ibmr.device = pd->device; 2253 2254 in = kzalloc(inlen, GFP_KERNEL); 2255 if (!in) { 2256 err = -ENOMEM; 2257 goto err_free; 2258 } 2259 2260 if (access_mode == MLX5_MKC_ACCESS_MODE_MTT) 2261 page_shift = PAGE_SHIFT; 2262 2263 err = _mlx5_alloc_mkey_descs(pd, mr, ndescs, desc_size, page_shift, 2264 access_mode, in, inlen); 2265 if (err) 2266 goto err_free_in; 2267 2268 mr->umem = NULL; 2269 kfree(in); 2270 2271 return mr; 2272 2273 err_free_in: 2274 kfree(in); 2275 err_free: 2276 kfree(mr); 2277 return ERR_PTR(err); 2278 } 2279 2280 static int mlx5_alloc_mem_reg_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr, 2281 int ndescs, u32 *in, int inlen) 2282 { 2283 return _mlx5_alloc_mkey_descs(pd, mr, ndescs, sizeof(struct mlx5_mtt), 2284 PAGE_SHIFT, MLX5_MKC_ACCESS_MODE_MTT, in, 2285 inlen); 2286 } 2287 2288 static int mlx5_alloc_sg_gaps_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr, 2289 int ndescs, u32 *in, int inlen) 2290 { 2291 return _mlx5_alloc_mkey_descs(pd, mr, ndescs, sizeof(struct mlx5_klm), 2292 0, MLX5_MKC_ACCESS_MODE_KLMS, in, inlen); 2293 } 2294 2295 static int mlx5_alloc_integrity_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr, 2296 int max_num_sg, int max_num_meta_sg, 2297 u32 *in, int inlen) 2298 { 2299 struct mlx5_ib_dev *dev = to_mdev(pd->device); 2300 u32 psv_index[2]; 2301 void *mkc; 2302 int err; 2303 2304 mr->sig = kzalloc(sizeof(*mr->sig), GFP_KERNEL); 2305 if (!mr->sig) 2306 return -ENOMEM; 2307 2308 /* create mem & wire PSVs */ 2309 err = mlx5_core_create_psv(dev->mdev, to_mpd(pd)->pdn, 2, psv_index); 2310 if (err) 2311 goto err_free_sig; 2312 2313 mr->sig->psv_memory.psv_idx = psv_index[0]; 2314 mr->sig->psv_wire.psv_idx = psv_index[1]; 2315 2316 mr->sig->sig_status_checked = true; 2317 mr->sig->sig_err_exists = false; 2318 /* Next UMR, Arm SIGERR */ 2319 ++mr->sig->sigerr_count; 2320 mr->klm_mr = mlx5_ib_alloc_pi_mr(pd, max_num_sg, max_num_meta_sg, 2321 sizeof(struct mlx5_klm), 2322 MLX5_MKC_ACCESS_MODE_KLMS); 2323 if (IS_ERR(mr->klm_mr)) { 2324 err = PTR_ERR(mr->klm_mr); 2325 goto err_destroy_psv; 2326 } 2327 mr->mtt_mr = mlx5_ib_alloc_pi_mr(pd, max_num_sg, max_num_meta_sg, 2328 sizeof(struct mlx5_mtt), 2329 MLX5_MKC_ACCESS_MODE_MTT); 2330 if (IS_ERR(mr->mtt_mr)) { 2331 err = PTR_ERR(mr->mtt_mr); 2332 goto err_free_klm_mr; 2333 } 2334 2335 /* Set bsf descriptors for mkey */ 2336 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 2337 MLX5_SET(mkc, mkc, bsf_en, 1); 2338 MLX5_SET(mkc, mkc, bsf_octword_size, MLX5_MKEY_BSF_OCTO_SIZE); 2339 2340 err = _mlx5_alloc_mkey_descs(pd, mr, 4, sizeof(struct mlx5_klm), 0, 2341 MLX5_MKC_ACCESS_MODE_KLMS, in, inlen); 2342 if (err) 2343 goto err_free_mtt_mr; 2344 2345 err = xa_err(xa_store(&dev->sig_mrs, mlx5_base_mkey(mr->mmkey.key), 2346 mr->sig, GFP_KERNEL)); 2347 if (err) 2348 goto err_free_descs; 2349 return 0; 2350 2351 err_free_descs: 2352 destroy_mkey(dev, mr); 2353 mlx5_free_priv_descs(mr); 2354 err_free_mtt_mr: 2355 mlx5_ib_dereg_mr(&mr->mtt_mr->ibmr, NULL); 2356 mr->mtt_mr = NULL; 2357 err_free_klm_mr: 2358 mlx5_ib_dereg_mr(&mr->klm_mr->ibmr, NULL); 2359 mr->klm_mr = NULL; 2360 err_destroy_psv: 2361 if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_memory.psv_idx)) 2362 mlx5_ib_warn(dev, "failed to destroy mem psv %d\n", 2363 mr->sig->psv_memory.psv_idx); 2364 if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_wire.psv_idx)) 2365 mlx5_ib_warn(dev, "failed to destroy wire psv %d\n", 2366 mr->sig->psv_wire.psv_idx); 2367 err_free_sig: 2368 kfree(mr->sig); 2369 2370 return err; 2371 } 2372 2373 static struct ib_mr *__mlx5_ib_alloc_mr(struct ib_pd *pd, 2374 enum ib_mr_type mr_type, u32 max_num_sg, 2375 u32 max_num_meta_sg) 2376 { 2377 struct mlx5_ib_dev *dev = to_mdev(pd->device); 2378 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 2379 int ndescs = ALIGN(max_num_sg, 4); 2380 struct mlx5_ib_mr *mr; 2381 u32 *in; 2382 int err; 2383 2384 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 2385 if (!mr) 2386 return ERR_PTR(-ENOMEM); 2387 2388 in = kzalloc(inlen, GFP_KERNEL); 2389 if (!in) { 2390 err = -ENOMEM; 2391 goto err_free; 2392 } 2393 2394 mr->ibmr.device = pd->device; 2395 mr->umem = NULL; 2396 2397 switch (mr_type) { 2398 case IB_MR_TYPE_MEM_REG: 2399 err = mlx5_alloc_mem_reg_descs(pd, mr, ndescs, in, inlen); 2400 break; 2401 case IB_MR_TYPE_SG_GAPS: 2402 err = mlx5_alloc_sg_gaps_descs(pd, mr, ndescs, in, inlen); 2403 break; 2404 case IB_MR_TYPE_INTEGRITY: 2405 err = mlx5_alloc_integrity_descs(pd, mr, max_num_sg, 2406 max_num_meta_sg, in, inlen); 2407 break; 2408 default: 2409 mlx5_ib_warn(dev, "Invalid mr type %d\n", mr_type); 2410 err = -EINVAL; 2411 } 2412 2413 if (err) 2414 goto err_free_in; 2415 2416 kfree(in); 2417 2418 return &mr->ibmr; 2419 2420 err_free_in: 2421 kfree(in); 2422 err_free: 2423 kfree(mr); 2424 return ERR_PTR(err); 2425 } 2426 2427 struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, 2428 u32 max_num_sg) 2429 { 2430 return __mlx5_ib_alloc_mr(pd, mr_type, max_num_sg, 0); 2431 } 2432 2433 struct ib_mr *mlx5_ib_alloc_mr_integrity(struct ib_pd *pd, 2434 u32 max_num_sg, u32 max_num_meta_sg) 2435 { 2436 return __mlx5_ib_alloc_mr(pd, IB_MR_TYPE_INTEGRITY, max_num_sg, 2437 max_num_meta_sg); 2438 } 2439 2440 int mlx5_ib_alloc_mw(struct ib_mw *ibmw, struct ib_udata *udata) 2441 { 2442 struct mlx5_ib_dev *dev = to_mdev(ibmw->device); 2443 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 2444 struct mlx5_ib_mw *mw = to_mmw(ibmw); 2445 unsigned int ndescs; 2446 u32 *in = NULL; 2447 void *mkc; 2448 int err; 2449 struct mlx5_ib_alloc_mw req = {}; 2450 struct { 2451 __u32 comp_mask; 2452 __u32 response_length; 2453 } resp = {}; 2454 2455 err = ib_copy_from_udata(&req, udata, min(udata->inlen, sizeof(req))); 2456 if (err) 2457 return err; 2458 2459 if (req.comp_mask || req.reserved1 || req.reserved2) 2460 return -EOPNOTSUPP; 2461 2462 if (udata->inlen > sizeof(req) && 2463 !ib_is_udata_cleared(udata, sizeof(req), 2464 udata->inlen - sizeof(req))) 2465 return -EOPNOTSUPP; 2466 2467 ndescs = req.num_klms ? roundup(req.num_klms, 4) : roundup(1, 4); 2468 2469 in = kzalloc(inlen, GFP_KERNEL); 2470 if (!in) 2471 return -ENOMEM; 2472 2473 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 2474 2475 MLX5_SET(mkc, mkc, free, 1); 2476 MLX5_SET(mkc, mkc, translations_octword_size, ndescs); 2477 MLX5_SET(mkc, mkc, pd, to_mpd(ibmw->pd)->pdn); 2478 MLX5_SET(mkc, mkc, umr_en, 1); 2479 MLX5_SET(mkc, mkc, lr, 1); 2480 MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_KLMS); 2481 MLX5_SET(mkc, mkc, en_rinval, !!((ibmw->type == IB_MW_TYPE_2))); 2482 MLX5_SET(mkc, mkc, qpn, 0xffffff); 2483 2484 err = mlx5_ib_create_mkey(dev, &mw->mmkey, in, inlen); 2485 if (err) 2486 goto free; 2487 2488 mw->mmkey.type = MLX5_MKEY_MW; 2489 ibmw->rkey = mw->mmkey.key; 2490 mw->mmkey.ndescs = ndescs; 2491 2492 resp.response_length = 2493 min(offsetofend(typeof(resp), response_length), udata->outlen); 2494 if (resp.response_length) { 2495 err = ib_copy_to_udata(udata, &resp, resp.response_length); 2496 if (err) 2497 goto free_mkey; 2498 } 2499 2500 if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) { 2501 err = mlx5r_store_odp_mkey(dev, &mw->mmkey); 2502 if (err) 2503 goto free_mkey; 2504 } 2505 2506 kfree(in); 2507 return 0; 2508 2509 free_mkey: 2510 mlx5_core_destroy_mkey(dev->mdev, mw->mmkey.key); 2511 free: 2512 kfree(in); 2513 return err; 2514 } 2515 2516 int mlx5_ib_dealloc_mw(struct ib_mw *mw) 2517 { 2518 struct mlx5_ib_dev *dev = to_mdev(mw->device); 2519 struct mlx5_ib_mw *mmw = to_mmw(mw); 2520 2521 if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) && 2522 xa_erase(&dev->odp_mkeys, mlx5_base_mkey(mmw->mmkey.key))) 2523 /* 2524 * pagefault_single_data_segment() may be accessing mmw 2525 * if the user bound an ODP MR to this MW. 2526 */ 2527 mlx5r_deref_wait_odp_mkey(&mmw->mmkey); 2528 2529 return mlx5_core_destroy_mkey(dev->mdev, mmw->mmkey.key); 2530 } 2531 2532 int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask, 2533 struct ib_mr_status *mr_status) 2534 { 2535 struct mlx5_ib_mr *mmr = to_mmr(ibmr); 2536 int ret = 0; 2537 2538 if (check_mask & ~IB_MR_CHECK_SIG_STATUS) { 2539 pr_err("Invalid status check mask\n"); 2540 ret = -EINVAL; 2541 goto done; 2542 } 2543 2544 mr_status->fail_status = 0; 2545 if (check_mask & IB_MR_CHECK_SIG_STATUS) { 2546 if (!mmr->sig) { 2547 ret = -EINVAL; 2548 pr_err("signature status check requested on a non-signature enabled MR\n"); 2549 goto done; 2550 } 2551 2552 mmr->sig->sig_status_checked = true; 2553 if (!mmr->sig->sig_err_exists) 2554 goto done; 2555 2556 if (ibmr->lkey == mmr->sig->err_item.key) 2557 memcpy(&mr_status->sig_err, &mmr->sig->err_item, 2558 sizeof(mr_status->sig_err)); 2559 else { 2560 mr_status->sig_err.err_type = IB_SIG_BAD_GUARD; 2561 mr_status->sig_err.sig_err_offset = 0; 2562 mr_status->sig_err.key = mmr->sig->err_item.key; 2563 } 2564 2565 mmr->sig->sig_err_exists = false; 2566 mr_status->fail_status |= IB_MR_CHECK_SIG_STATUS; 2567 } 2568 2569 done: 2570 return ret; 2571 } 2572 2573 static int 2574 mlx5_ib_map_pa_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg, 2575 int data_sg_nents, unsigned int *data_sg_offset, 2576 struct scatterlist *meta_sg, int meta_sg_nents, 2577 unsigned int *meta_sg_offset) 2578 { 2579 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2580 unsigned int sg_offset = 0; 2581 int n = 0; 2582 2583 mr->meta_length = 0; 2584 if (data_sg_nents == 1) { 2585 n++; 2586 mr->mmkey.ndescs = 1; 2587 if (data_sg_offset) 2588 sg_offset = *data_sg_offset; 2589 mr->data_length = sg_dma_len(data_sg) - sg_offset; 2590 mr->data_iova = sg_dma_address(data_sg) + sg_offset; 2591 if (meta_sg_nents == 1) { 2592 n++; 2593 mr->meta_ndescs = 1; 2594 if (meta_sg_offset) 2595 sg_offset = *meta_sg_offset; 2596 else 2597 sg_offset = 0; 2598 mr->meta_length = sg_dma_len(meta_sg) - sg_offset; 2599 mr->pi_iova = sg_dma_address(meta_sg) + sg_offset; 2600 } 2601 ibmr->length = mr->data_length + mr->meta_length; 2602 } 2603 2604 return n; 2605 } 2606 2607 static int 2608 mlx5_ib_sg_to_klms(struct mlx5_ib_mr *mr, 2609 struct scatterlist *sgl, 2610 unsigned short sg_nents, 2611 unsigned int *sg_offset_p, 2612 struct scatterlist *meta_sgl, 2613 unsigned short meta_sg_nents, 2614 unsigned int *meta_sg_offset_p) 2615 { 2616 struct scatterlist *sg = sgl; 2617 struct mlx5_klm *klms = mr->descs; 2618 unsigned int sg_offset = sg_offset_p ? *sg_offset_p : 0; 2619 u32 lkey = mr->ibmr.pd->local_dma_lkey; 2620 int i, j = 0; 2621 2622 mr->ibmr.iova = sg_dma_address(sg) + sg_offset; 2623 mr->ibmr.length = 0; 2624 2625 for_each_sg(sgl, sg, sg_nents, i) { 2626 if (unlikely(i >= mr->max_descs)) 2627 break; 2628 klms[i].va = cpu_to_be64(sg_dma_address(sg) + sg_offset); 2629 klms[i].bcount = cpu_to_be32(sg_dma_len(sg) - sg_offset); 2630 klms[i].key = cpu_to_be32(lkey); 2631 mr->ibmr.length += sg_dma_len(sg) - sg_offset; 2632 2633 sg_offset = 0; 2634 } 2635 2636 if (sg_offset_p) 2637 *sg_offset_p = sg_offset; 2638 2639 mr->mmkey.ndescs = i; 2640 mr->data_length = mr->ibmr.length; 2641 2642 if (meta_sg_nents) { 2643 sg = meta_sgl; 2644 sg_offset = meta_sg_offset_p ? *meta_sg_offset_p : 0; 2645 for_each_sg(meta_sgl, sg, meta_sg_nents, j) { 2646 if (unlikely(i + j >= mr->max_descs)) 2647 break; 2648 klms[i + j].va = cpu_to_be64(sg_dma_address(sg) + 2649 sg_offset); 2650 klms[i + j].bcount = cpu_to_be32(sg_dma_len(sg) - 2651 sg_offset); 2652 klms[i + j].key = cpu_to_be32(lkey); 2653 mr->ibmr.length += sg_dma_len(sg) - sg_offset; 2654 2655 sg_offset = 0; 2656 } 2657 if (meta_sg_offset_p) 2658 *meta_sg_offset_p = sg_offset; 2659 2660 mr->meta_ndescs = j; 2661 mr->meta_length = mr->ibmr.length - mr->data_length; 2662 } 2663 2664 return i + j; 2665 } 2666 2667 static int mlx5_set_page(struct ib_mr *ibmr, u64 addr) 2668 { 2669 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2670 __be64 *descs; 2671 2672 if (unlikely(mr->mmkey.ndescs == mr->max_descs)) 2673 return -ENOMEM; 2674 2675 descs = mr->descs; 2676 descs[mr->mmkey.ndescs++] = cpu_to_be64(addr | MLX5_EN_RD | MLX5_EN_WR); 2677 2678 return 0; 2679 } 2680 2681 static int mlx5_set_page_pi(struct ib_mr *ibmr, u64 addr) 2682 { 2683 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2684 __be64 *descs; 2685 2686 if (unlikely(mr->mmkey.ndescs + mr->meta_ndescs == mr->max_descs)) 2687 return -ENOMEM; 2688 2689 descs = mr->descs; 2690 descs[mr->mmkey.ndescs + mr->meta_ndescs++] = 2691 cpu_to_be64(addr | MLX5_EN_RD | MLX5_EN_WR); 2692 2693 return 0; 2694 } 2695 2696 static int 2697 mlx5_ib_map_mtt_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg, 2698 int data_sg_nents, unsigned int *data_sg_offset, 2699 struct scatterlist *meta_sg, int meta_sg_nents, 2700 unsigned int *meta_sg_offset) 2701 { 2702 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2703 struct mlx5_ib_mr *pi_mr = mr->mtt_mr; 2704 int n; 2705 2706 pi_mr->mmkey.ndescs = 0; 2707 pi_mr->meta_ndescs = 0; 2708 pi_mr->meta_length = 0; 2709 2710 ib_dma_sync_single_for_cpu(ibmr->device, pi_mr->desc_map, 2711 pi_mr->desc_size * pi_mr->max_descs, 2712 DMA_TO_DEVICE); 2713 2714 pi_mr->ibmr.page_size = ibmr->page_size; 2715 n = ib_sg_to_pages(&pi_mr->ibmr, data_sg, data_sg_nents, data_sg_offset, 2716 mlx5_set_page); 2717 if (n != data_sg_nents) 2718 return n; 2719 2720 pi_mr->data_iova = pi_mr->ibmr.iova; 2721 pi_mr->data_length = pi_mr->ibmr.length; 2722 pi_mr->ibmr.length = pi_mr->data_length; 2723 ibmr->length = pi_mr->data_length; 2724 2725 if (meta_sg_nents) { 2726 u64 page_mask = ~((u64)ibmr->page_size - 1); 2727 u64 iova = pi_mr->data_iova; 2728 2729 n += ib_sg_to_pages(&pi_mr->ibmr, meta_sg, meta_sg_nents, 2730 meta_sg_offset, mlx5_set_page_pi); 2731 2732 pi_mr->meta_length = pi_mr->ibmr.length; 2733 /* 2734 * PI address for the HW is the offset of the metadata address 2735 * relative to the first data page address. 2736 * It equals to first data page address + size of data pages + 2737 * metadata offset at the first metadata page 2738 */ 2739 pi_mr->pi_iova = (iova & page_mask) + 2740 pi_mr->mmkey.ndescs * ibmr->page_size + 2741 (pi_mr->ibmr.iova & ~page_mask); 2742 /* 2743 * In order to use one MTT MR for data and metadata, we register 2744 * also the gaps between the end of the data and the start of 2745 * the metadata (the sig MR will verify that the HW will access 2746 * to right addresses). This mapping is safe because we use 2747 * internal mkey for the registration. 2748 */ 2749 pi_mr->ibmr.length = pi_mr->pi_iova + pi_mr->meta_length - iova; 2750 pi_mr->ibmr.iova = iova; 2751 ibmr->length += pi_mr->meta_length; 2752 } 2753 2754 ib_dma_sync_single_for_device(ibmr->device, pi_mr->desc_map, 2755 pi_mr->desc_size * pi_mr->max_descs, 2756 DMA_TO_DEVICE); 2757 2758 return n; 2759 } 2760 2761 static int 2762 mlx5_ib_map_klm_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg, 2763 int data_sg_nents, unsigned int *data_sg_offset, 2764 struct scatterlist *meta_sg, int meta_sg_nents, 2765 unsigned int *meta_sg_offset) 2766 { 2767 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2768 struct mlx5_ib_mr *pi_mr = mr->klm_mr; 2769 int n; 2770 2771 pi_mr->mmkey.ndescs = 0; 2772 pi_mr->meta_ndescs = 0; 2773 pi_mr->meta_length = 0; 2774 2775 ib_dma_sync_single_for_cpu(ibmr->device, pi_mr->desc_map, 2776 pi_mr->desc_size * pi_mr->max_descs, 2777 DMA_TO_DEVICE); 2778 2779 n = mlx5_ib_sg_to_klms(pi_mr, data_sg, data_sg_nents, data_sg_offset, 2780 meta_sg, meta_sg_nents, meta_sg_offset); 2781 2782 ib_dma_sync_single_for_device(ibmr->device, pi_mr->desc_map, 2783 pi_mr->desc_size * pi_mr->max_descs, 2784 DMA_TO_DEVICE); 2785 2786 /* This is zero-based memory region */ 2787 pi_mr->data_iova = 0; 2788 pi_mr->ibmr.iova = 0; 2789 pi_mr->pi_iova = pi_mr->data_length; 2790 ibmr->length = pi_mr->ibmr.length; 2791 2792 return n; 2793 } 2794 2795 int mlx5_ib_map_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg, 2796 int data_sg_nents, unsigned int *data_sg_offset, 2797 struct scatterlist *meta_sg, int meta_sg_nents, 2798 unsigned int *meta_sg_offset) 2799 { 2800 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2801 struct mlx5_ib_mr *pi_mr = NULL; 2802 int n; 2803 2804 WARN_ON(ibmr->type != IB_MR_TYPE_INTEGRITY); 2805 2806 mr->mmkey.ndescs = 0; 2807 mr->data_length = 0; 2808 mr->data_iova = 0; 2809 mr->meta_ndescs = 0; 2810 mr->pi_iova = 0; 2811 /* 2812 * As a performance optimization, if possible, there is no need to 2813 * perform UMR operation to register the data/metadata buffers. 2814 * First try to map the sg lists to PA descriptors with local_dma_lkey. 2815 * Fallback to UMR only in case of a failure. 2816 */ 2817 n = mlx5_ib_map_pa_mr_sg_pi(ibmr, data_sg, data_sg_nents, 2818 data_sg_offset, meta_sg, meta_sg_nents, 2819 meta_sg_offset); 2820 if (n == data_sg_nents + meta_sg_nents) 2821 goto out; 2822 /* 2823 * As a performance optimization, if possible, there is no need to map 2824 * the sg lists to KLM descriptors. First try to map the sg lists to MTT 2825 * descriptors and fallback to KLM only in case of a failure. 2826 * It's more efficient for the HW to work with MTT descriptors 2827 * (especially in high load). 2828 * Use KLM (indirect access) only if it's mandatory. 2829 */ 2830 pi_mr = mr->mtt_mr; 2831 n = mlx5_ib_map_mtt_mr_sg_pi(ibmr, data_sg, data_sg_nents, 2832 data_sg_offset, meta_sg, meta_sg_nents, 2833 meta_sg_offset); 2834 if (n == data_sg_nents + meta_sg_nents) 2835 goto out; 2836 2837 pi_mr = mr->klm_mr; 2838 n = mlx5_ib_map_klm_mr_sg_pi(ibmr, data_sg, data_sg_nents, 2839 data_sg_offset, meta_sg, meta_sg_nents, 2840 meta_sg_offset); 2841 if (unlikely(n != data_sg_nents + meta_sg_nents)) 2842 return -ENOMEM; 2843 2844 out: 2845 /* This is zero-based memory region */ 2846 ibmr->iova = 0; 2847 mr->pi_mr = pi_mr; 2848 if (pi_mr) 2849 ibmr->sig_attrs->meta_length = pi_mr->meta_length; 2850 else 2851 ibmr->sig_attrs->meta_length = mr->meta_length; 2852 2853 return 0; 2854 } 2855 2856 int mlx5_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, 2857 unsigned int *sg_offset) 2858 { 2859 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2860 int n; 2861 2862 mr->mmkey.ndescs = 0; 2863 2864 ib_dma_sync_single_for_cpu(ibmr->device, mr->desc_map, 2865 mr->desc_size * mr->max_descs, 2866 DMA_TO_DEVICE); 2867 2868 if (mr->access_mode == MLX5_MKC_ACCESS_MODE_KLMS) 2869 n = mlx5_ib_sg_to_klms(mr, sg, sg_nents, sg_offset, NULL, 0, 2870 NULL); 2871 else 2872 n = ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, 2873 mlx5_set_page); 2874 2875 ib_dma_sync_single_for_device(ibmr->device, mr->desc_map, 2876 mr->desc_size * mr->max_descs, 2877 DMA_TO_DEVICE); 2878 2879 return n; 2880 } 2881