1 /* 2 * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved. 3 * Copyright (c) 2020, Intel Corporation. All rights reserved. 4 * 5 * This software is available to you under a choice of one of two 6 * licenses. You may choose to be licensed under the terms of the GNU 7 * General Public License (GPL) Version 2, available from the file 8 * COPYING in the main directory of this source tree, or the 9 * OpenIB.org BSD license below: 10 * 11 * Redistribution and use in source and binary forms, with or 12 * without modification, are permitted provided that the following 13 * conditions are met: 14 * 15 * - Redistributions of source code must retain the above 16 * copyright notice, this list of conditions and the following 17 * disclaimer. 18 * 19 * - Redistributions in binary form must reproduce the above 20 * copyright notice, this list of conditions and the following 21 * disclaimer in the documentation and/or other materials 22 * provided with the distribution. 23 * 24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 25 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 26 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 27 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 28 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 29 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 30 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 31 * SOFTWARE. 32 */ 33 34 35 #include <linux/kref.h> 36 #include <linux/random.h> 37 #include <linux/debugfs.h> 38 #include <linux/export.h> 39 #include <linux/delay.h> 40 #include <linux/dma-buf.h> 41 #include <linux/dma-resv.h> 42 #include <rdma/ib_umem_odp.h> 43 #include "dm.h" 44 #include "mlx5_ib.h" 45 #include "umr.h" 46 #include "data_direct.h" 47 48 enum { 49 MAX_PENDING_REG_MR = 8, 50 }; 51 52 #define MLX5_MR_CACHE_PERSISTENT_ENTRY_MIN_DESCS 4 53 #define MLX5_UMR_ALIGN 2048 54 55 static void 56 create_mkey_callback(int status, struct mlx5_async_work *context); 57 static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem, 58 u64 iova, int access_flags, 59 unsigned long page_size, bool populate, 60 int access_mode); 61 static int __mlx5_ib_dereg_mr(struct ib_mr *ibmr); 62 63 static void set_mkc_access_pd_addr_fields(void *mkc, int acc, u64 start_addr, 64 struct ib_pd *pd) 65 { 66 struct mlx5_ib_dev *dev = to_mdev(pd->device); 67 68 MLX5_SET(mkc, mkc, a, !!(acc & IB_ACCESS_REMOTE_ATOMIC)); 69 MLX5_SET(mkc, mkc, rw, !!(acc & IB_ACCESS_REMOTE_WRITE)); 70 MLX5_SET(mkc, mkc, rr, !!(acc & IB_ACCESS_REMOTE_READ)); 71 MLX5_SET(mkc, mkc, lw, !!(acc & IB_ACCESS_LOCAL_WRITE)); 72 MLX5_SET(mkc, mkc, lr, 1); 73 74 if (acc & IB_ACCESS_RELAXED_ORDERING) { 75 if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write)) 76 MLX5_SET(mkc, mkc, relaxed_ordering_write, 1); 77 78 if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read) || 79 (MLX5_CAP_GEN(dev->mdev, 80 relaxed_ordering_read_pci_enabled) && 81 pcie_relaxed_ordering_enabled(dev->mdev->pdev))) 82 MLX5_SET(mkc, mkc, relaxed_ordering_read, 1); 83 } 84 85 MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn); 86 MLX5_SET(mkc, mkc, qpn, 0xffffff); 87 MLX5_SET64(mkc, mkc, start_addr, start_addr); 88 } 89 90 static void assign_mkey_variant(struct mlx5_ib_dev *dev, u32 *mkey, u32 *in) 91 { 92 u8 key = atomic_inc_return(&dev->mkey_var); 93 void *mkc; 94 95 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 96 MLX5_SET(mkc, mkc, mkey_7_0, key); 97 *mkey = key; 98 } 99 100 static int mlx5_ib_create_mkey(struct mlx5_ib_dev *dev, 101 struct mlx5_ib_mkey *mkey, u32 *in, int inlen) 102 { 103 int ret; 104 105 assign_mkey_variant(dev, &mkey->key, in); 106 ret = mlx5_core_create_mkey(dev->mdev, &mkey->key, in, inlen); 107 if (!ret) 108 init_waitqueue_head(&mkey->wait); 109 110 return ret; 111 } 112 113 static int mlx5_ib_create_mkey_cb(struct mlx5r_async_create_mkey *async_create) 114 { 115 struct mlx5_ib_dev *dev = async_create->ent->dev; 116 size_t inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 117 size_t outlen = MLX5_ST_SZ_BYTES(create_mkey_out); 118 119 MLX5_SET(create_mkey_in, async_create->in, opcode, 120 MLX5_CMD_OP_CREATE_MKEY); 121 assign_mkey_variant(dev, &async_create->mkey, async_create->in); 122 return mlx5_cmd_exec_cb(&dev->async_ctx, async_create->in, inlen, 123 async_create->out, outlen, create_mkey_callback, 124 &async_create->cb_work); 125 } 126 127 static int mkey_cache_max_order(struct mlx5_ib_dev *dev); 128 static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent); 129 130 static int destroy_mkey(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) 131 { 132 WARN_ON(xa_load(&dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key))); 133 134 return mlx5_core_destroy_mkey(dev->mdev, mr->mmkey.key); 135 } 136 137 static void create_mkey_warn(struct mlx5_ib_dev *dev, int status, void *out) 138 { 139 if (status == -ENXIO) /* core driver is not available */ 140 return; 141 142 mlx5_ib_warn(dev, "async reg mr failed. status %d\n", status); 143 if (status != -EREMOTEIO) /* driver specific failure */ 144 return; 145 146 /* Failed in FW, print cmd out failure details */ 147 mlx5_cmd_out_err(dev->mdev, MLX5_CMD_OP_CREATE_MKEY, 0, out); 148 } 149 150 static int push_mkey_locked(struct mlx5_cache_ent *ent, u32 mkey) 151 { 152 unsigned long tmp = ent->mkeys_queue.ci % NUM_MKEYS_PER_PAGE; 153 struct mlx5_mkeys_page *page; 154 155 lockdep_assert_held(&ent->mkeys_queue.lock); 156 if (ent->mkeys_queue.ci >= 157 ent->mkeys_queue.num_pages * NUM_MKEYS_PER_PAGE) { 158 page = kzalloc(sizeof(*page), GFP_ATOMIC); 159 if (!page) 160 return -ENOMEM; 161 ent->mkeys_queue.num_pages++; 162 list_add_tail(&page->list, &ent->mkeys_queue.pages_list); 163 } else { 164 page = list_last_entry(&ent->mkeys_queue.pages_list, 165 struct mlx5_mkeys_page, list); 166 } 167 168 page->mkeys[tmp] = mkey; 169 ent->mkeys_queue.ci++; 170 return 0; 171 } 172 173 static int pop_mkey_locked(struct mlx5_cache_ent *ent) 174 { 175 unsigned long tmp = (ent->mkeys_queue.ci - 1) % NUM_MKEYS_PER_PAGE; 176 struct mlx5_mkeys_page *last_page; 177 u32 mkey; 178 179 lockdep_assert_held(&ent->mkeys_queue.lock); 180 last_page = list_last_entry(&ent->mkeys_queue.pages_list, 181 struct mlx5_mkeys_page, list); 182 mkey = last_page->mkeys[tmp]; 183 last_page->mkeys[tmp] = 0; 184 ent->mkeys_queue.ci--; 185 if (ent->mkeys_queue.num_pages > 1 && !tmp) { 186 list_del(&last_page->list); 187 ent->mkeys_queue.num_pages--; 188 kfree(last_page); 189 } 190 return mkey; 191 } 192 193 static void create_mkey_callback(int status, struct mlx5_async_work *context) 194 { 195 struct mlx5r_async_create_mkey *mkey_out = 196 container_of(context, struct mlx5r_async_create_mkey, cb_work); 197 struct mlx5_cache_ent *ent = mkey_out->ent; 198 struct mlx5_ib_dev *dev = ent->dev; 199 unsigned long flags; 200 201 if (status) { 202 create_mkey_warn(dev, status, mkey_out->out); 203 kfree(mkey_out); 204 spin_lock_irqsave(&ent->mkeys_queue.lock, flags); 205 ent->pending--; 206 WRITE_ONCE(dev->fill_delay, 1); 207 spin_unlock_irqrestore(&ent->mkeys_queue.lock, flags); 208 mod_timer(&dev->delay_timer, jiffies + HZ); 209 return; 210 } 211 212 mkey_out->mkey |= mlx5_idx_to_mkey( 213 MLX5_GET(create_mkey_out, mkey_out->out, mkey_index)); 214 WRITE_ONCE(dev->cache.last_add, jiffies); 215 216 spin_lock_irqsave(&ent->mkeys_queue.lock, flags); 217 push_mkey_locked(ent, mkey_out->mkey); 218 ent->pending--; 219 /* If we are doing fill_to_high_water then keep going. */ 220 queue_adjust_cache_locked(ent); 221 spin_unlock_irqrestore(&ent->mkeys_queue.lock, flags); 222 kfree(mkey_out); 223 } 224 225 static int get_mkc_octo_size(unsigned int access_mode, unsigned int ndescs) 226 { 227 int ret = 0; 228 229 switch (access_mode) { 230 case MLX5_MKC_ACCESS_MODE_MTT: 231 ret = DIV_ROUND_UP(ndescs, MLX5_IB_UMR_OCTOWORD / 232 sizeof(struct mlx5_mtt)); 233 break; 234 case MLX5_MKC_ACCESS_MODE_KSM: 235 ret = DIV_ROUND_UP(ndescs, MLX5_IB_UMR_OCTOWORD / 236 sizeof(struct mlx5_klm)); 237 break; 238 default: 239 WARN_ON(1); 240 } 241 return ret; 242 } 243 244 static void set_cache_mkc(struct mlx5_cache_ent *ent, void *mkc) 245 { 246 set_mkc_access_pd_addr_fields(mkc, ent->rb_key.access_flags, 0, 247 ent->dev->umrc.pd); 248 MLX5_SET(mkc, mkc, free, 1); 249 MLX5_SET(mkc, mkc, umr_en, 1); 250 MLX5_SET(mkc, mkc, access_mode_1_0, ent->rb_key.access_mode & 0x3); 251 MLX5_SET(mkc, mkc, access_mode_4_2, 252 (ent->rb_key.access_mode >> 2) & 0x7); 253 MLX5_SET(mkc, mkc, ma_translation_mode, !!ent->rb_key.ats); 254 255 MLX5_SET(mkc, mkc, translations_octword_size, 256 get_mkc_octo_size(ent->rb_key.access_mode, 257 ent->rb_key.ndescs)); 258 MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT); 259 } 260 261 /* Asynchronously schedule new MRs to be populated in the cache. */ 262 static int add_keys(struct mlx5_cache_ent *ent, unsigned int num) 263 { 264 struct mlx5r_async_create_mkey *async_create; 265 void *mkc; 266 int err = 0; 267 int i; 268 269 for (i = 0; i < num; i++) { 270 async_create = kzalloc(sizeof(struct mlx5r_async_create_mkey), 271 GFP_KERNEL); 272 if (!async_create) 273 return -ENOMEM; 274 mkc = MLX5_ADDR_OF(create_mkey_in, async_create->in, 275 memory_key_mkey_entry); 276 set_cache_mkc(ent, mkc); 277 async_create->ent = ent; 278 279 spin_lock_irq(&ent->mkeys_queue.lock); 280 if (ent->pending >= MAX_PENDING_REG_MR) { 281 err = -EAGAIN; 282 goto free_async_create; 283 } 284 ent->pending++; 285 spin_unlock_irq(&ent->mkeys_queue.lock); 286 287 err = mlx5_ib_create_mkey_cb(async_create); 288 if (err) { 289 mlx5_ib_warn(ent->dev, "create mkey failed %d\n", err); 290 goto err_create_mkey; 291 } 292 } 293 294 return 0; 295 296 err_create_mkey: 297 spin_lock_irq(&ent->mkeys_queue.lock); 298 ent->pending--; 299 free_async_create: 300 spin_unlock_irq(&ent->mkeys_queue.lock); 301 kfree(async_create); 302 return err; 303 } 304 305 /* Synchronously create a MR in the cache */ 306 static int create_cache_mkey(struct mlx5_cache_ent *ent, u32 *mkey) 307 { 308 size_t inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 309 void *mkc; 310 u32 *in; 311 int err; 312 313 in = kzalloc(inlen, GFP_KERNEL); 314 if (!in) 315 return -ENOMEM; 316 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 317 set_cache_mkc(ent, mkc); 318 319 err = mlx5_core_create_mkey(ent->dev->mdev, mkey, in, inlen); 320 if (err) 321 goto free_in; 322 323 WRITE_ONCE(ent->dev->cache.last_add, jiffies); 324 free_in: 325 kfree(in); 326 return err; 327 } 328 329 static void remove_cache_mr_locked(struct mlx5_cache_ent *ent) 330 { 331 u32 mkey; 332 333 lockdep_assert_held(&ent->mkeys_queue.lock); 334 if (!ent->mkeys_queue.ci) 335 return; 336 mkey = pop_mkey_locked(ent); 337 spin_unlock_irq(&ent->mkeys_queue.lock); 338 mlx5_core_destroy_mkey(ent->dev->mdev, mkey); 339 spin_lock_irq(&ent->mkeys_queue.lock); 340 } 341 342 static int resize_available_mrs(struct mlx5_cache_ent *ent, unsigned int target, 343 bool limit_fill) 344 __acquires(&ent->mkeys_queue.lock) __releases(&ent->mkeys_queue.lock) 345 { 346 int err; 347 348 lockdep_assert_held(&ent->mkeys_queue.lock); 349 350 while (true) { 351 if (limit_fill) 352 target = ent->limit * 2; 353 if (target == ent->pending + ent->mkeys_queue.ci) 354 return 0; 355 if (target > ent->pending + ent->mkeys_queue.ci) { 356 u32 todo = target - (ent->pending + ent->mkeys_queue.ci); 357 358 spin_unlock_irq(&ent->mkeys_queue.lock); 359 err = add_keys(ent, todo); 360 if (err == -EAGAIN) 361 usleep_range(3000, 5000); 362 spin_lock_irq(&ent->mkeys_queue.lock); 363 if (err) { 364 if (err != -EAGAIN) 365 return err; 366 } else 367 return 0; 368 } else { 369 remove_cache_mr_locked(ent); 370 } 371 } 372 } 373 374 static ssize_t size_write(struct file *filp, const char __user *buf, 375 size_t count, loff_t *pos) 376 { 377 struct mlx5_cache_ent *ent = filp->private_data; 378 u32 target; 379 int err; 380 381 err = kstrtou32_from_user(buf, count, 0, &target); 382 if (err) 383 return err; 384 385 /* 386 * Target is the new value of total_mrs the user requests, however we 387 * cannot free MRs that are in use. Compute the target value for stored 388 * mkeys. 389 */ 390 spin_lock_irq(&ent->mkeys_queue.lock); 391 if (target < ent->in_use) { 392 err = -EINVAL; 393 goto err_unlock; 394 } 395 target = target - ent->in_use; 396 if (target < ent->limit || target > ent->limit*2) { 397 err = -EINVAL; 398 goto err_unlock; 399 } 400 err = resize_available_mrs(ent, target, false); 401 if (err) 402 goto err_unlock; 403 spin_unlock_irq(&ent->mkeys_queue.lock); 404 405 return count; 406 407 err_unlock: 408 spin_unlock_irq(&ent->mkeys_queue.lock); 409 return err; 410 } 411 412 static ssize_t size_read(struct file *filp, char __user *buf, size_t count, 413 loff_t *pos) 414 { 415 struct mlx5_cache_ent *ent = filp->private_data; 416 char lbuf[20]; 417 int err; 418 419 err = snprintf(lbuf, sizeof(lbuf), "%ld\n", 420 ent->mkeys_queue.ci + ent->in_use); 421 if (err < 0) 422 return err; 423 424 return simple_read_from_buffer(buf, count, pos, lbuf, err); 425 } 426 427 static const struct file_operations size_fops = { 428 .owner = THIS_MODULE, 429 .open = simple_open, 430 .write = size_write, 431 .read = size_read, 432 }; 433 434 static ssize_t limit_write(struct file *filp, const char __user *buf, 435 size_t count, loff_t *pos) 436 { 437 struct mlx5_cache_ent *ent = filp->private_data; 438 u32 var; 439 int err; 440 441 err = kstrtou32_from_user(buf, count, 0, &var); 442 if (err) 443 return err; 444 445 /* 446 * Upon set we immediately fill the cache to high water mark implied by 447 * the limit. 448 */ 449 spin_lock_irq(&ent->mkeys_queue.lock); 450 ent->limit = var; 451 err = resize_available_mrs(ent, 0, true); 452 spin_unlock_irq(&ent->mkeys_queue.lock); 453 if (err) 454 return err; 455 return count; 456 } 457 458 static ssize_t limit_read(struct file *filp, char __user *buf, size_t count, 459 loff_t *pos) 460 { 461 struct mlx5_cache_ent *ent = filp->private_data; 462 char lbuf[20]; 463 int err; 464 465 err = snprintf(lbuf, sizeof(lbuf), "%d\n", ent->limit); 466 if (err < 0) 467 return err; 468 469 return simple_read_from_buffer(buf, count, pos, lbuf, err); 470 } 471 472 static const struct file_operations limit_fops = { 473 .owner = THIS_MODULE, 474 .open = simple_open, 475 .write = limit_write, 476 .read = limit_read, 477 }; 478 479 static bool someone_adding(struct mlx5_mkey_cache *cache) 480 { 481 struct mlx5_cache_ent *ent; 482 struct rb_node *node; 483 bool ret; 484 485 mutex_lock(&cache->rb_lock); 486 for (node = rb_first(&cache->rb_root); node; node = rb_next(node)) { 487 ent = rb_entry(node, struct mlx5_cache_ent, node); 488 spin_lock_irq(&ent->mkeys_queue.lock); 489 ret = ent->mkeys_queue.ci < ent->limit; 490 spin_unlock_irq(&ent->mkeys_queue.lock); 491 if (ret) { 492 mutex_unlock(&cache->rb_lock); 493 return true; 494 } 495 } 496 mutex_unlock(&cache->rb_lock); 497 return false; 498 } 499 500 /* 501 * Check if the bucket is outside the high/low water mark and schedule an async 502 * update. The cache refill has hysteresis, once the low water mark is hit it is 503 * refilled up to the high mark. 504 */ 505 static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent) 506 { 507 lockdep_assert_held(&ent->mkeys_queue.lock); 508 509 if (ent->disabled || READ_ONCE(ent->dev->fill_delay) || ent->is_tmp) 510 return; 511 if (ent->mkeys_queue.ci < ent->limit) { 512 ent->fill_to_high_water = true; 513 mod_delayed_work(ent->dev->cache.wq, &ent->dwork, 0); 514 } else if (ent->fill_to_high_water && 515 ent->mkeys_queue.ci + ent->pending < 2 * ent->limit) { 516 /* 517 * Once we start populating due to hitting a low water mark 518 * continue until we pass the high water mark. 519 */ 520 mod_delayed_work(ent->dev->cache.wq, &ent->dwork, 0); 521 } else if (ent->mkeys_queue.ci == 2 * ent->limit) { 522 ent->fill_to_high_water = false; 523 } else if (ent->mkeys_queue.ci > 2 * ent->limit) { 524 /* Queue deletion of excess entries */ 525 ent->fill_to_high_water = false; 526 if (ent->pending) 527 queue_delayed_work(ent->dev->cache.wq, &ent->dwork, 528 msecs_to_jiffies(1000)); 529 else 530 mod_delayed_work(ent->dev->cache.wq, &ent->dwork, 0); 531 } 532 } 533 534 static void clean_keys(struct mlx5_ib_dev *dev, struct mlx5_cache_ent *ent) 535 { 536 u32 mkey; 537 538 spin_lock_irq(&ent->mkeys_queue.lock); 539 while (ent->mkeys_queue.ci) { 540 mkey = pop_mkey_locked(ent); 541 spin_unlock_irq(&ent->mkeys_queue.lock); 542 mlx5_core_destroy_mkey(dev->mdev, mkey); 543 spin_lock_irq(&ent->mkeys_queue.lock); 544 } 545 ent->tmp_cleanup_scheduled = false; 546 spin_unlock_irq(&ent->mkeys_queue.lock); 547 } 548 549 static void __cache_work_func(struct mlx5_cache_ent *ent) 550 { 551 struct mlx5_ib_dev *dev = ent->dev; 552 struct mlx5_mkey_cache *cache = &dev->cache; 553 int err; 554 555 spin_lock_irq(&ent->mkeys_queue.lock); 556 if (ent->disabled) 557 goto out; 558 559 if (ent->fill_to_high_water && 560 ent->mkeys_queue.ci + ent->pending < 2 * ent->limit && 561 !READ_ONCE(dev->fill_delay)) { 562 spin_unlock_irq(&ent->mkeys_queue.lock); 563 err = add_keys(ent, 1); 564 spin_lock_irq(&ent->mkeys_queue.lock); 565 if (ent->disabled) 566 goto out; 567 if (err) { 568 /* 569 * EAGAIN only happens if there are pending MRs, so we 570 * will be rescheduled when storing them. The only 571 * failure path here is ENOMEM. 572 */ 573 if (err != -EAGAIN) { 574 mlx5_ib_warn( 575 dev, 576 "add keys command failed, err %d\n", 577 err); 578 queue_delayed_work(cache->wq, &ent->dwork, 579 msecs_to_jiffies(1000)); 580 } 581 } 582 } else if (ent->mkeys_queue.ci > 2 * ent->limit) { 583 bool need_delay; 584 585 /* 586 * The remove_cache_mr() logic is performed as garbage 587 * collection task. Such task is intended to be run when no 588 * other active processes are running. 589 * 590 * The need_resched() will return TRUE if there are user tasks 591 * to be activated in near future. 592 * 593 * In such case, we don't execute remove_cache_mr() and postpone 594 * the garbage collection work to try to run in next cycle, in 595 * order to free CPU resources to other tasks. 596 */ 597 spin_unlock_irq(&ent->mkeys_queue.lock); 598 need_delay = need_resched() || someone_adding(cache) || 599 !time_after(jiffies, 600 READ_ONCE(cache->last_add) + 300 * HZ); 601 spin_lock_irq(&ent->mkeys_queue.lock); 602 if (ent->disabled) 603 goto out; 604 if (need_delay) { 605 queue_delayed_work(cache->wq, &ent->dwork, 300 * HZ); 606 goto out; 607 } 608 remove_cache_mr_locked(ent); 609 queue_adjust_cache_locked(ent); 610 } 611 out: 612 spin_unlock_irq(&ent->mkeys_queue.lock); 613 } 614 615 static void delayed_cache_work_func(struct work_struct *work) 616 { 617 struct mlx5_cache_ent *ent; 618 619 ent = container_of(work, struct mlx5_cache_ent, dwork.work); 620 /* temp entries are never filled, only cleaned */ 621 if (ent->is_tmp) 622 clean_keys(ent->dev, ent); 623 else 624 __cache_work_func(ent); 625 } 626 627 static int cache_ent_key_cmp(struct mlx5r_cache_rb_key key1, 628 struct mlx5r_cache_rb_key key2) 629 { 630 int res; 631 632 res = key1.ats - key2.ats; 633 if (res) 634 return res; 635 636 res = key1.access_mode - key2.access_mode; 637 if (res) 638 return res; 639 640 res = key1.access_flags - key2.access_flags; 641 if (res) 642 return res; 643 644 /* 645 * keep ndescs the last in the compare table since the find function 646 * searches for an exact match on all properties and only closest 647 * match in size. 648 */ 649 return key1.ndescs - key2.ndescs; 650 } 651 652 static int mlx5_cache_ent_insert(struct mlx5_mkey_cache *cache, 653 struct mlx5_cache_ent *ent) 654 { 655 struct rb_node **new = &cache->rb_root.rb_node, *parent = NULL; 656 struct mlx5_cache_ent *cur; 657 int cmp; 658 659 /* Figure out where to put new node */ 660 while (*new) { 661 cur = rb_entry(*new, struct mlx5_cache_ent, node); 662 parent = *new; 663 cmp = cache_ent_key_cmp(cur->rb_key, ent->rb_key); 664 if (cmp > 0) 665 new = &((*new)->rb_left); 666 if (cmp < 0) 667 new = &((*new)->rb_right); 668 if (cmp == 0) 669 return -EEXIST; 670 } 671 672 /* Add new node and rebalance tree. */ 673 rb_link_node(&ent->node, parent, new); 674 rb_insert_color(&ent->node, &cache->rb_root); 675 676 return 0; 677 } 678 679 static struct mlx5_cache_ent * 680 mkey_cache_ent_from_rb_key(struct mlx5_ib_dev *dev, 681 struct mlx5r_cache_rb_key rb_key) 682 { 683 struct rb_node *node = dev->cache.rb_root.rb_node; 684 struct mlx5_cache_ent *cur, *smallest = NULL; 685 u64 ndescs_limit; 686 int cmp; 687 688 /* 689 * Find the smallest ent with order >= requested_order. 690 */ 691 while (node) { 692 cur = rb_entry(node, struct mlx5_cache_ent, node); 693 cmp = cache_ent_key_cmp(cur->rb_key, rb_key); 694 if (cmp > 0) { 695 smallest = cur; 696 node = node->rb_left; 697 } 698 if (cmp < 0) 699 node = node->rb_right; 700 if (cmp == 0) 701 return cur; 702 } 703 704 /* 705 * Limit the usage of mkeys larger than twice the required size while 706 * also allowing the usage of smallest cache entry for small MRs. 707 */ 708 ndescs_limit = max_t(u64, rb_key.ndescs * 2, 709 MLX5_MR_CACHE_PERSISTENT_ENTRY_MIN_DESCS); 710 711 return (smallest && 712 smallest->rb_key.access_mode == rb_key.access_mode && 713 smallest->rb_key.access_flags == rb_key.access_flags && 714 smallest->rb_key.ats == rb_key.ats && 715 smallest->rb_key.ndescs <= ndescs_limit) ? 716 smallest : 717 NULL; 718 } 719 720 static struct mlx5_ib_mr *_mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, 721 struct mlx5_cache_ent *ent) 722 { 723 struct mlx5_ib_mr *mr; 724 int err; 725 726 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 727 if (!mr) 728 return ERR_PTR(-ENOMEM); 729 730 spin_lock_irq(&ent->mkeys_queue.lock); 731 ent->in_use++; 732 733 if (!ent->mkeys_queue.ci) { 734 queue_adjust_cache_locked(ent); 735 ent->miss++; 736 spin_unlock_irq(&ent->mkeys_queue.lock); 737 err = create_cache_mkey(ent, &mr->mmkey.key); 738 if (err) { 739 spin_lock_irq(&ent->mkeys_queue.lock); 740 ent->in_use--; 741 spin_unlock_irq(&ent->mkeys_queue.lock); 742 kfree(mr); 743 return ERR_PTR(err); 744 } 745 } else { 746 mr->mmkey.key = pop_mkey_locked(ent); 747 queue_adjust_cache_locked(ent); 748 spin_unlock_irq(&ent->mkeys_queue.lock); 749 } 750 mr->mmkey.cache_ent = ent; 751 mr->mmkey.type = MLX5_MKEY_MR; 752 mr->mmkey.rb_key = ent->rb_key; 753 mr->mmkey.cacheable = true; 754 init_waitqueue_head(&mr->mmkey.wait); 755 return mr; 756 } 757 758 static int get_unchangeable_access_flags(struct mlx5_ib_dev *dev, 759 int access_flags) 760 { 761 int ret = 0; 762 763 if ((access_flags & IB_ACCESS_REMOTE_ATOMIC) && 764 MLX5_CAP_GEN(dev->mdev, atomic) && 765 MLX5_CAP_GEN(dev->mdev, umr_modify_atomic_disabled)) 766 ret |= IB_ACCESS_REMOTE_ATOMIC; 767 768 if ((access_flags & IB_ACCESS_RELAXED_ORDERING) && 769 MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write) && 770 !MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write_umr)) 771 ret |= IB_ACCESS_RELAXED_ORDERING; 772 773 if ((access_flags & IB_ACCESS_RELAXED_ORDERING) && 774 (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read) || 775 MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read_pci_enabled)) && 776 !MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read_umr)) 777 ret |= IB_ACCESS_RELAXED_ORDERING; 778 779 return ret; 780 } 781 782 struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, 783 int access_flags, int access_mode, 784 int ndescs) 785 { 786 struct mlx5r_cache_rb_key rb_key = { 787 .ndescs = ndescs, 788 .access_mode = access_mode, 789 .access_flags = get_unchangeable_access_flags(dev, access_flags) 790 }; 791 struct mlx5_cache_ent *ent = mkey_cache_ent_from_rb_key(dev, rb_key); 792 793 if (!ent) 794 return ERR_PTR(-EOPNOTSUPP); 795 796 return _mlx5_mr_cache_alloc(dev, ent); 797 } 798 799 static void mlx5_mkey_cache_debugfs_cleanup(struct mlx5_ib_dev *dev) 800 { 801 if (!mlx5_debugfs_root || dev->is_rep) 802 return; 803 804 debugfs_remove_recursive(dev->cache.fs_root); 805 dev->cache.fs_root = NULL; 806 } 807 808 static void mlx5_mkey_cache_debugfs_add_ent(struct mlx5_ib_dev *dev, 809 struct mlx5_cache_ent *ent) 810 { 811 int order = order_base_2(ent->rb_key.ndescs); 812 struct dentry *dir; 813 814 if (!mlx5_debugfs_root || dev->is_rep) 815 return; 816 817 if (ent->rb_key.access_mode == MLX5_MKC_ACCESS_MODE_KSM) 818 order = MLX5_IMR_KSM_CACHE_ENTRY + 2; 819 820 sprintf(ent->name, "%d", order); 821 dir = debugfs_create_dir(ent->name, dev->cache.fs_root); 822 debugfs_create_file("size", 0600, dir, ent, &size_fops); 823 debugfs_create_file("limit", 0600, dir, ent, &limit_fops); 824 debugfs_create_ulong("cur", 0400, dir, &ent->mkeys_queue.ci); 825 debugfs_create_u32("miss", 0600, dir, &ent->miss); 826 } 827 828 static void mlx5_mkey_cache_debugfs_init(struct mlx5_ib_dev *dev) 829 { 830 struct dentry *dbg_root = mlx5_debugfs_get_dev_root(dev->mdev); 831 struct mlx5_mkey_cache *cache = &dev->cache; 832 833 if (!mlx5_debugfs_root || dev->is_rep) 834 return; 835 836 cache->fs_root = debugfs_create_dir("mr_cache", dbg_root); 837 } 838 839 static void delay_time_func(struct timer_list *t) 840 { 841 struct mlx5_ib_dev *dev = from_timer(dev, t, delay_timer); 842 843 WRITE_ONCE(dev->fill_delay, 0); 844 } 845 846 static int mlx5r_mkeys_init(struct mlx5_cache_ent *ent) 847 { 848 struct mlx5_mkeys_page *page; 849 850 page = kzalloc(sizeof(*page), GFP_KERNEL); 851 if (!page) 852 return -ENOMEM; 853 INIT_LIST_HEAD(&ent->mkeys_queue.pages_list); 854 spin_lock_init(&ent->mkeys_queue.lock); 855 list_add_tail(&page->list, &ent->mkeys_queue.pages_list); 856 ent->mkeys_queue.num_pages++; 857 return 0; 858 } 859 860 static void mlx5r_mkeys_uninit(struct mlx5_cache_ent *ent) 861 { 862 struct mlx5_mkeys_page *page; 863 864 WARN_ON(ent->mkeys_queue.ci || ent->mkeys_queue.num_pages > 1); 865 page = list_last_entry(&ent->mkeys_queue.pages_list, 866 struct mlx5_mkeys_page, list); 867 list_del(&page->list); 868 kfree(page); 869 } 870 871 struct mlx5_cache_ent * 872 mlx5r_cache_create_ent_locked(struct mlx5_ib_dev *dev, 873 struct mlx5r_cache_rb_key rb_key, 874 bool persistent_entry) 875 { 876 struct mlx5_cache_ent *ent; 877 int order; 878 int ret; 879 880 ent = kzalloc(sizeof(*ent), GFP_KERNEL); 881 if (!ent) 882 return ERR_PTR(-ENOMEM); 883 884 ret = mlx5r_mkeys_init(ent); 885 if (ret) 886 goto mkeys_err; 887 ent->rb_key = rb_key; 888 ent->dev = dev; 889 ent->is_tmp = !persistent_entry; 890 891 INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func); 892 893 ret = mlx5_cache_ent_insert(&dev->cache, ent); 894 if (ret) 895 goto ent_insert_err; 896 897 if (persistent_entry) { 898 if (rb_key.access_mode == MLX5_MKC_ACCESS_MODE_KSM) 899 order = MLX5_IMR_KSM_CACHE_ENTRY; 900 else 901 order = order_base_2(rb_key.ndescs) - 2; 902 903 if ((dev->mdev->profile.mask & MLX5_PROF_MASK_MR_CACHE) && 904 !dev->is_rep && mlx5_core_is_pf(dev->mdev) && 905 mlx5r_umr_can_load_pas(dev, 0)) 906 ent->limit = dev->mdev->profile.mr_cache[order].limit; 907 else 908 ent->limit = 0; 909 910 mlx5_mkey_cache_debugfs_add_ent(dev, ent); 911 } 912 913 return ent; 914 ent_insert_err: 915 mlx5r_mkeys_uninit(ent); 916 mkeys_err: 917 kfree(ent); 918 return ERR_PTR(ret); 919 } 920 921 static void mlx5r_destroy_cache_entries(struct mlx5_ib_dev *dev) 922 { 923 struct rb_root *root = &dev->cache.rb_root; 924 struct mlx5_cache_ent *ent; 925 struct rb_node *node; 926 927 mutex_lock(&dev->cache.rb_lock); 928 node = rb_first(root); 929 while (node) { 930 ent = rb_entry(node, struct mlx5_cache_ent, node); 931 node = rb_next(node); 932 clean_keys(dev, ent); 933 rb_erase(&ent->node, root); 934 mlx5r_mkeys_uninit(ent); 935 kfree(ent); 936 } 937 mutex_unlock(&dev->cache.rb_lock); 938 } 939 940 int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev) 941 { 942 struct mlx5_mkey_cache *cache = &dev->cache; 943 struct rb_root *root = &dev->cache.rb_root; 944 struct mlx5r_cache_rb_key rb_key = { 945 .access_mode = MLX5_MKC_ACCESS_MODE_MTT, 946 }; 947 struct mlx5_cache_ent *ent; 948 struct rb_node *node; 949 int ret; 950 int i; 951 952 mutex_init(&dev->slow_path_mutex); 953 mutex_init(&dev->cache.rb_lock); 954 dev->cache.rb_root = RB_ROOT; 955 cache->wq = alloc_ordered_workqueue("mkey_cache", WQ_MEM_RECLAIM); 956 if (!cache->wq) { 957 mlx5_ib_warn(dev, "failed to create work queue\n"); 958 return -ENOMEM; 959 } 960 961 mlx5_cmd_init_async_ctx(dev->mdev, &dev->async_ctx); 962 timer_setup(&dev->delay_timer, delay_time_func, 0); 963 mlx5_mkey_cache_debugfs_init(dev); 964 mutex_lock(&cache->rb_lock); 965 for (i = 0; i <= mkey_cache_max_order(dev); i++) { 966 rb_key.ndescs = MLX5_MR_CACHE_PERSISTENT_ENTRY_MIN_DESCS << i; 967 ent = mlx5r_cache_create_ent_locked(dev, rb_key, true); 968 if (IS_ERR(ent)) { 969 ret = PTR_ERR(ent); 970 goto err; 971 } 972 } 973 974 ret = mlx5_odp_init_mkey_cache(dev); 975 if (ret) 976 goto err; 977 978 mutex_unlock(&cache->rb_lock); 979 for (node = rb_first(root); node; node = rb_next(node)) { 980 ent = rb_entry(node, struct mlx5_cache_ent, node); 981 spin_lock_irq(&ent->mkeys_queue.lock); 982 queue_adjust_cache_locked(ent); 983 spin_unlock_irq(&ent->mkeys_queue.lock); 984 } 985 986 return 0; 987 988 err: 989 mutex_unlock(&cache->rb_lock); 990 mlx5_mkey_cache_debugfs_cleanup(dev); 991 mlx5r_destroy_cache_entries(dev); 992 destroy_workqueue(cache->wq); 993 mlx5_ib_warn(dev, "failed to create mkey cache entry\n"); 994 return ret; 995 } 996 997 void mlx5_mkey_cache_cleanup(struct mlx5_ib_dev *dev) 998 { 999 struct rb_root *root = &dev->cache.rb_root; 1000 struct mlx5_cache_ent *ent; 1001 struct rb_node *node; 1002 1003 if (!dev->cache.wq) 1004 return; 1005 1006 mutex_lock(&dev->cache.rb_lock); 1007 for (node = rb_first(root); node; node = rb_next(node)) { 1008 ent = rb_entry(node, struct mlx5_cache_ent, node); 1009 spin_lock_irq(&ent->mkeys_queue.lock); 1010 ent->disabled = true; 1011 spin_unlock_irq(&ent->mkeys_queue.lock); 1012 cancel_delayed_work(&ent->dwork); 1013 } 1014 mutex_unlock(&dev->cache.rb_lock); 1015 1016 /* 1017 * After all entries are disabled and will not reschedule on WQ, 1018 * flush it and all async commands. 1019 */ 1020 flush_workqueue(dev->cache.wq); 1021 1022 mlx5_mkey_cache_debugfs_cleanup(dev); 1023 mlx5_cmd_cleanup_async_ctx(&dev->async_ctx); 1024 1025 /* At this point all entries are disabled and have no concurrent work. */ 1026 mlx5r_destroy_cache_entries(dev); 1027 1028 destroy_workqueue(dev->cache.wq); 1029 del_timer_sync(&dev->delay_timer); 1030 } 1031 1032 struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc) 1033 { 1034 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1035 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 1036 struct mlx5_ib_mr *mr; 1037 void *mkc; 1038 u32 *in; 1039 int err; 1040 1041 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 1042 if (!mr) 1043 return ERR_PTR(-ENOMEM); 1044 1045 in = kzalloc(inlen, GFP_KERNEL); 1046 if (!in) { 1047 err = -ENOMEM; 1048 goto err_free; 1049 } 1050 1051 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 1052 1053 MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_PA); 1054 MLX5_SET(mkc, mkc, length64, 1); 1055 set_mkc_access_pd_addr_fields(mkc, acc | IB_ACCESS_RELAXED_ORDERING, 0, 1056 pd); 1057 MLX5_SET(mkc, mkc, ma_translation_mode, MLX5_CAP_GEN(dev->mdev, ats)); 1058 1059 err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen); 1060 if (err) 1061 goto err_in; 1062 1063 kfree(in); 1064 mr->mmkey.type = MLX5_MKEY_MR; 1065 mr->ibmr.lkey = mr->mmkey.key; 1066 mr->ibmr.rkey = mr->mmkey.key; 1067 mr->umem = NULL; 1068 1069 return &mr->ibmr; 1070 1071 err_in: 1072 kfree(in); 1073 1074 err_free: 1075 kfree(mr); 1076 1077 return ERR_PTR(err); 1078 } 1079 1080 static int get_octo_len(u64 addr, u64 len, int page_shift) 1081 { 1082 u64 page_size = 1ULL << page_shift; 1083 u64 offset; 1084 int npages; 1085 1086 offset = addr & (page_size - 1); 1087 npages = ALIGN(len + offset, page_size) >> page_shift; 1088 return (npages + 1) / 2; 1089 } 1090 1091 static int mkey_cache_max_order(struct mlx5_ib_dev *dev) 1092 { 1093 if (MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset)) 1094 return MKEY_CACHE_LAST_STD_ENTRY; 1095 return MLX5_MAX_UMR_SHIFT; 1096 } 1097 1098 static void set_mr_fields(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr, 1099 u64 length, int access_flags, u64 iova) 1100 { 1101 mr->ibmr.lkey = mr->mmkey.key; 1102 mr->ibmr.rkey = mr->mmkey.key; 1103 mr->ibmr.length = length; 1104 mr->ibmr.device = &dev->ib_dev; 1105 mr->ibmr.iova = iova; 1106 mr->access_flags = access_flags; 1107 } 1108 1109 static unsigned int mlx5_umem_dmabuf_default_pgsz(struct ib_umem *umem, 1110 u64 iova) 1111 { 1112 /* 1113 * The alignment of iova has already been checked upon entering 1114 * UVERBS_METHOD_REG_DMABUF_MR 1115 */ 1116 umem->iova = iova; 1117 return PAGE_SIZE; 1118 } 1119 1120 static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd, 1121 struct ib_umem *umem, u64 iova, 1122 int access_flags, int access_mode) 1123 { 1124 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1125 struct mlx5r_cache_rb_key rb_key = {}; 1126 struct mlx5_cache_ent *ent; 1127 struct mlx5_ib_mr *mr; 1128 unsigned long page_size; 1129 1130 if (umem->is_dmabuf) 1131 page_size = mlx5_umem_dmabuf_default_pgsz(umem, iova); 1132 else 1133 page_size = mlx5_umem_mkc_find_best_pgsz(dev, umem, iova); 1134 if (WARN_ON(!page_size)) 1135 return ERR_PTR(-EINVAL); 1136 1137 rb_key.access_mode = access_mode; 1138 rb_key.ndescs = ib_umem_num_dma_blocks(umem, page_size); 1139 rb_key.ats = mlx5_umem_needs_ats(dev, umem, access_flags); 1140 rb_key.access_flags = get_unchangeable_access_flags(dev, access_flags); 1141 ent = mkey_cache_ent_from_rb_key(dev, rb_key); 1142 /* 1143 * If the MR can't come from the cache then synchronously create an uncached 1144 * one. 1145 */ 1146 if (!ent) { 1147 mutex_lock(&dev->slow_path_mutex); 1148 mr = reg_create(pd, umem, iova, access_flags, page_size, false, access_mode); 1149 mutex_unlock(&dev->slow_path_mutex); 1150 if (IS_ERR(mr)) 1151 return mr; 1152 mr->mmkey.rb_key = rb_key; 1153 mr->mmkey.cacheable = true; 1154 return mr; 1155 } 1156 1157 mr = _mlx5_mr_cache_alloc(dev, ent); 1158 if (IS_ERR(mr)) 1159 return mr; 1160 1161 mr->ibmr.pd = pd; 1162 mr->umem = umem; 1163 mr->page_shift = order_base_2(page_size); 1164 set_mr_fields(dev, mr, umem->length, access_flags, iova); 1165 1166 return mr; 1167 } 1168 1169 static struct ib_mr * 1170 reg_create_crossing_vhca_mr(struct ib_pd *pd, u64 iova, u64 length, int access_flags, 1171 u32 crossed_lkey) 1172 { 1173 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1174 int access_mode = MLX5_MKC_ACCESS_MODE_CROSSING; 1175 struct mlx5_ib_mr *mr; 1176 void *mkc; 1177 int inlen; 1178 u32 *in; 1179 int err; 1180 1181 if (!MLX5_CAP_GEN(dev->mdev, crossing_vhca_mkey)) 1182 return ERR_PTR(-EOPNOTSUPP); 1183 1184 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 1185 if (!mr) 1186 return ERR_PTR(-ENOMEM); 1187 1188 inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 1189 in = kvzalloc(inlen, GFP_KERNEL); 1190 if (!in) { 1191 err = -ENOMEM; 1192 goto err_1; 1193 } 1194 1195 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 1196 MLX5_SET(mkc, mkc, crossing_target_vhca_id, 1197 MLX5_CAP_GEN(dev->mdev, vhca_id)); 1198 MLX5_SET(mkc, mkc, translations_octword_size, crossed_lkey); 1199 MLX5_SET(mkc, mkc, access_mode_1_0, access_mode & 0x3); 1200 MLX5_SET(mkc, mkc, access_mode_4_2, (access_mode >> 2) & 0x7); 1201 1202 /* for this crossing mkey IOVA should be 0 and len should be IOVA + len */ 1203 set_mkc_access_pd_addr_fields(mkc, access_flags, 0, pd); 1204 MLX5_SET64(mkc, mkc, len, iova + length); 1205 1206 MLX5_SET(mkc, mkc, free, 0); 1207 MLX5_SET(mkc, mkc, umr_en, 0); 1208 err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen); 1209 if (err) 1210 goto err_2; 1211 1212 mr->mmkey.type = MLX5_MKEY_MR; 1213 set_mr_fields(dev, mr, length, access_flags, iova); 1214 mr->ibmr.pd = pd; 1215 kvfree(in); 1216 mlx5_ib_dbg(dev, "crossing mkey = 0x%x\n", mr->mmkey.key); 1217 1218 return &mr->ibmr; 1219 err_2: 1220 kvfree(in); 1221 err_1: 1222 kfree(mr); 1223 return ERR_PTR(err); 1224 } 1225 1226 /* 1227 * If ibmr is NULL it will be allocated by reg_create. 1228 * Else, the given ibmr will be used. 1229 */ 1230 static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem, 1231 u64 iova, int access_flags, 1232 unsigned long page_size, bool populate, 1233 int access_mode) 1234 { 1235 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1236 struct mlx5_ib_mr *mr; 1237 __be64 *pas; 1238 void *mkc; 1239 int inlen; 1240 u32 *in; 1241 int err; 1242 bool pg_cap = !!(MLX5_CAP_GEN(dev->mdev, pg)) && 1243 (access_mode == MLX5_MKC_ACCESS_MODE_MTT); 1244 bool ksm_mode = (access_mode == MLX5_MKC_ACCESS_MODE_KSM); 1245 1246 if (!page_size) 1247 return ERR_PTR(-EINVAL); 1248 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 1249 if (!mr) 1250 return ERR_PTR(-ENOMEM); 1251 1252 mr->ibmr.pd = pd; 1253 mr->access_flags = access_flags; 1254 mr->page_shift = order_base_2(page_size); 1255 1256 inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 1257 if (populate) 1258 inlen += sizeof(*pas) * 1259 roundup(ib_umem_num_dma_blocks(umem, page_size), 2); 1260 in = kvzalloc(inlen, GFP_KERNEL); 1261 if (!in) { 1262 err = -ENOMEM; 1263 goto err_1; 1264 } 1265 pas = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt); 1266 if (populate) { 1267 if (WARN_ON(access_flags & IB_ACCESS_ON_DEMAND || ksm_mode)) { 1268 err = -EINVAL; 1269 goto err_2; 1270 } 1271 mlx5_ib_populate_pas(umem, 1UL << mr->page_shift, pas, 1272 pg_cap ? MLX5_IB_MTT_PRESENT : 0); 1273 } 1274 1275 /* The pg_access bit allows setting the access flags 1276 * in the page list submitted with the command. 1277 */ 1278 MLX5_SET(create_mkey_in, in, pg_access, !!(pg_cap)); 1279 1280 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 1281 set_mkc_access_pd_addr_fields(mkc, access_flags, iova, 1282 populate ? pd : dev->umrc.pd); 1283 /* In case a data direct flow, overwrite the pdn field by its internal kernel PD */ 1284 if (umem->is_dmabuf && ksm_mode) 1285 MLX5_SET(mkc, mkc, pd, dev->ddr.pdn); 1286 1287 MLX5_SET(mkc, mkc, free, !populate); 1288 MLX5_SET(mkc, mkc, access_mode_1_0, access_mode); 1289 MLX5_SET(mkc, mkc, umr_en, 1); 1290 1291 MLX5_SET64(mkc, mkc, len, umem->length); 1292 MLX5_SET(mkc, mkc, bsf_octword_size, 0); 1293 if (ksm_mode) 1294 MLX5_SET(mkc, mkc, translations_octword_size, 1295 get_octo_len(iova, umem->length, mr->page_shift) * 2); 1296 else 1297 MLX5_SET(mkc, mkc, translations_octword_size, 1298 get_octo_len(iova, umem->length, mr->page_shift)); 1299 MLX5_SET(mkc, mkc, log_page_size, mr->page_shift); 1300 if (mlx5_umem_needs_ats(dev, umem, access_flags)) 1301 MLX5_SET(mkc, mkc, ma_translation_mode, 1); 1302 if (populate) { 1303 MLX5_SET(create_mkey_in, in, translations_octword_actual_size, 1304 get_octo_len(iova, umem->length, mr->page_shift)); 1305 } 1306 1307 err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen); 1308 if (err) { 1309 mlx5_ib_warn(dev, "create mkey failed\n"); 1310 goto err_2; 1311 } 1312 mr->mmkey.type = MLX5_MKEY_MR; 1313 mr->mmkey.ndescs = get_octo_len(iova, umem->length, mr->page_shift); 1314 mr->umem = umem; 1315 set_mr_fields(dev, mr, umem->length, access_flags, iova); 1316 kvfree(in); 1317 1318 mlx5_ib_dbg(dev, "mkey = 0x%x\n", mr->mmkey.key); 1319 1320 return mr; 1321 1322 err_2: 1323 kvfree(in); 1324 err_1: 1325 kfree(mr); 1326 return ERR_PTR(err); 1327 } 1328 1329 static struct ib_mr *mlx5_ib_get_dm_mr(struct ib_pd *pd, u64 start_addr, 1330 u64 length, int acc, int mode) 1331 { 1332 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1333 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 1334 struct mlx5_ib_mr *mr; 1335 void *mkc; 1336 u32 *in; 1337 int err; 1338 1339 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 1340 if (!mr) 1341 return ERR_PTR(-ENOMEM); 1342 1343 in = kzalloc(inlen, GFP_KERNEL); 1344 if (!in) { 1345 err = -ENOMEM; 1346 goto err_free; 1347 } 1348 1349 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 1350 1351 MLX5_SET(mkc, mkc, access_mode_1_0, mode & 0x3); 1352 MLX5_SET(mkc, mkc, access_mode_4_2, (mode >> 2) & 0x7); 1353 MLX5_SET64(mkc, mkc, len, length); 1354 set_mkc_access_pd_addr_fields(mkc, acc, start_addr, pd); 1355 1356 err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen); 1357 if (err) 1358 goto err_in; 1359 1360 kfree(in); 1361 1362 set_mr_fields(dev, mr, length, acc, start_addr); 1363 1364 return &mr->ibmr; 1365 1366 err_in: 1367 kfree(in); 1368 1369 err_free: 1370 kfree(mr); 1371 1372 return ERR_PTR(err); 1373 } 1374 1375 int mlx5_ib_advise_mr(struct ib_pd *pd, 1376 enum ib_uverbs_advise_mr_advice advice, 1377 u32 flags, 1378 struct ib_sge *sg_list, 1379 u32 num_sge, 1380 struct uverbs_attr_bundle *attrs) 1381 { 1382 if (advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH && 1383 advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE && 1384 advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_NO_FAULT) 1385 return -EOPNOTSUPP; 1386 1387 return mlx5_ib_advise_mr_prefetch(pd, advice, flags, 1388 sg_list, num_sge); 1389 } 1390 1391 struct ib_mr *mlx5_ib_reg_dm_mr(struct ib_pd *pd, struct ib_dm *dm, 1392 struct ib_dm_mr_attr *attr, 1393 struct uverbs_attr_bundle *attrs) 1394 { 1395 struct mlx5_ib_dm *mdm = to_mdm(dm); 1396 struct mlx5_core_dev *dev = to_mdev(dm->device)->mdev; 1397 u64 start_addr = mdm->dev_addr + attr->offset; 1398 int mode; 1399 1400 switch (mdm->type) { 1401 case MLX5_IB_UAPI_DM_TYPE_MEMIC: 1402 if (attr->access_flags & ~MLX5_IB_DM_MEMIC_ALLOWED_ACCESS) 1403 return ERR_PTR(-EINVAL); 1404 1405 mode = MLX5_MKC_ACCESS_MODE_MEMIC; 1406 start_addr -= pci_resource_start(dev->pdev, 0); 1407 break; 1408 case MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM: 1409 case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM: 1410 case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_PATTERN_SW_ICM: 1411 case MLX5_IB_UAPI_DM_TYPE_ENCAP_SW_ICM: 1412 if (attr->access_flags & ~MLX5_IB_DM_SW_ICM_ALLOWED_ACCESS) 1413 return ERR_PTR(-EINVAL); 1414 1415 mode = MLX5_MKC_ACCESS_MODE_SW_ICM; 1416 break; 1417 default: 1418 return ERR_PTR(-EINVAL); 1419 } 1420 1421 return mlx5_ib_get_dm_mr(pd, start_addr, attr->length, 1422 attr->access_flags, mode); 1423 } 1424 1425 static struct ib_mr *create_real_mr(struct ib_pd *pd, struct ib_umem *umem, 1426 u64 iova, int access_flags) 1427 { 1428 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1429 struct mlx5_ib_mr *mr = NULL; 1430 bool xlt_with_umr; 1431 int err; 1432 1433 xlt_with_umr = mlx5r_umr_can_load_pas(dev, umem->length); 1434 if (xlt_with_umr) { 1435 mr = alloc_cacheable_mr(pd, umem, iova, access_flags, 1436 MLX5_MKC_ACCESS_MODE_MTT); 1437 } else { 1438 unsigned long page_size = 1439 mlx5_umem_mkc_find_best_pgsz(dev, umem, iova); 1440 1441 mutex_lock(&dev->slow_path_mutex); 1442 mr = reg_create(pd, umem, iova, access_flags, page_size, 1443 true, MLX5_MKC_ACCESS_MODE_MTT); 1444 mutex_unlock(&dev->slow_path_mutex); 1445 } 1446 if (IS_ERR(mr)) { 1447 ib_umem_release(umem); 1448 return ERR_CAST(mr); 1449 } 1450 1451 mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmkey.key); 1452 1453 atomic_add(ib_umem_num_pages(umem), &dev->mdev->priv.reg_pages); 1454 1455 if (xlt_with_umr) { 1456 /* 1457 * If the MR was created with reg_create then it will be 1458 * configured properly but left disabled. It is safe to go ahead 1459 * and configure it again via UMR while enabling it. 1460 */ 1461 err = mlx5r_umr_update_mr_pas(mr, MLX5_IB_UPD_XLT_ENABLE); 1462 if (err) { 1463 mlx5_ib_dereg_mr(&mr->ibmr, NULL); 1464 return ERR_PTR(err); 1465 } 1466 } 1467 return &mr->ibmr; 1468 } 1469 1470 static struct ib_mr *create_user_odp_mr(struct ib_pd *pd, u64 start, u64 length, 1471 u64 iova, int access_flags, 1472 struct ib_udata *udata) 1473 { 1474 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1475 struct ib_umem_odp *odp; 1476 struct mlx5_ib_mr *mr; 1477 int err; 1478 1479 if (!IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) 1480 return ERR_PTR(-EOPNOTSUPP); 1481 1482 err = mlx5r_odp_create_eq(dev, &dev->odp_pf_eq); 1483 if (err) 1484 return ERR_PTR(err); 1485 if (!start && length == U64_MAX) { 1486 if (iova != 0) 1487 return ERR_PTR(-EINVAL); 1488 if (!(dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT)) 1489 return ERR_PTR(-EINVAL); 1490 1491 mr = mlx5_ib_alloc_implicit_mr(to_mpd(pd), access_flags); 1492 if (IS_ERR(mr)) 1493 return ERR_CAST(mr); 1494 return &mr->ibmr; 1495 } 1496 1497 /* ODP requires xlt update via umr to work. */ 1498 if (!mlx5r_umr_can_load_pas(dev, length)) 1499 return ERR_PTR(-EINVAL); 1500 1501 odp = ib_umem_odp_get(&dev->ib_dev, start, length, access_flags, 1502 &mlx5_mn_ops); 1503 if (IS_ERR(odp)) 1504 return ERR_CAST(odp); 1505 1506 mr = alloc_cacheable_mr(pd, &odp->umem, iova, access_flags, 1507 MLX5_MKC_ACCESS_MODE_MTT); 1508 if (IS_ERR(mr)) { 1509 ib_umem_release(&odp->umem); 1510 return ERR_CAST(mr); 1511 } 1512 xa_init(&mr->implicit_children); 1513 1514 odp->private = mr; 1515 err = mlx5r_store_odp_mkey(dev, &mr->mmkey); 1516 if (err) 1517 goto err_dereg_mr; 1518 1519 err = mlx5_ib_init_odp_mr(mr); 1520 if (err) 1521 goto err_dereg_mr; 1522 return &mr->ibmr; 1523 1524 err_dereg_mr: 1525 mlx5_ib_dereg_mr(&mr->ibmr, NULL); 1526 return ERR_PTR(err); 1527 } 1528 1529 struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, 1530 u64 iova, int access_flags, 1531 struct ib_udata *udata) 1532 { 1533 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1534 struct ib_umem *umem; 1535 int err; 1536 1537 if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM)) 1538 return ERR_PTR(-EOPNOTSUPP); 1539 1540 mlx5_ib_dbg(dev, "start 0x%llx, iova 0x%llx, length 0x%llx, access_flags 0x%x\n", 1541 start, iova, length, access_flags); 1542 1543 err = mlx5r_umr_resource_init(dev); 1544 if (err) 1545 return ERR_PTR(err); 1546 1547 if (access_flags & IB_ACCESS_ON_DEMAND) 1548 return create_user_odp_mr(pd, start, length, iova, access_flags, 1549 udata); 1550 umem = ib_umem_get(&dev->ib_dev, start, length, access_flags); 1551 if (IS_ERR(umem)) 1552 return ERR_CAST(umem); 1553 return create_real_mr(pd, umem, iova, access_flags); 1554 } 1555 1556 static void mlx5_ib_dmabuf_invalidate_cb(struct dma_buf_attachment *attach) 1557 { 1558 struct ib_umem_dmabuf *umem_dmabuf = attach->importer_priv; 1559 struct mlx5_ib_mr *mr = umem_dmabuf->private; 1560 1561 dma_resv_assert_held(umem_dmabuf->attach->dmabuf->resv); 1562 1563 if (!umem_dmabuf->sgt || !mr) 1564 return; 1565 1566 mlx5r_umr_update_mr_pas(mr, MLX5_IB_UPD_XLT_ZAP); 1567 ib_umem_dmabuf_unmap_pages(umem_dmabuf); 1568 } 1569 1570 static struct dma_buf_attach_ops mlx5_ib_dmabuf_attach_ops = { 1571 .allow_peer2peer = 1, 1572 .move_notify = mlx5_ib_dmabuf_invalidate_cb, 1573 }; 1574 1575 static struct ib_mr * 1576 reg_user_mr_dmabuf(struct ib_pd *pd, struct device *dma_device, 1577 u64 offset, u64 length, u64 virt_addr, 1578 int fd, int access_flags, int access_mode) 1579 { 1580 bool pinned_mode = (access_mode == MLX5_MKC_ACCESS_MODE_KSM); 1581 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1582 struct mlx5_ib_mr *mr = NULL; 1583 struct ib_umem_dmabuf *umem_dmabuf; 1584 int err; 1585 1586 err = mlx5r_umr_resource_init(dev); 1587 if (err) 1588 return ERR_PTR(err); 1589 1590 if (!pinned_mode) 1591 umem_dmabuf = ib_umem_dmabuf_get(&dev->ib_dev, 1592 offset, length, fd, 1593 access_flags, 1594 &mlx5_ib_dmabuf_attach_ops); 1595 else 1596 umem_dmabuf = ib_umem_dmabuf_get_pinned_with_dma_device(&dev->ib_dev, 1597 dma_device, offset, length, 1598 fd, access_flags); 1599 1600 if (IS_ERR(umem_dmabuf)) { 1601 mlx5_ib_dbg(dev, "umem_dmabuf get failed (%ld)\n", 1602 PTR_ERR(umem_dmabuf)); 1603 return ERR_CAST(umem_dmabuf); 1604 } 1605 1606 mr = alloc_cacheable_mr(pd, &umem_dmabuf->umem, virt_addr, 1607 access_flags, access_mode); 1608 if (IS_ERR(mr)) { 1609 ib_umem_release(&umem_dmabuf->umem); 1610 return ERR_CAST(mr); 1611 } 1612 1613 mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmkey.key); 1614 1615 atomic_add(ib_umem_num_pages(mr->umem), &dev->mdev->priv.reg_pages); 1616 umem_dmabuf->private = mr; 1617 if (!pinned_mode) { 1618 err = mlx5r_store_odp_mkey(dev, &mr->mmkey); 1619 if (err) 1620 goto err_dereg_mr; 1621 } else { 1622 mr->data_direct = true; 1623 } 1624 1625 err = mlx5_ib_init_dmabuf_mr(mr); 1626 if (err) 1627 goto err_dereg_mr; 1628 return &mr->ibmr; 1629 1630 err_dereg_mr: 1631 __mlx5_ib_dereg_mr(&mr->ibmr); 1632 return ERR_PTR(err); 1633 } 1634 1635 static struct ib_mr * 1636 reg_user_mr_dmabuf_by_data_direct(struct ib_pd *pd, u64 offset, 1637 u64 length, u64 virt_addr, 1638 int fd, int access_flags) 1639 { 1640 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1641 struct mlx5_data_direct_dev *data_direct_dev; 1642 struct ib_mr *crossing_mr; 1643 struct ib_mr *crossed_mr; 1644 int ret = 0; 1645 1646 /* As of HW behaviour the IOVA must be page aligned in KSM mode */ 1647 if (!PAGE_ALIGNED(virt_addr) || (access_flags & IB_ACCESS_ON_DEMAND)) 1648 return ERR_PTR(-EOPNOTSUPP); 1649 1650 mutex_lock(&dev->data_direct_lock); 1651 data_direct_dev = dev->data_direct_dev; 1652 if (!data_direct_dev) { 1653 ret = -EINVAL; 1654 goto end; 1655 } 1656 1657 /* The device's 'data direct mkey' was created without RO flags to 1658 * simplify things and allow for a single mkey per device. 1659 * Since RO is not a must, mask it out accordingly. 1660 */ 1661 access_flags &= ~IB_ACCESS_RELAXED_ORDERING; 1662 crossed_mr = reg_user_mr_dmabuf(pd, &data_direct_dev->pdev->dev, 1663 offset, length, virt_addr, fd, 1664 access_flags, MLX5_MKC_ACCESS_MODE_KSM); 1665 if (IS_ERR(crossed_mr)) { 1666 ret = PTR_ERR(crossed_mr); 1667 goto end; 1668 } 1669 1670 mutex_lock(&dev->slow_path_mutex); 1671 crossing_mr = reg_create_crossing_vhca_mr(pd, virt_addr, length, access_flags, 1672 crossed_mr->lkey); 1673 mutex_unlock(&dev->slow_path_mutex); 1674 if (IS_ERR(crossing_mr)) { 1675 __mlx5_ib_dereg_mr(crossed_mr); 1676 ret = PTR_ERR(crossing_mr); 1677 goto end; 1678 } 1679 1680 list_add_tail(&to_mmr(crossed_mr)->dd_node, &dev->data_direct_mr_list); 1681 to_mmr(crossing_mr)->dd_crossed_mr = to_mmr(crossed_mr); 1682 to_mmr(crossing_mr)->data_direct = true; 1683 end: 1684 mutex_unlock(&dev->data_direct_lock); 1685 return ret ? ERR_PTR(ret) : crossing_mr; 1686 } 1687 1688 struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset, 1689 u64 length, u64 virt_addr, 1690 int fd, int access_flags, 1691 struct uverbs_attr_bundle *attrs) 1692 { 1693 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1694 int mlx5_access_flags = 0; 1695 int err; 1696 1697 if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM) || 1698 !IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) 1699 return ERR_PTR(-EOPNOTSUPP); 1700 1701 if (uverbs_attr_is_valid(attrs, MLX5_IB_ATTR_REG_DMABUF_MR_ACCESS_FLAGS)) { 1702 err = uverbs_get_flags32(&mlx5_access_flags, attrs, 1703 MLX5_IB_ATTR_REG_DMABUF_MR_ACCESS_FLAGS, 1704 MLX5_IB_UAPI_REG_DMABUF_ACCESS_DATA_DIRECT); 1705 if (err) 1706 return ERR_PTR(err); 1707 } 1708 1709 mlx5_ib_dbg(dev, 1710 "offset 0x%llx, virt_addr 0x%llx, length 0x%llx, fd %d, access_flags 0x%x, mlx5_access_flags 0x%x\n", 1711 offset, virt_addr, length, fd, access_flags, mlx5_access_flags); 1712 1713 /* dmabuf requires xlt update via umr to work. */ 1714 if (!mlx5r_umr_can_load_pas(dev, length)) 1715 return ERR_PTR(-EINVAL); 1716 1717 if (mlx5_access_flags & MLX5_IB_UAPI_REG_DMABUF_ACCESS_DATA_DIRECT) 1718 return reg_user_mr_dmabuf_by_data_direct(pd, offset, length, virt_addr, 1719 fd, access_flags); 1720 1721 return reg_user_mr_dmabuf(pd, pd->device->dma_device, 1722 offset, length, virt_addr, 1723 fd, access_flags, MLX5_MKC_ACCESS_MODE_MTT); 1724 } 1725 1726 /* 1727 * True if the change in access flags can be done via UMR, only some access 1728 * flags can be updated. 1729 */ 1730 static bool can_use_umr_rereg_access(struct mlx5_ib_dev *dev, 1731 unsigned int current_access_flags, 1732 unsigned int target_access_flags) 1733 { 1734 unsigned int diffs = current_access_flags ^ target_access_flags; 1735 1736 if (diffs & ~(IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE | 1737 IB_ACCESS_REMOTE_READ | IB_ACCESS_RELAXED_ORDERING | 1738 IB_ACCESS_REMOTE_ATOMIC)) 1739 return false; 1740 return mlx5r_umr_can_reconfig(dev, current_access_flags, 1741 target_access_flags); 1742 } 1743 1744 static bool can_use_umr_rereg_pas(struct mlx5_ib_mr *mr, 1745 struct ib_umem *new_umem, 1746 int new_access_flags, u64 iova, 1747 unsigned long *page_size) 1748 { 1749 struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device); 1750 1751 /* We only track the allocated sizes of MRs from the cache */ 1752 if (!mr->mmkey.cache_ent) 1753 return false; 1754 if (!mlx5r_umr_can_load_pas(dev, new_umem->length)) 1755 return false; 1756 1757 *page_size = mlx5_umem_mkc_find_best_pgsz(dev, new_umem, iova); 1758 if (WARN_ON(!*page_size)) 1759 return false; 1760 return (mr->mmkey.cache_ent->rb_key.ndescs) >= 1761 ib_umem_num_dma_blocks(new_umem, *page_size); 1762 } 1763 1764 static int umr_rereg_pas(struct mlx5_ib_mr *mr, struct ib_pd *pd, 1765 int access_flags, int flags, struct ib_umem *new_umem, 1766 u64 iova, unsigned long page_size) 1767 { 1768 struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device); 1769 int upd_flags = MLX5_IB_UPD_XLT_ADDR | MLX5_IB_UPD_XLT_ENABLE; 1770 struct ib_umem *old_umem = mr->umem; 1771 int err; 1772 1773 /* 1774 * To keep everything simple the MR is revoked before we start to mess 1775 * with it. This ensure the change is atomic relative to any use of the 1776 * MR. 1777 */ 1778 err = mlx5r_umr_revoke_mr(mr); 1779 if (err) 1780 return err; 1781 1782 if (flags & IB_MR_REREG_PD) { 1783 mr->ibmr.pd = pd; 1784 upd_flags |= MLX5_IB_UPD_XLT_PD; 1785 } 1786 if (flags & IB_MR_REREG_ACCESS) { 1787 mr->access_flags = access_flags; 1788 upd_flags |= MLX5_IB_UPD_XLT_ACCESS; 1789 } 1790 1791 mr->ibmr.iova = iova; 1792 mr->ibmr.length = new_umem->length; 1793 mr->page_shift = order_base_2(page_size); 1794 mr->umem = new_umem; 1795 err = mlx5r_umr_update_mr_pas(mr, upd_flags); 1796 if (err) { 1797 /* 1798 * The MR is revoked at this point so there is no issue to free 1799 * new_umem. 1800 */ 1801 mr->umem = old_umem; 1802 return err; 1803 } 1804 1805 atomic_sub(ib_umem_num_pages(old_umem), &dev->mdev->priv.reg_pages); 1806 ib_umem_release(old_umem); 1807 atomic_add(ib_umem_num_pages(new_umem), &dev->mdev->priv.reg_pages); 1808 return 0; 1809 } 1810 1811 struct ib_mr *mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, 1812 u64 length, u64 iova, int new_access_flags, 1813 struct ib_pd *new_pd, 1814 struct ib_udata *udata) 1815 { 1816 struct mlx5_ib_dev *dev = to_mdev(ib_mr->device); 1817 struct mlx5_ib_mr *mr = to_mmr(ib_mr); 1818 int err; 1819 1820 if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM) || mr->data_direct) 1821 return ERR_PTR(-EOPNOTSUPP); 1822 1823 mlx5_ib_dbg( 1824 dev, 1825 "start 0x%llx, iova 0x%llx, length 0x%llx, access_flags 0x%x\n", 1826 start, iova, length, new_access_flags); 1827 1828 if (flags & ~(IB_MR_REREG_TRANS | IB_MR_REREG_PD | IB_MR_REREG_ACCESS)) 1829 return ERR_PTR(-EOPNOTSUPP); 1830 1831 if (!(flags & IB_MR_REREG_ACCESS)) 1832 new_access_flags = mr->access_flags; 1833 if (!(flags & IB_MR_REREG_PD)) 1834 new_pd = ib_mr->pd; 1835 1836 if (!(flags & IB_MR_REREG_TRANS)) { 1837 struct ib_umem *umem; 1838 1839 /* Fast path for PD/access change */ 1840 if (can_use_umr_rereg_access(dev, mr->access_flags, 1841 new_access_flags)) { 1842 err = mlx5r_umr_rereg_pd_access(mr, new_pd, 1843 new_access_flags); 1844 if (err) 1845 return ERR_PTR(err); 1846 return NULL; 1847 } 1848 /* DM or ODP MR's don't have a normal umem so we can't re-use it */ 1849 if (!mr->umem || is_odp_mr(mr) || is_dmabuf_mr(mr)) 1850 goto recreate; 1851 1852 /* 1853 * Only one active MR can refer to a umem at one time, revoke 1854 * the old MR before assigning the umem to the new one. 1855 */ 1856 err = mlx5r_umr_revoke_mr(mr); 1857 if (err) 1858 return ERR_PTR(err); 1859 umem = mr->umem; 1860 mr->umem = NULL; 1861 atomic_sub(ib_umem_num_pages(umem), &dev->mdev->priv.reg_pages); 1862 1863 return create_real_mr(new_pd, umem, mr->ibmr.iova, 1864 new_access_flags); 1865 } 1866 1867 /* 1868 * DM doesn't have a PAS list so we can't re-use it, odp/dmabuf does 1869 * but the logic around releasing the umem is different 1870 */ 1871 if (!mr->umem || is_odp_mr(mr) || is_dmabuf_mr(mr)) 1872 goto recreate; 1873 1874 if (!(new_access_flags & IB_ACCESS_ON_DEMAND) && 1875 can_use_umr_rereg_access(dev, mr->access_flags, new_access_flags)) { 1876 struct ib_umem *new_umem; 1877 unsigned long page_size; 1878 1879 new_umem = ib_umem_get(&dev->ib_dev, start, length, 1880 new_access_flags); 1881 if (IS_ERR(new_umem)) 1882 return ERR_CAST(new_umem); 1883 1884 /* Fast path for PAS change */ 1885 if (can_use_umr_rereg_pas(mr, new_umem, new_access_flags, iova, 1886 &page_size)) { 1887 err = umr_rereg_pas(mr, new_pd, new_access_flags, flags, 1888 new_umem, iova, page_size); 1889 if (err) { 1890 ib_umem_release(new_umem); 1891 return ERR_PTR(err); 1892 } 1893 return NULL; 1894 } 1895 return create_real_mr(new_pd, new_umem, iova, new_access_flags); 1896 } 1897 1898 /* 1899 * Everything else has no state we can preserve, just create a new MR 1900 * from scratch 1901 */ 1902 recreate: 1903 return mlx5_ib_reg_user_mr(new_pd, start, length, iova, 1904 new_access_flags, udata); 1905 } 1906 1907 static int 1908 mlx5_alloc_priv_descs(struct ib_device *device, 1909 struct mlx5_ib_mr *mr, 1910 int ndescs, 1911 int desc_size) 1912 { 1913 struct mlx5_ib_dev *dev = to_mdev(device); 1914 struct device *ddev = &dev->mdev->pdev->dev; 1915 int size = ndescs * desc_size; 1916 int add_size; 1917 int ret; 1918 1919 add_size = max_t(int, MLX5_UMR_ALIGN - ARCH_KMALLOC_MINALIGN, 0); 1920 if (is_power_of_2(MLX5_UMR_ALIGN) && add_size) { 1921 int end = max_t(int, MLX5_UMR_ALIGN, roundup_pow_of_two(size)); 1922 1923 add_size = min_t(int, end - size, add_size); 1924 } 1925 1926 mr->descs_alloc = kzalloc(size + add_size, GFP_KERNEL); 1927 if (!mr->descs_alloc) 1928 return -ENOMEM; 1929 1930 mr->descs = PTR_ALIGN(mr->descs_alloc, MLX5_UMR_ALIGN); 1931 1932 mr->desc_map = dma_map_single(ddev, mr->descs, size, DMA_TO_DEVICE); 1933 if (dma_mapping_error(ddev, mr->desc_map)) { 1934 ret = -ENOMEM; 1935 goto err; 1936 } 1937 1938 return 0; 1939 err: 1940 kfree(mr->descs_alloc); 1941 1942 return ret; 1943 } 1944 1945 static void 1946 mlx5_free_priv_descs(struct mlx5_ib_mr *mr) 1947 { 1948 if (!mr->umem && !mr->data_direct && 1949 mr->ibmr.type != IB_MR_TYPE_DM && mr->descs) { 1950 struct ib_device *device = mr->ibmr.device; 1951 int size = mr->max_descs * mr->desc_size; 1952 struct mlx5_ib_dev *dev = to_mdev(device); 1953 1954 dma_unmap_single(&dev->mdev->pdev->dev, mr->desc_map, size, 1955 DMA_TO_DEVICE); 1956 kfree(mr->descs_alloc); 1957 mr->descs = NULL; 1958 } 1959 } 1960 1961 static int cache_ent_find_and_store(struct mlx5_ib_dev *dev, 1962 struct mlx5_ib_mr *mr) 1963 { 1964 struct mlx5_mkey_cache *cache = &dev->cache; 1965 struct mlx5_cache_ent *ent; 1966 int ret; 1967 1968 if (mr->mmkey.cache_ent) { 1969 spin_lock_irq(&mr->mmkey.cache_ent->mkeys_queue.lock); 1970 goto end; 1971 } 1972 1973 mutex_lock(&cache->rb_lock); 1974 ent = mkey_cache_ent_from_rb_key(dev, mr->mmkey.rb_key); 1975 if (ent) { 1976 if (ent->rb_key.ndescs == mr->mmkey.rb_key.ndescs) { 1977 if (ent->disabled) { 1978 mutex_unlock(&cache->rb_lock); 1979 return -EOPNOTSUPP; 1980 } 1981 mr->mmkey.cache_ent = ent; 1982 spin_lock_irq(&mr->mmkey.cache_ent->mkeys_queue.lock); 1983 mutex_unlock(&cache->rb_lock); 1984 goto end; 1985 } 1986 } 1987 1988 ent = mlx5r_cache_create_ent_locked(dev, mr->mmkey.rb_key, false); 1989 mutex_unlock(&cache->rb_lock); 1990 if (IS_ERR(ent)) 1991 return PTR_ERR(ent); 1992 1993 mr->mmkey.cache_ent = ent; 1994 spin_lock_irq(&mr->mmkey.cache_ent->mkeys_queue.lock); 1995 1996 end: 1997 ret = push_mkey_locked(mr->mmkey.cache_ent, mr->mmkey.key); 1998 spin_unlock_irq(&mr->mmkey.cache_ent->mkeys_queue.lock); 1999 return ret; 2000 } 2001 2002 static int mlx5_ib_revoke_data_direct_mr(struct mlx5_ib_mr *mr) 2003 { 2004 struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device); 2005 struct ib_umem_dmabuf *umem_dmabuf = to_ib_umem_dmabuf(mr->umem); 2006 int err; 2007 2008 lockdep_assert_held(&dev->data_direct_lock); 2009 mr->revoked = true; 2010 err = mlx5r_umr_revoke_mr(mr); 2011 if (WARN_ON(err)) 2012 return err; 2013 2014 ib_umem_dmabuf_revoke(umem_dmabuf); 2015 return 0; 2016 } 2017 2018 void mlx5_ib_revoke_data_direct_mrs(struct mlx5_ib_dev *dev) 2019 { 2020 struct mlx5_ib_mr *mr, *next; 2021 2022 lockdep_assert_held(&dev->data_direct_lock); 2023 2024 list_for_each_entry_safe(mr, next, &dev->data_direct_mr_list, dd_node) { 2025 list_del(&mr->dd_node); 2026 mlx5_ib_revoke_data_direct_mr(mr); 2027 } 2028 } 2029 2030 static int mlx5_revoke_mr(struct mlx5_ib_mr *mr) 2031 { 2032 struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device); 2033 struct mlx5_cache_ent *ent = mr->mmkey.cache_ent; 2034 bool is_odp = is_odp_mr(mr); 2035 bool is_odp_dma_buf = is_dmabuf_mr(mr) && 2036 !to_ib_umem_dmabuf(mr->umem)->pinned; 2037 bool from_cache = !!ent; 2038 int ret = 0; 2039 2040 if (is_odp) 2041 mutex_lock(&to_ib_umem_odp(mr->umem)->umem_mutex); 2042 2043 if (is_odp_dma_buf) 2044 dma_resv_lock(to_ib_umem_dmabuf(mr->umem)->attach->dmabuf->resv, NULL); 2045 2046 if (mr->mmkey.cacheable && !mlx5r_umr_revoke_mr(mr) && !cache_ent_find_and_store(dev, mr)) { 2047 ent = mr->mmkey.cache_ent; 2048 /* upon storing to a clean temp entry - schedule its cleanup */ 2049 spin_lock_irq(&ent->mkeys_queue.lock); 2050 if (from_cache) 2051 ent->in_use--; 2052 if (ent->is_tmp && !ent->tmp_cleanup_scheduled) { 2053 mod_delayed_work(ent->dev->cache.wq, &ent->dwork, 2054 msecs_to_jiffies(30 * 1000)); 2055 ent->tmp_cleanup_scheduled = true; 2056 } 2057 spin_unlock_irq(&ent->mkeys_queue.lock); 2058 goto out; 2059 } 2060 2061 if (ent) { 2062 spin_lock_irq(&ent->mkeys_queue.lock); 2063 ent->in_use--; 2064 mr->mmkey.cache_ent = NULL; 2065 spin_unlock_irq(&ent->mkeys_queue.lock); 2066 } 2067 ret = destroy_mkey(dev, mr); 2068 out: 2069 if (is_odp) { 2070 if (!ret) 2071 to_ib_umem_odp(mr->umem)->private = NULL; 2072 mutex_unlock(&to_ib_umem_odp(mr->umem)->umem_mutex); 2073 } 2074 2075 if (is_odp_dma_buf) { 2076 if (!ret) 2077 to_ib_umem_dmabuf(mr->umem)->private = NULL; 2078 dma_resv_unlock(to_ib_umem_dmabuf(mr->umem)->attach->dmabuf->resv); 2079 } 2080 2081 return ret; 2082 } 2083 2084 static int __mlx5_ib_dereg_mr(struct ib_mr *ibmr) 2085 { 2086 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2087 struct mlx5_ib_dev *dev = to_mdev(ibmr->device); 2088 int rc; 2089 2090 /* 2091 * Any async use of the mr must hold the refcount, once the refcount 2092 * goes to zero no other thread, such as ODP page faults, prefetch, any 2093 * UMR activity, etc can touch the mkey. Thus it is safe to destroy it. 2094 */ 2095 if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) && 2096 refcount_read(&mr->mmkey.usecount) != 0 && 2097 xa_erase(&mr_to_mdev(mr)->odp_mkeys, mlx5_base_mkey(mr->mmkey.key))) 2098 mlx5r_deref_wait_odp_mkey(&mr->mmkey); 2099 2100 if (ibmr->type == IB_MR_TYPE_INTEGRITY) { 2101 xa_cmpxchg(&dev->sig_mrs, mlx5_base_mkey(mr->mmkey.key), 2102 mr->sig, NULL, GFP_KERNEL); 2103 2104 if (mr->mtt_mr) { 2105 rc = mlx5_ib_dereg_mr(&mr->mtt_mr->ibmr, NULL); 2106 if (rc) 2107 return rc; 2108 mr->mtt_mr = NULL; 2109 } 2110 if (mr->klm_mr) { 2111 rc = mlx5_ib_dereg_mr(&mr->klm_mr->ibmr, NULL); 2112 if (rc) 2113 return rc; 2114 mr->klm_mr = NULL; 2115 } 2116 2117 if (mlx5_core_destroy_psv(dev->mdev, 2118 mr->sig->psv_memory.psv_idx)) 2119 mlx5_ib_warn(dev, "failed to destroy mem psv %d\n", 2120 mr->sig->psv_memory.psv_idx); 2121 if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_wire.psv_idx)) 2122 mlx5_ib_warn(dev, "failed to destroy wire psv %d\n", 2123 mr->sig->psv_wire.psv_idx); 2124 kfree(mr->sig); 2125 mr->sig = NULL; 2126 } 2127 2128 /* Stop DMA */ 2129 rc = mlx5_revoke_mr(mr); 2130 if (rc) 2131 return rc; 2132 2133 if (mr->umem) { 2134 bool is_odp = is_odp_mr(mr); 2135 2136 if (!is_odp) 2137 atomic_sub(ib_umem_num_pages(mr->umem), 2138 &dev->mdev->priv.reg_pages); 2139 ib_umem_release(mr->umem); 2140 if (is_odp) 2141 mlx5_ib_free_odp_mr(mr); 2142 } 2143 2144 if (!mr->mmkey.cache_ent) 2145 mlx5_free_priv_descs(mr); 2146 2147 kfree(mr); 2148 return 0; 2149 } 2150 2151 static int dereg_crossing_data_direct_mr(struct mlx5_ib_dev *dev, 2152 struct mlx5_ib_mr *mr) 2153 { 2154 struct mlx5_ib_mr *dd_crossed_mr = mr->dd_crossed_mr; 2155 int ret; 2156 2157 ret = __mlx5_ib_dereg_mr(&mr->ibmr); 2158 if (ret) 2159 return ret; 2160 2161 mutex_lock(&dev->data_direct_lock); 2162 if (!dd_crossed_mr->revoked) 2163 list_del(&dd_crossed_mr->dd_node); 2164 2165 ret = __mlx5_ib_dereg_mr(&dd_crossed_mr->ibmr); 2166 mutex_unlock(&dev->data_direct_lock); 2167 return ret; 2168 } 2169 2170 int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) 2171 { 2172 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2173 struct mlx5_ib_dev *dev = to_mdev(ibmr->device); 2174 2175 if (mr->data_direct) 2176 return dereg_crossing_data_direct_mr(dev, mr); 2177 2178 return __mlx5_ib_dereg_mr(ibmr); 2179 } 2180 2181 static void mlx5_set_umr_free_mkey(struct ib_pd *pd, u32 *in, int ndescs, 2182 int access_mode, int page_shift) 2183 { 2184 struct mlx5_ib_dev *dev = to_mdev(pd->device); 2185 void *mkc; 2186 2187 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 2188 2189 /* This is only used from the kernel, so setting the PD is OK. */ 2190 set_mkc_access_pd_addr_fields(mkc, IB_ACCESS_RELAXED_ORDERING, 0, pd); 2191 MLX5_SET(mkc, mkc, free, 1); 2192 MLX5_SET(mkc, mkc, translations_octword_size, ndescs); 2193 MLX5_SET(mkc, mkc, access_mode_1_0, access_mode & 0x3); 2194 MLX5_SET(mkc, mkc, access_mode_4_2, (access_mode >> 2) & 0x7); 2195 MLX5_SET(mkc, mkc, umr_en, 1); 2196 MLX5_SET(mkc, mkc, log_page_size, page_shift); 2197 if (access_mode == MLX5_MKC_ACCESS_MODE_PA || 2198 access_mode == MLX5_MKC_ACCESS_MODE_MTT) 2199 MLX5_SET(mkc, mkc, ma_translation_mode, MLX5_CAP_GEN(dev->mdev, ats)); 2200 } 2201 2202 static int _mlx5_alloc_mkey_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr, 2203 int ndescs, int desc_size, int page_shift, 2204 int access_mode, u32 *in, int inlen) 2205 { 2206 struct mlx5_ib_dev *dev = to_mdev(pd->device); 2207 int err; 2208 2209 mr->access_mode = access_mode; 2210 mr->desc_size = desc_size; 2211 mr->max_descs = ndescs; 2212 2213 err = mlx5_alloc_priv_descs(pd->device, mr, ndescs, desc_size); 2214 if (err) 2215 return err; 2216 2217 mlx5_set_umr_free_mkey(pd, in, ndescs, access_mode, page_shift); 2218 2219 err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen); 2220 if (err) 2221 goto err_free_descs; 2222 2223 mr->mmkey.type = MLX5_MKEY_MR; 2224 mr->ibmr.lkey = mr->mmkey.key; 2225 mr->ibmr.rkey = mr->mmkey.key; 2226 2227 return 0; 2228 2229 err_free_descs: 2230 mlx5_free_priv_descs(mr); 2231 return err; 2232 } 2233 2234 static struct mlx5_ib_mr *mlx5_ib_alloc_pi_mr(struct ib_pd *pd, 2235 u32 max_num_sg, u32 max_num_meta_sg, 2236 int desc_size, int access_mode) 2237 { 2238 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 2239 int ndescs = ALIGN(max_num_sg + max_num_meta_sg, 4); 2240 int page_shift = 0; 2241 struct mlx5_ib_mr *mr; 2242 u32 *in; 2243 int err; 2244 2245 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 2246 if (!mr) 2247 return ERR_PTR(-ENOMEM); 2248 2249 mr->ibmr.pd = pd; 2250 mr->ibmr.device = pd->device; 2251 2252 in = kzalloc(inlen, GFP_KERNEL); 2253 if (!in) { 2254 err = -ENOMEM; 2255 goto err_free; 2256 } 2257 2258 if (access_mode == MLX5_MKC_ACCESS_MODE_MTT) 2259 page_shift = PAGE_SHIFT; 2260 2261 err = _mlx5_alloc_mkey_descs(pd, mr, ndescs, desc_size, page_shift, 2262 access_mode, in, inlen); 2263 if (err) 2264 goto err_free_in; 2265 2266 mr->umem = NULL; 2267 kfree(in); 2268 2269 return mr; 2270 2271 err_free_in: 2272 kfree(in); 2273 err_free: 2274 kfree(mr); 2275 return ERR_PTR(err); 2276 } 2277 2278 static int mlx5_alloc_mem_reg_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr, 2279 int ndescs, u32 *in, int inlen) 2280 { 2281 return _mlx5_alloc_mkey_descs(pd, mr, ndescs, sizeof(struct mlx5_mtt), 2282 PAGE_SHIFT, MLX5_MKC_ACCESS_MODE_MTT, in, 2283 inlen); 2284 } 2285 2286 static int mlx5_alloc_sg_gaps_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr, 2287 int ndescs, u32 *in, int inlen) 2288 { 2289 return _mlx5_alloc_mkey_descs(pd, mr, ndescs, sizeof(struct mlx5_klm), 2290 0, MLX5_MKC_ACCESS_MODE_KLMS, in, inlen); 2291 } 2292 2293 static int mlx5_alloc_integrity_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr, 2294 int max_num_sg, int max_num_meta_sg, 2295 u32 *in, int inlen) 2296 { 2297 struct mlx5_ib_dev *dev = to_mdev(pd->device); 2298 u32 psv_index[2]; 2299 void *mkc; 2300 int err; 2301 2302 mr->sig = kzalloc(sizeof(*mr->sig), GFP_KERNEL); 2303 if (!mr->sig) 2304 return -ENOMEM; 2305 2306 /* create mem & wire PSVs */ 2307 err = mlx5_core_create_psv(dev->mdev, to_mpd(pd)->pdn, 2, psv_index); 2308 if (err) 2309 goto err_free_sig; 2310 2311 mr->sig->psv_memory.psv_idx = psv_index[0]; 2312 mr->sig->psv_wire.psv_idx = psv_index[1]; 2313 2314 mr->sig->sig_status_checked = true; 2315 mr->sig->sig_err_exists = false; 2316 /* Next UMR, Arm SIGERR */ 2317 ++mr->sig->sigerr_count; 2318 mr->klm_mr = mlx5_ib_alloc_pi_mr(pd, max_num_sg, max_num_meta_sg, 2319 sizeof(struct mlx5_klm), 2320 MLX5_MKC_ACCESS_MODE_KLMS); 2321 if (IS_ERR(mr->klm_mr)) { 2322 err = PTR_ERR(mr->klm_mr); 2323 goto err_destroy_psv; 2324 } 2325 mr->mtt_mr = mlx5_ib_alloc_pi_mr(pd, max_num_sg, max_num_meta_sg, 2326 sizeof(struct mlx5_mtt), 2327 MLX5_MKC_ACCESS_MODE_MTT); 2328 if (IS_ERR(mr->mtt_mr)) { 2329 err = PTR_ERR(mr->mtt_mr); 2330 goto err_free_klm_mr; 2331 } 2332 2333 /* Set bsf descriptors for mkey */ 2334 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 2335 MLX5_SET(mkc, mkc, bsf_en, 1); 2336 MLX5_SET(mkc, mkc, bsf_octword_size, MLX5_MKEY_BSF_OCTO_SIZE); 2337 2338 err = _mlx5_alloc_mkey_descs(pd, mr, 4, sizeof(struct mlx5_klm), 0, 2339 MLX5_MKC_ACCESS_MODE_KLMS, in, inlen); 2340 if (err) 2341 goto err_free_mtt_mr; 2342 2343 err = xa_err(xa_store(&dev->sig_mrs, mlx5_base_mkey(mr->mmkey.key), 2344 mr->sig, GFP_KERNEL)); 2345 if (err) 2346 goto err_free_descs; 2347 return 0; 2348 2349 err_free_descs: 2350 destroy_mkey(dev, mr); 2351 mlx5_free_priv_descs(mr); 2352 err_free_mtt_mr: 2353 mlx5_ib_dereg_mr(&mr->mtt_mr->ibmr, NULL); 2354 mr->mtt_mr = NULL; 2355 err_free_klm_mr: 2356 mlx5_ib_dereg_mr(&mr->klm_mr->ibmr, NULL); 2357 mr->klm_mr = NULL; 2358 err_destroy_psv: 2359 if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_memory.psv_idx)) 2360 mlx5_ib_warn(dev, "failed to destroy mem psv %d\n", 2361 mr->sig->psv_memory.psv_idx); 2362 if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_wire.psv_idx)) 2363 mlx5_ib_warn(dev, "failed to destroy wire psv %d\n", 2364 mr->sig->psv_wire.psv_idx); 2365 err_free_sig: 2366 kfree(mr->sig); 2367 2368 return err; 2369 } 2370 2371 static struct ib_mr *__mlx5_ib_alloc_mr(struct ib_pd *pd, 2372 enum ib_mr_type mr_type, u32 max_num_sg, 2373 u32 max_num_meta_sg) 2374 { 2375 struct mlx5_ib_dev *dev = to_mdev(pd->device); 2376 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 2377 int ndescs = ALIGN(max_num_sg, 4); 2378 struct mlx5_ib_mr *mr; 2379 u32 *in; 2380 int err; 2381 2382 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 2383 if (!mr) 2384 return ERR_PTR(-ENOMEM); 2385 2386 in = kzalloc(inlen, GFP_KERNEL); 2387 if (!in) { 2388 err = -ENOMEM; 2389 goto err_free; 2390 } 2391 2392 mr->ibmr.device = pd->device; 2393 mr->umem = NULL; 2394 2395 switch (mr_type) { 2396 case IB_MR_TYPE_MEM_REG: 2397 err = mlx5_alloc_mem_reg_descs(pd, mr, ndescs, in, inlen); 2398 break; 2399 case IB_MR_TYPE_SG_GAPS: 2400 err = mlx5_alloc_sg_gaps_descs(pd, mr, ndescs, in, inlen); 2401 break; 2402 case IB_MR_TYPE_INTEGRITY: 2403 err = mlx5_alloc_integrity_descs(pd, mr, max_num_sg, 2404 max_num_meta_sg, in, inlen); 2405 break; 2406 default: 2407 mlx5_ib_warn(dev, "Invalid mr type %d\n", mr_type); 2408 err = -EINVAL; 2409 } 2410 2411 if (err) 2412 goto err_free_in; 2413 2414 kfree(in); 2415 2416 return &mr->ibmr; 2417 2418 err_free_in: 2419 kfree(in); 2420 err_free: 2421 kfree(mr); 2422 return ERR_PTR(err); 2423 } 2424 2425 struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, 2426 u32 max_num_sg) 2427 { 2428 return __mlx5_ib_alloc_mr(pd, mr_type, max_num_sg, 0); 2429 } 2430 2431 struct ib_mr *mlx5_ib_alloc_mr_integrity(struct ib_pd *pd, 2432 u32 max_num_sg, u32 max_num_meta_sg) 2433 { 2434 return __mlx5_ib_alloc_mr(pd, IB_MR_TYPE_INTEGRITY, max_num_sg, 2435 max_num_meta_sg); 2436 } 2437 2438 int mlx5_ib_alloc_mw(struct ib_mw *ibmw, struct ib_udata *udata) 2439 { 2440 struct mlx5_ib_dev *dev = to_mdev(ibmw->device); 2441 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 2442 struct mlx5_ib_mw *mw = to_mmw(ibmw); 2443 unsigned int ndescs; 2444 u32 *in = NULL; 2445 void *mkc; 2446 int err; 2447 struct mlx5_ib_alloc_mw req = {}; 2448 struct { 2449 __u32 comp_mask; 2450 __u32 response_length; 2451 } resp = {}; 2452 2453 err = ib_copy_from_udata(&req, udata, min(udata->inlen, sizeof(req))); 2454 if (err) 2455 return err; 2456 2457 if (req.comp_mask || req.reserved1 || req.reserved2) 2458 return -EOPNOTSUPP; 2459 2460 if (udata->inlen > sizeof(req) && 2461 !ib_is_udata_cleared(udata, sizeof(req), 2462 udata->inlen - sizeof(req))) 2463 return -EOPNOTSUPP; 2464 2465 ndescs = req.num_klms ? roundup(req.num_klms, 4) : roundup(1, 4); 2466 2467 in = kzalloc(inlen, GFP_KERNEL); 2468 if (!in) 2469 return -ENOMEM; 2470 2471 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 2472 2473 MLX5_SET(mkc, mkc, free, 1); 2474 MLX5_SET(mkc, mkc, translations_octword_size, ndescs); 2475 MLX5_SET(mkc, mkc, pd, to_mpd(ibmw->pd)->pdn); 2476 MLX5_SET(mkc, mkc, umr_en, 1); 2477 MLX5_SET(mkc, mkc, lr, 1); 2478 MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_KLMS); 2479 MLX5_SET(mkc, mkc, en_rinval, !!((ibmw->type == IB_MW_TYPE_2))); 2480 MLX5_SET(mkc, mkc, qpn, 0xffffff); 2481 2482 err = mlx5_ib_create_mkey(dev, &mw->mmkey, in, inlen); 2483 if (err) 2484 goto free; 2485 2486 mw->mmkey.type = MLX5_MKEY_MW; 2487 ibmw->rkey = mw->mmkey.key; 2488 mw->mmkey.ndescs = ndescs; 2489 2490 resp.response_length = 2491 min(offsetofend(typeof(resp), response_length), udata->outlen); 2492 if (resp.response_length) { 2493 err = ib_copy_to_udata(udata, &resp, resp.response_length); 2494 if (err) 2495 goto free_mkey; 2496 } 2497 2498 if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) { 2499 err = mlx5r_store_odp_mkey(dev, &mw->mmkey); 2500 if (err) 2501 goto free_mkey; 2502 } 2503 2504 kfree(in); 2505 return 0; 2506 2507 free_mkey: 2508 mlx5_core_destroy_mkey(dev->mdev, mw->mmkey.key); 2509 free: 2510 kfree(in); 2511 return err; 2512 } 2513 2514 int mlx5_ib_dealloc_mw(struct ib_mw *mw) 2515 { 2516 struct mlx5_ib_dev *dev = to_mdev(mw->device); 2517 struct mlx5_ib_mw *mmw = to_mmw(mw); 2518 2519 if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) && 2520 xa_erase(&dev->odp_mkeys, mlx5_base_mkey(mmw->mmkey.key))) 2521 /* 2522 * pagefault_single_data_segment() may be accessing mmw 2523 * if the user bound an ODP MR to this MW. 2524 */ 2525 mlx5r_deref_wait_odp_mkey(&mmw->mmkey); 2526 2527 return mlx5_core_destroy_mkey(dev->mdev, mmw->mmkey.key); 2528 } 2529 2530 int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask, 2531 struct ib_mr_status *mr_status) 2532 { 2533 struct mlx5_ib_mr *mmr = to_mmr(ibmr); 2534 int ret = 0; 2535 2536 if (check_mask & ~IB_MR_CHECK_SIG_STATUS) { 2537 pr_err("Invalid status check mask\n"); 2538 ret = -EINVAL; 2539 goto done; 2540 } 2541 2542 mr_status->fail_status = 0; 2543 if (check_mask & IB_MR_CHECK_SIG_STATUS) { 2544 if (!mmr->sig) { 2545 ret = -EINVAL; 2546 pr_err("signature status check requested on a non-signature enabled MR\n"); 2547 goto done; 2548 } 2549 2550 mmr->sig->sig_status_checked = true; 2551 if (!mmr->sig->sig_err_exists) 2552 goto done; 2553 2554 if (ibmr->lkey == mmr->sig->err_item.key) 2555 memcpy(&mr_status->sig_err, &mmr->sig->err_item, 2556 sizeof(mr_status->sig_err)); 2557 else { 2558 mr_status->sig_err.err_type = IB_SIG_BAD_GUARD; 2559 mr_status->sig_err.sig_err_offset = 0; 2560 mr_status->sig_err.key = mmr->sig->err_item.key; 2561 } 2562 2563 mmr->sig->sig_err_exists = false; 2564 mr_status->fail_status |= IB_MR_CHECK_SIG_STATUS; 2565 } 2566 2567 done: 2568 return ret; 2569 } 2570 2571 static int 2572 mlx5_ib_map_pa_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg, 2573 int data_sg_nents, unsigned int *data_sg_offset, 2574 struct scatterlist *meta_sg, int meta_sg_nents, 2575 unsigned int *meta_sg_offset) 2576 { 2577 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2578 unsigned int sg_offset = 0; 2579 int n = 0; 2580 2581 mr->meta_length = 0; 2582 if (data_sg_nents == 1) { 2583 n++; 2584 mr->mmkey.ndescs = 1; 2585 if (data_sg_offset) 2586 sg_offset = *data_sg_offset; 2587 mr->data_length = sg_dma_len(data_sg) - sg_offset; 2588 mr->data_iova = sg_dma_address(data_sg) + sg_offset; 2589 if (meta_sg_nents == 1) { 2590 n++; 2591 mr->meta_ndescs = 1; 2592 if (meta_sg_offset) 2593 sg_offset = *meta_sg_offset; 2594 else 2595 sg_offset = 0; 2596 mr->meta_length = sg_dma_len(meta_sg) - sg_offset; 2597 mr->pi_iova = sg_dma_address(meta_sg) + sg_offset; 2598 } 2599 ibmr->length = mr->data_length + mr->meta_length; 2600 } 2601 2602 return n; 2603 } 2604 2605 static int 2606 mlx5_ib_sg_to_klms(struct mlx5_ib_mr *mr, 2607 struct scatterlist *sgl, 2608 unsigned short sg_nents, 2609 unsigned int *sg_offset_p, 2610 struct scatterlist *meta_sgl, 2611 unsigned short meta_sg_nents, 2612 unsigned int *meta_sg_offset_p) 2613 { 2614 struct scatterlist *sg = sgl; 2615 struct mlx5_klm *klms = mr->descs; 2616 unsigned int sg_offset = sg_offset_p ? *sg_offset_p : 0; 2617 u32 lkey = mr->ibmr.pd->local_dma_lkey; 2618 int i, j = 0; 2619 2620 mr->ibmr.iova = sg_dma_address(sg) + sg_offset; 2621 mr->ibmr.length = 0; 2622 2623 for_each_sg(sgl, sg, sg_nents, i) { 2624 if (unlikely(i >= mr->max_descs)) 2625 break; 2626 klms[i].va = cpu_to_be64(sg_dma_address(sg) + sg_offset); 2627 klms[i].bcount = cpu_to_be32(sg_dma_len(sg) - sg_offset); 2628 klms[i].key = cpu_to_be32(lkey); 2629 mr->ibmr.length += sg_dma_len(sg) - sg_offset; 2630 2631 sg_offset = 0; 2632 } 2633 2634 if (sg_offset_p) 2635 *sg_offset_p = sg_offset; 2636 2637 mr->mmkey.ndescs = i; 2638 mr->data_length = mr->ibmr.length; 2639 2640 if (meta_sg_nents) { 2641 sg = meta_sgl; 2642 sg_offset = meta_sg_offset_p ? *meta_sg_offset_p : 0; 2643 for_each_sg(meta_sgl, sg, meta_sg_nents, j) { 2644 if (unlikely(i + j >= mr->max_descs)) 2645 break; 2646 klms[i + j].va = cpu_to_be64(sg_dma_address(sg) + 2647 sg_offset); 2648 klms[i + j].bcount = cpu_to_be32(sg_dma_len(sg) - 2649 sg_offset); 2650 klms[i + j].key = cpu_to_be32(lkey); 2651 mr->ibmr.length += sg_dma_len(sg) - sg_offset; 2652 2653 sg_offset = 0; 2654 } 2655 if (meta_sg_offset_p) 2656 *meta_sg_offset_p = sg_offset; 2657 2658 mr->meta_ndescs = j; 2659 mr->meta_length = mr->ibmr.length - mr->data_length; 2660 } 2661 2662 return i + j; 2663 } 2664 2665 static int mlx5_set_page(struct ib_mr *ibmr, u64 addr) 2666 { 2667 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2668 __be64 *descs; 2669 2670 if (unlikely(mr->mmkey.ndescs == mr->max_descs)) 2671 return -ENOMEM; 2672 2673 descs = mr->descs; 2674 descs[mr->mmkey.ndescs++] = cpu_to_be64(addr | MLX5_EN_RD | MLX5_EN_WR); 2675 2676 return 0; 2677 } 2678 2679 static int mlx5_set_page_pi(struct ib_mr *ibmr, u64 addr) 2680 { 2681 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2682 __be64 *descs; 2683 2684 if (unlikely(mr->mmkey.ndescs + mr->meta_ndescs == mr->max_descs)) 2685 return -ENOMEM; 2686 2687 descs = mr->descs; 2688 descs[mr->mmkey.ndescs + mr->meta_ndescs++] = 2689 cpu_to_be64(addr | MLX5_EN_RD | MLX5_EN_WR); 2690 2691 return 0; 2692 } 2693 2694 static int 2695 mlx5_ib_map_mtt_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg, 2696 int data_sg_nents, unsigned int *data_sg_offset, 2697 struct scatterlist *meta_sg, int meta_sg_nents, 2698 unsigned int *meta_sg_offset) 2699 { 2700 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2701 struct mlx5_ib_mr *pi_mr = mr->mtt_mr; 2702 int n; 2703 2704 pi_mr->mmkey.ndescs = 0; 2705 pi_mr->meta_ndescs = 0; 2706 pi_mr->meta_length = 0; 2707 2708 ib_dma_sync_single_for_cpu(ibmr->device, pi_mr->desc_map, 2709 pi_mr->desc_size * pi_mr->max_descs, 2710 DMA_TO_DEVICE); 2711 2712 pi_mr->ibmr.page_size = ibmr->page_size; 2713 n = ib_sg_to_pages(&pi_mr->ibmr, data_sg, data_sg_nents, data_sg_offset, 2714 mlx5_set_page); 2715 if (n != data_sg_nents) 2716 return n; 2717 2718 pi_mr->data_iova = pi_mr->ibmr.iova; 2719 pi_mr->data_length = pi_mr->ibmr.length; 2720 pi_mr->ibmr.length = pi_mr->data_length; 2721 ibmr->length = pi_mr->data_length; 2722 2723 if (meta_sg_nents) { 2724 u64 page_mask = ~((u64)ibmr->page_size - 1); 2725 u64 iova = pi_mr->data_iova; 2726 2727 n += ib_sg_to_pages(&pi_mr->ibmr, meta_sg, meta_sg_nents, 2728 meta_sg_offset, mlx5_set_page_pi); 2729 2730 pi_mr->meta_length = pi_mr->ibmr.length; 2731 /* 2732 * PI address for the HW is the offset of the metadata address 2733 * relative to the first data page address. 2734 * It equals to first data page address + size of data pages + 2735 * metadata offset at the first metadata page 2736 */ 2737 pi_mr->pi_iova = (iova & page_mask) + 2738 pi_mr->mmkey.ndescs * ibmr->page_size + 2739 (pi_mr->ibmr.iova & ~page_mask); 2740 /* 2741 * In order to use one MTT MR for data and metadata, we register 2742 * also the gaps between the end of the data and the start of 2743 * the metadata (the sig MR will verify that the HW will access 2744 * to right addresses). This mapping is safe because we use 2745 * internal mkey for the registration. 2746 */ 2747 pi_mr->ibmr.length = pi_mr->pi_iova + pi_mr->meta_length - iova; 2748 pi_mr->ibmr.iova = iova; 2749 ibmr->length += pi_mr->meta_length; 2750 } 2751 2752 ib_dma_sync_single_for_device(ibmr->device, pi_mr->desc_map, 2753 pi_mr->desc_size * pi_mr->max_descs, 2754 DMA_TO_DEVICE); 2755 2756 return n; 2757 } 2758 2759 static int 2760 mlx5_ib_map_klm_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg, 2761 int data_sg_nents, unsigned int *data_sg_offset, 2762 struct scatterlist *meta_sg, int meta_sg_nents, 2763 unsigned int *meta_sg_offset) 2764 { 2765 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2766 struct mlx5_ib_mr *pi_mr = mr->klm_mr; 2767 int n; 2768 2769 pi_mr->mmkey.ndescs = 0; 2770 pi_mr->meta_ndescs = 0; 2771 pi_mr->meta_length = 0; 2772 2773 ib_dma_sync_single_for_cpu(ibmr->device, pi_mr->desc_map, 2774 pi_mr->desc_size * pi_mr->max_descs, 2775 DMA_TO_DEVICE); 2776 2777 n = mlx5_ib_sg_to_klms(pi_mr, data_sg, data_sg_nents, data_sg_offset, 2778 meta_sg, meta_sg_nents, meta_sg_offset); 2779 2780 ib_dma_sync_single_for_device(ibmr->device, pi_mr->desc_map, 2781 pi_mr->desc_size * pi_mr->max_descs, 2782 DMA_TO_DEVICE); 2783 2784 /* This is zero-based memory region */ 2785 pi_mr->data_iova = 0; 2786 pi_mr->ibmr.iova = 0; 2787 pi_mr->pi_iova = pi_mr->data_length; 2788 ibmr->length = pi_mr->ibmr.length; 2789 2790 return n; 2791 } 2792 2793 int mlx5_ib_map_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg, 2794 int data_sg_nents, unsigned int *data_sg_offset, 2795 struct scatterlist *meta_sg, int meta_sg_nents, 2796 unsigned int *meta_sg_offset) 2797 { 2798 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2799 struct mlx5_ib_mr *pi_mr = NULL; 2800 int n; 2801 2802 WARN_ON(ibmr->type != IB_MR_TYPE_INTEGRITY); 2803 2804 mr->mmkey.ndescs = 0; 2805 mr->data_length = 0; 2806 mr->data_iova = 0; 2807 mr->meta_ndescs = 0; 2808 mr->pi_iova = 0; 2809 /* 2810 * As a performance optimization, if possible, there is no need to 2811 * perform UMR operation to register the data/metadata buffers. 2812 * First try to map the sg lists to PA descriptors with local_dma_lkey. 2813 * Fallback to UMR only in case of a failure. 2814 */ 2815 n = mlx5_ib_map_pa_mr_sg_pi(ibmr, data_sg, data_sg_nents, 2816 data_sg_offset, meta_sg, meta_sg_nents, 2817 meta_sg_offset); 2818 if (n == data_sg_nents + meta_sg_nents) 2819 goto out; 2820 /* 2821 * As a performance optimization, if possible, there is no need to map 2822 * the sg lists to KLM descriptors. First try to map the sg lists to MTT 2823 * descriptors and fallback to KLM only in case of a failure. 2824 * It's more efficient for the HW to work with MTT descriptors 2825 * (especially in high load). 2826 * Use KLM (indirect access) only if it's mandatory. 2827 */ 2828 pi_mr = mr->mtt_mr; 2829 n = mlx5_ib_map_mtt_mr_sg_pi(ibmr, data_sg, data_sg_nents, 2830 data_sg_offset, meta_sg, meta_sg_nents, 2831 meta_sg_offset); 2832 if (n == data_sg_nents + meta_sg_nents) 2833 goto out; 2834 2835 pi_mr = mr->klm_mr; 2836 n = mlx5_ib_map_klm_mr_sg_pi(ibmr, data_sg, data_sg_nents, 2837 data_sg_offset, meta_sg, meta_sg_nents, 2838 meta_sg_offset); 2839 if (unlikely(n != data_sg_nents + meta_sg_nents)) 2840 return -ENOMEM; 2841 2842 out: 2843 /* This is zero-based memory region */ 2844 ibmr->iova = 0; 2845 mr->pi_mr = pi_mr; 2846 if (pi_mr) 2847 ibmr->sig_attrs->meta_length = pi_mr->meta_length; 2848 else 2849 ibmr->sig_attrs->meta_length = mr->meta_length; 2850 2851 return 0; 2852 } 2853 2854 int mlx5_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, 2855 unsigned int *sg_offset) 2856 { 2857 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2858 int n; 2859 2860 mr->mmkey.ndescs = 0; 2861 2862 ib_dma_sync_single_for_cpu(ibmr->device, mr->desc_map, 2863 mr->desc_size * mr->max_descs, 2864 DMA_TO_DEVICE); 2865 2866 if (mr->access_mode == MLX5_MKC_ACCESS_MODE_KLMS) 2867 n = mlx5_ib_sg_to_klms(mr, sg, sg_nents, sg_offset, NULL, 0, 2868 NULL); 2869 else 2870 n = ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, 2871 mlx5_set_page); 2872 2873 ib_dma_sync_single_for_device(ibmr->device, mr->desc_map, 2874 mr->desc_size * mr->max_descs, 2875 DMA_TO_DEVICE); 2876 2877 return n; 2878 } 2879