1 /* 2 * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved. 3 * Copyright (c) 2020, Intel Corporation. All rights reserved. 4 * 5 * This software is available to you under a choice of one of two 6 * licenses. You may choose to be licensed under the terms of the GNU 7 * General Public License (GPL) Version 2, available from the file 8 * COPYING in the main directory of this source tree, or the 9 * OpenIB.org BSD license below: 10 * 11 * Redistribution and use in source and binary forms, with or 12 * without modification, are permitted provided that the following 13 * conditions are met: 14 * 15 * - Redistributions of source code must retain the above 16 * copyright notice, this list of conditions and the following 17 * disclaimer. 18 * 19 * - Redistributions in binary form must reproduce the above 20 * copyright notice, this list of conditions and the following 21 * disclaimer in the documentation and/or other materials 22 * provided with the distribution. 23 * 24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 25 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 26 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 27 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 28 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 29 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 30 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 31 * SOFTWARE. 32 */ 33 34 35 #include <linux/kref.h> 36 #include <linux/random.h> 37 #include <linux/debugfs.h> 38 #include <linux/export.h> 39 #include <linux/delay.h> 40 #include <linux/dma-buf.h> 41 #include <linux/dma-resv.h> 42 #include <rdma/ib_umem_odp.h> 43 #include "dm.h" 44 #include "mlx5_ib.h" 45 #include "umr.h" 46 #include "data_direct.h" 47 48 enum { 49 MAX_PENDING_REG_MR = 8, 50 }; 51 52 #define MLX5_MR_CACHE_PERSISTENT_ENTRY_MIN_DESCS 4 53 #define MLX5_UMR_ALIGN 2048 54 55 static void 56 create_mkey_callback(int status, struct mlx5_async_work *context); 57 static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem, 58 u64 iova, int access_flags, 59 unsigned long page_size, bool populate, 60 int access_mode); 61 static int __mlx5_ib_dereg_mr(struct ib_mr *ibmr); 62 63 static void set_mkc_access_pd_addr_fields(void *mkc, int acc, u64 start_addr, 64 struct ib_pd *pd) 65 { 66 struct mlx5_ib_dev *dev = to_mdev(pd->device); 67 68 MLX5_SET(mkc, mkc, a, !!(acc & IB_ACCESS_REMOTE_ATOMIC)); 69 MLX5_SET(mkc, mkc, rw, !!(acc & IB_ACCESS_REMOTE_WRITE)); 70 MLX5_SET(mkc, mkc, rr, !!(acc & IB_ACCESS_REMOTE_READ)); 71 MLX5_SET(mkc, mkc, lw, !!(acc & IB_ACCESS_LOCAL_WRITE)); 72 MLX5_SET(mkc, mkc, lr, 1); 73 74 if (acc & IB_ACCESS_RELAXED_ORDERING) { 75 if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write)) 76 MLX5_SET(mkc, mkc, relaxed_ordering_write, 1); 77 78 if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read) || 79 (MLX5_CAP_GEN(dev->mdev, 80 relaxed_ordering_read_pci_enabled) && 81 pcie_relaxed_ordering_enabled(dev->mdev->pdev))) 82 MLX5_SET(mkc, mkc, relaxed_ordering_read, 1); 83 } 84 85 MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn); 86 MLX5_SET(mkc, mkc, qpn, 0xffffff); 87 MLX5_SET64(mkc, mkc, start_addr, start_addr); 88 } 89 90 static void assign_mkey_variant(struct mlx5_ib_dev *dev, u32 *mkey, u32 *in) 91 { 92 u8 key = atomic_inc_return(&dev->mkey_var); 93 void *mkc; 94 95 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 96 MLX5_SET(mkc, mkc, mkey_7_0, key); 97 *mkey = key; 98 } 99 100 static int mlx5_ib_create_mkey(struct mlx5_ib_dev *dev, 101 struct mlx5_ib_mkey *mkey, u32 *in, int inlen) 102 { 103 int ret; 104 105 assign_mkey_variant(dev, &mkey->key, in); 106 ret = mlx5_core_create_mkey(dev->mdev, &mkey->key, in, inlen); 107 if (!ret) 108 init_waitqueue_head(&mkey->wait); 109 110 return ret; 111 } 112 113 static int mlx5_ib_create_mkey_cb(struct mlx5r_async_create_mkey *async_create) 114 { 115 struct mlx5_ib_dev *dev = async_create->ent->dev; 116 size_t inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 117 size_t outlen = MLX5_ST_SZ_BYTES(create_mkey_out); 118 119 MLX5_SET(create_mkey_in, async_create->in, opcode, 120 MLX5_CMD_OP_CREATE_MKEY); 121 assign_mkey_variant(dev, &async_create->mkey, async_create->in); 122 return mlx5_cmd_exec_cb(&dev->async_ctx, async_create->in, inlen, 123 async_create->out, outlen, create_mkey_callback, 124 &async_create->cb_work); 125 } 126 127 static int mkey_cache_max_order(struct mlx5_ib_dev *dev); 128 static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent); 129 130 static int destroy_mkey(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) 131 { 132 WARN_ON(xa_load(&dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key))); 133 134 return mlx5_core_destroy_mkey(dev->mdev, mr->mmkey.key); 135 } 136 137 static void create_mkey_warn(struct mlx5_ib_dev *dev, int status, void *out) 138 { 139 if (status == -ENXIO) /* core driver is not available */ 140 return; 141 142 mlx5_ib_warn(dev, "async reg mr failed. status %d\n", status); 143 if (status != -EREMOTEIO) /* driver specific failure */ 144 return; 145 146 /* Failed in FW, print cmd out failure details */ 147 mlx5_cmd_out_err(dev->mdev, MLX5_CMD_OP_CREATE_MKEY, 0, out); 148 } 149 150 static int push_mkey_locked(struct mlx5_cache_ent *ent, u32 mkey) 151 { 152 unsigned long tmp = ent->mkeys_queue.ci % NUM_MKEYS_PER_PAGE; 153 struct mlx5_mkeys_page *page; 154 155 lockdep_assert_held(&ent->mkeys_queue.lock); 156 if (ent->mkeys_queue.ci >= 157 ent->mkeys_queue.num_pages * NUM_MKEYS_PER_PAGE) { 158 page = kzalloc(sizeof(*page), GFP_ATOMIC); 159 if (!page) 160 return -ENOMEM; 161 ent->mkeys_queue.num_pages++; 162 list_add_tail(&page->list, &ent->mkeys_queue.pages_list); 163 } else { 164 page = list_last_entry(&ent->mkeys_queue.pages_list, 165 struct mlx5_mkeys_page, list); 166 } 167 168 page->mkeys[tmp] = mkey; 169 ent->mkeys_queue.ci++; 170 return 0; 171 } 172 173 static int pop_mkey_locked(struct mlx5_cache_ent *ent) 174 { 175 unsigned long tmp = (ent->mkeys_queue.ci - 1) % NUM_MKEYS_PER_PAGE; 176 struct mlx5_mkeys_page *last_page; 177 u32 mkey; 178 179 lockdep_assert_held(&ent->mkeys_queue.lock); 180 last_page = list_last_entry(&ent->mkeys_queue.pages_list, 181 struct mlx5_mkeys_page, list); 182 mkey = last_page->mkeys[tmp]; 183 last_page->mkeys[tmp] = 0; 184 ent->mkeys_queue.ci--; 185 if (ent->mkeys_queue.num_pages > 1 && !tmp) { 186 list_del(&last_page->list); 187 ent->mkeys_queue.num_pages--; 188 kfree(last_page); 189 } 190 return mkey; 191 } 192 193 static void create_mkey_callback(int status, struct mlx5_async_work *context) 194 { 195 struct mlx5r_async_create_mkey *mkey_out = 196 container_of(context, struct mlx5r_async_create_mkey, cb_work); 197 struct mlx5_cache_ent *ent = mkey_out->ent; 198 struct mlx5_ib_dev *dev = ent->dev; 199 unsigned long flags; 200 201 if (status) { 202 create_mkey_warn(dev, status, mkey_out->out); 203 kfree(mkey_out); 204 spin_lock_irqsave(&ent->mkeys_queue.lock, flags); 205 ent->pending--; 206 WRITE_ONCE(dev->fill_delay, 1); 207 spin_unlock_irqrestore(&ent->mkeys_queue.lock, flags); 208 mod_timer(&dev->delay_timer, jiffies + HZ); 209 return; 210 } 211 212 mkey_out->mkey |= mlx5_idx_to_mkey( 213 MLX5_GET(create_mkey_out, mkey_out->out, mkey_index)); 214 WRITE_ONCE(dev->cache.last_add, jiffies); 215 216 spin_lock_irqsave(&ent->mkeys_queue.lock, flags); 217 push_mkey_locked(ent, mkey_out->mkey); 218 ent->pending--; 219 /* If we are doing fill_to_high_water then keep going. */ 220 queue_adjust_cache_locked(ent); 221 spin_unlock_irqrestore(&ent->mkeys_queue.lock, flags); 222 kfree(mkey_out); 223 } 224 225 static int get_mkc_octo_size(unsigned int access_mode, unsigned int ndescs) 226 { 227 int ret = 0; 228 229 switch (access_mode) { 230 case MLX5_MKC_ACCESS_MODE_MTT: 231 ret = DIV_ROUND_UP(ndescs, MLX5_IB_UMR_OCTOWORD / 232 sizeof(struct mlx5_mtt)); 233 break; 234 case MLX5_MKC_ACCESS_MODE_KSM: 235 ret = DIV_ROUND_UP(ndescs, MLX5_IB_UMR_OCTOWORD / 236 sizeof(struct mlx5_klm)); 237 break; 238 default: 239 WARN_ON(1); 240 } 241 return ret; 242 } 243 244 static void set_cache_mkc(struct mlx5_cache_ent *ent, void *mkc) 245 { 246 set_mkc_access_pd_addr_fields(mkc, ent->rb_key.access_flags, 0, 247 ent->dev->umrc.pd); 248 MLX5_SET(mkc, mkc, free, 1); 249 MLX5_SET(mkc, mkc, umr_en, 1); 250 MLX5_SET(mkc, mkc, access_mode_1_0, ent->rb_key.access_mode & 0x3); 251 MLX5_SET(mkc, mkc, access_mode_4_2, 252 (ent->rb_key.access_mode >> 2) & 0x7); 253 MLX5_SET(mkc, mkc, ma_translation_mode, !!ent->rb_key.ats); 254 255 MLX5_SET(mkc, mkc, translations_octword_size, 256 get_mkc_octo_size(ent->rb_key.access_mode, 257 ent->rb_key.ndescs)); 258 MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT); 259 } 260 261 /* Asynchronously schedule new MRs to be populated in the cache. */ 262 static int add_keys(struct mlx5_cache_ent *ent, unsigned int num) 263 { 264 struct mlx5r_async_create_mkey *async_create; 265 void *mkc; 266 int err = 0; 267 int i; 268 269 for (i = 0; i < num; i++) { 270 async_create = kzalloc(sizeof(struct mlx5r_async_create_mkey), 271 GFP_KERNEL); 272 if (!async_create) 273 return -ENOMEM; 274 mkc = MLX5_ADDR_OF(create_mkey_in, async_create->in, 275 memory_key_mkey_entry); 276 set_cache_mkc(ent, mkc); 277 async_create->ent = ent; 278 279 spin_lock_irq(&ent->mkeys_queue.lock); 280 if (ent->pending >= MAX_PENDING_REG_MR) { 281 err = -EAGAIN; 282 goto free_async_create; 283 } 284 ent->pending++; 285 spin_unlock_irq(&ent->mkeys_queue.lock); 286 287 err = mlx5_ib_create_mkey_cb(async_create); 288 if (err) { 289 mlx5_ib_warn(ent->dev, "create mkey failed %d\n", err); 290 goto err_create_mkey; 291 } 292 } 293 294 return 0; 295 296 err_create_mkey: 297 spin_lock_irq(&ent->mkeys_queue.lock); 298 ent->pending--; 299 free_async_create: 300 spin_unlock_irq(&ent->mkeys_queue.lock); 301 kfree(async_create); 302 return err; 303 } 304 305 /* Synchronously create a MR in the cache */ 306 static int create_cache_mkey(struct mlx5_cache_ent *ent, u32 *mkey) 307 { 308 size_t inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 309 void *mkc; 310 u32 *in; 311 int err; 312 313 in = kzalloc(inlen, GFP_KERNEL); 314 if (!in) 315 return -ENOMEM; 316 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 317 set_cache_mkc(ent, mkc); 318 319 err = mlx5_core_create_mkey(ent->dev->mdev, mkey, in, inlen); 320 if (err) 321 goto free_in; 322 323 WRITE_ONCE(ent->dev->cache.last_add, jiffies); 324 free_in: 325 kfree(in); 326 return err; 327 } 328 329 static void remove_cache_mr_locked(struct mlx5_cache_ent *ent) 330 { 331 u32 mkey; 332 333 lockdep_assert_held(&ent->mkeys_queue.lock); 334 if (!ent->mkeys_queue.ci) 335 return; 336 mkey = pop_mkey_locked(ent); 337 spin_unlock_irq(&ent->mkeys_queue.lock); 338 mlx5_core_destroy_mkey(ent->dev->mdev, mkey); 339 spin_lock_irq(&ent->mkeys_queue.lock); 340 } 341 342 static int resize_available_mrs(struct mlx5_cache_ent *ent, unsigned int target, 343 bool limit_fill) 344 __acquires(&ent->mkeys_queue.lock) __releases(&ent->mkeys_queue.lock) 345 { 346 int err; 347 348 lockdep_assert_held(&ent->mkeys_queue.lock); 349 350 while (true) { 351 if (limit_fill) 352 target = ent->limit * 2; 353 if (target == ent->pending + ent->mkeys_queue.ci) 354 return 0; 355 if (target > ent->pending + ent->mkeys_queue.ci) { 356 u32 todo = target - (ent->pending + ent->mkeys_queue.ci); 357 358 spin_unlock_irq(&ent->mkeys_queue.lock); 359 err = add_keys(ent, todo); 360 if (err == -EAGAIN) 361 usleep_range(3000, 5000); 362 spin_lock_irq(&ent->mkeys_queue.lock); 363 if (err) { 364 if (err != -EAGAIN) 365 return err; 366 } else 367 return 0; 368 } else { 369 remove_cache_mr_locked(ent); 370 } 371 } 372 } 373 374 static ssize_t size_write(struct file *filp, const char __user *buf, 375 size_t count, loff_t *pos) 376 { 377 struct mlx5_cache_ent *ent = filp->private_data; 378 u32 target; 379 int err; 380 381 err = kstrtou32_from_user(buf, count, 0, &target); 382 if (err) 383 return err; 384 385 /* 386 * Target is the new value of total_mrs the user requests, however we 387 * cannot free MRs that are in use. Compute the target value for stored 388 * mkeys. 389 */ 390 spin_lock_irq(&ent->mkeys_queue.lock); 391 if (target < ent->in_use) { 392 err = -EINVAL; 393 goto err_unlock; 394 } 395 target = target - ent->in_use; 396 if (target < ent->limit || target > ent->limit*2) { 397 err = -EINVAL; 398 goto err_unlock; 399 } 400 err = resize_available_mrs(ent, target, false); 401 if (err) 402 goto err_unlock; 403 spin_unlock_irq(&ent->mkeys_queue.lock); 404 405 return count; 406 407 err_unlock: 408 spin_unlock_irq(&ent->mkeys_queue.lock); 409 return err; 410 } 411 412 static ssize_t size_read(struct file *filp, char __user *buf, size_t count, 413 loff_t *pos) 414 { 415 struct mlx5_cache_ent *ent = filp->private_data; 416 char lbuf[20]; 417 int err; 418 419 err = snprintf(lbuf, sizeof(lbuf), "%ld\n", 420 ent->mkeys_queue.ci + ent->in_use); 421 if (err < 0) 422 return err; 423 424 return simple_read_from_buffer(buf, count, pos, lbuf, err); 425 } 426 427 static const struct file_operations size_fops = { 428 .owner = THIS_MODULE, 429 .open = simple_open, 430 .write = size_write, 431 .read = size_read, 432 }; 433 434 static ssize_t limit_write(struct file *filp, const char __user *buf, 435 size_t count, loff_t *pos) 436 { 437 struct mlx5_cache_ent *ent = filp->private_data; 438 u32 var; 439 int err; 440 441 err = kstrtou32_from_user(buf, count, 0, &var); 442 if (err) 443 return err; 444 445 /* 446 * Upon set we immediately fill the cache to high water mark implied by 447 * the limit. 448 */ 449 spin_lock_irq(&ent->mkeys_queue.lock); 450 ent->limit = var; 451 err = resize_available_mrs(ent, 0, true); 452 spin_unlock_irq(&ent->mkeys_queue.lock); 453 if (err) 454 return err; 455 return count; 456 } 457 458 static ssize_t limit_read(struct file *filp, char __user *buf, size_t count, 459 loff_t *pos) 460 { 461 struct mlx5_cache_ent *ent = filp->private_data; 462 char lbuf[20]; 463 int err; 464 465 err = snprintf(lbuf, sizeof(lbuf), "%d\n", ent->limit); 466 if (err < 0) 467 return err; 468 469 return simple_read_from_buffer(buf, count, pos, lbuf, err); 470 } 471 472 static const struct file_operations limit_fops = { 473 .owner = THIS_MODULE, 474 .open = simple_open, 475 .write = limit_write, 476 .read = limit_read, 477 }; 478 479 static bool someone_adding(struct mlx5_mkey_cache *cache) 480 { 481 struct mlx5_cache_ent *ent; 482 struct rb_node *node; 483 bool ret; 484 485 mutex_lock(&cache->rb_lock); 486 for (node = rb_first(&cache->rb_root); node; node = rb_next(node)) { 487 ent = rb_entry(node, struct mlx5_cache_ent, node); 488 spin_lock_irq(&ent->mkeys_queue.lock); 489 ret = ent->mkeys_queue.ci < ent->limit; 490 spin_unlock_irq(&ent->mkeys_queue.lock); 491 if (ret) { 492 mutex_unlock(&cache->rb_lock); 493 return true; 494 } 495 } 496 mutex_unlock(&cache->rb_lock); 497 return false; 498 } 499 500 /* 501 * Check if the bucket is outside the high/low water mark and schedule an async 502 * update. The cache refill has hysteresis, once the low water mark is hit it is 503 * refilled up to the high mark. 504 */ 505 static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent) 506 { 507 lockdep_assert_held(&ent->mkeys_queue.lock); 508 509 if (ent->disabled || READ_ONCE(ent->dev->fill_delay) || ent->is_tmp) 510 return; 511 if (ent->mkeys_queue.ci < ent->limit) { 512 ent->fill_to_high_water = true; 513 mod_delayed_work(ent->dev->cache.wq, &ent->dwork, 0); 514 } else if (ent->fill_to_high_water && 515 ent->mkeys_queue.ci + ent->pending < 2 * ent->limit) { 516 /* 517 * Once we start populating due to hitting a low water mark 518 * continue until we pass the high water mark. 519 */ 520 mod_delayed_work(ent->dev->cache.wq, &ent->dwork, 0); 521 } else if (ent->mkeys_queue.ci == 2 * ent->limit) { 522 ent->fill_to_high_water = false; 523 } else if (ent->mkeys_queue.ci > 2 * ent->limit) { 524 /* Queue deletion of excess entries */ 525 ent->fill_to_high_water = false; 526 if (ent->pending) 527 queue_delayed_work(ent->dev->cache.wq, &ent->dwork, 528 secs_to_jiffies(1)); 529 else 530 mod_delayed_work(ent->dev->cache.wq, &ent->dwork, 0); 531 } 532 } 533 534 static void clean_keys(struct mlx5_ib_dev *dev, struct mlx5_cache_ent *ent) 535 { 536 u32 mkey; 537 538 spin_lock_irq(&ent->mkeys_queue.lock); 539 while (ent->mkeys_queue.ci) { 540 mkey = pop_mkey_locked(ent); 541 spin_unlock_irq(&ent->mkeys_queue.lock); 542 mlx5_core_destroy_mkey(dev->mdev, mkey); 543 spin_lock_irq(&ent->mkeys_queue.lock); 544 } 545 ent->tmp_cleanup_scheduled = false; 546 spin_unlock_irq(&ent->mkeys_queue.lock); 547 } 548 549 static void __cache_work_func(struct mlx5_cache_ent *ent) 550 { 551 struct mlx5_ib_dev *dev = ent->dev; 552 struct mlx5_mkey_cache *cache = &dev->cache; 553 int err; 554 555 spin_lock_irq(&ent->mkeys_queue.lock); 556 if (ent->disabled) 557 goto out; 558 559 if (ent->fill_to_high_water && 560 ent->mkeys_queue.ci + ent->pending < 2 * ent->limit && 561 !READ_ONCE(dev->fill_delay)) { 562 spin_unlock_irq(&ent->mkeys_queue.lock); 563 err = add_keys(ent, 1); 564 spin_lock_irq(&ent->mkeys_queue.lock); 565 if (ent->disabled) 566 goto out; 567 if (err) { 568 /* 569 * EAGAIN only happens if there are pending MRs, so we 570 * will be rescheduled when storing them. The only 571 * failure path here is ENOMEM. 572 */ 573 if (err != -EAGAIN) { 574 mlx5_ib_warn( 575 dev, 576 "add keys command failed, err %d\n", 577 err); 578 queue_delayed_work(cache->wq, &ent->dwork, 579 secs_to_jiffies(1)); 580 } 581 } 582 } else if (ent->mkeys_queue.ci > 2 * ent->limit) { 583 bool need_delay; 584 585 /* 586 * The remove_cache_mr() logic is performed as garbage 587 * collection task. Such task is intended to be run when no 588 * other active processes are running. 589 * 590 * The need_resched() will return TRUE if there are user tasks 591 * to be activated in near future. 592 * 593 * In such case, we don't execute remove_cache_mr() and postpone 594 * the garbage collection work to try to run in next cycle, in 595 * order to free CPU resources to other tasks. 596 */ 597 spin_unlock_irq(&ent->mkeys_queue.lock); 598 need_delay = need_resched() || someone_adding(cache) || 599 !time_after(jiffies, 600 READ_ONCE(cache->last_add) + 300 * HZ); 601 spin_lock_irq(&ent->mkeys_queue.lock); 602 if (ent->disabled) 603 goto out; 604 if (need_delay) { 605 queue_delayed_work(cache->wq, &ent->dwork, 300 * HZ); 606 goto out; 607 } 608 remove_cache_mr_locked(ent); 609 queue_adjust_cache_locked(ent); 610 } 611 out: 612 spin_unlock_irq(&ent->mkeys_queue.lock); 613 } 614 615 static void delayed_cache_work_func(struct work_struct *work) 616 { 617 struct mlx5_cache_ent *ent; 618 619 ent = container_of(work, struct mlx5_cache_ent, dwork.work); 620 /* temp entries are never filled, only cleaned */ 621 if (ent->is_tmp) 622 clean_keys(ent->dev, ent); 623 else 624 __cache_work_func(ent); 625 } 626 627 static int cache_ent_key_cmp(struct mlx5r_cache_rb_key key1, 628 struct mlx5r_cache_rb_key key2) 629 { 630 int res; 631 632 res = key1.ats - key2.ats; 633 if (res) 634 return res; 635 636 res = key1.access_mode - key2.access_mode; 637 if (res) 638 return res; 639 640 res = key1.access_flags - key2.access_flags; 641 if (res) 642 return res; 643 644 /* 645 * keep ndescs the last in the compare table since the find function 646 * searches for an exact match on all properties and only closest 647 * match in size. 648 */ 649 return key1.ndescs - key2.ndescs; 650 } 651 652 static int mlx5_cache_ent_insert(struct mlx5_mkey_cache *cache, 653 struct mlx5_cache_ent *ent) 654 { 655 struct rb_node **new = &cache->rb_root.rb_node, *parent = NULL; 656 struct mlx5_cache_ent *cur; 657 int cmp; 658 659 /* Figure out where to put new node */ 660 while (*new) { 661 cur = rb_entry(*new, struct mlx5_cache_ent, node); 662 parent = *new; 663 cmp = cache_ent_key_cmp(cur->rb_key, ent->rb_key); 664 if (cmp > 0) 665 new = &((*new)->rb_left); 666 if (cmp < 0) 667 new = &((*new)->rb_right); 668 if (cmp == 0) 669 return -EEXIST; 670 } 671 672 /* Add new node and rebalance tree. */ 673 rb_link_node(&ent->node, parent, new); 674 rb_insert_color(&ent->node, &cache->rb_root); 675 676 return 0; 677 } 678 679 static struct mlx5_cache_ent * 680 mkey_cache_ent_from_rb_key(struct mlx5_ib_dev *dev, 681 struct mlx5r_cache_rb_key rb_key) 682 { 683 struct rb_node *node = dev->cache.rb_root.rb_node; 684 struct mlx5_cache_ent *cur, *smallest = NULL; 685 u64 ndescs_limit; 686 int cmp; 687 688 /* 689 * Find the smallest ent with order >= requested_order. 690 */ 691 while (node) { 692 cur = rb_entry(node, struct mlx5_cache_ent, node); 693 cmp = cache_ent_key_cmp(cur->rb_key, rb_key); 694 if (cmp > 0) { 695 smallest = cur; 696 node = node->rb_left; 697 } 698 if (cmp < 0) 699 node = node->rb_right; 700 if (cmp == 0) 701 return cur; 702 } 703 704 /* 705 * Limit the usage of mkeys larger than twice the required size while 706 * also allowing the usage of smallest cache entry for small MRs. 707 */ 708 ndescs_limit = max_t(u64, rb_key.ndescs * 2, 709 MLX5_MR_CACHE_PERSISTENT_ENTRY_MIN_DESCS); 710 711 return (smallest && 712 smallest->rb_key.access_mode == rb_key.access_mode && 713 smallest->rb_key.access_flags == rb_key.access_flags && 714 smallest->rb_key.ats == rb_key.ats && 715 smallest->rb_key.ndescs <= ndescs_limit) ? 716 smallest : 717 NULL; 718 } 719 720 static struct mlx5_ib_mr *_mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, 721 struct mlx5_cache_ent *ent) 722 { 723 struct mlx5_ib_mr *mr; 724 int err; 725 726 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 727 if (!mr) 728 return ERR_PTR(-ENOMEM); 729 730 spin_lock_irq(&ent->mkeys_queue.lock); 731 ent->in_use++; 732 733 if (!ent->mkeys_queue.ci) { 734 queue_adjust_cache_locked(ent); 735 ent->miss++; 736 spin_unlock_irq(&ent->mkeys_queue.lock); 737 err = create_cache_mkey(ent, &mr->mmkey.key); 738 if (err) { 739 spin_lock_irq(&ent->mkeys_queue.lock); 740 ent->in_use--; 741 spin_unlock_irq(&ent->mkeys_queue.lock); 742 kfree(mr); 743 return ERR_PTR(err); 744 } 745 } else { 746 mr->mmkey.key = pop_mkey_locked(ent); 747 queue_adjust_cache_locked(ent); 748 spin_unlock_irq(&ent->mkeys_queue.lock); 749 } 750 mr->mmkey.cache_ent = ent; 751 mr->mmkey.type = MLX5_MKEY_MR; 752 mr->mmkey.rb_key = ent->rb_key; 753 mr->mmkey.cacheable = true; 754 init_waitqueue_head(&mr->mmkey.wait); 755 return mr; 756 } 757 758 static int get_unchangeable_access_flags(struct mlx5_ib_dev *dev, 759 int access_flags) 760 { 761 int ret = 0; 762 763 if ((access_flags & IB_ACCESS_REMOTE_ATOMIC) && 764 MLX5_CAP_GEN(dev->mdev, atomic) && 765 MLX5_CAP_GEN(dev->mdev, umr_modify_atomic_disabled)) 766 ret |= IB_ACCESS_REMOTE_ATOMIC; 767 768 if ((access_flags & IB_ACCESS_RELAXED_ORDERING) && 769 MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write) && 770 !MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write_umr)) 771 ret |= IB_ACCESS_RELAXED_ORDERING; 772 773 if ((access_flags & IB_ACCESS_RELAXED_ORDERING) && 774 (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read) || 775 MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read_pci_enabled)) && 776 !MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read_umr)) 777 ret |= IB_ACCESS_RELAXED_ORDERING; 778 779 return ret; 780 } 781 782 struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, 783 int access_flags, int access_mode, 784 int ndescs) 785 { 786 struct mlx5r_cache_rb_key rb_key = { 787 .ndescs = ndescs, 788 .access_mode = access_mode, 789 .access_flags = get_unchangeable_access_flags(dev, access_flags) 790 }; 791 struct mlx5_cache_ent *ent = mkey_cache_ent_from_rb_key(dev, rb_key); 792 793 if (!ent) 794 return ERR_PTR(-EOPNOTSUPP); 795 796 return _mlx5_mr_cache_alloc(dev, ent); 797 } 798 799 static void mlx5_mkey_cache_debugfs_cleanup(struct mlx5_ib_dev *dev) 800 { 801 if (!mlx5_debugfs_root || dev->is_rep) 802 return; 803 804 debugfs_remove_recursive(dev->cache.fs_root); 805 dev->cache.fs_root = NULL; 806 } 807 808 static void mlx5_mkey_cache_debugfs_add_ent(struct mlx5_ib_dev *dev, 809 struct mlx5_cache_ent *ent) 810 { 811 int order = order_base_2(ent->rb_key.ndescs); 812 struct dentry *dir; 813 814 if (!mlx5_debugfs_root || dev->is_rep) 815 return; 816 817 if (ent->rb_key.access_mode == MLX5_MKC_ACCESS_MODE_KSM) 818 order = MLX5_IMR_KSM_CACHE_ENTRY + 2; 819 820 sprintf(ent->name, "%d", order); 821 dir = debugfs_create_dir(ent->name, dev->cache.fs_root); 822 debugfs_create_file("size", 0600, dir, ent, &size_fops); 823 debugfs_create_file("limit", 0600, dir, ent, &limit_fops); 824 debugfs_create_ulong("cur", 0400, dir, &ent->mkeys_queue.ci); 825 debugfs_create_u32("miss", 0600, dir, &ent->miss); 826 } 827 828 static void mlx5_mkey_cache_debugfs_init(struct mlx5_ib_dev *dev) 829 { 830 struct dentry *dbg_root = mlx5_debugfs_get_dev_root(dev->mdev); 831 struct mlx5_mkey_cache *cache = &dev->cache; 832 833 if (!mlx5_debugfs_root || dev->is_rep) 834 return; 835 836 cache->fs_root = debugfs_create_dir("mr_cache", dbg_root); 837 } 838 839 static void delay_time_func(struct timer_list *t) 840 { 841 struct mlx5_ib_dev *dev = timer_container_of(dev, t, delay_timer); 842 843 WRITE_ONCE(dev->fill_delay, 0); 844 } 845 846 static int mlx5r_mkeys_init(struct mlx5_cache_ent *ent) 847 { 848 struct mlx5_mkeys_page *page; 849 850 page = kzalloc(sizeof(*page), GFP_KERNEL); 851 if (!page) 852 return -ENOMEM; 853 INIT_LIST_HEAD(&ent->mkeys_queue.pages_list); 854 spin_lock_init(&ent->mkeys_queue.lock); 855 list_add_tail(&page->list, &ent->mkeys_queue.pages_list); 856 ent->mkeys_queue.num_pages++; 857 return 0; 858 } 859 860 static void mlx5r_mkeys_uninit(struct mlx5_cache_ent *ent) 861 { 862 struct mlx5_mkeys_page *page; 863 864 WARN_ON(ent->mkeys_queue.ci || ent->mkeys_queue.num_pages > 1); 865 page = list_last_entry(&ent->mkeys_queue.pages_list, 866 struct mlx5_mkeys_page, list); 867 list_del(&page->list); 868 kfree(page); 869 } 870 871 struct mlx5_cache_ent * 872 mlx5r_cache_create_ent_locked(struct mlx5_ib_dev *dev, 873 struct mlx5r_cache_rb_key rb_key, 874 bool persistent_entry) 875 { 876 struct mlx5_cache_ent *ent; 877 int order; 878 int ret; 879 880 ent = kzalloc(sizeof(*ent), GFP_KERNEL); 881 if (!ent) 882 return ERR_PTR(-ENOMEM); 883 884 ret = mlx5r_mkeys_init(ent); 885 if (ret) 886 goto mkeys_err; 887 ent->rb_key = rb_key; 888 ent->dev = dev; 889 ent->is_tmp = !persistent_entry; 890 891 INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func); 892 893 ret = mlx5_cache_ent_insert(&dev->cache, ent); 894 if (ret) 895 goto ent_insert_err; 896 897 if (persistent_entry) { 898 if (rb_key.access_mode == MLX5_MKC_ACCESS_MODE_KSM) 899 order = MLX5_IMR_KSM_CACHE_ENTRY; 900 else 901 order = order_base_2(rb_key.ndescs) - 2; 902 903 if ((dev->mdev->profile.mask & MLX5_PROF_MASK_MR_CACHE) && 904 !dev->is_rep && mlx5_core_is_pf(dev->mdev) && 905 mlx5r_umr_can_load_pas(dev, 0)) 906 ent->limit = dev->mdev->profile.mr_cache[order].limit; 907 else 908 ent->limit = 0; 909 910 mlx5_mkey_cache_debugfs_add_ent(dev, ent); 911 } 912 913 return ent; 914 ent_insert_err: 915 mlx5r_mkeys_uninit(ent); 916 mkeys_err: 917 kfree(ent); 918 return ERR_PTR(ret); 919 } 920 921 static void mlx5r_destroy_cache_entries(struct mlx5_ib_dev *dev) 922 { 923 struct rb_root *root = &dev->cache.rb_root; 924 struct mlx5_cache_ent *ent; 925 struct rb_node *node; 926 927 mutex_lock(&dev->cache.rb_lock); 928 node = rb_first(root); 929 while (node) { 930 ent = rb_entry(node, struct mlx5_cache_ent, node); 931 node = rb_next(node); 932 clean_keys(dev, ent); 933 rb_erase(&ent->node, root); 934 mlx5r_mkeys_uninit(ent); 935 kfree(ent); 936 } 937 mutex_unlock(&dev->cache.rb_lock); 938 } 939 940 int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev) 941 { 942 struct mlx5_mkey_cache *cache = &dev->cache; 943 struct rb_root *root = &dev->cache.rb_root; 944 struct mlx5r_cache_rb_key rb_key = { 945 .access_mode = MLX5_MKC_ACCESS_MODE_MTT, 946 }; 947 struct mlx5_cache_ent *ent; 948 struct rb_node *node; 949 int ret; 950 int i; 951 952 mutex_init(&dev->slow_path_mutex); 953 mutex_init(&dev->cache.rb_lock); 954 dev->cache.rb_root = RB_ROOT; 955 cache->wq = alloc_ordered_workqueue("mkey_cache", WQ_MEM_RECLAIM); 956 if (!cache->wq) { 957 mlx5_ib_warn(dev, "failed to create work queue\n"); 958 return -ENOMEM; 959 } 960 961 mlx5_cmd_init_async_ctx(dev->mdev, &dev->async_ctx); 962 timer_setup(&dev->delay_timer, delay_time_func, 0); 963 mlx5_mkey_cache_debugfs_init(dev); 964 mutex_lock(&cache->rb_lock); 965 for (i = 0; i <= mkey_cache_max_order(dev); i++) { 966 rb_key.ndescs = MLX5_MR_CACHE_PERSISTENT_ENTRY_MIN_DESCS << i; 967 ent = mlx5r_cache_create_ent_locked(dev, rb_key, true); 968 if (IS_ERR(ent)) { 969 ret = PTR_ERR(ent); 970 goto err; 971 } 972 } 973 974 ret = mlx5_odp_init_mkey_cache(dev); 975 if (ret) 976 goto err; 977 978 mutex_unlock(&cache->rb_lock); 979 for (node = rb_first(root); node; node = rb_next(node)) { 980 ent = rb_entry(node, struct mlx5_cache_ent, node); 981 spin_lock_irq(&ent->mkeys_queue.lock); 982 queue_adjust_cache_locked(ent); 983 spin_unlock_irq(&ent->mkeys_queue.lock); 984 } 985 986 return 0; 987 988 err: 989 mutex_unlock(&cache->rb_lock); 990 mlx5_mkey_cache_debugfs_cleanup(dev); 991 mlx5r_destroy_cache_entries(dev); 992 destroy_workqueue(cache->wq); 993 mlx5_ib_warn(dev, "failed to create mkey cache entry\n"); 994 return ret; 995 } 996 997 void mlx5_mkey_cache_cleanup(struct mlx5_ib_dev *dev) 998 { 999 struct rb_root *root = &dev->cache.rb_root; 1000 struct mlx5_cache_ent *ent; 1001 struct rb_node *node; 1002 1003 if (!dev->cache.wq) 1004 return; 1005 1006 mutex_lock(&dev->cache.rb_lock); 1007 for (node = rb_first(root); node; node = rb_next(node)) { 1008 ent = rb_entry(node, struct mlx5_cache_ent, node); 1009 spin_lock_irq(&ent->mkeys_queue.lock); 1010 ent->disabled = true; 1011 spin_unlock_irq(&ent->mkeys_queue.lock); 1012 cancel_delayed_work(&ent->dwork); 1013 } 1014 mutex_unlock(&dev->cache.rb_lock); 1015 1016 /* 1017 * After all entries are disabled and will not reschedule on WQ, 1018 * flush it and all async commands. 1019 */ 1020 flush_workqueue(dev->cache.wq); 1021 1022 mlx5_mkey_cache_debugfs_cleanup(dev); 1023 mlx5_cmd_cleanup_async_ctx(&dev->async_ctx); 1024 1025 /* At this point all entries are disabled and have no concurrent work. */ 1026 mlx5r_destroy_cache_entries(dev); 1027 1028 destroy_workqueue(dev->cache.wq); 1029 timer_delete_sync(&dev->delay_timer); 1030 } 1031 1032 struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc) 1033 { 1034 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1035 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 1036 struct mlx5_ib_mr *mr; 1037 void *mkc; 1038 u32 *in; 1039 int err; 1040 1041 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 1042 if (!mr) 1043 return ERR_PTR(-ENOMEM); 1044 1045 in = kzalloc(inlen, GFP_KERNEL); 1046 if (!in) { 1047 err = -ENOMEM; 1048 goto err_free; 1049 } 1050 1051 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 1052 1053 MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_PA); 1054 MLX5_SET(mkc, mkc, length64, 1); 1055 set_mkc_access_pd_addr_fields(mkc, acc | IB_ACCESS_RELAXED_ORDERING, 0, 1056 pd); 1057 MLX5_SET(mkc, mkc, ma_translation_mode, MLX5_CAP_GEN(dev->mdev, ats)); 1058 1059 err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen); 1060 if (err) 1061 goto err_in; 1062 1063 kfree(in); 1064 mr->mmkey.type = MLX5_MKEY_MR; 1065 mr->ibmr.lkey = mr->mmkey.key; 1066 mr->ibmr.rkey = mr->mmkey.key; 1067 mr->umem = NULL; 1068 1069 return &mr->ibmr; 1070 1071 err_in: 1072 kfree(in); 1073 1074 err_free: 1075 kfree(mr); 1076 1077 return ERR_PTR(err); 1078 } 1079 1080 static int get_octo_len(u64 addr, u64 len, int page_shift) 1081 { 1082 u64 page_size = 1ULL << page_shift; 1083 u64 offset; 1084 int npages; 1085 1086 offset = addr & (page_size - 1); 1087 npages = ALIGN(len + offset, page_size) >> page_shift; 1088 return (npages + 1) / 2; 1089 } 1090 1091 static int mkey_cache_max_order(struct mlx5_ib_dev *dev) 1092 { 1093 if (MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset)) 1094 return MKEY_CACHE_LAST_STD_ENTRY; 1095 return MLX5_MAX_UMR_SHIFT; 1096 } 1097 1098 static void set_mr_fields(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr, 1099 u64 length, int access_flags, u64 iova) 1100 { 1101 mr->ibmr.lkey = mr->mmkey.key; 1102 mr->ibmr.rkey = mr->mmkey.key; 1103 mr->ibmr.length = length; 1104 mr->ibmr.device = &dev->ib_dev; 1105 mr->ibmr.iova = iova; 1106 mr->access_flags = access_flags; 1107 } 1108 1109 static unsigned int mlx5_umem_dmabuf_default_pgsz(struct ib_umem *umem, 1110 u64 iova) 1111 { 1112 /* 1113 * The alignment of iova has already been checked upon entering 1114 * UVERBS_METHOD_REG_DMABUF_MR 1115 */ 1116 umem->iova = iova; 1117 return PAGE_SIZE; 1118 } 1119 1120 static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd, 1121 struct ib_umem *umem, u64 iova, 1122 int access_flags, int access_mode) 1123 { 1124 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1125 struct mlx5r_cache_rb_key rb_key = {}; 1126 struct mlx5_cache_ent *ent; 1127 struct mlx5_ib_mr *mr; 1128 unsigned long page_size; 1129 1130 if (umem->is_dmabuf) 1131 page_size = mlx5_umem_dmabuf_default_pgsz(umem, iova); 1132 else 1133 page_size = mlx5_umem_mkc_find_best_pgsz(dev, umem, iova, 1134 access_mode); 1135 if (WARN_ON(!page_size)) 1136 return ERR_PTR(-EINVAL); 1137 1138 rb_key.access_mode = access_mode; 1139 rb_key.ndescs = ib_umem_num_dma_blocks(umem, page_size); 1140 rb_key.ats = mlx5_umem_needs_ats(dev, umem, access_flags); 1141 rb_key.access_flags = get_unchangeable_access_flags(dev, access_flags); 1142 ent = mkey_cache_ent_from_rb_key(dev, rb_key); 1143 /* 1144 * If the MR can't come from the cache then synchronously create an uncached 1145 * one. 1146 */ 1147 if (!ent) { 1148 mutex_lock(&dev->slow_path_mutex); 1149 mr = reg_create(pd, umem, iova, access_flags, page_size, false, access_mode); 1150 mutex_unlock(&dev->slow_path_mutex); 1151 if (IS_ERR(mr)) 1152 return mr; 1153 mr->mmkey.rb_key = rb_key; 1154 mr->mmkey.cacheable = true; 1155 return mr; 1156 } 1157 1158 mr = _mlx5_mr_cache_alloc(dev, ent); 1159 if (IS_ERR(mr)) 1160 return mr; 1161 1162 mr->ibmr.pd = pd; 1163 mr->umem = umem; 1164 mr->page_shift = order_base_2(page_size); 1165 set_mr_fields(dev, mr, umem->length, access_flags, iova); 1166 1167 return mr; 1168 } 1169 1170 static struct ib_mr * 1171 reg_create_crossing_vhca_mr(struct ib_pd *pd, u64 iova, u64 length, int access_flags, 1172 u32 crossed_lkey) 1173 { 1174 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1175 int access_mode = MLX5_MKC_ACCESS_MODE_CROSSING; 1176 struct mlx5_ib_mr *mr; 1177 void *mkc; 1178 int inlen; 1179 u32 *in; 1180 int err; 1181 1182 if (!MLX5_CAP_GEN(dev->mdev, crossing_vhca_mkey)) 1183 return ERR_PTR(-EOPNOTSUPP); 1184 1185 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 1186 if (!mr) 1187 return ERR_PTR(-ENOMEM); 1188 1189 inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 1190 in = kvzalloc(inlen, GFP_KERNEL); 1191 if (!in) { 1192 err = -ENOMEM; 1193 goto err_1; 1194 } 1195 1196 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 1197 MLX5_SET(mkc, mkc, crossing_target_vhca_id, 1198 MLX5_CAP_GEN(dev->mdev, vhca_id)); 1199 MLX5_SET(mkc, mkc, translations_octword_size, crossed_lkey); 1200 MLX5_SET(mkc, mkc, access_mode_1_0, access_mode & 0x3); 1201 MLX5_SET(mkc, mkc, access_mode_4_2, (access_mode >> 2) & 0x7); 1202 1203 /* for this crossing mkey IOVA should be 0 and len should be IOVA + len */ 1204 set_mkc_access_pd_addr_fields(mkc, access_flags, 0, pd); 1205 MLX5_SET64(mkc, mkc, len, iova + length); 1206 1207 MLX5_SET(mkc, mkc, free, 0); 1208 MLX5_SET(mkc, mkc, umr_en, 0); 1209 err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen); 1210 if (err) 1211 goto err_2; 1212 1213 mr->mmkey.type = MLX5_MKEY_MR; 1214 set_mr_fields(dev, mr, length, access_flags, iova); 1215 mr->ibmr.pd = pd; 1216 kvfree(in); 1217 mlx5_ib_dbg(dev, "crossing mkey = 0x%x\n", mr->mmkey.key); 1218 1219 return &mr->ibmr; 1220 err_2: 1221 kvfree(in); 1222 err_1: 1223 kfree(mr); 1224 return ERR_PTR(err); 1225 } 1226 1227 /* 1228 * If ibmr is NULL it will be allocated by reg_create. 1229 * Else, the given ibmr will be used. 1230 */ 1231 static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem, 1232 u64 iova, int access_flags, 1233 unsigned long page_size, bool populate, 1234 int access_mode) 1235 { 1236 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1237 struct mlx5_ib_mr *mr; 1238 __be64 *pas; 1239 void *mkc; 1240 int inlen; 1241 u32 *in; 1242 int err; 1243 bool pg_cap = !!(MLX5_CAP_GEN(dev->mdev, pg)) && 1244 (access_mode == MLX5_MKC_ACCESS_MODE_MTT); 1245 bool ksm_mode = (access_mode == MLX5_MKC_ACCESS_MODE_KSM); 1246 1247 if (!page_size) 1248 return ERR_PTR(-EINVAL); 1249 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 1250 if (!mr) 1251 return ERR_PTR(-ENOMEM); 1252 1253 mr->ibmr.pd = pd; 1254 mr->access_flags = access_flags; 1255 mr->page_shift = order_base_2(page_size); 1256 1257 inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 1258 if (populate) 1259 inlen += sizeof(*pas) * 1260 roundup(ib_umem_num_dma_blocks(umem, page_size), 2); 1261 in = kvzalloc(inlen, GFP_KERNEL); 1262 if (!in) { 1263 err = -ENOMEM; 1264 goto err_1; 1265 } 1266 pas = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt); 1267 if (populate) { 1268 if (WARN_ON(access_flags & IB_ACCESS_ON_DEMAND || ksm_mode)) { 1269 err = -EINVAL; 1270 goto err_2; 1271 } 1272 mlx5_ib_populate_pas(umem, 1UL << mr->page_shift, pas, 1273 pg_cap ? MLX5_IB_MTT_PRESENT : 0); 1274 } 1275 1276 /* The pg_access bit allows setting the access flags 1277 * in the page list submitted with the command. 1278 */ 1279 MLX5_SET(create_mkey_in, in, pg_access, !!(pg_cap)); 1280 1281 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 1282 set_mkc_access_pd_addr_fields(mkc, access_flags, iova, 1283 populate ? pd : dev->umrc.pd); 1284 /* In case a data direct flow, overwrite the pdn field by its internal kernel PD */ 1285 if (umem->is_dmabuf && ksm_mode) 1286 MLX5_SET(mkc, mkc, pd, dev->ddr.pdn); 1287 1288 MLX5_SET(mkc, mkc, free, !populate); 1289 MLX5_SET(mkc, mkc, access_mode_1_0, access_mode); 1290 MLX5_SET(mkc, mkc, umr_en, 1); 1291 1292 MLX5_SET64(mkc, mkc, len, umem->length); 1293 MLX5_SET(mkc, mkc, bsf_octword_size, 0); 1294 if (ksm_mode) 1295 MLX5_SET(mkc, mkc, translations_octword_size, 1296 get_octo_len(iova, umem->length, mr->page_shift) * 2); 1297 else 1298 MLX5_SET(mkc, mkc, translations_octword_size, 1299 get_octo_len(iova, umem->length, mr->page_shift)); 1300 MLX5_SET(mkc, mkc, log_page_size, mr->page_shift); 1301 if (mlx5_umem_needs_ats(dev, umem, access_flags)) 1302 MLX5_SET(mkc, mkc, ma_translation_mode, 1); 1303 if (populate) { 1304 MLX5_SET(create_mkey_in, in, translations_octword_actual_size, 1305 get_octo_len(iova, umem->length, mr->page_shift)); 1306 } 1307 1308 err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen); 1309 if (err) { 1310 mlx5_ib_warn(dev, "create mkey failed\n"); 1311 goto err_2; 1312 } 1313 mr->mmkey.type = MLX5_MKEY_MR; 1314 mr->mmkey.ndescs = get_octo_len(iova, umem->length, mr->page_shift); 1315 mr->umem = umem; 1316 set_mr_fields(dev, mr, umem->length, access_flags, iova); 1317 kvfree(in); 1318 1319 mlx5_ib_dbg(dev, "mkey = 0x%x\n", mr->mmkey.key); 1320 1321 return mr; 1322 1323 err_2: 1324 kvfree(in); 1325 err_1: 1326 kfree(mr); 1327 return ERR_PTR(err); 1328 } 1329 1330 static struct ib_mr *mlx5_ib_get_dm_mr(struct ib_pd *pd, u64 start_addr, 1331 u64 length, int acc, int mode) 1332 { 1333 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1334 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 1335 struct mlx5_ib_mr *mr; 1336 void *mkc; 1337 u32 *in; 1338 int err; 1339 1340 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 1341 if (!mr) 1342 return ERR_PTR(-ENOMEM); 1343 1344 in = kzalloc(inlen, GFP_KERNEL); 1345 if (!in) { 1346 err = -ENOMEM; 1347 goto err_free; 1348 } 1349 1350 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 1351 1352 MLX5_SET(mkc, mkc, access_mode_1_0, mode & 0x3); 1353 MLX5_SET(mkc, mkc, access_mode_4_2, (mode >> 2) & 0x7); 1354 MLX5_SET64(mkc, mkc, len, length); 1355 set_mkc_access_pd_addr_fields(mkc, acc, start_addr, pd); 1356 1357 err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen); 1358 if (err) 1359 goto err_in; 1360 1361 kfree(in); 1362 1363 set_mr_fields(dev, mr, length, acc, start_addr); 1364 1365 return &mr->ibmr; 1366 1367 err_in: 1368 kfree(in); 1369 1370 err_free: 1371 kfree(mr); 1372 1373 return ERR_PTR(err); 1374 } 1375 1376 int mlx5_ib_advise_mr(struct ib_pd *pd, 1377 enum ib_uverbs_advise_mr_advice advice, 1378 u32 flags, 1379 struct ib_sge *sg_list, 1380 u32 num_sge, 1381 struct uverbs_attr_bundle *attrs) 1382 { 1383 if (advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH && 1384 advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE && 1385 advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_NO_FAULT) 1386 return -EOPNOTSUPP; 1387 1388 return mlx5_ib_advise_mr_prefetch(pd, advice, flags, 1389 sg_list, num_sge); 1390 } 1391 1392 struct ib_mr *mlx5_ib_reg_dm_mr(struct ib_pd *pd, struct ib_dm *dm, 1393 struct ib_dm_mr_attr *attr, 1394 struct uverbs_attr_bundle *attrs) 1395 { 1396 struct mlx5_ib_dm *mdm = to_mdm(dm); 1397 struct mlx5_core_dev *dev = to_mdev(dm->device)->mdev; 1398 u64 start_addr = mdm->dev_addr + attr->offset; 1399 int mode; 1400 1401 switch (mdm->type) { 1402 case MLX5_IB_UAPI_DM_TYPE_MEMIC: 1403 if (attr->access_flags & ~MLX5_IB_DM_MEMIC_ALLOWED_ACCESS) 1404 return ERR_PTR(-EINVAL); 1405 1406 mode = MLX5_MKC_ACCESS_MODE_MEMIC; 1407 start_addr -= pci_resource_start(dev->pdev, 0); 1408 break; 1409 case MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM: 1410 case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM: 1411 case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_PATTERN_SW_ICM: 1412 case MLX5_IB_UAPI_DM_TYPE_ENCAP_SW_ICM: 1413 if (attr->access_flags & ~MLX5_IB_DM_SW_ICM_ALLOWED_ACCESS) 1414 return ERR_PTR(-EINVAL); 1415 1416 mode = MLX5_MKC_ACCESS_MODE_SW_ICM; 1417 break; 1418 default: 1419 return ERR_PTR(-EINVAL); 1420 } 1421 1422 return mlx5_ib_get_dm_mr(pd, start_addr, attr->length, 1423 attr->access_flags, mode); 1424 } 1425 1426 static struct ib_mr *create_real_mr(struct ib_pd *pd, struct ib_umem *umem, 1427 u64 iova, int access_flags) 1428 { 1429 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1430 struct mlx5_ib_mr *mr = NULL; 1431 bool xlt_with_umr; 1432 int err; 1433 1434 xlt_with_umr = mlx5r_umr_can_load_pas(dev, umem->length); 1435 if (xlt_with_umr) { 1436 mr = alloc_cacheable_mr(pd, umem, iova, access_flags, 1437 MLX5_MKC_ACCESS_MODE_MTT); 1438 } else { 1439 unsigned long page_size = mlx5_umem_mkc_find_best_pgsz( 1440 dev, umem, iova, MLX5_MKC_ACCESS_MODE_MTT); 1441 1442 mutex_lock(&dev->slow_path_mutex); 1443 mr = reg_create(pd, umem, iova, access_flags, page_size, 1444 true, MLX5_MKC_ACCESS_MODE_MTT); 1445 mutex_unlock(&dev->slow_path_mutex); 1446 } 1447 if (IS_ERR(mr)) { 1448 ib_umem_release(umem); 1449 return ERR_CAST(mr); 1450 } 1451 1452 mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmkey.key); 1453 1454 atomic_add(ib_umem_num_pages(umem), &dev->mdev->priv.reg_pages); 1455 1456 if (xlt_with_umr) { 1457 /* 1458 * If the MR was created with reg_create then it will be 1459 * configured properly but left disabled. It is safe to go ahead 1460 * and configure it again via UMR while enabling it. 1461 */ 1462 err = mlx5r_umr_update_mr_pas(mr, MLX5_IB_UPD_XLT_ENABLE); 1463 if (err) { 1464 mlx5_ib_dereg_mr(&mr->ibmr, NULL); 1465 return ERR_PTR(err); 1466 } 1467 } 1468 return &mr->ibmr; 1469 } 1470 1471 static struct ib_mr *create_user_odp_mr(struct ib_pd *pd, u64 start, u64 length, 1472 u64 iova, int access_flags, 1473 struct ib_udata *udata) 1474 { 1475 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1476 struct ib_umem_odp *odp; 1477 struct mlx5_ib_mr *mr; 1478 int err; 1479 1480 if (!IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) 1481 return ERR_PTR(-EOPNOTSUPP); 1482 1483 err = mlx5r_odp_create_eq(dev, &dev->odp_pf_eq); 1484 if (err) 1485 return ERR_PTR(err); 1486 if (!start && length == U64_MAX) { 1487 if (iova != 0) 1488 return ERR_PTR(-EINVAL); 1489 if (!(dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT)) 1490 return ERR_PTR(-EINVAL); 1491 1492 mr = mlx5_ib_alloc_implicit_mr(to_mpd(pd), access_flags); 1493 if (IS_ERR(mr)) 1494 return ERR_CAST(mr); 1495 return &mr->ibmr; 1496 } 1497 1498 /* ODP requires xlt update via umr to work. */ 1499 if (!mlx5r_umr_can_load_pas(dev, length)) 1500 return ERR_PTR(-EINVAL); 1501 1502 odp = ib_umem_odp_get(&dev->ib_dev, start, length, access_flags, 1503 &mlx5_mn_ops); 1504 if (IS_ERR(odp)) 1505 return ERR_CAST(odp); 1506 1507 mr = alloc_cacheable_mr(pd, &odp->umem, iova, access_flags, 1508 MLX5_MKC_ACCESS_MODE_MTT); 1509 if (IS_ERR(mr)) { 1510 ib_umem_release(&odp->umem); 1511 return ERR_CAST(mr); 1512 } 1513 xa_init(&mr->implicit_children); 1514 1515 odp->private = mr; 1516 err = mlx5r_store_odp_mkey(dev, &mr->mmkey); 1517 if (err) 1518 goto err_dereg_mr; 1519 1520 err = mlx5_ib_init_odp_mr(mr); 1521 if (err) 1522 goto err_dereg_mr; 1523 return &mr->ibmr; 1524 1525 err_dereg_mr: 1526 mlx5_ib_dereg_mr(&mr->ibmr, NULL); 1527 return ERR_PTR(err); 1528 } 1529 1530 struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, 1531 u64 iova, int access_flags, 1532 struct ib_dmah *dmah, 1533 struct ib_udata *udata) 1534 { 1535 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1536 struct ib_umem *umem; 1537 int err; 1538 1539 if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM) || dmah) 1540 return ERR_PTR(-EOPNOTSUPP); 1541 1542 mlx5_ib_dbg(dev, "start 0x%llx, iova 0x%llx, length 0x%llx, access_flags 0x%x\n", 1543 start, iova, length, access_flags); 1544 1545 err = mlx5r_umr_resource_init(dev); 1546 if (err) 1547 return ERR_PTR(err); 1548 1549 if (access_flags & IB_ACCESS_ON_DEMAND) 1550 return create_user_odp_mr(pd, start, length, iova, access_flags, 1551 udata); 1552 umem = ib_umem_get(&dev->ib_dev, start, length, access_flags); 1553 if (IS_ERR(umem)) 1554 return ERR_CAST(umem); 1555 return create_real_mr(pd, umem, iova, access_flags); 1556 } 1557 1558 static void mlx5_ib_dmabuf_invalidate_cb(struct dma_buf_attachment *attach) 1559 { 1560 struct ib_umem_dmabuf *umem_dmabuf = attach->importer_priv; 1561 struct mlx5_ib_mr *mr = umem_dmabuf->private; 1562 1563 dma_resv_assert_held(umem_dmabuf->attach->dmabuf->resv); 1564 1565 if (!umem_dmabuf->sgt || !mr) 1566 return; 1567 1568 mlx5r_umr_update_mr_pas(mr, MLX5_IB_UPD_XLT_ZAP); 1569 ib_umem_dmabuf_unmap_pages(umem_dmabuf); 1570 } 1571 1572 static struct dma_buf_attach_ops mlx5_ib_dmabuf_attach_ops = { 1573 .allow_peer2peer = 1, 1574 .move_notify = mlx5_ib_dmabuf_invalidate_cb, 1575 }; 1576 1577 static struct ib_mr * 1578 reg_user_mr_dmabuf(struct ib_pd *pd, struct device *dma_device, 1579 u64 offset, u64 length, u64 virt_addr, 1580 int fd, int access_flags, int access_mode) 1581 { 1582 bool pinned_mode = (access_mode == MLX5_MKC_ACCESS_MODE_KSM); 1583 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1584 struct mlx5_ib_mr *mr = NULL; 1585 struct ib_umem_dmabuf *umem_dmabuf; 1586 int err; 1587 1588 err = mlx5r_umr_resource_init(dev); 1589 if (err) 1590 return ERR_PTR(err); 1591 1592 if (!pinned_mode) 1593 umem_dmabuf = ib_umem_dmabuf_get(&dev->ib_dev, 1594 offset, length, fd, 1595 access_flags, 1596 &mlx5_ib_dmabuf_attach_ops); 1597 else 1598 umem_dmabuf = ib_umem_dmabuf_get_pinned_with_dma_device(&dev->ib_dev, 1599 dma_device, offset, length, 1600 fd, access_flags); 1601 1602 if (IS_ERR(umem_dmabuf)) { 1603 mlx5_ib_dbg(dev, "umem_dmabuf get failed (%ld)\n", 1604 PTR_ERR(umem_dmabuf)); 1605 return ERR_CAST(umem_dmabuf); 1606 } 1607 1608 mr = alloc_cacheable_mr(pd, &umem_dmabuf->umem, virt_addr, 1609 access_flags, access_mode); 1610 if (IS_ERR(mr)) { 1611 ib_umem_release(&umem_dmabuf->umem); 1612 return ERR_CAST(mr); 1613 } 1614 1615 mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmkey.key); 1616 1617 atomic_add(ib_umem_num_pages(mr->umem), &dev->mdev->priv.reg_pages); 1618 umem_dmabuf->private = mr; 1619 if (!pinned_mode) { 1620 err = mlx5r_store_odp_mkey(dev, &mr->mmkey); 1621 if (err) 1622 goto err_dereg_mr; 1623 } else { 1624 mr->data_direct = true; 1625 } 1626 1627 err = mlx5_ib_init_dmabuf_mr(mr); 1628 if (err) 1629 goto err_dereg_mr; 1630 return &mr->ibmr; 1631 1632 err_dereg_mr: 1633 __mlx5_ib_dereg_mr(&mr->ibmr); 1634 return ERR_PTR(err); 1635 } 1636 1637 static struct ib_mr * 1638 reg_user_mr_dmabuf_by_data_direct(struct ib_pd *pd, u64 offset, 1639 u64 length, u64 virt_addr, 1640 int fd, int access_flags) 1641 { 1642 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1643 struct mlx5_data_direct_dev *data_direct_dev; 1644 struct ib_mr *crossing_mr; 1645 struct ib_mr *crossed_mr; 1646 int ret = 0; 1647 1648 /* As of HW behaviour the IOVA must be page aligned in KSM mode */ 1649 if (!PAGE_ALIGNED(virt_addr) || (access_flags & IB_ACCESS_ON_DEMAND)) 1650 return ERR_PTR(-EOPNOTSUPP); 1651 1652 mutex_lock(&dev->data_direct_lock); 1653 data_direct_dev = dev->data_direct_dev; 1654 if (!data_direct_dev) { 1655 ret = -EINVAL; 1656 goto end; 1657 } 1658 1659 /* The device's 'data direct mkey' was created without RO flags to 1660 * simplify things and allow for a single mkey per device. 1661 * Since RO is not a must, mask it out accordingly. 1662 */ 1663 access_flags &= ~IB_ACCESS_RELAXED_ORDERING; 1664 crossed_mr = reg_user_mr_dmabuf(pd, &data_direct_dev->pdev->dev, 1665 offset, length, virt_addr, fd, 1666 access_flags, MLX5_MKC_ACCESS_MODE_KSM); 1667 if (IS_ERR(crossed_mr)) { 1668 ret = PTR_ERR(crossed_mr); 1669 goto end; 1670 } 1671 1672 mutex_lock(&dev->slow_path_mutex); 1673 crossing_mr = reg_create_crossing_vhca_mr(pd, virt_addr, length, access_flags, 1674 crossed_mr->lkey); 1675 mutex_unlock(&dev->slow_path_mutex); 1676 if (IS_ERR(crossing_mr)) { 1677 __mlx5_ib_dereg_mr(crossed_mr); 1678 ret = PTR_ERR(crossing_mr); 1679 goto end; 1680 } 1681 1682 list_add_tail(&to_mmr(crossed_mr)->dd_node, &dev->data_direct_mr_list); 1683 to_mmr(crossing_mr)->dd_crossed_mr = to_mmr(crossed_mr); 1684 to_mmr(crossing_mr)->data_direct = true; 1685 end: 1686 mutex_unlock(&dev->data_direct_lock); 1687 return ret ? ERR_PTR(ret) : crossing_mr; 1688 } 1689 1690 struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset, 1691 u64 length, u64 virt_addr, 1692 int fd, int access_flags, 1693 struct ib_dmah *dmah, 1694 struct uverbs_attr_bundle *attrs) 1695 { 1696 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1697 int mlx5_access_flags = 0; 1698 int err; 1699 1700 if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM) || 1701 !IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) || dmah) 1702 return ERR_PTR(-EOPNOTSUPP); 1703 1704 if (uverbs_attr_is_valid(attrs, MLX5_IB_ATTR_REG_DMABUF_MR_ACCESS_FLAGS)) { 1705 err = uverbs_get_flags32(&mlx5_access_flags, attrs, 1706 MLX5_IB_ATTR_REG_DMABUF_MR_ACCESS_FLAGS, 1707 MLX5_IB_UAPI_REG_DMABUF_ACCESS_DATA_DIRECT); 1708 if (err) 1709 return ERR_PTR(err); 1710 } 1711 1712 mlx5_ib_dbg(dev, 1713 "offset 0x%llx, virt_addr 0x%llx, length 0x%llx, fd %d, access_flags 0x%x, mlx5_access_flags 0x%x\n", 1714 offset, virt_addr, length, fd, access_flags, mlx5_access_flags); 1715 1716 /* dmabuf requires xlt update via umr to work. */ 1717 if (!mlx5r_umr_can_load_pas(dev, length)) 1718 return ERR_PTR(-EINVAL); 1719 1720 if (mlx5_access_flags & MLX5_IB_UAPI_REG_DMABUF_ACCESS_DATA_DIRECT) 1721 return reg_user_mr_dmabuf_by_data_direct(pd, offset, length, virt_addr, 1722 fd, access_flags); 1723 1724 return reg_user_mr_dmabuf(pd, pd->device->dma_device, 1725 offset, length, virt_addr, 1726 fd, access_flags, MLX5_MKC_ACCESS_MODE_MTT); 1727 } 1728 1729 /* 1730 * True if the change in access flags can be done via UMR, only some access 1731 * flags can be updated. 1732 */ 1733 static bool can_use_umr_rereg_access(struct mlx5_ib_dev *dev, 1734 unsigned int current_access_flags, 1735 unsigned int target_access_flags) 1736 { 1737 unsigned int diffs = current_access_flags ^ target_access_flags; 1738 1739 if (diffs & ~(IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE | 1740 IB_ACCESS_REMOTE_READ | IB_ACCESS_RELAXED_ORDERING | 1741 IB_ACCESS_REMOTE_ATOMIC)) 1742 return false; 1743 return mlx5r_umr_can_reconfig(dev, current_access_flags, 1744 target_access_flags); 1745 } 1746 1747 static bool can_use_umr_rereg_pas(struct mlx5_ib_mr *mr, 1748 struct ib_umem *new_umem, 1749 int new_access_flags, u64 iova, 1750 unsigned long *page_size) 1751 { 1752 struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device); 1753 1754 /* We only track the allocated sizes of MRs from the cache */ 1755 if (!mr->mmkey.cache_ent) 1756 return false; 1757 if (!mlx5r_umr_can_load_pas(dev, new_umem->length)) 1758 return false; 1759 1760 *page_size = mlx5_umem_mkc_find_best_pgsz( 1761 dev, new_umem, iova, mr->mmkey.cache_ent->rb_key.access_mode); 1762 if (WARN_ON(!*page_size)) 1763 return false; 1764 return (mr->mmkey.cache_ent->rb_key.ndescs) >= 1765 ib_umem_num_dma_blocks(new_umem, *page_size); 1766 } 1767 1768 static int umr_rereg_pas(struct mlx5_ib_mr *mr, struct ib_pd *pd, 1769 int access_flags, int flags, struct ib_umem *new_umem, 1770 u64 iova, unsigned long page_size) 1771 { 1772 struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device); 1773 int upd_flags = MLX5_IB_UPD_XLT_ADDR | MLX5_IB_UPD_XLT_ENABLE; 1774 struct ib_umem *old_umem = mr->umem; 1775 int err; 1776 1777 /* 1778 * To keep everything simple the MR is revoked before we start to mess 1779 * with it. This ensure the change is atomic relative to any use of the 1780 * MR. 1781 */ 1782 err = mlx5r_umr_revoke_mr(mr); 1783 if (err) 1784 return err; 1785 1786 if (flags & IB_MR_REREG_PD) { 1787 mr->ibmr.pd = pd; 1788 upd_flags |= MLX5_IB_UPD_XLT_PD; 1789 } 1790 if (flags & IB_MR_REREG_ACCESS) { 1791 mr->access_flags = access_flags; 1792 upd_flags |= MLX5_IB_UPD_XLT_ACCESS; 1793 } 1794 1795 mr->ibmr.iova = iova; 1796 mr->ibmr.length = new_umem->length; 1797 mr->page_shift = order_base_2(page_size); 1798 mr->umem = new_umem; 1799 err = mlx5r_umr_update_mr_pas(mr, upd_flags); 1800 if (err) { 1801 /* 1802 * The MR is revoked at this point so there is no issue to free 1803 * new_umem. 1804 */ 1805 mr->umem = old_umem; 1806 return err; 1807 } 1808 1809 atomic_sub(ib_umem_num_pages(old_umem), &dev->mdev->priv.reg_pages); 1810 ib_umem_release(old_umem); 1811 atomic_add(ib_umem_num_pages(new_umem), &dev->mdev->priv.reg_pages); 1812 return 0; 1813 } 1814 1815 struct ib_mr *mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, 1816 u64 length, u64 iova, int new_access_flags, 1817 struct ib_pd *new_pd, 1818 struct ib_udata *udata) 1819 { 1820 struct mlx5_ib_dev *dev = to_mdev(ib_mr->device); 1821 struct mlx5_ib_mr *mr = to_mmr(ib_mr); 1822 int err; 1823 1824 if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM) || mr->data_direct) 1825 return ERR_PTR(-EOPNOTSUPP); 1826 1827 mlx5_ib_dbg( 1828 dev, 1829 "start 0x%llx, iova 0x%llx, length 0x%llx, access_flags 0x%x\n", 1830 start, iova, length, new_access_flags); 1831 1832 if (flags & ~(IB_MR_REREG_TRANS | IB_MR_REREG_PD | IB_MR_REREG_ACCESS)) 1833 return ERR_PTR(-EOPNOTSUPP); 1834 1835 if (!(flags & IB_MR_REREG_ACCESS)) 1836 new_access_flags = mr->access_flags; 1837 if (!(flags & IB_MR_REREG_PD)) 1838 new_pd = ib_mr->pd; 1839 1840 if (!(flags & IB_MR_REREG_TRANS)) { 1841 struct ib_umem *umem; 1842 1843 /* Fast path for PD/access change */ 1844 if (can_use_umr_rereg_access(dev, mr->access_flags, 1845 new_access_flags)) { 1846 err = mlx5r_umr_rereg_pd_access(mr, new_pd, 1847 new_access_flags); 1848 if (err) 1849 return ERR_PTR(err); 1850 return NULL; 1851 } 1852 /* DM or ODP MR's don't have a normal umem so we can't re-use it */ 1853 if (!mr->umem || is_odp_mr(mr) || is_dmabuf_mr(mr)) 1854 goto recreate; 1855 1856 /* 1857 * Only one active MR can refer to a umem at one time, revoke 1858 * the old MR before assigning the umem to the new one. 1859 */ 1860 err = mlx5r_umr_revoke_mr(mr); 1861 if (err) 1862 return ERR_PTR(err); 1863 umem = mr->umem; 1864 mr->umem = NULL; 1865 atomic_sub(ib_umem_num_pages(umem), &dev->mdev->priv.reg_pages); 1866 1867 return create_real_mr(new_pd, umem, mr->ibmr.iova, 1868 new_access_flags); 1869 } 1870 1871 /* 1872 * DM doesn't have a PAS list so we can't re-use it, odp/dmabuf does 1873 * but the logic around releasing the umem is different 1874 */ 1875 if (!mr->umem || is_odp_mr(mr) || is_dmabuf_mr(mr)) 1876 goto recreate; 1877 1878 if (!(new_access_flags & IB_ACCESS_ON_DEMAND) && 1879 can_use_umr_rereg_access(dev, mr->access_flags, new_access_flags)) { 1880 struct ib_umem *new_umem; 1881 unsigned long page_size; 1882 1883 new_umem = ib_umem_get(&dev->ib_dev, start, length, 1884 new_access_flags); 1885 if (IS_ERR(new_umem)) 1886 return ERR_CAST(new_umem); 1887 1888 /* Fast path for PAS change */ 1889 if (can_use_umr_rereg_pas(mr, new_umem, new_access_flags, iova, 1890 &page_size)) { 1891 err = umr_rereg_pas(mr, new_pd, new_access_flags, flags, 1892 new_umem, iova, page_size); 1893 if (err) { 1894 ib_umem_release(new_umem); 1895 return ERR_PTR(err); 1896 } 1897 return NULL; 1898 } 1899 return create_real_mr(new_pd, new_umem, iova, new_access_flags); 1900 } 1901 1902 /* 1903 * Everything else has no state we can preserve, just create a new MR 1904 * from scratch 1905 */ 1906 recreate: 1907 return mlx5_ib_reg_user_mr(new_pd, start, length, iova, 1908 new_access_flags, NULL, udata); 1909 } 1910 1911 static int 1912 mlx5_alloc_priv_descs(struct ib_device *device, 1913 struct mlx5_ib_mr *mr, 1914 int ndescs, 1915 int desc_size) 1916 { 1917 struct mlx5_ib_dev *dev = to_mdev(device); 1918 struct device *ddev = &dev->mdev->pdev->dev; 1919 int size = ndescs * desc_size; 1920 int add_size; 1921 int ret; 1922 1923 add_size = max_t(int, MLX5_UMR_ALIGN - ARCH_KMALLOC_MINALIGN, 0); 1924 if (is_power_of_2(MLX5_UMR_ALIGN) && add_size) { 1925 int end = max_t(int, MLX5_UMR_ALIGN, roundup_pow_of_two(size)); 1926 1927 add_size = min_t(int, end - size, add_size); 1928 } 1929 1930 mr->descs_alloc = kzalloc(size + add_size, GFP_KERNEL); 1931 if (!mr->descs_alloc) 1932 return -ENOMEM; 1933 1934 mr->descs = PTR_ALIGN(mr->descs_alloc, MLX5_UMR_ALIGN); 1935 1936 mr->desc_map = dma_map_single(ddev, mr->descs, size, DMA_TO_DEVICE); 1937 if (dma_mapping_error(ddev, mr->desc_map)) { 1938 ret = -ENOMEM; 1939 goto err; 1940 } 1941 1942 return 0; 1943 err: 1944 kfree(mr->descs_alloc); 1945 1946 return ret; 1947 } 1948 1949 static void 1950 mlx5_free_priv_descs(struct mlx5_ib_mr *mr) 1951 { 1952 if (!mr->umem && !mr->data_direct && 1953 mr->ibmr.type != IB_MR_TYPE_DM && mr->descs) { 1954 struct ib_device *device = mr->ibmr.device; 1955 int size = mr->max_descs * mr->desc_size; 1956 struct mlx5_ib_dev *dev = to_mdev(device); 1957 1958 dma_unmap_single(&dev->mdev->pdev->dev, mr->desc_map, size, 1959 DMA_TO_DEVICE); 1960 kfree(mr->descs_alloc); 1961 mr->descs = NULL; 1962 } 1963 } 1964 1965 static int cache_ent_find_and_store(struct mlx5_ib_dev *dev, 1966 struct mlx5_ib_mr *mr) 1967 { 1968 struct mlx5_mkey_cache *cache = &dev->cache; 1969 struct mlx5_cache_ent *ent; 1970 int ret; 1971 1972 if (mr->mmkey.cache_ent) { 1973 spin_lock_irq(&mr->mmkey.cache_ent->mkeys_queue.lock); 1974 goto end; 1975 } 1976 1977 mutex_lock(&cache->rb_lock); 1978 ent = mkey_cache_ent_from_rb_key(dev, mr->mmkey.rb_key); 1979 if (ent) { 1980 if (ent->rb_key.ndescs == mr->mmkey.rb_key.ndescs) { 1981 if (ent->disabled) { 1982 mutex_unlock(&cache->rb_lock); 1983 return -EOPNOTSUPP; 1984 } 1985 mr->mmkey.cache_ent = ent; 1986 spin_lock_irq(&mr->mmkey.cache_ent->mkeys_queue.lock); 1987 mutex_unlock(&cache->rb_lock); 1988 goto end; 1989 } 1990 } 1991 1992 ent = mlx5r_cache_create_ent_locked(dev, mr->mmkey.rb_key, false); 1993 mutex_unlock(&cache->rb_lock); 1994 if (IS_ERR(ent)) 1995 return PTR_ERR(ent); 1996 1997 mr->mmkey.cache_ent = ent; 1998 spin_lock_irq(&mr->mmkey.cache_ent->mkeys_queue.lock); 1999 2000 end: 2001 ret = push_mkey_locked(mr->mmkey.cache_ent, mr->mmkey.key); 2002 spin_unlock_irq(&mr->mmkey.cache_ent->mkeys_queue.lock); 2003 return ret; 2004 } 2005 2006 static int mlx5_ib_revoke_data_direct_mr(struct mlx5_ib_mr *mr) 2007 { 2008 struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device); 2009 struct ib_umem_dmabuf *umem_dmabuf = to_ib_umem_dmabuf(mr->umem); 2010 int err; 2011 2012 lockdep_assert_held(&dev->data_direct_lock); 2013 mr->revoked = true; 2014 err = mlx5r_umr_revoke_mr(mr); 2015 if (WARN_ON(err)) 2016 return err; 2017 2018 ib_umem_dmabuf_revoke(umem_dmabuf); 2019 return 0; 2020 } 2021 2022 void mlx5_ib_revoke_data_direct_mrs(struct mlx5_ib_dev *dev) 2023 { 2024 struct mlx5_ib_mr *mr, *next; 2025 2026 lockdep_assert_held(&dev->data_direct_lock); 2027 2028 list_for_each_entry_safe(mr, next, &dev->data_direct_mr_list, dd_node) { 2029 list_del(&mr->dd_node); 2030 mlx5_ib_revoke_data_direct_mr(mr); 2031 } 2032 } 2033 2034 static int mlx5_revoke_mr(struct mlx5_ib_mr *mr) 2035 { 2036 struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device); 2037 struct mlx5_cache_ent *ent = mr->mmkey.cache_ent; 2038 bool is_odp = is_odp_mr(mr); 2039 bool is_odp_dma_buf = is_dmabuf_mr(mr) && 2040 !to_ib_umem_dmabuf(mr->umem)->pinned; 2041 bool from_cache = !!ent; 2042 int ret = 0; 2043 2044 if (is_odp) 2045 mutex_lock(&to_ib_umem_odp(mr->umem)->umem_mutex); 2046 2047 if (is_odp_dma_buf) 2048 dma_resv_lock(to_ib_umem_dmabuf(mr->umem)->attach->dmabuf->resv, NULL); 2049 2050 if (mr->mmkey.cacheable && !mlx5r_umr_revoke_mr(mr) && !cache_ent_find_and_store(dev, mr)) { 2051 ent = mr->mmkey.cache_ent; 2052 /* upon storing to a clean temp entry - schedule its cleanup */ 2053 spin_lock_irq(&ent->mkeys_queue.lock); 2054 if (from_cache) 2055 ent->in_use--; 2056 if (ent->is_tmp && !ent->tmp_cleanup_scheduled) { 2057 mod_delayed_work(ent->dev->cache.wq, &ent->dwork, 2058 secs_to_jiffies(30)); 2059 ent->tmp_cleanup_scheduled = true; 2060 } 2061 spin_unlock_irq(&ent->mkeys_queue.lock); 2062 goto out; 2063 } 2064 2065 if (ent) { 2066 spin_lock_irq(&ent->mkeys_queue.lock); 2067 ent->in_use--; 2068 mr->mmkey.cache_ent = NULL; 2069 spin_unlock_irq(&ent->mkeys_queue.lock); 2070 } 2071 ret = destroy_mkey(dev, mr); 2072 out: 2073 if (is_odp) { 2074 if (!ret) 2075 to_ib_umem_odp(mr->umem)->private = NULL; 2076 mutex_unlock(&to_ib_umem_odp(mr->umem)->umem_mutex); 2077 } 2078 2079 if (is_odp_dma_buf) { 2080 if (!ret) 2081 to_ib_umem_dmabuf(mr->umem)->private = NULL; 2082 dma_resv_unlock(to_ib_umem_dmabuf(mr->umem)->attach->dmabuf->resv); 2083 } 2084 2085 return ret; 2086 } 2087 2088 static int __mlx5_ib_dereg_mr(struct ib_mr *ibmr) 2089 { 2090 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2091 struct mlx5_ib_dev *dev = to_mdev(ibmr->device); 2092 int rc; 2093 2094 /* 2095 * Any async use of the mr must hold the refcount, once the refcount 2096 * goes to zero no other thread, such as ODP page faults, prefetch, any 2097 * UMR activity, etc can touch the mkey. Thus it is safe to destroy it. 2098 */ 2099 if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) && 2100 refcount_read(&mr->mmkey.usecount) != 0 && 2101 xa_erase(&mr_to_mdev(mr)->odp_mkeys, mlx5_base_mkey(mr->mmkey.key))) 2102 mlx5r_deref_wait_odp_mkey(&mr->mmkey); 2103 2104 if (ibmr->type == IB_MR_TYPE_INTEGRITY) { 2105 xa_cmpxchg(&dev->sig_mrs, mlx5_base_mkey(mr->mmkey.key), 2106 mr->sig, NULL, GFP_KERNEL); 2107 2108 if (mr->mtt_mr) { 2109 rc = mlx5_ib_dereg_mr(&mr->mtt_mr->ibmr, NULL); 2110 if (rc) 2111 return rc; 2112 mr->mtt_mr = NULL; 2113 } 2114 if (mr->klm_mr) { 2115 rc = mlx5_ib_dereg_mr(&mr->klm_mr->ibmr, NULL); 2116 if (rc) 2117 return rc; 2118 mr->klm_mr = NULL; 2119 } 2120 2121 if (mlx5_core_destroy_psv(dev->mdev, 2122 mr->sig->psv_memory.psv_idx)) 2123 mlx5_ib_warn(dev, "failed to destroy mem psv %d\n", 2124 mr->sig->psv_memory.psv_idx); 2125 if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_wire.psv_idx)) 2126 mlx5_ib_warn(dev, "failed to destroy wire psv %d\n", 2127 mr->sig->psv_wire.psv_idx); 2128 kfree(mr->sig); 2129 mr->sig = NULL; 2130 } 2131 2132 /* Stop DMA */ 2133 rc = mlx5_revoke_mr(mr); 2134 if (rc) 2135 return rc; 2136 2137 if (mr->umem) { 2138 bool is_odp = is_odp_mr(mr); 2139 2140 if (!is_odp) 2141 atomic_sub(ib_umem_num_pages(mr->umem), 2142 &dev->mdev->priv.reg_pages); 2143 ib_umem_release(mr->umem); 2144 if (is_odp) 2145 mlx5_ib_free_odp_mr(mr); 2146 } 2147 2148 if (!mr->mmkey.cache_ent) 2149 mlx5_free_priv_descs(mr); 2150 2151 kfree(mr); 2152 return 0; 2153 } 2154 2155 static int dereg_crossing_data_direct_mr(struct mlx5_ib_dev *dev, 2156 struct mlx5_ib_mr *mr) 2157 { 2158 struct mlx5_ib_mr *dd_crossed_mr = mr->dd_crossed_mr; 2159 int ret; 2160 2161 ret = __mlx5_ib_dereg_mr(&mr->ibmr); 2162 if (ret) 2163 return ret; 2164 2165 mutex_lock(&dev->data_direct_lock); 2166 if (!dd_crossed_mr->revoked) 2167 list_del(&dd_crossed_mr->dd_node); 2168 2169 ret = __mlx5_ib_dereg_mr(&dd_crossed_mr->ibmr); 2170 mutex_unlock(&dev->data_direct_lock); 2171 return ret; 2172 } 2173 2174 int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) 2175 { 2176 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2177 struct mlx5_ib_dev *dev = to_mdev(ibmr->device); 2178 2179 if (mr->data_direct) 2180 return dereg_crossing_data_direct_mr(dev, mr); 2181 2182 return __mlx5_ib_dereg_mr(ibmr); 2183 } 2184 2185 static void mlx5_set_umr_free_mkey(struct ib_pd *pd, u32 *in, int ndescs, 2186 int access_mode, int page_shift) 2187 { 2188 struct mlx5_ib_dev *dev = to_mdev(pd->device); 2189 void *mkc; 2190 2191 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 2192 2193 /* This is only used from the kernel, so setting the PD is OK. */ 2194 set_mkc_access_pd_addr_fields(mkc, IB_ACCESS_RELAXED_ORDERING, 0, pd); 2195 MLX5_SET(mkc, mkc, free, 1); 2196 MLX5_SET(mkc, mkc, translations_octword_size, ndescs); 2197 MLX5_SET(mkc, mkc, access_mode_1_0, access_mode & 0x3); 2198 MLX5_SET(mkc, mkc, access_mode_4_2, (access_mode >> 2) & 0x7); 2199 MLX5_SET(mkc, mkc, umr_en, 1); 2200 MLX5_SET(mkc, mkc, log_page_size, page_shift); 2201 if (access_mode == MLX5_MKC_ACCESS_MODE_PA || 2202 access_mode == MLX5_MKC_ACCESS_MODE_MTT) 2203 MLX5_SET(mkc, mkc, ma_translation_mode, MLX5_CAP_GEN(dev->mdev, ats)); 2204 } 2205 2206 static int _mlx5_alloc_mkey_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr, 2207 int ndescs, int desc_size, int page_shift, 2208 int access_mode, u32 *in, int inlen) 2209 { 2210 struct mlx5_ib_dev *dev = to_mdev(pd->device); 2211 int err; 2212 2213 mr->access_mode = access_mode; 2214 mr->desc_size = desc_size; 2215 mr->max_descs = ndescs; 2216 2217 err = mlx5_alloc_priv_descs(pd->device, mr, ndescs, desc_size); 2218 if (err) 2219 return err; 2220 2221 mlx5_set_umr_free_mkey(pd, in, ndescs, access_mode, page_shift); 2222 2223 err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen); 2224 if (err) 2225 goto err_free_descs; 2226 2227 mr->mmkey.type = MLX5_MKEY_MR; 2228 mr->ibmr.lkey = mr->mmkey.key; 2229 mr->ibmr.rkey = mr->mmkey.key; 2230 2231 return 0; 2232 2233 err_free_descs: 2234 mlx5_free_priv_descs(mr); 2235 return err; 2236 } 2237 2238 static struct mlx5_ib_mr *mlx5_ib_alloc_pi_mr(struct ib_pd *pd, 2239 u32 max_num_sg, u32 max_num_meta_sg, 2240 int desc_size, int access_mode) 2241 { 2242 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 2243 int ndescs = ALIGN(max_num_sg + max_num_meta_sg, 4); 2244 int page_shift = 0; 2245 struct mlx5_ib_mr *mr; 2246 u32 *in; 2247 int err; 2248 2249 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 2250 if (!mr) 2251 return ERR_PTR(-ENOMEM); 2252 2253 mr->ibmr.pd = pd; 2254 mr->ibmr.device = pd->device; 2255 2256 in = kzalloc(inlen, GFP_KERNEL); 2257 if (!in) { 2258 err = -ENOMEM; 2259 goto err_free; 2260 } 2261 2262 if (access_mode == MLX5_MKC_ACCESS_MODE_MTT) 2263 page_shift = PAGE_SHIFT; 2264 2265 err = _mlx5_alloc_mkey_descs(pd, mr, ndescs, desc_size, page_shift, 2266 access_mode, in, inlen); 2267 if (err) 2268 goto err_free_in; 2269 2270 mr->umem = NULL; 2271 kfree(in); 2272 2273 return mr; 2274 2275 err_free_in: 2276 kfree(in); 2277 err_free: 2278 kfree(mr); 2279 return ERR_PTR(err); 2280 } 2281 2282 static int mlx5_alloc_mem_reg_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr, 2283 int ndescs, u32 *in, int inlen) 2284 { 2285 return _mlx5_alloc_mkey_descs(pd, mr, ndescs, sizeof(struct mlx5_mtt), 2286 PAGE_SHIFT, MLX5_MKC_ACCESS_MODE_MTT, in, 2287 inlen); 2288 } 2289 2290 static int mlx5_alloc_sg_gaps_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr, 2291 int ndescs, u32 *in, int inlen) 2292 { 2293 return _mlx5_alloc_mkey_descs(pd, mr, ndescs, sizeof(struct mlx5_klm), 2294 0, MLX5_MKC_ACCESS_MODE_KLMS, in, inlen); 2295 } 2296 2297 static int mlx5_alloc_integrity_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr, 2298 int max_num_sg, int max_num_meta_sg, 2299 u32 *in, int inlen) 2300 { 2301 struct mlx5_ib_dev *dev = to_mdev(pd->device); 2302 u32 psv_index[2]; 2303 void *mkc; 2304 int err; 2305 2306 mr->sig = kzalloc(sizeof(*mr->sig), GFP_KERNEL); 2307 if (!mr->sig) 2308 return -ENOMEM; 2309 2310 /* create mem & wire PSVs */ 2311 err = mlx5_core_create_psv(dev->mdev, to_mpd(pd)->pdn, 2, psv_index); 2312 if (err) 2313 goto err_free_sig; 2314 2315 mr->sig->psv_memory.psv_idx = psv_index[0]; 2316 mr->sig->psv_wire.psv_idx = psv_index[1]; 2317 2318 mr->sig->sig_status_checked = true; 2319 mr->sig->sig_err_exists = false; 2320 /* Next UMR, Arm SIGERR */ 2321 ++mr->sig->sigerr_count; 2322 mr->klm_mr = mlx5_ib_alloc_pi_mr(pd, max_num_sg, max_num_meta_sg, 2323 sizeof(struct mlx5_klm), 2324 MLX5_MKC_ACCESS_MODE_KLMS); 2325 if (IS_ERR(mr->klm_mr)) { 2326 err = PTR_ERR(mr->klm_mr); 2327 goto err_destroy_psv; 2328 } 2329 mr->mtt_mr = mlx5_ib_alloc_pi_mr(pd, max_num_sg, max_num_meta_sg, 2330 sizeof(struct mlx5_mtt), 2331 MLX5_MKC_ACCESS_MODE_MTT); 2332 if (IS_ERR(mr->mtt_mr)) { 2333 err = PTR_ERR(mr->mtt_mr); 2334 goto err_free_klm_mr; 2335 } 2336 2337 /* Set bsf descriptors for mkey */ 2338 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 2339 MLX5_SET(mkc, mkc, bsf_en, 1); 2340 MLX5_SET(mkc, mkc, bsf_octword_size, MLX5_MKEY_BSF_OCTO_SIZE); 2341 2342 err = _mlx5_alloc_mkey_descs(pd, mr, 4, sizeof(struct mlx5_klm), 0, 2343 MLX5_MKC_ACCESS_MODE_KLMS, in, inlen); 2344 if (err) 2345 goto err_free_mtt_mr; 2346 2347 err = xa_err(xa_store(&dev->sig_mrs, mlx5_base_mkey(mr->mmkey.key), 2348 mr->sig, GFP_KERNEL)); 2349 if (err) 2350 goto err_free_descs; 2351 return 0; 2352 2353 err_free_descs: 2354 destroy_mkey(dev, mr); 2355 mlx5_free_priv_descs(mr); 2356 err_free_mtt_mr: 2357 mlx5_ib_dereg_mr(&mr->mtt_mr->ibmr, NULL); 2358 mr->mtt_mr = NULL; 2359 err_free_klm_mr: 2360 mlx5_ib_dereg_mr(&mr->klm_mr->ibmr, NULL); 2361 mr->klm_mr = NULL; 2362 err_destroy_psv: 2363 if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_memory.psv_idx)) 2364 mlx5_ib_warn(dev, "failed to destroy mem psv %d\n", 2365 mr->sig->psv_memory.psv_idx); 2366 if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_wire.psv_idx)) 2367 mlx5_ib_warn(dev, "failed to destroy wire psv %d\n", 2368 mr->sig->psv_wire.psv_idx); 2369 err_free_sig: 2370 kfree(mr->sig); 2371 2372 return err; 2373 } 2374 2375 static struct ib_mr *__mlx5_ib_alloc_mr(struct ib_pd *pd, 2376 enum ib_mr_type mr_type, u32 max_num_sg, 2377 u32 max_num_meta_sg) 2378 { 2379 struct mlx5_ib_dev *dev = to_mdev(pd->device); 2380 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 2381 int ndescs = ALIGN(max_num_sg, 4); 2382 struct mlx5_ib_mr *mr; 2383 u32 *in; 2384 int err; 2385 2386 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 2387 if (!mr) 2388 return ERR_PTR(-ENOMEM); 2389 2390 in = kzalloc(inlen, GFP_KERNEL); 2391 if (!in) { 2392 err = -ENOMEM; 2393 goto err_free; 2394 } 2395 2396 mr->ibmr.device = pd->device; 2397 mr->umem = NULL; 2398 2399 switch (mr_type) { 2400 case IB_MR_TYPE_MEM_REG: 2401 err = mlx5_alloc_mem_reg_descs(pd, mr, ndescs, in, inlen); 2402 break; 2403 case IB_MR_TYPE_SG_GAPS: 2404 err = mlx5_alloc_sg_gaps_descs(pd, mr, ndescs, in, inlen); 2405 break; 2406 case IB_MR_TYPE_INTEGRITY: 2407 err = mlx5_alloc_integrity_descs(pd, mr, max_num_sg, 2408 max_num_meta_sg, in, inlen); 2409 break; 2410 default: 2411 mlx5_ib_warn(dev, "Invalid mr type %d\n", mr_type); 2412 err = -EINVAL; 2413 } 2414 2415 if (err) 2416 goto err_free_in; 2417 2418 kfree(in); 2419 2420 return &mr->ibmr; 2421 2422 err_free_in: 2423 kfree(in); 2424 err_free: 2425 kfree(mr); 2426 return ERR_PTR(err); 2427 } 2428 2429 struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, 2430 u32 max_num_sg) 2431 { 2432 return __mlx5_ib_alloc_mr(pd, mr_type, max_num_sg, 0); 2433 } 2434 2435 struct ib_mr *mlx5_ib_alloc_mr_integrity(struct ib_pd *pd, 2436 u32 max_num_sg, u32 max_num_meta_sg) 2437 { 2438 return __mlx5_ib_alloc_mr(pd, IB_MR_TYPE_INTEGRITY, max_num_sg, 2439 max_num_meta_sg); 2440 } 2441 2442 int mlx5_ib_alloc_mw(struct ib_mw *ibmw, struct ib_udata *udata) 2443 { 2444 struct mlx5_ib_dev *dev = to_mdev(ibmw->device); 2445 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 2446 struct mlx5_ib_mw *mw = to_mmw(ibmw); 2447 unsigned int ndescs; 2448 u32 *in = NULL; 2449 void *mkc; 2450 int err; 2451 struct mlx5_ib_alloc_mw req = {}; 2452 struct { 2453 __u32 comp_mask; 2454 __u32 response_length; 2455 } resp = {}; 2456 2457 err = ib_copy_from_udata(&req, udata, min(udata->inlen, sizeof(req))); 2458 if (err) 2459 return err; 2460 2461 if (req.comp_mask || req.reserved1 || req.reserved2) 2462 return -EOPNOTSUPP; 2463 2464 if (udata->inlen > sizeof(req) && 2465 !ib_is_udata_cleared(udata, sizeof(req), 2466 udata->inlen - sizeof(req))) 2467 return -EOPNOTSUPP; 2468 2469 ndescs = req.num_klms ? roundup(req.num_klms, 4) : roundup(1, 4); 2470 2471 in = kzalloc(inlen, GFP_KERNEL); 2472 if (!in) 2473 return -ENOMEM; 2474 2475 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 2476 2477 MLX5_SET(mkc, mkc, free, 1); 2478 MLX5_SET(mkc, mkc, translations_octword_size, ndescs); 2479 MLX5_SET(mkc, mkc, pd, to_mpd(ibmw->pd)->pdn); 2480 MLX5_SET(mkc, mkc, umr_en, 1); 2481 MLX5_SET(mkc, mkc, lr, 1); 2482 MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_KLMS); 2483 MLX5_SET(mkc, mkc, en_rinval, !!((ibmw->type == IB_MW_TYPE_2))); 2484 MLX5_SET(mkc, mkc, qpn, 0xffffff); 2485 2486 err = mlx5_ib_create_mkey(dev, &mw->mmkey, in, inlen); 2487 if (err) 2488 goto free; 2489 2490 mw->mmkey.type = MLX5_MKEY_MW; 2491 ibmw->rkey = mw->mmkey.key; 2492 mw->mmkey.ndescs = ndescs; 2493 2494 resp.response_length = 2495 min(offsetofend(typeof(resp), response_length), udata->outlen); 2496 if (resp.response_length) { 2497 err = ib_copy_to_udata(udata, &resp, resp.response_length); 2498 if (err) 2499 goto free_mkey; 2500 } 2501 2502 if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) { 2503 err = mlx5r_store_odp_mkey(dev, &mw->mmkey); 2504 if (err) 2505 goto free_mkey; 2506 } 2507 2508 kfree(in); 2509 return 0; 2510 2511 free_mkey: 2512 mlx5_core_destroy_mkey(dev->mdev, mw->mmkey.key); 2513 free: 2514 kfree(in); 2515 return err; 2516 } 2517 2518 int mlx5_ib_dealloc_mw(struct ib_mw *mw) 2519 { 2520 struct mlx5_ib_dev *dev = to_mdev(mw->device); 2521 struct mlx5_ib_mw *mmw = to_mmw(mw); 2522 2523 if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) && 2524 xa_erase(&dev->odp_mkeys, mlx5_base_mkey(mmw->mmkey.key))) 2525 /* 2526 * pagefault_single_data_segment() may be accessing mmw 2527 * if the user bound an ODP MR to this MW. 2528 */ 2529 mlx5r_deref_wait_odp_mkey(&mmw->mmkey); 2530 2531 return mlx5_core_destroy_mkey(dev->mdev, mmw->mmkey.key); 2532 } 2533 2534 int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask, 2535 struct ib_mr_status *mr_status) 2536 { 2537 struct mlx5_ib_mr *mmr = to_mmr(ibmr); 2538 int ret = 0; 2539 2540 if (check_mask & ~IB_MR_CHECK_SIG_STATUS) { 2541 pr_err("Invalid status check mask\n"); 2542 ret = -EINVAL; 2543 goto done; 2544 } 2545 2546 mr_status->fail_status = 0; 2547 if (check_mask & IB_MR_CHECK_SIG_STATUS) { 2548 if (!mmr->sig) { 2549 ret = -EINVAL; 2550 pr_err("signature status check requested on a non-signature enabled MR\n"); 2551 goto done; 2552 } 2553 2554 mmr->sig->sig_status_checked = true; 2555 if (!mmr->sig->sig_err_exists) 2556 goto done; 2557 2558 if (ibmr->lkey == mmr->sig->err_item.key) 2559 memcpy(&mr_status->sig_err, &mmr->sig->err_item, 2560 sizeof(mr_status->sig_err)); 2561 else { 2562 mr_status->sig_err.err_type = IB_SIG_BAD_GUARD; 2563 mr_status->sig_err.sig_err_offset = 0; 2564 mr_status->sig_err.key = mmr->sig->err_item.key; 2565 } 2566 2567 mmr->sig->sig_err_exists = false; 2568 mr_status->fail_status |= IB_MR_CHECK_SIG_STATUS; 2569 } 2570 2571 done: 2572 return ret; 2573 } 2574 2575 static int 2576 mlx5_ib_map_pa_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg, 2577 int data_sg_nents, unsigned int *data_sg_offset, 2578 struct scatterlist *meta_sg, int meta_sg_nents, 2579 unsigned int *meta_sg_offset) 2580 { 2581 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2582 unsigned int sg_offset = 0; 2583 int n = 0; 2584 2585 mr->meta_length = 0; 2586 if (data_sg_nents == 1) { 2587 n++; 2588 mr->mmkey.ndescs = 1; 2589 if (data_sg_offset) 2590 sg_offset = *data_sg_offset; 2591 mr->data_length = sg_dma_len(data_sg) - sg_offset; 2592 mr->data_iova = sg_dma_address(data_sg) + sg_offset; 2593 if (meta_sg_nents == 1) { 2594 n++; 2595 mr->meta_ndescs = 1; 2596 if (meta_sg_offset) 2597 sg_offset = *meta_sg_offset; 2598 else 2599 sg_offset = 0; 2600 mr->meta_length = sg_dma_len(meta_sg) - sg_offset; 2601 mr->pi_iova = sg_dma_address(meta_sg) + sg_offset; 2602 } 2603 ibmr->length = mr->data_length + mr->meta_length; 2604 } 2605 2606 return n; 2607 } 2608 2609 static int 2610 mlx5_ib_sg_to_klms(struct mlx5_ib_mr *mr, 2611 struct scatterlist *sgl, 2612 unsigned short sg_nents, 2613 unsigned int *sg_offset_p, 2614 struct scatterlist *meta_sgl, 2615 unsigned short meta_sg_nents, 2616 unsigned int *meta_sg_offset_p) 2617 { 2618 struct scatterlist *sg = sgl; 2619 struct mlx5_klm *klms = mr->descs; 2620 unsigned int sg_offset = sg_offset_p ? *sg_offset_p : 0; 2621 u32 lkey = mr->ibmr.pd->local_dma_lkey; 2622 int i, j = 0; 2623 2624 mr->ibmr.iova = sg_dma_address(sg) + sg_offset; 2625 mr->ibmr.length = 0; 2626 2627 for_each_sg(sgl, sg, sg_nents, i) { 2628 if (unlikely(i >= mr->max_descs)) 2629 break; 2630 klms[i].va = cpu_to_be64(sg_dma_address(sg) + sg_offset); 2631 klms[i].bcount = cpu_to_be32(sg_dma_len(sg) - sg_offset); 2632 klms[i].key = cpu_to_be32(lkey); 2633 mr->ibmr.length += sg_dma_len(sg) - sg_offset; 2634 2635 sg_offset = 0; 2636 } 2637 2638 if (sg_offset_p) 2639 *sg_offset_p = sg_offset; 2640 2641 mr->mmkey.ndescs = i; 2642 mr->data_length = mr->ibmr.length; 2643 2644 if (meta_sg_nents) { 2645 sg = meta_sgl; 2646 sg_offset = meta_sg_offset_p ? *meta_sg_offset_p : 0; 2647 for_each_sg(meta_sgl, sg, meta_sg_nents, j) { 2648 if (unlikely(i + j >= mr->max_descs)) 2649 break; 2650 klms[i + j].va = cpu_to_be64(sg_dma_address(sg) + 2651 sg_offset); 2652 klms[i + j].bcount = cpu_to_be32(sg_dma_len(sg) - 2653 sg_offset); 2654 klms[i + j].key = cpu_to_be32(lkey); 2655 mr->ibmr.length += sg_dma_len(sg) - sg_offset; 2656 2657 sg_offset = 0; 2658 } 2659 if (meta_sg_offset_p) 2660 *meta_sg_offset_p = sg_offset; 2661 2662 mr->meta_ndescs = j; 2663 mr->meta_length = mr->ibmr.length - mr->data_length; 2664 } 2665 2666 return i + j; 2667 } 2668 2669 static int mlx5_set_page(struct ib_mr *ibmr, u64 addr) 2670 { 2671 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2672 __be64 *descs; 2673 2674 if (unlikely(mr->mmkey.ndescs == mr->max_descs)) 2675 return -ENOMEM; 2676 2677 descs = mr->descs; 2678 descs[mr->mmkey.ndescs++] = cpu_to_be64(addr | MLX5_EN_RD | MLX5_EN_WR); 2679 2680 return 0; 2681 } 2682 2683 static int mlx5_set_page_pi(struct ib_mr *ibmr, u64 addr) 2684 { 2685 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2686 __be64 *descs; 2687 2688 if (unlikely(mr->mmkey.ndescs + mr->meta_ndescs == mr->max_descs)) 2689 return -ENOMEM; 2690 2691 descs = mr->descs; 2692 descs[mr->mmkey.ndescs + mr->meta_ndescs++] = 2693 cpu_to_be64(addr | MLX5_EN_RD | MLX5_EN_WR); 2694 2695 return 0; 2696 } 2697 2698 static int 2699 mlx5_ib_map_mtt_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg, 2700 int data_sg_nents, unsigned int *data_sg_offset, 2701 struct scatterlist *meta_sg, int meta_sg_nents, 2702 unsigned int *meta_sg_offset) 2703 { 2704 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2705 struct mlx5_ib_mr *pi_mr = mr->mtt_mr; 2706 int n; 2707 2708 pi_mr->mmkey.ndescs = 0; 2709 pi_mr->meta_ndescs = 0; 2710 pi_mr->meta_length = 0; 2711 2712 ib_dma_sync_single_for_cpu(ibmr->device, pi_mr->desc_map, 2713 pi_mr->desc_size * pi_mr->max_descs, 2714 DMA_TO_DEVICE); 2715 2716 pi_mr->ibmr.page_size = ibmr->page_size; 2717 n = ib_sg_to_pages(&pi_mr->ibmr, data_sg, data_sg_nents, data_sg_offset, 2718 mlx5_set_page); 2719 if (n != data_sg_nents) 2720 return n; 2721 2722 pi_mr->data_iova = pi_mr->ibmr.iova; 2723 pi_mr->data_length = pi_mr->ibmr.length; 2724 pi_mr->ibmr.length = pi_mr->data_length; 2725 ibmr->length = pi_mr->data_length; 2726 2727 if (meta_sg_nents) { 2728 u64 page_mask = ~((u64)ibmr->page_size - 1); 2729 u64 iova = pi_mr->data_iova; 2730 2731 n += ib_sg_to_pages(&pi_mr->ibmr, meta_sg, meta_sg_nents, 2732 meta_sg_offset, mlx5_set_page_pi); 2733 2734 pi_mr->meta_length = pi_mr->ibmr.length; 2735 /* 2736 * PI address for the HW is the offset of the metadata address 2737 * relative to the first data page address. 2738 * It equals to first data page address + size of data pages + 2739 * metadata offset at the first metadata page 2740 */ 2741 pi_mr->pi_iova = (iova & page_mask) + 2742 pi_mr->mmkey.ndescs * ibmr->page_size + 2743 (pi_mr->ibmr.iova & ~page_mask); 2744 /* 2745 * In order to use one MTT MR for data and metadata, we register 2746 * also the gaps between the end of the data and the start of 2747 * the metadata (the sig MR will verify that the HW will access 2748 * to right addresses). This mapping is safe because we use 2749 * internal mkey for the registration. 2750 */ 2751 pi_mr->ibmr.length = pi_mr->pi_iova + pi_mr->meta_length - iova; 2752 pi_mr->ibmr.iova = iova; 2753 ibmr->length += pi_mr->meta_length; 2754 } 2755 2756 ib_dma_sync_single_for_device(ibmr->device, pi_mr->desc_map, 2757 pi_mr->desc_size * pi_mr->max_descs, 2758 DMA_TO_DEVICE); 2759 2760 return n; 2761 } 2762 2763 static int 2764 mlx5_ib_map_klm_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg, 2765 int data_sg_nents, unsigned int *data_sg_offset, 2766 struct scatterlist *meta_sg, int meta_sg_nents, 2767 unsigned int *meta_sg_offset) 2768 { 2769 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2770 struct mlx5_ib_mr *pi_mr = mr->klm_mr; 2771 int n; 2772 2773 pi_mr->mmkey.ndescs = 0; 2774 pi_mr->meta_ndescs = 0; 2775 pi_mr->meta_length = 0; 2776 2777 ib_dma_sync_single_for_cpu(ibmr->device, pi_mr->desc_map, 2778 pi_mr->desc_size * pi_mr->max_descs, 2779 DMA_TO_DEVICE); 2780 2781 n = mlx5_ib_sg_to_klms(pi_mr, data_sg, data_sg_nents, data_sg_offset, 2782 meta_sg, meta_sg_nents, meta_sg_offset); 2783 2784 ib_dma_sync_single_for_device(ibmr->device, pi_mr->desc_map, 2785 pi_mr->desc_size * pi_mr->max_descs, 2786 DMA_TO_DEVICE); 2787 2788 /* This is zero-based memory region */ 2789 pi_mr->data_iova = 0; 2790 pi_mr->ibmr.iova = 0; 2791 pi_mr->pi_iova = pi_mr->data_length; 2792 ibmr->length = pi_mr->ibmr.length; 2793 2794 return n; 2795 } 2796 2797 int mlx5_ib_map_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg, 2798 int data_sg_nents, unsigned int *data_sg_offset, 2799 struct scatterlist *meta_sg, int meta_sg_nents, 2800 unsigned int *meta_sg_offset) 2801 { 2802 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2803 struct mlx5_ib_mr *pi_mr = NULL; 2804 int n; 2805 2806 WARN_ON(ibmr->type != IB_MR_TYPE_INTEGRITY); 2807 2808 mr->mmkey.ndescs = 0; 2809 mr->data_length = 0; 2810 mr->data_iova = 0; 2811 mr->meta_ndescs = 0; 2812 mr->pi_iova = 0; 2813 /* 2814 * As a performance optimization, if possible, there is no need to 2815 * perform UMR operation to register the data/metadata buffers. 2816 * First try to map the sg lists to PA descriptors with local_dma_lkey. 2817 * Fallback to UMR only in case of a failure. 2818 */ 2819 n = mlx5_ib_map_pa_mr_sg_pi(ibmr, data_sg, data_sg_nents, 2820 data_sg_offset, meta_sg, meta_sg_nents, 2821 meta_sg_offset); 2822 if (n == data_sg_nents + meta_sg_nents) 2823 goto out; 2824 /* 2825 * As a performance optimization, if possible, there is no need to map 2826 * the sg lists to KLM descriptors. First try to map the sg lists to MTT 2827 * descriptors and fallback to KLM only in case of a failure. 2828 * It's more efficient for the HW to work with MTT descriptors 2829 * (especially in high load). 2830 * Use KLM (indirect access) only if it's mandatory. 2831 */ 2832 pi_mr = mr->mtt_mr; 2833 n = mlx5_ib_map_mtt_mr_sg_pi(ibmr, data_sg, data_sg_nents, 2834 data_sg_offset, meta_sg, meta_sg_nents, 2835 meta_sg_offset); 2836 if (n == data_sg_nents + meta_sg_nents) 2837 goto out; 2838 2839 pi_mr = mr->klm_mr; 2840 n = mlx5_ib_map_klm_mr_sg_pi(ibmr, data_sg, data_sg_nents, 2841 data_sg_offset, meta_sg, meta_sg_nents, 2842 meta_sg_offset); 2843 if (unlikely(n != data_sg_nents + meta_sg_nents)) 2844 return -ENOMEM; 2845 2846 out: 2847 /* This is zero-based memory region */ 2848 ibmr->iova = 0; 2849 mr->pi_mr = pi_mr; 2850 if (pi_mr) 2851 ibmr->sig_attrs->meta_length = pi_mr->meta_length; 2852 else 2853 ibmr->sig_attrs->meta_length = mr->meta_length; 2854 2855 return 0; 2856 } 2857 2858 int mlx5_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, 2859 unsigned int *sg_offset) 2860 { 2861 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2862 int n; 2863 2864 mr->mmkey.ndescs = 0; 2865 2866 ib_dma_sync_single_for_cpu(ibmr->device, mr->desc_map, 2867 mr->desc_size * mr->max_descs, 2868 DMA_TO_DEVICE); 2869 2870 if (mr->access_mode == MLX5_MKC_ACCESS_MODE_KLMS) 2871 n = mlx5_ib_sg_to_klms(mr, sg, sg_nents, sg_offset, NULL, 0, 2872 NULL); 2873 else 2874 n = ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, 2875 mlx5_set_page); 2876 2877 ib_dma_sync_single_for_device(ibmr->device, mr->desc_map, 2878 mr->desc_size * mr->max_descs, 2879 DMA_TO_DEVICE); 2880 2881 return n; 2882 } 2883