1 /* 2 * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved. 3 * Copyright (c) 2020, Intel Corporation. All rights reserved. 4 * 5 * This software is available to you under a choice of one of two 6 * licenses. You may choose to be licensed under the terms of the GNU 7 * General Public License (GPL) Version 2, available from the file 8 * COPYING in the main directory of this source tree, or the 9 * OpenIB.org BSD license below: 10 * 11 * Redistribution and use in source and binary forms, with or 12 * without modification, are permitted provided that the following 13 * conditions are met: 14 * 15 * - Redistributions of source code must retain the above 16 * copyright notice, this list of conditions and the following 17 * disclaimer. 18 * 19 * - Redistributions in binary form must reproduce the above 20 * copyright notice, this list of conditions and the following 21 * disclaimer in the documentation and/or other materials 22 * provided with the distribution. 23 * 24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 25 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 26 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 27 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 28 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 29 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 30 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 31 * SOFTWARE. 32 */ 33 34 35 #include <linux/kref.h> 36 #include <linux/random.h> 37 #include <linux/debugfs.h> 38 #include <linux/export.h> 39 #include <linux/delay.h> 40 #include <linux/dma-buf.h> 41 #include <linux/dma-resv.h> 42 #include <rdma/ib_umem_odp.h> 43 #include "dm.h" 44 #include "mlx5_ib.h" 45 #include "umr.h" 46 #include "data_direct.h" 47 48 enum { 49 MAX_PENDING_REG_MR = 8, 50 }; 51 52 #define MLX5_MR_CACHE_PERSISTENT_ENTRY_MIN_DESCS 4 53 #define MLX5_UMR_ALIGN 2048 54 55 static void 56 create_mkey_callback(int status, struct mlx5_async_work *context); 57 static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem, 58 u64 iova, int access_flags, 59 unsigned int page_size, bool populate, 60 int access_mode); 61 static int __mlx5_ib_dereg_mr(struct ib_mr *ibmr); 62 63 static void set_mkc_access_pd_addr_fields(void *mkc, int acc, u64 start_addr, 64 struct ib_pd *pd) 65 { 66 struct mlx5_ib_dev *dev = to_mdev(pd->device); 67 68 MLX5_SET(mkc, mkc, a, !!(acc & IB_ACCESS_REMOTE_ATOMIC)); 69 MLX5_SET(mkc, mkc, rw, !!(acc & IB_ACCESS_REMOTE_WRITE)); 70 MLX5_SET(mkc, mkc, rr, !!(acc & IB_ACCESS_REMOTE_READ)); 71 MLX5_SET(mkc, mkc, lw, !!(acc & IB_ACCESS_LOCAL_WRITE)); 72 MLX5_SET(mkc, mkc, lr, 1); 73 74 if (acc & IB_ACCESS_RELAXED_ORDERING) { 75 if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write)) 76 MLX5_SET(mkc, mkc, relaxed_ordering_write, 1); 77 78 if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read) || 79 (MLX5_CAP_GEN(dev->mdev, 80 relaxed_ordering_read_pci_enabled) && 81 pcie_relaxed_ordering_enabled(dev->mdev->pdev))) 82 MLX5_SET(mkc, mkc, relaxed_ordering_read, 1); 83 } 84 85 MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn); 86 MLX5_SET(mkc, mkc, qpn, 0xffffff); 87 MLX5_SET64(mkc, mkc, start_addr, start_addr); 88 } 89 90 static void assign_mkey_variant(struct mlx5_ib_dev *dev, u32 *mkey, u32 *in) 91 { 92 u8 key = atomic_inc_return(&dev->mkey_var); 93 void *mkc; 94 95 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 96 MLX5_SET(mkc, mkc, mkey_7_0, key); 97 *mkey = key; 98 } 99 100 static int mlx5_ib_create_mkey(struct mlx5_ib_dev *dev, 101 struct mlx5_ib_mkey *mkey, u32 *in, int inlen) 102 { 103 int ret; 104 105 assign_mkey_variant(dev, &mkey->key, in); 106 ret = mlx5_core_create_mkey(dev->mdev, &mkey->key, in, inlen); 107 if (!ret) 108 init_waitqueue_head(&mkey->wait); 109 110 return ret; 111 } 112 113 static int mlx5_ib_create_mkey_cb(struct mlx5r_async_create_mkey *async_create) 114 { 115 struct mlx5_ib_dev *dev = async_create->ent->dev; 116 size_t inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 117 size_t outlen = MLX5_ST_SZ_BYTES(create_mkey_out); 118 119 MLX5_SET(create_mkey_in, async_create->in, opcode, 120 MLX5_CMD_OP_CREATE_MKEY); 121 assign_mkey_variant(dev, &async_create->mkey, async_create->in); 122 return mlx5_cmd_exec_cb(&dev->async_ctx, async_create->in, inlen, 123 async_create->out, outlen, create_mkey_callback, 124 &async_create->cb_work); 125 } 126 127 static int mkey_cache_max_order(struct mlx5_ib_dev *dev); 128 static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent); 129 130 static int destroy_mkey(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) 131 { 132 WARN_ON(xa_load(&dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key))); 133 134 return mlx5_core_destroy_mkey(dev->mdev, mr->mmkey.key); 135 } 136 137 static void create_mkey_warn(struct mlx5_ib_dev *dev, int status, void *out) 138 { 139 if (status == -ENXIO) /* core driver is not available */ 140 return; 141 142 mlx5_ib_warn(dev, "async reg mr failed. status %d\n", status); 143 if (status != -EREMOTEIO) /* driver specific failure */ 144 return; 145 146 /* Failed in FW, print cmd out failure details */ 147 mlx5_cmd_out_err(dev->mdev, MLX5_CMD_OP_CREATE_MKEY, 0, out); 148 } 149 150 static int push_mkey_locked(struct mlx5_cache_ent *ent, u32 mkey) 151 { 152 unsigned long tmp = ent->mkeys_queue.ci % NUM_MKEYS_PER_PAGE; 153 struct mlx5_mkeys_page *page; 154 155 lockdep_assert_held(&ent->mkeys_queue.lock); 156 if (ent->mkeys_queue.ci >= 157 ent->mkeys_queue.num_pages * NUM_MKEYS_PER_PAGE) { 158 page = kzalloc(sizeof(*page), GFP_ATOMIC); 159 if (!page) 160 return -ENOMEM; 161 ent->mkeys_queue.num_pages++; 162 list_add_tail(&page->list, &ent->mkeys_queue.pages_list); 163 } else { 164 page = list_last_entry(&ent->mkeys_queue.pages_list, 165 struct mlx5_mkeys_page, list); 166 } 167 168 page->mkeys[tmp] = mkey; 169 ent->mkeys_queue.ci++; 170 return 0; 171 } 172 173 static int pop_mkey_locked(struct mlx5_cache_ent *ent) 174 { 175 unsigned long tmp = (ent->mkeys_queue.ci - 1) % NUM_MKEYS_PER_PAGE; 176 struct mlx5_mkeys_page *last_page; 177 u32 mkey; 178 179 lockdep_assert_held(&ent->mkeys_queue.lock); 180 last_page = list_last_entry(&ent->mkeys_queue.pages_list, 181 struct mlx5_mkeys_page, list); 182 mkey = last_page->mkeys[tmp]; 183 last_page->mkeys[tmp] = 0; 184 ent->mkeys_queue.ci--; 185 if (ent->mkeys_queue.num_pages > 1 && !tmp) { 186 list_del(&last_page->list); 187 ent->mkeys_queue.num_pages--; 188 kfree(last_page); 189 } 190 return mkey; 191 } 192 193 static void create_mkey_callback(int status, struct mlx5_async_work *context) 194 { 195 struct mlx5r_async_create_mkey *mkey_out = 196 container_of(context, struct mlx5r_async_create_mkey, cb_work); 197 struct mlx5_cache_ent *ent = mkey_out->ent; 198 struct mlx5_ib_dev *dev = ent->dev; 199 unsigned long flags; 200 201 if (status) { 202 create_mkey_warn(dev, status, mkey_out->out); 203 kfree(mkey_out); 204 spin_lock_irqsave(&ent->mkeys_queue.lock, flags); 205 ent->pending--; 206 WRITE_ONCE(dev->fill_delay, 1); 207 spin_unlock_irqrestore(&ent->mkeys_queue.lock, flags); 208 mod_timer(&dev->delay_timer, jiffies + HZ); 209 return; 210 } 211 212 mkey_out->mkey |= mlx5_idx_to_mkey( 213 MLX5_GET(create_mkey_out, mkey_out->out, mkey_index)); 214 WRITE_ONCE(dev->cache.last_add, jiffies); 215 216 spin_lock_irqsave(&ent->mkeys_queue.lock, flags); 217 push_mkey_locked(ent, mkey_out->mkey); 218 ent->pending--; 219 /* If we are doing fill_to_high_water then keep going. */ 220 queue_adjust_cache_locked(ent); 221 spin_unlock_irqrestore(&ent->mkeys_queue.lock, flags); 222 kfree(mkey_out); 223 } 224 225 static int get_mkc_octo_size(unsigned int access_mode, unsigned int ndescs) 226 { 227 int ret = 0; 228 229 switch (access_mode) { 230 case MLX5_MKC_ACCESS_MODE_MTT: 231 ret = DIV_ROUND_UP(ndescs, MLX5_IB_UMR_OCTOWORD / 232 sizeof(struct mlx5_mtt)); 233 break; 234 case MLX5_MKC_ACCESS_MODE_KSM: 235 ret = DIV_ROUND_UP(ndescs, MLX5_IB_UMR_OCTOWORD / 236 sizeof(struct mlx5_klm)); 237 break; 238 default: 239 WARN_ON(1); 240 } 241 return ret; 242 } 243 244 static void set_cache_mkc(struct mlx5_cache_ent *ent, void *mkc) 245 { 246 set_mkc_access_pd_addr_fields(mkc, ent->rb_key.access_flags, 0, 247 ent->dev->umrc.pd); 248 MLX5_SET(mkc, mkc, free, 1); 249 MLX5_SET(mkc, mkc, umr_en, 1); 250 MLX5_SET(mkc, mkc, access_mode_1_0, ent->rb_key.access_mode & 0x3); 251 MLX5_SET(mkc, mkc, access_mode_4_2, 252 (ent->rb_key.access_mode >> 2) & 0x7); 253 MLX5_SET(mkc, mkc, ma_translation_mode, !!ent->rb_key.ats); 254 255 MLX5_SET(mkc, mkc, translations_octword_size, 256 get_mkc_octo_size(ent->rb_key.access_mode, 257 ent->rb_key.ndescs)); 258 MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT); 259 } 260 261 /* Asynchronously schedule new MRs to be populated in the cache. */ 262 static int add_keys(struct mlx5_cache_ent *ent, unsigned int num) 263 { 264 struct mlx5r_async_create_mkey *async_create; 265 void *mkc; 266 int err = 0; 267 int i; 268 269 for (i = 0; i < num; i++) { 270 async_create = kzalloc(sizeof(struct mlx5r_async_create_mkey), 271 GFP_KERNEL); 272 if (!async_create) 273 return -ENOMEM; 274 mkc = MLX5_ADDR_OF(create_mkey_in, async_create->in, 275 memory_key_mkey_entry); 276 set_cache_mkc(ent, mkc); 277 async_create->ent = ent; 278 279 spin_lock_irq(&ent->mkeys_queue.lock); 280 if (ent->pending >= MAX_PENDING_REG_MR) { 281 err = -EAGAIN; 282 goto free_async_create; 283 } 284 ent->pending++; 285 spin_unlock_irq(&ent->mkeys_queue.lock); 286 287 err = mlx5_ib_create_mkey_cb(async_create); 288 if (err) { 289 mlx5_ib_warn(ent->dev, "create mkey failed %d\n", err); 290 goto err_create_mkey; 291 } 292 } 293 294 return 0; 295 296 err_create_mkey: 297 spin_lock_irq(&ent->mkeys_queue.lock); 298 ent->pending--; 299 free_async_create: 300 spin_unlock_irq(&ent->mkeys_queue.lock); 301 kfree(async_create); 302 return err; 303 } 304 305 /* Synchronously create a MR in the cache */ 306 static int create_cache_mkey(struct mlx5_cache_ent *ent, u32 *mkey) 307 { 308 size_t inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 309 void *mkc; 310 u32 *in; 311 int err; 312 313 in = kzalloc(inlen, GFP_KERNEL); 314 if (!in) 315 return -ENOMEM; 316 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 317 set_cache_mkc(ent, mkc); 318 319 err = mlx5_core_create_mkey(ent->dev->mdev, mkey, in, inlen); 320 if (err) 321 goto free_in; 322 323 WRITE_ONCE(ent->dev->cache.last_add, jiffies); 324 free_in: 325 kfree(in); 326 return err; 327 } 328 329 static void remove_cache_mr_locked(struct mlx5_cache_ent *ent) 330 { 331 u32 mkey; 332 333 lockdep_assert_held(&ent->mkeys_queue.lock); 334 if (!ent->mkeys_queue.ci) 335 return; 336 mkey = pop_mkey_locked(ent); 337 spin_unlock_irq(&ent->mkeys_queue.lock); 338 mlx5_core_destroy_mkey(ent->dev->mdev, mkey); 339 spin_lock_irq(&ent->mkeys_queue.lock); 340 } 341 342 static int resize_available_mrs(struct mlx5_cache_ent *ent, unsigned int target, 343 bool limit_fill) 344 __acquires(&ent->mkeys_queue.lock) __releases(&ent->mkeys_queue.lock) 345 { 346 int err; 347 348 lockdep_assert_held(&ent->mkeys_queue.lock); 349 350 while (true) { 351 if (limit_fill) 352 target = ent->limit * 2; 353 if (target == ent->pending + ent->mkeys_queue.ci) 354 return 0; 355 if (target > ent->pending + ent->mkeys_queue.ci) { 356 u32 todo = target - (ent->pending + ent->mkeys_queue.ci); 357 358 spin_unlock_irq(&ent->mkeys_queue.lock); 359 err = add_keys(ent, todo); 360 if (err == -EAGAIN) 361 usleep_range(3000, 5000); 362 spin_lock_irq(&ent->mkeys_queue.lock); 363 if (err) { 364 if (err != -EAGAIN) 365 return err; 366 } else 367 return 0; 368 } else { 369 remove_cache_mr_locked(ent); 370 } 371 } 372 } 373 374 static ssize_t size_write(struct file *filp, const char __user *buf, 375 size_t count, loff_t *pos) 376 { 377 struct mlx5_cache_ent *ent = filp->private_data; 378 u32 target; 379 int err; 380 381 err = kstrtou32_from_user(buf, count, 0, &target); 382 if (err) 383 return err; 384 385 /* 386 * Target is the new value of total_mrs the user requests, however we 387 * cannot free MRs that are in use. Compute the target value for stored 388 * mkeys. 389 */ 390 spin_lock_irq(&ent->mkeys_queue.lock); 391 if (target < ent->in_use) { 392 err = -EINVAL; 393 goto err_unlock; 394 } 395 target = target - ent->in_use; 396 if (target < ent->limit || target > ent->limit*2) { 397 err = -EINVAL; 398 goto err_unlock; 399 } 400 err = resize_available_mrs(ent, target, false); 401 if (err) 402 goto err_unlock; 403 spin_unlock_irq(&ent->mkeys_queue.lock); 404 405 return count; 406 407 err_unlock: 408 spin_unlock_irq(&ent->mkeys_queue.lock); 409 return err; 410 } 411 412 static ssize_t size_read(struct file *filp, char __user *buf, size_t count, 413 loff_t *pos) 414 { 415 struct mlx5_cache_ent *ent = filp->private_data; 416 char lbuf[20]; 417 int err; 418 419 err = snprintf(lbuf, sizeof(lbuf), "%ld\n", 420 ent->mkeys_queue.ci + ent->in_use); 421 if (err < 0) 422 return err; 423 424 return simple_read_from_buffer(buf, count, pos, lbuf, err); 425 } 426 427 static const struct file_operations size_fops = { 428 .owner = THIS_MODULE, 429 .open = simple_open, 430 .write = size_write, 431 .read = size_read, 432 }; 433 434 static ssize_t limit_write(struct file *filp, const char __user *buf, 435 size_t count, loff_t *pos) 436 { 437 struct mlx5_cache_ent *ent = filp->private_data; 438 u32 var; 439 int err; 440 441 err = kstrtou32_from_user(buf, count, 0, &var); 442 if (err) 443 return err; 444 445 /* 446 * Upon set we immediately fill the cache to high water mark implied by 447 * the limit. 448 */ 449 spin_lock_irq(&ent->mkeys_queue.lock); 450 ent->limit = var; 451 err = resize_available_mrs(ent, 0, true); 452 spin_unlock_irq(&ent->mkeys_queue.lock); 453 if (err) 454 return err; 455 return count; 456 } 457 458 static ssize_t limit_read(struct file *filp, char __user *buf, size_t count, 459 loff_t *pos) 460 { 461 struct mlx5_cache_ent *ent = filp->private_data; 462 char lbuf[20]; 463 int err; 464 465 err = snprintf(lbuf, sizeof(lbuf), "%d\n", ent->limit); 466 if (err < 0) 467 return err; 468 469 return simple_read_from_buffer(buf, count, pos, lbuf, err); 470 } 471 472 static const struct file_operations limit_fops = { 473 .owner = THIS_MODULE, 474 .open = simple_open, 475 .write = limit_write, 476 .read = limit_read, 477 }; 478 479 static bool someone_adding(struct mlx5_mkey_cache *cache) 480 { 481 struct mlx5_cache_ent *ent; 482 struct rb_node *node; 483 bool ret; 484 485 mutex_lock(&cache->rb_lock); 486 for (node = rb_first(&cache->rb_root); node; node = rb_next(node)) { 487 ent = rb_entry(node, struct mlx5_cache_ent, node); 488 spin_lock_irq(&ent->mkeys_queue.lock); 489 ret = ent->mkeys_queue.ci < ent->limit; 490 spin_unlock_irq(&ent->mkeys_queue.lock); 491 if (ret) { 492 mutex_unlock(&cache->rb_lock); 493 return true; 494 } 495 } 496 mutex_unlock(&cache->rb_lock); 497 return false; 498 } 499 500 /* 501 * Check if the bucket is outside the high/low water mark and schedule an async 502 * update. The cache refill has hysteresis, once the low water mark is hit it is 503 * refilled up to the high mark. 504 */ 505 static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent) 506 { 507 lockdep_assert_held(&ent->mkeys_queue.lock); 508 509 if (ent->disabled || READ_ONCE(ent->dev->fill_delay) || ent->is_tmp) 510 return; 511 if (ent->mkeys_queue.ci < ent->limit) { 512 ent->fill_to_high_water = true; 513 mod_delayed_work(ent->dev->cache.wq, &ent->dwork, 0); 514 } else if (ent->fill_to_high_water && 515 ent->mkeys_queue.ci + ent->pending < 2 * ent->limit) { 516 /* 517 * Once we start populating due to hitting a low water mark 518 * continue until we pass the high water mark. 519 */ 520 mod_delayed_work(ent->dev->cache.wq, &ent->dwork, 0); 521 } else if (ent->mkeys_queue.ci == 2 * ent->limit) { 522 ent->fill_to_high_water = false; 523 } else if (ent->mkeys_queue.ci > 2 * ent->limit) { 524 /* Queue deletion of excess entries */ 525 ent->fill_to_high_water = false; 526 if (ent->pending) 527 queue_delayed_work(ent->dev->cache.wq, &ent->dwork, 528 msecs_to_jiffies(1000)); 529 else 530 mod_delayed_work(ent->dev->cache.wq, &ent->dwork, 0); 531 } 532 } 533 534 static void clean_keys(struct mlx5_ib_dev *dev, struct mlx5_cache_ent *ent) 535 { 536 u32 mkey; 537 538 spin_lock_irq(&ent->mkeys_queue.lock); 539 while (ent->mkeys_queue.ci) { 540 mkey = pop_mkey_locked(ent); 541 spin_unlock_irq(&ent->mkeys_queue.lock); 542 mlx5_core_destroy_mkey(dev->mdev, mkey); 543 spin_lock_irq(&ent->mkeys_queue.lock); 544 } 545 ent->tmp_cleanup_scheduled = false; 546 spin_unlock_irq(&ent->mkeys_queue.lock); 547 } 548 549 static void __cache_work_func(struct mlx5_cache_ent *ent) 550 { 551 struct mlx5_ib_dev *dev = ent->dev; 552 struct mlx5_mkey_cache *cache = &dev->cache; 553 int err; 554 555 spin_lock_irq(&ent->mkeys_queue.lock); 556 if (ent->disabled) 557 goto out; 558 559 if (ent->fill_to_high_water && 560 ent->mkeys_queue.ci + ent->pending < 2 * ent->limit && 561 !READ_ONCE(dev->fill_delay)) { 562 spin_unlock_irq(&ent->mkeys_queue.lock); 563 err = add_keys(ent, 1); 564 spin_lock_irq(&ent->mkeys_queue.lock); 565 if (ent->disabled) 566 goto out; 567 if (err) { 568 /* 569 * EAGAIN only happens if there are pending MRs, so we 570 * will be rescheduled when storing them. The only 571 * failure path here is ENOMEM. 572 */ 573 if (err != -EAGAIN) { 574 mlx5_ib_warn( 575 dev, 576 "add keys command failed, err %d\n", 577 err); 578 queue_delayed_work(cache->wq, &ent->dwork, 579 msecs_to_jiffies(1000)); 580 } 581 } 582 } else if (ent->mkeys_queue.ci > 2 * ent->limit) { 583 bool need_delay; 584 585 /* 586 * The remove_cache_mr() logic is performed as garbage 587 * collection task. Such task is intended to be run when no 588 * other active processes are running. 589 * 590 * The need_resched() will return TRUE if there are user tasks 591 * to be activated in near future. 592 * 593 * In such case, we don't execute remove_cache_mr() and postpone 594 * the garbage collection work to try to run in next cycle, in 595 * order to free CPU resources to other tasks. 596 */ 597 spin_unlock_irq(&ent->mkeys_queue.lock); 598 need_delay = need_resched() || someone_adding(cache) || 599 !time_after(jiffies, 600 READ_ONCE(cache->last_add) + 300 * HZ); 601 spin_lock_irq(&ent->mkeys_queue.lock); 602 if (ent->disabled) 603 goto out; 604 if (need_delay) { 605 queue_delayed_work(cache->wq, &ent->dwork, 300 * HZ); 606 goto out; 607 } 608 remove_cache_mr_locked(ent); 609 queue_adjust_cache_locked(ent); 610 } 611 out: 612 spin_unlock_irq(&ent->mkeys_queue.lock); 613 } 614 615 static void delayed_cache_work_func(struct work_struct *work) 616 { 617 struct mlx5_cache_ent *ent; 618 619 ent = container_of(work, struct mlx5_cache_ent, dwork.work); 620 /* temp entries are never filled, only cleaned */ 621 if (ent->is_tmp) 622 clean_keys(ent->dev, ent); 623 else 624 __cache_work_func(ent); 625 } 626 627 static int cache_ent_key_cmp(struct mlx5r_cache_rb_key key1, 628 struct mlx5r_cache_rb_key key2) 629 { 630 int res; 631 632 res = key1.ats - key2.ats; 633 if (res) 634 return res; 635 636 res = key1.access_mode - key2.access_mode; 637 if (res) 638 return res; 639 640 res = key1.access_flags - key2.access_flags; 641 if (res) 642 return res; 643 644 /* 645 * keep ndescs the last in the compare table since the find function 646 * searches for an exact match on all properties and only closest 647 * match in size. 648 */ 649 return key1.ndescs - key2.ndescs; 650 } 651 652 static int mlx5_cache_ent_insert(struct mlx5_mkey_cache *cache, 653 struct mlx5_cache_ent *ent) 654 { 655 struct rb_node **new = &cache->rb_root.rb_node, *parent = NULL; 656 struct mlx5_cache_ent *cur; 657 int cmp; 658 659 /* Figure out where to put new node */ 660 while (*new) { 661 cur = rb_entry(*new, struct mlx5_cache_ent, node); 662 parent = *new; 663 cmp = cache_ent_key_cmp(cur->rb_key, ent->rb_key); 664 if (cmp > 0) 665 new = &((*new)->rb_left); 666 if (cmp < 0) 667 new = &((*new)->rb_right); 668 if (cmp == 0) 669 return -EEXIST; 670 } 671 672 /* Add new node and rebalance tree. */ 673 rb_link_node(&ent->node, parent, new); 674 rb_insert_color(&ent->node, &cache->rb_root); 675 676 return 0; 677 } 678 679 static struct mlx5_cache_ent * 680 mkey_cache_ent_from_rb_key(struct mlx5_ib_dev *dev, 681 struct mlx5r_cache_rb_key rb_key) 682 { 683 struct rb_node *node = dev->cache.rb_root.rb_node; 684 struct mlx5_cache_ent *cur, *smallest = NULL; 685 u64 ndescs_limit; 686 int cmp; 687 688 /* 689 * Find the smallest ent with order >= requested_order. 690 */ 691 while (node) { 692 cur = rb_entry(node, struct mlx5_cache_ent, node); 693 cmp = cache_ent_key_cmp(cur->rb_key, rb_key); 694 if (cmp > 0) { 695 smallest = cur; 696 node = node->rb_left; 697 } 698 if (cmp < 0) 699 node = node->rb_right; 700 if (cmp == 0) 701 return cur; 702 } 703 704 /* 705 * Limit the usage of mkeys larger than twice the required size while 706 * also allowing the usage of smallest cache entry for small MRs. 707 */ 708 ndescs_limit = max_t(u64, rb_key.ndescs * 2, 709 MLX5_MR_CACHE_PERSISTENT_ENTRY_MIN_DESCS); 710 711 return (smallest && 712 smallest->rb_key.access_mode == rb_key.access_mode && 713 smallest->rb_key.access_flags == rb_key.access_flags && 714 smallest->rb_key.ats == rb_key.ats && 715 smallest->rb_key.ndescs <= ndescs_limit) ? 716 smallest : 717 NULL; 718 } 719 720 static struct mlx5_ib_mr *_mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, 721 struct mlx5_cache_ent *ent, 722 int access_flags) 723 { 724 struct mlx5_ib_mr *mr; 725 int err; 726 727 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 728 if (!mr) 729 return ERR_PTR(-ENOMEM); 730 731 spin_lock_irq(&ent->mkeys_queue.lock); 732 ent->in_use++; 733 734 if (!ent->mkeys_queue.ci) { 735 queue_adjust_cache_locked(ent); 736 ent->miss++; 737 spin_unlock_irq(&ent->mkeys_queue.lock); 738 err = create_cache_mkey(ent, &mr->mmkey.key); 739 if (err) { 740 spin_lock_irq(&ent->mkeys_queue.lock); 741 ent->in_use--; 742 spin_unlock_irq(&ent->mkeys_queue.lock); 743 kfree(mr); 744 return ERR_PTR(err); 745 } 746 } else { 747 mr->mmkey.key = pop_mkey_locked(ent); 748 queue_adjust_cache_locked(ent); 749 spin_unlock_irq(&ent->mkeys_queue.lock); 750 } 751 mr->mmkey.cache_ent = ent; 752 mr->mmkey.type = MLX5_MKEY_MR; 753 mr->mmkey.rb_key = ent->rb_key; 754 mr->mmkey.cacheable = true; 755 init_waitqueue_head(&mr->mmkey.wait); 756 return mr; 757 } 758 759 static int get_unchangeable_access_flags(struct mlx5_ib_dev *dev, 760 int access_flags) 761 { 762 int ret = 0; 763 764 if ((access_flags & IB_ACCESS_REMOTE_ATOMIC) && 765 MLX5_CAP_GEN(dev->mdev, atomic) && 766 MLX5_CAP_GEN(dev->mdev, umr_modify_atomic_disabled)) 767 ret |= IB_ACCESS_REMOTE_ATOMIC; 768 769 if ((access_flags & IB_ACCESS_RELAXED_ORDERING) && 770 MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write) && 771 !MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write_umr)) 772 ret |= IB_ACCESS_RELAXED_ORDERING; 773 774 if ((access_flags & IB_ACCESS_RELAXED_ORDERING) && 775 (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read) || 776 MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read_pci_enabled)) && 777 !MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read_umr)) 778 ret |= IB_ACCESS_RELAXED_ORDERING; 779 780 return ret; 781 } 782 783 struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, 784 int access_flags, int access_mode, 785 int ndescs) 786 { 787 struct mlx5r_cache_rb_key rb_key = { 788 .ndescs = ndescs, 789 .access_mode = access_mode, 790 .access_flags = get_unchangeable_access_flags(dev, access_flags) 791 }; 792 struct mlx5_cache_ent *ent = mkey_cache_ent_from_rb_key(dev, rb_key); 793 794 if (!ent) 795 return ERR_PTR(-EOPNOTSUPP); 796 797 return _mlx5_mr_cache_alloc(dev, ent, access_flags); 798 } 799 800 static void mlx5_mkey_cache_debugfs_cleanup(struct mlx5_ib_dev *dev) 801 { 802 if (!mlx5_debugfs_root || dev->is_rep) 803 return; 804 805 debugfs_remove_recursive(dev->cache.fs_root); 806 dev->cache.fs_root = NULL; 807 } 808 809 static void mlx5_mkey_cache_debugfs_add_ent(struct mlx5_ib_dev *dev, 810 struct mlx5_cache_ent *ent) 811 { 812 int order = order_base_2(ent->rb_key.ndescs); 813 struct dentry *dir; 814 815 if (!mlx5_debugfs_root || dev->is_rep) 816 return; 817 818 if (ent->rb_key.access_mode == MLX5_MKC_ACCESS_MODE_KSM) 819 order = MLX5_IMR_KSM_CACHE_ENTRY + 2; 820 821 sprintf(ent->name, "%d", order); 822 dir = debugfs_create_dir(ent->name, dev->cache.fs_root); 823 debugfs_create_file("size", 0600, dir, ent, &size_fops); 824 debugfs_create_file("limit", 0600, dir, ent, &limit_fops); 825 debugfs_create_ulong("cur", 0400, dir, &ent->mkeys_queue.ci); 826 debugfs_create_u32("miss", 0600, dir, &ent->miss); 827 } 828 829 static void mlx5_mkey_cache_debugfs_init(struct mlx5_ib_dev *dev) 830 { 831 struct dentry *dbg_root = mlx5_debugfs_get_dev_root(dev->mdev); 832 struct mlx5_mkey_cache *cache = &dev->cache; 833 834 if (!mlx5_debugfs_root || dev->is_rep) 835 return; 836 837 cache->fs_root = debugfs_create_dir("mr_cache", dbg_root); 838 } 839 840 static void delay_time_func(struct timer_list *t) 841 { 842 struct mlx5_ib_dev *dev = from_timer(dev, t, delay_timer); 843 844 WRITE_ONCE(dev->fill_delay, 0); 845 } 846 847 static int mlx5r_mkeys_init(struct mlx5_cache_ent *ent) 848 { 849 struct mlx5_mkeys_page *page; 850 851 page = kzalloc(sizeof(*page), GFP_KERNEL); 852 if (!page) 853 return -ENOMEM; 854 INIT_LIST_HEAD(&ent->mkeys_queue.pages_list); 855 spin_lock_init(&ent->mkeys_queue.lock); 856 list_add_tail(&page->list, &ent->mkeys_queue.pages_list); 857 ent->mkeys_queue.num_pages++; 858 return 0; 859 } 860 861 static void mlx5r_mkeys_uninit(struct mlx5_cache_ent *ent) 862 { 863 struct mlx5_mkeys_page *page; 864 865 WARN_ON(ent->mkeys_queue.ci || ent->mkeys_queue.num_pages > 1); 866 page = list_last_entry(&ent->mkeys_queue.pages_list, 867 struct mlx5_mkeys_page, list); 868 list_del(&page->list); 869 kfree(page); 870 } 871 872 struct mlx5_cache_ent * 873 mlx5r_cache_create_ent_locked(struct mlx5_ib_dev *dev, 874 struct mlx5r_cache_rb_key rb_key, 875 bool persistent_entry) 876 { 877 struct mlx5_cache_ent *ent; 878 int order; 879 int ret; 880 881 ent = kzalloc(sizeof(*ent), GFP_KERNEL); 882 if (!ent) 883 return ERR_PTR(-ENOMEM); 884 885 ret = mlx5r_mkeys_init(ent); 886 if (ret) 887 goto mkeys_err; 888 ent->rb_key = rb_key; 889 ent->dev = dev; 890 ent->is_tmp = !persistent_entry; 891 892 INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func); 893 894 ret = mlx5_cache_ent_insert(&dev->cache, ent); 895 if (ret) 896 goto ent_insert_err; 897 898 if (persistent_entry) { 899 if (rb_key.access_mode == MLX5_MKC_ACCESS_MODE_KSM) 900 order = MLX5_IMR_KSM_CACHE_ENTRY; 901 else 902 order = order_base_2(rb_key.ndescs) - 2; 903 904 if ((dev->mdev->profile.mask & MLX5_PROF_MASK_MR_CACHE) && 905 !dev->is_rep && mlx5_core_is_pf(dev->mdev) && 906 mlx5r_umr_can_load_pas(dev, 0)) 907 ent->limit = dev->mdev->profile.mr_cache[order].limit; 908 else 909 ent->limit = 0; 910 911 mlx5_mkey_cache_debugfs_add_ent(dev, ent); 912 } 913 914 return ent; 915 ent_insert_err: 916 mlx5r_mkeys_uninit(ent); 917 mkeys_err: 918 kfree(ent); 919 return ERR_PTR(ret); 920 } 921 922 int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev) 923 { 924 struct mlx5_mkey_cache *cache = &dev->cache; 925 struct rb_root *root = &dev->cache.rb_root; 926 struct mlx5r_cache_rb_key rb_key = { 927 .access_mode = MLX5_MKC_ACCESS_MODE_MTT, 928 }; 929 struct mlx5_cache_ent *ent; 930 struct rb_node *node; 931 int ret; 932 int i; 933 934 mutex_init(&dev->slow_path_mutex); 935 mutex_init(&dev->cache.rb_lock); 936 dev->cache.rb_root = RB_ROOT; 937 cache->wq = alloc_ordered_workqueue("mkey_cache", WQ_MEM_RECLAIM); 938 if (!cache->wq) { 939 mlx5_ib_warn(dev, "failed to create work queue\n"); 940 return -ENOMEM; 941 } 942 943 mlx5_cmd_init_async_ctx(dev->mdev, &dev->async_ctx); 944 timer_setup(&dev->delay_timer, delay_time_func, 0); 945 mlx5_mkey_cache_debugfs_init(dev); 946 mutex_lock(&cache->rb_lock); 947 for (i = 0; i <= mkey_cache_max_order(dev); i++) { 948 rb_key.ndescs = MLX5_MR_CACHE_PERSISTENT_ENTRY_MIN_DESCS << i; 949 ent = mlx5r_cache_create_ent_locked(dev, rb_key, true); 950 if (IS_ERR(ent)) { 951 ret = PTR_ERR(ent); 952 goto err; 953 } 954 } 955 956 ret = mlx5_odp_init_mkey_cache(dev); 957 if (ret) 958 goto err; 959 960 mutex_unlock(&cache->rb_lock); 961 for (node = rb_first(root); node; node = rb_next(node)) { 962 ent = rb_entry(node, struct mlx5_cache_ent, node); 963 spin_lock_irq(&ent->mkeys_queue.lock); 964 queue_adjust_cache_locked(ent); 965 spin_unlock_irq(&ent->mkeys_queue.lock); 966 } 967 968 return 0; 969 970 err: 971 mutex_unlock(&cache->rb_lock); 972 mlx5_mkey_cache_debugfs_cleanup(dev); 973 mlx5_ib_warn(dev, "failed to create mkey cache entry\n"); 974 return ret; 975 } 976 977 void mlx5_mkey_cache_cleanup(struct mlx5_ib_dev *dev) 978 { 979 struct rb_root *root = &dev->cache.rb_root; 980 struct mlx5_cache_ent *ent; 981 struct rb_node *node; 982 983 if (!dev->cache.wq) 984 return; 985 986 mutex_lock(&dev->cache.rb_lock); 987 for (node = rb_first(root); node; node = rb_next(node)) { 988 ent = rb_entry(node, struct mlx5_cache_ent, node); 989 spin_lock_irq(&ent->mkeys_queue.lock); 990 ent->disabled = true; 991 spin_unlock_irq(&ent->mkeys_queue.lock); 992 cancel_delayed_work(&ent->dwork); 993 } 994 mutex_unlock(&dev->cache.rb_lock); 995 996 /* 997 * After all entries are disabled and will not reschedule on WQ, 998 * flush it and all async commands. 999 */ 1000 flush_workqueue(dev->cache.wq); 1001 1002 mlx5_mkey_cache_debugfs_cleanup(dev); 1003 mlx5_cmd_cleanup_async_ctx(&dev->async_ctx); 1004 1005 /* At this point all entries are disabled and have no concurrent work. */ 1006 mutex_lock(&dev->cache.rb_lock); 1007 node = rb_first(root); 1008 while (node) { 1009 ent = rb_entry(node, struct mlx5_cache_ent, node); 1010 node = rb_next(node); 1011 clean_keys(dev, ent); 1012 rb_erase(&ent->node, root); 1013 mlx5r_mkeys_uninit(ent); 1014 kfree(ent); 1015 } 1016 mutex_unlock(&dev->cache.rb_lock); 1017 1018 destroy_workqueue(dev->cache.wq); 1019 del_timer_sync(&dev->delay_timer); 1020 } 1021 1022 struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc) 1023 { 1024 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1025 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 1026 struct mlx5_ib_mr *mr; 1027 void *mkc; 1028 u32 *in; 1029 int err; 1030 1031 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 1032 if (!mr) 1033 return ERR_PTR(-ENOMEM); 1034 1035 in = kzalloc(inlen, GFP_KERNEL); 1036 if (!in) { 1037 err = -ENOMEM; 1038 goto err_free; 1039 } 1040 1041 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 1042 1043 MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_PA); 1044 MLX5_SET(mkc, mkc, length64, 1); 1045 set_mkc_access_pd_addr_fields(mkc, acc | IB_ACCESS_RELAXED_ORDERING, 0, 1046 pd); 1047 MLX5_SET(mkc, mkc, ma_translation_mode, MLX5_CAP_GEN(dev->mdev, ats)); 1048 1049 err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen); 1050 if (err) 1051 goto err_in; 1052 1053 kfree(in); 1054 mr->mmkey.type = MLX5_MKEY_MR; 1055 mr->ibmr.lkey = mr->mmkey.key; 1056 mr->ibmr.rkey = mr->mmkey.key; 1057 mr->umem = NULL; 1058 1059 return &mr->ibmr; 1060 1061 err_in: 1062 kfree(in); 1063 1064 err_free: 1065 kfree(mr); 1066 1067 return ERR_PTR(err); 1068 } 1069 1070 static int get_octo_len(u64 addr, u64 len, int page_shift) 1071 { 1072 u64 page_size = 1ULL << page_shift; 1073 u64 offset; 1074 int npages; 1075 1076 offset = addr & (page_size - 1); 1077 npages = ALIGN(len + offset, page_size) >> page_shift; 1078 return (npages + 1) / 2; 1079 } 1080 1081 static int mkey_cache_max_order(struct mlx5_ib_dev *dev) 1082 { 1083 if (MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset)) 1084 return MKEY_CACHE_LAST_STD_ENTRY; 1085 return MLX5_MAX_UMR_SHIFT; 1086 } 1087 1088 static void set_mr_fields(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr, 1089 u64 length, int access_flags, u64 iova) 1090 { 1091 mr->ibmr.lkey = mr->mmkey.key; 1092 mr->ibmr.rkey = mr->mmkey.key; 1093 mr->ibmr.length = length; 1094 mr->ibmr.device = &dev->ib_dev; 1095 mr->ibmr.iova = iova; 1096 mr->access_flags = access_flags; 1097 } 1098 1099 static unsigned int mlx5_umem_dmabuf_default_pgsz(struct ib_umem *umem, 1100 u64 iova) 1101 { 1102 /* 1103 * The alignment of iova has already been checked upon entering 1104 * UVERBS_METHOD_REG_DMABUF_MR 1105 */ 1106 umem->iova = iova; 1107 return PAGE_SIZE; 1108 } 1109 1110 static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd, 1111 struct ib_umem *umem, u64 iova, 1112 int access_flags, int access_mode) 1113 { 1114 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1115 struct mlx5r_cache_rb_key rb_key = {}; 1116 struct mlx5_cache_ent *ent; 1117 struct mlx5_ib_mr *mr; 1118 unsigned int page_size; 1119 1120 if (umem->is_dmabuf) 1121 page_size = mlx5_umem_dmabuf_default_pgsz(umem, iova); 1122 else 1123 page_size = mlx5_umem_mkc_find_best_pgsz(dev, umem, iova); 1124 if (WARN_ON(!page_size)) 1125 return ERR_PTR(-EINVAL); 1126 1127 rb_key.access_mode = access_mode; 1128 rb_key.ndescs = ib_umem_num_dma_blocks(umem, page_size); 1129 rb_key.ats = mlx5_umem_needs_ats(dev, umem, access_flags); 1130 rb_key.access_flags = get_unchangeable_access_flags(dev, access_flags); 1131 ent = mkey_cache_ent_from_rb_key(dev, rb_key); 1132 /* 1133 * If the MR can't come from the cache then synchronously create an uncached 1134 * one. 1135 */ 1136 if (!ent) { 1137 mutex_lock(&dev->slow_path_mutex); 1138 mr = reg_create(pd, umem, iova, access_flags, page_size, false, access_mode); 1139 mutex_unlock(&dev->slow_path_mutex); 1140 if (IS_ERR(mr)) 1141 return mr; 1142 mr->mmkey.rb_key = rb_key; 1143 mr->mmkey.cacheable = true; 1144 return mr; 1145 } 1146 1147 mr = _mlx5_mr_cache_alloc(dev, ent, access_flags); 1148 if (IS_ERR(mr)) 1149 return mr; 1150 1151 mr->ibmr.pd = pd; 1152 mr->umem = umem; 1153 mr->page_shift = order_base_2(page_size); 1154 set_mr_fields(dev, mr, umem->length, access_flags, iova); 1155 1156 return mr; 1157 } 1158 1159 static struct ib_mr * 1160 reg_create_crossing_vhca_mr(struct ib_pd *pd, u64 iova, u64 length, int access_flags, 1161 u32 crossed_lkey) 1162 { 1163 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1164 int access_mode = MLX5_MKC_ACCESS_MODE_CROSSING; 1165 struct mlx5_ib_mr *mr; 1166 void *mkc; 1167 int inlen; 1168 u32 *in; 1169 int err; 1170 1171 if (!MLX5_CAP_GEN(dev->mdev, crossing_vhca_mkey)) 1172 return ERR_PTR(-EOPNOTSUPP); 1173 1174 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 1175 if (!mr) 1176 return ERR_PTR(-ENOMEM); 1177 1178 inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 1179 in = kvzalloc(inlen, GFP_KERNEL); 1180 if (!in) { 1181 err = -ENOMEM; 1182 goto err_1; 1183 } 1184 1185 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 1186 MLX5_SET(mkc, mkc, crossing_target_vhca_id, 1187 MLX5_CAP_GEN(dev->mdev, vhca_id)); 1188 MLX5_SET(mkc, mkc, translations_octword_size, crossed_lkey); 1189 MLX5_SET(mkc, mkc, access_mode_1_0, access_mode & 0x3); 1190 MLX5_SET(mkc, mkc, access_mode_4_2, (access_mode >> 2) & 0x7); 1191 1192 /* for this crossing mkey IOVA should be 0 and len should be IOVA + len */ 1193 set_mkc_access_pd_addr_fields(mkc, access_flags, 0, pd); 1194 MLX5_SET64(mkc, mkc, len, iova + length); 1195 1196 MLX5_SET(mkc, mkc, free, 0); 1197 MLX5_SET(mkc, mkc, umr_en, 0); 1198 err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen); 1199 if (err) 1200 goto err_2; 1201 1202 mr->mmkey.type = MLX5_MKEY_MR; 1203 set_mr_fields(dev, mr, length, access_flags, iova); 1204 mr->ibmr.pd = pd; 1205 kvfree(in); 1206 mlx5_ib_dbg(dev, "crossing mkey = 0x%x\n", mr->mmkey.key); 1207 1208 return &mr->ibmr; 1209 err_2: 1210 kvfree(in); 1211 err_1: 1212 kfree(mr); 1213 return ERR_PTR(err); 1214 } 1215 1216 /* 1217 * If ibmr is NULL it will be allocated by reg_create. 1218 * Else, the given ibmr will be used. 1219 */ 1220 static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem, 1221 u64 iova, int access_flags, 1222 unsigned int page_size, bool populate, 1223 int access_mode) 1224 { 1225 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1226 struct mlx5_ib_mr *mr; 1227 __be64 *pas; 1228 void *mkc; 1229 int inlen; 1230 u32 *in; 1231 int err; 1232 bool pg_cap = !!(MLX5_CAP_GEN(dev->mdev, pg)) && 1233 (access_mode == MLX5_MKC_ACCESS_MODE_MTT); 1234 bool ksm_mode = (access_mode == MLX5_MKC_ACCESS_MODE_KSM); 1235 1236 if (!page_size) 1237 return ERR_PTR(-EINVAL); 1238 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 1239 if (!mr) 1240 return ERR_PTR(-ENOMEM); 1241 1242 mr->ibmr.pd = pd; 1243 mr->access_flags = access_flags; 1244 mr->page_shift = order_base_2(page_size); 1245 1246 inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 1247 if (populate) 1248 inlen += sizeof(*pas) * 1249 roundup(ib_umem_num_dma_blocks(umem, page_size), 2); 1250 in = kvzalloc(inlen, GFP_KERNEL); 1251 if (!in) { 1252 err = -ENOMEM; 1253 goto err_1; 1254 } 1255 pas = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt); 1256 if (populate) { 1257 if (WARN_ON(access_flags & IB_ACCESS_ON_DEMAND || ksm_mode)) { 1258 err = -EINVAL; 1259 goto err_2; 1260 } 1261 mlx5_ib_populate_pas(umem, 1UL << mr->page_shift, pas, 1262 pg_cap ? MLX5_IB_MTT_PRESENT : 0); 1263 } 1264 1265 /* The pg_access bit allows setting the access flags 1266 * in the page list submitted with the command. 1267 */ 1268 MLX5_SET(create_mkey_in, in, pg_access, !!(pg_cap)); 1269 1270 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 1271 set_mkc_access_pd_addr_fields(mkc, access_flags, iova, 1272 populate ? pd : dev->umrc.pd); 1273 /* In case a data direct flow, overwrite the pdn field by its internal kernel PD */ 1274 if (umem->is_dmabuf && ksm_mode) 1275 MLX5_SET(mkc, mkc, pd, dev->ddr.pdn); 1276 1277 MLX5_SET(mkc, mkc, free, !populate); 1278 MLX5_SET(mkc, mkc, access_mode_1_0, access_mode); 1279 MLX5_SET(mkc, mkc, umr_en, 1); 1280 1281 MLX5_SET64(mkc, mkc, len, umem->length); 1282 MLX5_SET(mkc, mkc, bsf_octword_size, 0); 1283 if (ksm_mode) 1284 MLX5_SET(mkc, mkc, translations_octword_size, 1285 get_octo_len(iova, umem->length, mr->page_shift) * 2); 1286 else 1287 MLX5_SET(mkc, mkc, translations_octword_size, 1288 get_octo_len(iova, umem->length, mr->page_shift)); 1289 MLX5_SET(mkc, mkc, log_page_size, mr->page_shift); 1290 if (mlx5_umem_needs_ats(dev, umem, access_flags)) 1291 MLX5_SET(mkc, mkc, ma_translation_mode, 1); 1292 if (populate) { 1293 MLX5_SET(create_mkey_in, in, translations_octword_actual_size, 1294 get_octo_len(iova, umem->length, mr->page_shift)); 1295 } 1296 1297 err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen); 1298 if (err) { 1299 mlx5_ib_warn(dev, "create mkey failed\n"); 1300 goto err_2; 1301 } 1302 mr->mmkey.type = MLX5_MKEY_MR; 1303 mr->mmkey.ndescs = get_octo_len(iova, umem->length, mr->page_shift); 1304 mr->umem = umem; 1305 set_mr_fields(dev, mr, umem->length, access_flags, iova); 1306 kvfree(in); 1307 1308 mlx5_ib_dbg(dev, "mkey = 0x%x\n", mr->mmkey.key); 1309 1310 return mr; 1311 1312 err_2: 1313 kvfree(in); 1314 err_1: 1315 kfree(mr); 1316 return ERR_PTR(err); 1317 } 1318 1319 static struct ib_mr *mlx5_ib_get_dm_mr(struct ib_pd *pd, u64 start_addr, 1320 u64 length, int acc, int mode) 1321 { 1322 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1323 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 1324 struct mlx5_ib_mr *mr; 1325 void *mkc; 1326 u32 *in; 1327 int err; 1328 1329 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 1330 if (!mr) 1331 return ERR_PTR(-ENOMEM); 1332 1333 in = kzalloc(inlen, GFP_KERNEL); 1334 if (!in) { 1335 err = -ENOMEM; 1336 goto err_free; 1337 } 1338 1339 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 1340 1341 MLX5_SET(mkc, mkc, access_mode_1_0, mode & 0x3); 1342 MLX5_SET(mkc, mkc, access_mode_4_2, (mode >> 2) & 0x7); 1343 MLX5_SET64(mkc, mkc, len, length); 1344 set_mkc_access_pd_addr_fields(mkc, acc, start_addr, pd); 1345 1346 err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen); 1347 if (err) 1348 goto err_in; 1349 1350 kfree(in); 1351 1352 set_mr_fields(dev, mr, length, acc, start_addr); 1353 1354 return &mr->ibmr; 1355 1356 err_in: 1357 kfree(in); 1358 1359 err_free: 1360 kfree(mr); 1361 1362 return ERR_PTR(err); 1363 } 1364 1365 int mlx5_ib_advise_mr(struct ib_pd *pd, 1366 enum ib_uverbs_advise_mr_advice advice, 1367 u32 flags, 1368 struct ib_sge *sg_list, 1369 u32 num_sge, 1370 struct uverbs_attr_bundle *attrs) 1371 { 1372 if (advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH && 1373 advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE && 1374 advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_NO_FAULT) 1375 return -EOPNOTSUPP; 1376 1377 return mlx5_ib_advise_mr_prefetch(pd, advice, flags, 1378 sg_list, num_sge); 1379 } 1380 1381 struct ib_mr *mlx5_ib_reg_dm_mr(struct ib_pd *pd, struct ib_dm *dm, 1382 struct ib_dm_mr_attr *attr, 1383 struct uverbs_attr_bundle *attrs) 1384 { 1385 struct mlx5_ib_dm *mdm = to_mdm(dm); 1386 struct mlx5_core_dev *dev = to_mdev(dm->device)->mdev; 1387 u64 start_addr = mdm->dev_addr + attr->offset; 1388 int mode; 1389 1390 switch (mdm->type) { 1391 case MLX5_IB_UAPI_DM_TYPE_MEMIC: 1392 if (attr->access_flags & ~MLX5_IB_DM_MEMIC_ALLOWED_ACCESS) 1393 return ERR_PTR(-EINVAL); 1394 1395 mode = MLX5_MKC_ACCESS_MODE_MEMIC; 1396 start_addr -= pci_resource_start(dev->pdev, 0); 1397 break; 1398 case MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM: 1399 case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM: 1400 case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_PATTERN_SW_ICM: 1401 case MLX5_IB_UAPI_DM_TYPE_ENCAP_SW_ICM: 1402 if (attr->access_flags & ~MLX5_IB_DM_SW_ICM_ALLOWED_ACCESS) 1403 return ERR_PTR(-EINVAL); 1404 1405 mode = MLX5_MKC_ACCESS_MODE_SW_ICM; 1406 break; 1407 default: 1408 return ERR_PTR(-EINVAL); 1409 } 1410 1411 return mlx5_ib_get_dm_mr(pd, start_addr, attr->length, 1412 attr->access_flags, mode); 1413 } 1414 1415 static struct ib_mr *create_real_mr(struct ib_pd *pd, struct ib_umem *umem, 1416 u64 iova, int access_flags) 1417 { 1418 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1419 struct mlx5_ib_mr *mr = NULL; 1420 bool xlt_with_umr; 1421 int err; 1422 1423 xlt_with_umr = mlx5r_umr_can_load_pas(dev, umem->length); 1424 if (xlt_with_umr) { 1425 mr = alloc_cacheable_mr(pd, umem, iova, access_flags, 1426 MLX5_MKC_ACCESS_MODE_MTT); 1427 } else { 1428 unsigned int page_size = 1429 mlx5_umem_mkc_find_best_pgsz(dev, umem, iova); 1430 1431 mutex_lock(&dev->slow_path_mutex); 1432 mr = reg_create(pd, umem, iova, access_flags, page_size, 1433 true, MLX5_MKC_ACCESS_MODE_MTT); 1434 mutex_unlock(&dev->slow_path_mutex); 1435 } 1436 if (IS_ERR(mr)) { 1437 ib_umem_release(umem); 1438 return ERR_CAST(mr); 1439 } 1440 1441 mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmkey.key); 1442 1443 atomic_add(ib_umem_num_pages(umem), &dev->mdev->priv.reg_pages); 1444 1445 if (xlt_with_umr) { 1446 /* 1447 * If the MR was created with reg_create then it will be 1448 * configured properly but left disabled. It is safe to go ahead 1449 * and configure it again via UMR while enabling it. 1450 */ 1451 err = mlx5r_umr_update_mr_pas(mr, MLX5_IB_UPD_XLT_ENABLE); 1452 if (err) { 1453 mlx5_ib_dereg_mr(&mr->ibmr, NULL); 1454 return ERR_PTR(err); 1455 } 1456 } 1457 return &mr->ibmr; 1458 } 1459 1460 static struct ib_mr *create_user_odp_mr(struct ib_pd *pd, u64 start, u64 length, 1461 u64 iova, int access_flags, 1462 struct ib_udata *udata) 1463 { 1464 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1465 struct ib_umem_odp *odp; 1466 struct mlx5_ib_mr *mr; 1467 int err; 1468 1469 if (!IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) 1470 return ERR_PTR(-EOPNOTSUPP); 1471 1472 err = mlx5r_odp_create_eq(dev, &dev->odp_pf_eq); 1473 if (err) 1474 return ERR_PTR(err); 1475 if (!start && length == U64_MAX) { 1476 if (iova != 0) 1477 return ERR_PTR(-EINVAL); 1478 if (!(dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT)) 1479 return ERR_PTR(-EINVAL); 1480 1481 mr = mlx5_ib_alloc_implicit_mr(to_mpd(pd), access_flags); 1482 if (IS_ERR(mr)) 1483 return ERR_CAST(mr); 1484 return &mr->ibmr; 1485 } 1486 1487 /* ODP requires xlt update via umr to work. */ 1488 if (!mlx5r_umr_can_load_pas(dev, length)) 1489 return ERR_PTR(-EINVAL); 1490 1491 odp = ib_umem_odp_get(&dev->ib_dev, start, length, access_flags, 1492 &mlx5_mn_ops); 1493 if (IS_ERR(odp)) 1494 return ERR_CAST(odp); 1495 1496 mr = alloc_cacheable_mr(pd, &odp->umem, iova, access_flags, 1497 MLX5_MKC_ACCESS_MODE_MTT); 1498 if (IS_ERR(mr)) { 1499 ib_umem_release(&odp->umem); 1500 return ERR_CAST(mr); 1501 } 1502 xa_init(&mr->implicit_children); 1503 1504 odp->private = mr; 1505 err = mlx5r_store_odp_mkey(dev, &mr->mmkey); 1506 if (err) 1507 goto err_dereg_mr; 1508 1509 err = mlx5_ib_init_odp_mr(mr); 1510 if (err) 1511 goto err_dereg_mr; 1512 return &mr->ibmr; 1513 1514 err_dereg_mr: 1515 mlx5_ib_dereg_mr(&mr->ibmr, NULL); 1516 return ERR_PTR(err); 1517 } 1518 1519 struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, 1520 u64 iova, int access_flags, 1521 struct ib_udata *udata) 1522 { 1523 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1524 struct ib_umem *umem; 1525 int err; 1526 1527 if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM)) 1528 return ERR_PTR(-EOPNOTSUPP); 1529 1530 mlx5_ib_dbg(dev, "start 0x%llx, iova 0x%llx, length 0x%llx, access_flags 0x%x\n", 1531 start, iova, length, access_flags); 1532 1533 err = mlx5r_umr_resource_init(dev); 1534 if (err) 1535 return ERR_PTR(err); 1536 1537 if (access_flags & IB_ACCESS_ON_DEMAND) 1538 return create_user_odp_mr(pd, start, length, iova, access_flags, 1539 udata); 1540 umem = ib_umem_get(&dev->ib_dev, start, length, access_flags); 1541 if (IS_ERR(umem)) 1542 return ERR_CAST(umem); 1543 return create_real_mr(pd, umem, iova, access_flags); 1544 } 1545 1546 static void mlx5_ib_dmabuf_invalidate_cb(struct dma_buf_attachment *attach) 1547 { 1548 struct ib_umem_dmabuf *umem_dmabuf = attach->importer_priv; 1549 struct mlx5_ib_mr *mr = umem_dmabuf->private; 1550 1551 dma_resv_assert_held(umem_dmabuf->attach->dmabuf->resv); 1552 1553 if (!umem_dmabuf->sgt || !mr) 1554 return; 1555 1556 mlx5r_umr_update_mr_pas(mr, MLX5_IB_UPD_XLT_ZAP); 1557 ib_umem_dmabuf_unmap_pages(umem_dmabuf); 1558 } 1559 1560 static struct dma_buf_attach_ops mlx5_ib_dmabuf_attach_ops = { 1561 .allow_peer2peer = 1, 1562 .move_notify = mlx5_ib_dmabuf_invalidate_cb, 1563 }; 1564 1565 static struct ib_mr * 1566 reg_user_mr_dmabuf(struct ib_pd *pd, struct device *dma_device, 1567 u64 offset, u64 length, u64 virt_addr, 1568 int fd, int access_flags, int access_mode) 1569 { 1570 bool pinned_mode = (access_mode == MLX5_MKC_ACCESS_MODE_KSM); 1571 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1572 struct mlx5_ib_mr *mr = NULL; 1573 struct ib_umem_dmabuf *umem_dmabuf; 1574 int err; 1575 1576 err = mlx5r_umr_resource_init(dev); 1577 if (err) 1578 return ERR_PTR(err); 1579 1580 if (!pinned_mode) 1581 umem_dmabuf = ib_umem_dmabuf_get(&dev->ib_dev, 1582 offset, length, fd, 1583 access_flags, 1584 &mlx5_ib_dmabuf_attach_ops); 1585 else 1586 umem_dmabuf = ib_umem_dmabuf_get_pinned_with_dma_device(&dev->ib_dev, 1587 dma_device, offset, length, 1588 fd, access_flags); 1589 1590 if (IS_ERR(umem_dmabuf)) { 1591 mlx5_ib_dbg(dev, "umem_dmabuf get failed (%ld)\n", 1592 PTR_ERR(umem_dmabuf)); 1593 return ERR_CAST(umem_dmabuf); 1594 } 1595 1596 mr = alloc_cacheable_mr(pd, &umem_dmabuf->umem, virt_addr, 1597 access_flags, access_mode); 1598 if (IS_ERR(mr)) { 1599 ib_umem_release(&umem_dmabuf->umem); 1600 return ERR_CAST(mr); 1601 } 1602 1603 mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmkey.key); 1604 1605 atomic_add(ib_umem_num_pages(mr->umem), &dev->mdev->priv.reg_pages); 1606 umem_dmabuf->private = mr; 1607 if (!pinned_mode) { 1608 err = mlx5r_store_odp_mkey(dev, &mr->mmkey); 1609 if (err) 1610 goto err_dereg_mr; 1611 } else { 1612 mr->data_direct = true; 1613 } 1614 1615 err = mlx5_ib_init_dmabuf_mr(mr); 1616 if (err) 1617 goto err_dereg_mr; 1618 return &mr->ibmr; 1619 1620 err_dereg_mr: 1621 __mlx5_ib_dereg_mr(&mr->ibmr); 1622 return ERR_PTR(err); 1623 } 1624 1625 static struct ib_mr * 1626 reg_user_mr_dmabuf_by_data_direct(struct ib_pd *pd, u64 offset, 1627 u64 length, u64 virt_addr, 1628 int fd, int access_flags) 1629 { 1630 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1631 struct mlx5_data_direct_dev *data_direct_dev; 1632 struct ib_mr *crossing_mr; 1633 struct ib_mr *crossed_mr; 1634 int ret = 0; 1635 1636 /* As of HW behaviour the IOVA must be page aligned in KSM mode */ 1637 if (!PAGE_ALIGNED(virt_addr) || (access_flags & IB_ACCESS_ON_DEMAND)) 1638 return ERR_PTR(-EOPNOTSUPP); 1639 1640 mutex_lock(&dev->data_direct_lock); 1641 data_direct_dev = dev->data_direct_dev; 1642 if (!data_direct_dev) { 1643 ret = -EINVAL; 1644 goto end; 1645 } 1646 1647 /* The device's 'data direct mkey' was created without RO flags to 1648 * simplify things and allow for a single mkey per device. 1649 * Since RO is not a must, mask it out accordingly. 1650 */ 1651 access_flags &= ~IB_ACCESS_RELAXED_ORDERING; 1652 crossed_mr = reg_user_mr_dmabuf(pd, &data_direct_dev->pdev->dev, 1653 offset, length, virt_addr, fd, 1654 access_flags, MLX5_MKC_ACCESS_MODE_KSM); 1655 if (IS_ERR(crossed_mr)) { 1656 ret = PTR_ERR(crossed_mr); 1657 goto end; 1658 } 1659 1660 mutex_lock(&dev->slow_path_mutex); 1661 crossing_mr = reg_create_crossing_vhca_mr(pd, virt_addr, length, access_flags, 1662 crossed_mr->lkey); 1663 mutex_unlock(&dev->slow_path_mutex); 1664 if (IS_ERR(crossing_mr)) { 1665 __mlx5_ib_dereg_mr(crossed_mr); 1666 ret = PTR_ERR(crossing_mr); 1667 goto end; 1668 } 1669 1670 list_add_tail(&to_mmr(crossed_mr)->dd_node, &dev->data_direct_mr_list); 1671 to_mmr(crossing_mr)->dd_crossed_mr = to_mmr(crossed_mr); 1672 to_mmr(crossing_mr)->data_direct = true; 1673 end: 1674 mutex_unlock(&dev->data_direct_lock); 1675 return ret ? ERR_PTR(ret) : crossing_mr; 1676 } 1677 1678 struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset, 1679 u64 length, u64 virt_addr, 1680 int fd, int access_flags, 1681 struct uverbs_attr_bundle *attrs) 1682 { 1683 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1684 int mlx5_access_flags = 0; 1685 int err; 1686 1687 if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM) || 1688 !IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) 1689 return ERR_PTR(-EOPNOTSUPP); 1690 1691 if (uverbs_attr_is_valid(attrs, MLX5_IB_ATTR_REG_DMABUF_MR_ACCESS_FLAGS)) { 1692 err = uverbs_get_flags32(&mlx5_access_flags, attrs, 1693 MLX5_IB_ATTR_REG_DMABUF_MR_ACCESS_FLAGS, 1694 MLX5_IB_UAPI_REG_DMABUF_ACCESS_DATA_DIRECT); 1695 if (err) 1696 return ERR_PTR(err); 1697 } 1698 1699 mlx5_ib_dbg(dev, 1700 "offset 0x%llx, virt_addr 0x%llx, length 0x%llx, fd %d, access_flags 0x%x, mlx5_access_flags 0x%x\n", 1701 offset, virt_addr, length, fd, access_flags, mlx5_access_flags); 1702 1703 /* dmabuf requires xlt update via umr to work. */ 1704 if (!mlx5r_umr_can_load_pas(dev, length)) 1705 return ERR_PTR(-EINVAL); 1706 1707 if (mlx5_access_flags & MLX5_IB_UAPI_REG_DMABUF_ACCESS_DATA_DIRECT) 1708 return reg_user_mr_dmabuf_by_data_direct(pd, offset, length, virt_addr, 1709 fd, access_flags); 1710 1711 return reg_user_mr_dmabuf(pd, pd->device->dma_device, 1712 offset, length, virt_addr, 1713 fd, access_flags, MLX5_MKC_ACCESS_MODE_MTT); 1714 } 1715 1716 /* 1717 * True if the change in access flags can be done via UMR, only some access 1718 * flags can be updated. 1719 */ 1720 static bool can_use_umr_rereg_access(struct mlx5_ib_dev *dev, 1721 unsigned int current_access_flags, 1722 unsigned int target_access_flags) 1723 { 1724 unsigned int diffs = current_access_flags ^ target_access_flags; 1725 1726 if (diffs & ~(IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE | 1727 IB_ACCESS_REMOTE_READ | IB_ACCESS_RELAXED_ORDERING | 1728 IB_ACCESS_REMOTE_ATOMIC)) 1729 return false; 1730 return mlx5r_umr_can_reconfig(dev, current_access_flags, 1731 target_access_flags); 1732 } 1733 1734 static bool can_use_umr_rereg_pas(struct mlx5_ib_mr *mr, 1735 struct ib_umem *new_umem, 1736 int new_access_flags, u64 iova, 1737 unsigned long *page_size) 1738 { 1739 struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device); 1740 1741 /* We only track the allocated sizes of MRs from the cache */ 1742 if (!mr->mmkey.cache_ent) 1743 return false; 1744 if (!mlx5r_umr_can_load_pas(dev, new_umem->length)) 1745 return false; 1746 1747 *page_size = mlx5_umem_mkc_find_best_pgsz(dev, new_umem, iova); 1748 if (WARN_ON(!*page_size)) 1749 return false; 1750 return (mr->mmkey.cache_ent->rb_key.ndescs) >= 1751 ib_umem_num_dma_blocks(new_umem, *page_size); 1752 } 1753 1754 static int umr_rereg_pas(struct mlx5_ib_mr *mr, struct ib_pd *pd, 1755 int access_flags, int flags, struct ib_umem *new_umem, 1756 u64 iova, unsigned long page_size) 1757 { 1758 struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device); 1759 int upd_flags = MLX5_IB_UPD_XLT_ADDR | MLX5_IB_UPD_XLT_ENABLE; 1760 struct ib_umem *old_umem = mr->umem; 1761 int err; 1762 1763 /* 1764 * To keep everything simple the MR is revoked before we start to mess 1765 * with it. This ensure the change is atomic relative to any use of the 1766 * MR. 1767 */ 1768 err = mlx5r_umr_revoke_mr(mr); 1769 if (err) 1770 return err; 1771 1772 if (flags & IB_MR_REREG_PD) { 1773 mr->ibmr.pd = pd; 1774 upd_flags |= MLX5_IB_UPD_XLT_PD; 1775 } 1776 if (flags & IB_MR_REREG_ACCESS) { 1777 mr->access_flags = access_flags; 1778 upd_flags |= MLX5_IB_UPD_XLT_ACCESS; 1779 } 1780 1781 mr->ibmr.iova = iova; 1782 mr->ibmr.length = new_umem->length; 1783 mr->page_shift = order_base_2(page_size); 1784 mr->umem = new_umem; 1785 err = mlx5r_umr_update_mr_pas(mr, upd_flags); 1786 if (err) { 1787 /* 1788 * The MR is revoked at this point so there is no issue to free 1789 * new_umem. 1790 */ 1791 mr->umem = old_umem; 1792 return err; 1793 } 1794 1795 atomic_sub(ib_umem_num_pages(old_umem), &dev->mdev->priv.reg_pages); 1796 ib_umem_release(old_umem); 1797 atomic_add(ib_umem_num_pages(new_umem), &dev->mdev->priv.reg_pages); 1798 return 0; 1799 } 1800 1801 struct ib_mr *mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, 1802 u64 length, u64 iova, int new_access_flags, 1803 struct ib_pd *new_pd, 1804 struct ib_udata *udata) 1805 { 1806 struct mlx5_ib_dev *dev = to_mdev(ib_mr->device); 1807 struct mlx5_ib_mr *mr = to_mmr(ib_mr); 1808 int err; 1809 1810 if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM) || mr->data_direct) 1811 return ERR_PTR(-EOPNOTSUPP); 1812 1813 mlx5_ib_dbg( 1814 dev, 1815 "start 0x%llx, iova 0x%llx, length 0x%llx, access_flags 0x%x\n", 1816 start, iova, length, new_access_flags); 1817 1818 if (flags & ~(IB_MR_REREG_TRANS | IB_MR_REREG_PD | IB_MR_REREG_ACCESS)) 1819 return ERR_PTR(-EOPNOTSUPP); 1820 1821 if (!(flags & IB_MR_REREG_ACCESS)) 1822 new_access_flags = mr->access_flags; 1823 if (!(flags & IB_MR_REREG_PD)) 1824 new_pd = ib_mr->pd; 1825 1826 if (!(flags & IB_MR_REREG_TRANS)) { 1827 struct ib_umem *umem; 1828 1829 /* Fast path for PD/access change */ 1830 if (can_use_umr_rereg_access(dev, mr->access_flags, 1831 new_access_flags)) { 1832 err = mlx5r_umr_rereg_pd_access(mr, new_pd, 1833 new_access_flags); 1834 if (err) 1835 return ERR_PTR(err); 1836 return NULL; 1837 } 1838 /* DM or ODP MR's don't have a normal umem so we can't re-use it */ 1839 if (!mr->umem || is_odp_mr(mr) || is_dmabuf_mr(mr)) 1840 goto recreate; 1841 1842 /* 1843 * Only one active MR can refer to a umem at one time, revoke 1844 * the old MR before assigning the umem to the new one. 1845 */ 1846 err = mlx5r_umr_revoke_mr(mr); 1847 if (err) 1848 return ERR_PTR(err); 1849 umem = mr->umem; 1850 mr->umem = NULL; 1851 atomic_sub(ib_umem_num_pages(umem), &dev->mdev->priv.reg_pages); 1852 1853 return create_real_mr(new_pd, umem, mr->ibmr.iova, 1854 new_access_flags); 1855 } 1856 1857 /* 1858 * DM doesn't have a PAS list so we can't re-use it, odp/dmabuf does 1859 * but the logic around releasing the umem is different 1860 */ 1861 if (!mr->umem || is_odp_mr(mr) || is_dmabuf_mr(mr)) 1862 goto recreate; 1863 1864 if (!(new_access_flags & IB_ACCESS_ON_DEMAND) && 1865 can_use_umr_rereg_access(dev, mr->access_flags, new_access_flags)) { 1866 struct ib_umem *new_umem; 1867 unsigned long page_size; 1868 1869 new_umem = ib_umem_get(&dev->ib_dev, start, length, 1870 new_access_flags); 1871 if (IS_ERR(new_umem)) 1872 return ERR_CAST(new_umem); 1873 1874 /* Fast path for PAS change */ 1875 if (can_use_umr_rereg_pas(mr, new_umem, new_access_flags, iova, 1876 &page_size)) { 1877 err = umr_rereg_pas(mr, new_pd, new_access_flags, flags, 1878 new_umem, iova, page_size); 1879 if (err) { 1880 ib_umem_release(new_umem); 1881 return ERR_PTR(err); 1882 } 1883 return NULL; 1884 } 1885 return create_real_mr(new_pd, new_umem, iova, new_access_flags); 1886 } 1887 1888 /* 1889 * Everything else has no state we can preserve, just create a new MR 1890 * from scratch 1891 */ 1892 recreate: 1893 return mlx5_ib_reg_user_mr(new_pd, start, length, iova, 1894 new_access_flags, udata); 1895 } 1896 1897 static int 1898 mlx5_alloc_priv_descs(struct ib_device *device, 1899 struct mlx5_ib_mr *mr, 1900 int ndescs, 1901 int desc_size) 1902 { 1903 struct mlx5_ib_dev *dev = to_mdev(device); 1904 struct device *ddev = &dev->mdev->pdev->dev; 1905 int size = ndescs * desc_size; 1906 int add_size; 1907 int ret; 1908 1909 add_size = max_t(int, MLX5_UMR_ALIGN - ARCH_KMALLOC_MINALIGN, 0); 1910 if (is_power_of_2(MLX5_UMR_ALIGN) && add_size) { 1911 int end = max_t(int, MLX5_UMR_ALIGN, roundup_pow_of_two(size)); 1912 1913 add_size = min_t(int, end - size, add_size); 1914 } 1915 1916 mr->descs_alloc = kzalloc(size + add_size, GFP_KERNEL); 1917 if (!mr->descs_alloc) 1918 return -ENOMEM; 1919 1920 mr->descs = PTR_ALIGN(mr->descs_alloc, MLX5_UMR_ALIGN); 1921 1922 mr->desc_map = dma_map_single(ddev, mr->descs, size, DMA_TO_DEVICE); 1923 if (dma_mapping_error(ddev, mr->desc_map)) { 1924 ret = -ENOMEM; 1925 goto err; 1926 } 1927 1928 return 0; 1929 err: 1930 kfree(mr->descs_alloc); 1931 1932 return ret; 1933 } 1934 1935 static void 1936 mlx5_free_priv_descs(struct mlx5_ib_mr *mr) 1937 { 1938 if (!mr->umem && !mr->data_direct && 1939 mr->ibmr.type != IB_MR_TYPE_DM && mr->descs) { 1940 struct ib_device *device = mr->ibmr.device; 1941 int size = mr->max_descs * mr->desc_size; 1942 struct mlx5_ib_dev *dev = to_mdev(device); 1943 1944 dma_unmap_single(&dev->mdev->pdev->dev, mr->desc_map, size, 1945 DMA_TO_DEVICE); 1946 kfree(mr->descs_alloc); 1947 mr->descs = NULL; 1948 } 1949 } 1950 1951 static int cache_ent_find_and_store(struct mlx5_ib_dev *dev, 1952 struct mlx5_ib_mr *mr) 1953 { 1954 struct mlx5_mkey_cache *cache = &dev->cache; 1955 struct mlx5_cache_ent *ent; 1956 int ret; 1957 1958 if (mr->mmkey.cache_ent) { 1959 spin_lock_irq(&mr->mmkey.cache_ent->mkeys_queue.lock); 1960 mr->mmkey.cache_ent->in_use--; 1961 goto end; 1962 } 1963 1964 mutex_lock(&cache->rb_lock); 1965 ent = mkey_cache_ent_from_rb_key(dev, mr->mmkey.rb_key); 1966 if (ent) { 1967 if (ent->rb_key.ndescs == mr->mmkey.rb_key.ndescs) { 1968 if (ent->disabled) { 1969 mutex_unlock(&cache->rb_lock); 1970 return -EOPNOTSUPP; 1971 } 1972 mr->mmkey.cache_ent = ent; 1973 spin_lock_irq(&mr->mmkey.cache_ent->mkeys_queue.lock); 1974 mutex_unlock(&cache->rb_lock); 1975 goto end; 1976 } 1977 } 1978 1979 ent = mlx5r_cache_create_ent_locked(dev, mr->mmkey.rb_key, false); 1980 mutex_unlock(&cache->rb_lock); 1981 if (IS_ERR(ent)) 1982 return PTR_ERR(ent); 1983 1984 mr->mmkey.cache_ent = ent; 1985 spin_lock_irq(&mr->mmkey.cache_ent->mkeys_queue.lock); 1986 1987 end: 1988 ret = push_mkey_locked(mr->mmkey.cache_ent, mr->mmkey.key); 1989 spin_unlock_irq(&mr->mmkey.cache_ent->mkeys_queue.lock); 1990 return ret; 1991 } 1992 1993 static int mlx5_ib_revoke_data_direct_mr(struct mlx5_ib_mr *mr) 1994 { 1995 struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device); 1996 struct ib_umem_dmabuf *umem_dmabuf = to_ib_umem_dmabuf(mr->umem); 1997 int err; 1998 1999 lockdep_assert_held(&dev->data_direct_lock); 2000 mr->revoked = true; 2001 err = mlx5r_umr_revoke_mr(mr); 2002 if (WARN_ON(err)) 2003 return err; 2004 2005 ib_umem_dmabuf_revoke(umem_dmabuf); 2006 return 0; 2007 } 2008 2009 void mlx5_ib_revoke_data_direct_mrs(struct mlx5_ib_dev *dev) 2010 { 2011 struct mlx5_ib_mr *mr, *next; 2012 2013 lockdep_assert_held(&dev->data_direct_lock); 2014 2015 list_for_each_entry_safe(mr, next, &dev->data_direct_mr_list, dd_node) { 2016 list_del(&mr->dd_node); 2017 mlx5_ib_revoke_data_direct_mr(mr); 2018 } 2019 } 2020 2021 static int mlx5_revoke_mr(struct mlx5_ib_mr *mr) 2022 { 2023 struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device); 2024 struct mlx5_cache_ent *ent = mr->mmkey.cache_ent; 2025 bool is_odp = is_odp_mr(mr); 2026 bool is_odp_dma_buf = is_dmabuf_mr(mr) && 2027 !to_ib_umem_dmabuf(mr->umem)->pinned; 2028 int ret = 0; 2029 2030 if (is_odp) 2031 mutex_lock(&to_ib_umem_odp(mr->umem)->umem_mutex); 2032 2033 if (is_odp_dma_buf) 2034 dma_resv_lock(to_ib_umem_dmabuf(mr->umem)->attach->dmabuf->resv, NULL); 2035 2036 if (mr->mmkey.cacheable && !mlx5r_umr_revoke_mr(mr) && !cache_ent_find_and_store(dev, mr)) { 2037 ent = mr->mmkey.cache_ent; 2038 /* upon storing to a clean temp entry - schedule its cleanup */ 2039 spin_lock_irq(&ent->mkeys_queue.lock); 2040 if (ent->is_tmp && !ent->tmp_cleanup_scheduled) { 2041 mod_delayed_work(ent->dev->cache.wq, &ent->dwork, 2042 msecs_to_jiffies(30 * 1000)); 2043 ent->tmp_cleanup_scheduled = true; 2044 } 2045 spin_unlock_irq(&ent->mkeys_queue.lock); 2046 goto out; 2047 } 2048 2049 if (ent) { 2050 spin_lock_irq(&ent->mkeys_queue.lock); 2051 ent->in_use--; 2052 mr->mmkey.cache_ent = NULL; 2053 spin_unlock_irq(&ent->mkeys_queue.lock); 2054 } 2055 ret = destroy_mkey(dev, mr); 2056 out: 2057 if (is_odp) { 2058 if (!ret) 2059 to_ib_umem_odp(mr->umem)->private = NULL; 2060 mutex_unlock(&to_ib_umem_odp(mr->umem)->umem_mutex); 2061 } 2062 2063 if (is_odp_dma_buf) { 2064 if (!ret) 2065 to_ib_umem_dmabuf(mr->umem)->private = NULL; 2066 dma_resv_unlock(to_ib_umem_dmabuf(mr->umem)->attach->dmabuf->resv); 2067 } 2068 2069 return ret; 2070 } 2071 2072 static int __mlx5_ib_dereg_mr(struct ib_mr *ibmr) 2073 { 2074 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2075 struct mlx5_ib_dev *dev = to_mdev(ibmr->device); 2076 int rc; 2077 2078 /* 2079 * Any async use of the mr must hold the refcount, once the refcount 2080 * goes to zero no other thread, such as ODP page faults, prefetch, any 2081 * UMR activity, etc can touch the mkey. Thus it is safe to destroy it. 2082 */ 2083 if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) && 2084 refcount_read(&mr->mmkey.usecount) != 0 && 2085 xa_erase(&mr_to_mdev(mr)->odp_mkeys, mlx5_base_mkey(mr->mmkey.key))) 2086 mlx5r_deref_wait_odp_mkey(&mr->mmkey); 2087 2088 if (ibmr->type == IB_MR_TYPE_INTEGRITY) { 2089 xa_cmpxchg(&dev->sig_mrs, mlx5_base_mkey(mr->mmkey.key), 2090 mr->sig, NULL, GFP_KERNEL); 2091 2092 if (mr->mtt_mr) { 2093 rc = mlx5_ib_dereg_mr(&mr->mtt_mr->ibmr, NULL); 2094 if (rc) 2095 return rc; 2096 mr->mtt_mr = NULL; 2097 } 2098 if (mr->klm_mr) { 2099 rc = mlx5_ib_dereg_mr(&mr->klm_mr->ibmr, NULL); 2100 if (rc) 2101 return rc; 2102 mr->klm_mr = NULL; 2103 } 2104 2105 if (mlx5_core_destroy_psv(dev->mdev, 2106 mr->sig->psv_memory.psv_idx)) 2107 mlx5_ib_warn(dev, "failed to destroy mem psv %d\n", 2108 mr->sig->psv_memory.psv_idx); 2109 if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_wire.psv_idx)) 2110 mlx5_ib_warn(dev, "failed to destroy wire psv %d\n", 2111 mr->sig->psv_wire.psv_idx); 2112 kfree(mr->sig); 2113 mr->sig = NULL; 2114 } 2115 2116 /* Stop DMA */ 2117 rc = mlx5_revoke_mr(mr); 2118 if (rc) 2119 return rc; 2120 2121 if (mr->umem) { 2122 bool is_odp = is_odp_mr(mr); 2123 2124 if (!is_odp) 2125 atomic_sub(ib_umem_num_pages(mr->umem), 2126 &dev->mdev->priv.reg_pages); 2127 ib_umem_release(mr->umem); 2128 if (is_odp) 2129 mlx5_ib_free_odp_mr(mr); 2130 } 2131 2132 if (!mr->mmkey.cache_ent) 2133 mlx5_free_priv_descs(mr); 2134 2135 kfree(mr); 2136 return 0; 2137 } 2138 2139 static int dereg_crossing_data_direct_mr(struct mlx5_ib_dev *dev, 2140 struct mlx5_ib_mr *mr) 2141 { 2142 struct mlx5_ib_mr *dd_crossed_mr = mr->dd_crossed_mr; 2143 int ret; 2144 2145 ret = __mlx5_ib_dereg_mr(&mr->ibmr); 2146 if (ret) 2147 return ret; 2148 2149 mutex_lock(&dev->data_direct_lock); 2150 if (!dd_crossed_mr->revoked) 2151 list_del(&dd_crossed_mr->dd_node); 2152 2153 ret = __mlx5_ib_dereg_mr(&dd_crossed_mr->ibmr); 2154 mutex_unlock(&dev->data_direct_lock); 2155 return ret; 2156 } 2157 2158 int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) 2159 { 2160 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2161 struct mlx5_ib_dev *dev = to_mdev(ibmr->device); 2162 2163 if (mr->data_direct) 2164 return dereg_crossing_data_direct_mr(dev, mr); 2165 2166 return __mlx5_ib_dereg_mr(ibmr); 2167 } 2168 2169 static void mlx5_set_umr_free_mkey(struct ib_pd *pd, u32 *in, int ndescs, 2170 int access_mode, int page_shift) 2171 { 2172 struct mlx5_ib_dev *dev = to_mdev(pd->device); 2173 void *mkc; 2174 2175 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 2176 2177 /* This is only used from the kernel, so setting the PD is OK. */ 2178 set_mkc_access_pd_addr_fields(mkc, IB_ACCESS_RELAXED_ORDERING, 0, pd); 2179 MLX5_SET(mkc, mkc, free, 1); 2180 MLX5_SET(mkc, mkc, translations_octword_size, ndescs); 2181 MLX5_SET(mkc, mkc, access_mode_1_0, access_mode & 0x3); 2182 MLX5_SET(mkc, mkc, access_mode_4_2, (access_mode >> 2) & 0x7); 2183 MLX5_SET(mkc, mkc, umr_en, 1); 2184 MLX5_SET(mkc, mkc, log_page_size, page_shift); 2185 if (access_mode == MLX5_MKC_ACCESS_MODE_PA || 2186 access_mode == MLX5_MKC_ACCESS_MODE_MTT) 2187 MLX5_SET(mkc, mkc, ma_translation_mode, MLX5_CAP_GEN(dev->mdev, ats)); 2188 } 2189 2190 static int _mlx5_alloc_mkey_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr, 2191 int ndescs, int desc_size, int page_shift, 2192 int access_mode, u32 *in, int inlen) 2193 { 2194 struct mlx5_ib_dev *dev = to_mdev(pd->device); 2195 int err; 2196 2197 mr->access_mode = access_mode; 2198 mr->desc_size = desc_size; 2199 mr->max_descs = ndescs; 2200 2201 err = mlx5_alloc_priv_descs(pd->device, mr, ndescs, desc_size); 2202 if (err) 2203 return err; 2204 2205 mlx5_set_umr_free_mkey(pd, in, ndescs, access_mode, page_shift); 2206 2207 err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen); 2208 if (err) 2209 goto err_free_descs; 2210 2211 mr->mmkey.type = MLX5_MKEY_MR; 2212 mr->ibmr.lkey = mr->mmkey.key; 2213 mr->ibmr.rkey = mr->mmkey.key; 2214 2215 return 0; 2216 2217 err_free_descs: 2218 mlx5_free_priv_descs(mr); 2219 return err; 2220 } 2221 2222 static struct mlx5_ib_mr *mlx5_ib_alloc_pi_mr(struct ib_pd *pd, 2223 u32 max_num_sg, u32 max_num_meta_sg, 2224 int desc_size, int access_mode) 2225 { 2226 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 2227 int ndescs = ALIGN(max_num_sg + max_num_meta_sg, 4); 2228 int page_shift = 0; 2229 struct mlx5_ib_mr *mr; 2230 u32 *in; 2231 int err; 2232 2233 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 2234 if (!mr) 2235 return ERR_PTR(-ENOMEM); 2236 2237 mr->ibmr.pd = pd; 2238 mr->ibmr.device = pd->device; 2239 2240 in = kzalloc(inlen, GFP_KERNEL); 2241 if (!in) { 2242 err = -ENOMEM; 2243 goto err_free; 2244 } 2245 2246 if (access_mode == MLX5_MKC_ACCESS_MODE_MTT) 2247 page_shift = PAGE_SHIFT; 2248 2249 err = _mlx5_alloc_mkey_descs(pd, mr, ndescs, desc_size, page_shift, 2250 access_mode, in, inlen); 2251 if (err) 2252 goto err_free_in; 2253 2254 mr->umem = NULL; 2255 kfree(in); 2256 2257 return mr; 2258 2259 err_free_in: 2260 kfree(in); 2261 err_free: 2262 kfree(mr); 2263 return ERR_PTR(err); 2264 } 2265 2266 static int mlx5_alloc_mem_reg_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr, 2267 int ndescs, u32 *in, int inlen) 2268 { 2269 return _mlx5_alloc_mkey_descs(pd, mr, ndescs, sizeof(struct mlx5_mtt), 2270 PAGE_SHIFT, MLX5_MKC_ACCESS_MODE_MTT, in, 2271 inlen); 2272 } 2273 2274 static int mlx5_alloc_sg_gaps_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr, 2275 int ndescs, u32 *in, int inlen) 2276 { 2277 return _mlx5_alloc_mkey_descs(pd, mr, ndescs, sizeof(struct mlx5_klm), 2278 0, MLX5_MKC_ACCESS_MODE_KLMS, in, inlen); 2279 } 2280 2281 static int mlx5_alloc_integrity_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr, 2282 int max_num_sg, int max_num_meta_sg, 2283 u32 *in, int inlen) 2284 { 2285 struct mlx5_ib_dev *dev = to_mdev(pd->device); 2286 u32 psv_index[2]; 2287 void *mkc; 2288 int err; 2289 2290 mr->sig = kzalloc(sizeof(*mr->sig), GFP_KERNEL); 2291 if (!mr->sig) 2292 return -ENOMEM; 2293 2294 /* create mem & wire PSVs */ 2295 err = mlx5_core_create_psv(dev->mdev, to_mpd(pd)->pdn, 2, psv_index); 2296 if (err) 2297 goto err_free_sig; 2298 2299 mr->sig->psv_memory.psv_idx = psv_index[0]; 2300 mr->sig->psv_wire.psv_idx = psv_index[1]; 2301 2302 mr->sig->sig_status_checked = true; 2303 mr->sig->sig_err_exists = false; 2304 /* Next UMR, Arm SIGERR */ 2305 ++mr->sig->sigerr_count; 2306 mr->klm_mr = mlx5_ib_alloc_pi_mr(pd, max_num_sg, max_num_meta_sg, 2307 sizeof(struct mlx5_klm), 2308 MLX5_MKC_ACCESS_MODE_KLMS); 2309 if (IS_ERR(mr->klm_mr)) { 2310 err = PTR_ERR(mr->klm_mr); 2311 goto err_destroy_psv; 2312 } 2313 mr->mtt_mr = mlx5_ib_alloc_pi_mr(pd, max_num_sg, max_num_meta_sg, 2314 sizeof(struct mlx5_mtt), 2315 MLX5_MKC_ACCESS_MODE_MTT); 2316 if (IS_ERR(mr->mtt_mr)) { 2317 err = PTR_ERR(mr->mtt_mr); 2318 goto err_free_klm_mr; 2319 } 2320 2321 /* Set bsf descriptors for mkey */ 2322 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 2323 MLX5_SET(mkc, mkc, bsf_en, 1); 2324 MLX5_SET(mkc, mkc, bsf_octword_size, MLX5_MKEY_BSF_OCTO_SIZE); 2325 2326 err = _mlx5_alloc_mkey_descs(pd, mr, 4, sizeof(struct mlx5_klm), 0, 2327 MLX5_MKC_ACCESS_MODE_KLMS, in, inlen); 2328 if (err) 2329 goto err_free_mtt_mr; 2330 2331 err = xa_err(xa_store(&dev->sig_mrs, mlx5_base_mkey(mr->mmkey.key), 2332 mr->sig, GFP_KERNEL)); 2333 if (err) 2334 goto err_free_descs; 2335 return 0; 2336 2337 err_free_descs: 2338 destroy_mkey(dev, mr); 2339 mlx5_free_priv_descs(mr); 2340 err_free_mtt_mr: 2341 mlx5_ib_dereg_mr(&mr->mtt_mr->ibmr, NULL); 2342 mr->mtt_mr = NULL; 2343 err_free_klm_mr: 2344 mlx5_ib_dereg_mr(&mr->klm_mr->ibmr, NULL); 2345 mr->klm_mr = NULL; 2346 err_destroy_psv: 2347 if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_memory.psv_idx)) 2348 mlx5_ib_warn(dev, "failed to destroy mem psv %d\n", 2349 mr->sig->psv_memory.psv_idx); 2350 if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_wire.psv_idx)) 2351 mlx5_ib_warn(dev, "failed to destroy wire psv %d\n", 2352 mr->sig->psv_wire.psv_idx); 2353 err_free_sig: 2354 kfree(mr->sig); 2355 2356 return err; 2357 } 2358 2359 static struct ib_mr *__mlx5_ib_alloc_mr(struct ib_pd *pd, 2360 enum ib_mr_type mr_type, u32 max_num_sg, 2361 u32 max_num_meta_sg) 2362 { 2363 struct mlx5_ib_dev *dev = to_mdev(pd->device); 2364 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 2365 int ndescs = ALIGN(max_num_sg, 4); 2366 struct mlx5_ib_mr *mr; 2367 u32 *in; 2368 int err; 2369 2370 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 2371 if (!mr) 2372 return ERR_PTR(-ENOMEM); 2373 2374 in = kzalloc(inlen, GFP_KERNEL); 2375 if (!in) { 2376 err = -ENOMEM; 2377 goto err_free; 2378 } 2379 2380 mr->ibmr.device = pd->device; 2381 mr->umem = NULL; 2382 2383 switch (mr_type) { 2384 case IB_MR_TYPE_MEM_REG: 2385 err = mlx5_alloc_mem_reg_descs(pd, mr, ndescs, in, inlen); 2386 break; 2387 case IB_MR_TYPE_SG_GAPS: 2388 err = mlx5_alloc_sg_gaps_descs(pd, mr, ndescs, in, inlen); 2389 break; 2390 case IB_MR_TYPE_INTEGRITY: 2391 err = mlx5_alloc_integrity_descs(pd, mr, max_num_sg, 2392 max_num_meta_sg, in, inlen); 2393 break; 2394 default: 2395 mlx5_ib_warn(dev, "Invalid mr type %d\n", mr_type); 2396 err = -EINVAL; 2397 } 2398 2399 if (err) 2400 goto err_free_in; 2401 2402 kfree(in); 2403 2404 return &mr->ibmr; 2405 2406 err_free_in: 2407 kfree(in); 2408 err_free: 2409 kfree(mr); 2410 return ERR_PTR(err); 2411 } 2412 2413 struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, 2414 u32 max_num_sg) 2415 { 2416 return __mlx5_ib_alloc_mr(pd, mr_type, max_num_sg, 0); 2417 } 2418 2419 struct ib_mr *mlx5_ib_alloc_mr_integrity(struct ib_pd *pd, 2420 u32 max_num_sg, u32 max_num_meta_sg) 2421 { 2422 return __mlx5_ib_alloc_mr(pd, IB_MR_TYPE_INTEGRITY, max_num_sg, 2423 max_num_meta_sg); 2424 } 2425 2426 int mlx5_ib_alloc_mw(struct ib_mw *ibmw, struct ib_udata *udata) 2427 { 2428 struct mlx5_ib_dev *dev = to_mdev(ibmw->device); 2429 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 2430 struct mlx5_ib_mw *mw = to_mmw(ibmw); 2431 unsigned int ndescs; 2432 u32 *in = NULL; 2433 void *mkc; 2434 int err; 2435 struct mlx5_ib_alloc_mw req = {}; 2436 struct { 2437 __u32 comp_mask; 2438 __u32 response_length; 2439 } resp = {}; 2440 2441 err = ib_copy_from_udata(&req, udata, min(udata->inlen, sizeof(req))); 2442 if (err) 2443 return err; 2444 2445 if (req.comp_mask || req.reserved1 || req.reserved2) 2446 return -EOPNOTSUPP; 2447 2448 if (udata->inlen > sizeof(req) && 2449 !ib_is_udata_cleared(udata, sizeof(req), 2450 udata->inlen - sizeof(req))) 2451 return -EOPNOTSUPP; 2452 2453 ndescs = req.num_klms ? roundup(req.num_klms, 4) : roundup(1, 4); 2454 2455 in = kzalloc(inlen, GFP_KERNEL); 2456 if (!in) 2457 return -ENOMEM; 2458 2459 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 2460 2461 MLX5_SET(mkc, mkc, free, 1); 2462 MLX5_SET(mkc, mkc, translations_octword_size, ndescs); 2463 MLX5_SET(mkc, mkc, pd, to_mpd(ibmw->pd)->pdn); 2464 MLX5_SET(mkc, mkc, umr_en, 1); 2465 MLX5_SET(mkc, mkc, lr, 1); 2466 MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_KLMS); 2467 MLX5_SET(mkc, mkc, en_rinval, !!((ibmw->type == IB_MW_TYPE_2))); 2468 MLX5_SET(mkc, mkc, qpn, 0xffffff); 2469 2470 err = mlx5_ib_create_mkey(dev, &mw->mmkey, in, inlen); 2471 if (err) 2472 goto free; 2473 2474 mw->mmkey.type = MLX5_MKEY_MW; 2475 ibmw->rkey = mw->mmkey.key; 2476 mw->mmkey.ndescs = ndescs; 2477 2478 resp.response_length = 2479 min(offsetofend(typeof(resp), response_length), udata->outlen); 2480 if (resp.response_length) { 2481 err = ib_copy_to_udata(udata, &resp, resp.response_length); 2482 if (err) 2483 goto free_mkey; 2484 } 2485 2486 if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) { 2487 err = mlx5r_store_odp_mkey(dev, &mw->mmkey); 2488 if (err) 2489 goto free_mkey; 2490 } 2491 2492 kfree(in); 2493 return 0; 2494 2495 free_mkey: 2496 mlx5_core_destroy_mkey(dev->mdev, mw->mmkey.key); 2497 free: 2498 kfree(in); 2499 return err; 2500 } 2501 2502 int mlx5_ib_dealloc_mw(struct ib_mw *mw) 2503 { 2504 struct mlx5_ib_dev *dev = to_mdev(mw->device); 2505 struct mlx5_ib_mw *mmw = to_mmw(mw); 2506 2507 if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) && 2508 xa_erase(&dev->odp_mkeys, mlx5_base_mkey(mmw->mmkey.key))) 2509 /* 2510 * pagefault_single_data_segment() may be accessing mmw 2511 * if the user bound an ODP MR to this MW. 2512 */ 2513 mlx5r_deref_wait_odp_mkey(&mmw->mmkey); 2514 2515 return mlx5_core_destroy_mkey(dev->mdev, mmw->mmkey.key); 2516 } 2517 2518 int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask, 2519 struct ib_mr_status *mr_status) 2520 { 2521 struct mlx5_ib_mr *mmr = to_mmr(ibmr); 2522 int ret = 0; 2523 2524 if (check_mask & ~IB_MR_CHECK_SIG_STATUS) { 2525 pr_err("Invalid status check mask\n"); 2526 ret = -EINVAL; 2527 goto done; 2528 } 2529 2530 mr_status->fail_status = 0; 2531 if (check_mask & IB_MR_CHECK_SIG_STATUS) { 2532 if (!mmr->sig) { 2533 ret = -EINVAL; 2534 pr_err("signature status check requested on a non-signature enabled MR\n"); 2535 goto done; 2536 } 2537 2538 mmr->sig->sig_status_checked = true; 2539 if (!mmr->sig->sig_err_exists) 2540 goto done; 2541 2542 if (ibmr->lkey == mmr->sig->err_item.key) 2543 memcpy(&mr_status->sig_err, &mmr->sig->err_item, 2544 sizeof(mr_status->sig_err)); 2545 else { 2546 mr_status->sig_err.err_type = IB_SIG_BAD_GUARD; 2547 mr_status->sig_err.sig_err_offset = 0; 2548 mr_status->sig_err.key = mmr->sig->err_item.key; 2549 } 2550 2551 mmr->sig->sig_err_exists = false; 2552 mr_status->fail_status |= IB_MR_CHECK_SIG_STATUS; 2553 } 2554 2555 done: 2556 return ret; 2557 } 2558 2559 static int 2560 mlx5_ib_map_pa_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg, 2561 int data_sg_nents, unsigned int *data_sg_offset, 2562 struct scatterlist *meta_sg, int meta_sg_nents, 2563 unsigned int *meta_sg_offset) 2564 { 2565 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2566 unsigned int sg_offset = 0; 2567 int n = 0; 2568 2569 mr->meta_length = 0; 2570 if (data_sg_nents == 1) { 2571 n++; 2572 mr->mmkey.ndescs = 1; 2573 if (data_sg_offset) 2574 sg_offset = *data_sg_offset; 2575 mr->data_length = sg_dma_len(data_sg) - sg_offset; 2576 mr->data_iova = sg_dma_address(data_sg) + sg_offset; 2577 if (meta_sg_nents == 1) { 2578 n++; 2579 mr->meta_ndescs = 1; 2580 if (meta_sg_offset) 2581 sg_offset = *meta_sg_offset; 2582 else 2583 sg_offset = 0; 2584 mr->meta_length = sg_dma_len(meta_sg) - sg_offset; 2585 mr->pi_iova = sg_dma_address(meta_sg) + sg_offset; 2586 } 2587 ibmr->length = mr->data_length + mr->meta_length; 2588 } 2589 2590 return n; 2591 } 2592 2593 static int 2594 mlx5_ib_sg_to_klms(struct mlx5_ib_mr *mr, 2595 struct scatterlist *sgl, 2596 unsigned short sg_nents, 2597 unsigned int *sg_offset_p, 2598 struct scatterlist *meta_sgl, 2599 unsigned short meta_sg_nents, 2600 unsigned int *meta_sg_offset_p) 2601 { 2602 struct scatterlist *sg = sgl; 2603 struct mlx5_klm *klms = mr->descs; 2604 unsigned int sg_offset = sg_offset_p ? *sg_offset_p : 0; 2605 u32 lkey = mr->ibmr.pd->local_dma_lkey; 2606 int i, j = 0; 2607 2608 mr->ibmr.iova = sg_dma_address(sg) + sg_offset; 2609 mr->ibmr.length = 0; 2610 2611 for_each_sg(sgl, sg, sg_nents, i) { 2612 if (unlikely(i >= mr->max_descs)) 2613 break; 2614 klms[i].va = cpu_to_be64(sg_dma_address(sg) + sg_offset); 2615 klms[i].bcount = cpu_to_be32(sg_dma_len(sg) - sg_offset); 2616 klms[i].key = cpu_to_be32(lkey); 2617 mr->ibmr.length += sg_dma_len(sg) - sg_offset; 2618 2619 sg_offset = 0; 2620 } 2621 2622 if (sg_offset_p) 2623 *sg_offset_p = sg_offset; 2624 2625 mr->mmkey.ndescs = i; 2626 mr->data_length = mr->ibmr.length; 2627 2628 if (meta_sg_nents) { 2629 sg = meta_sgl; 2630 sg_offset = meta_sg_offset_p ? *meta_sg_offset_p : 0; 2631 for_each_sg(meta_sgl, sg, meta_sg_nents, j) { 2632 if (unlikely(i + j >= mr->max_descs)) 2633 break; 2634 klms[i + j].va = cpu_to_be64(sg_dma_address(sg) + 2635 sg_offset); 2636 klms[i + j].bcount = cpu_to_be32(sg_dma_len(sg) - 2637 sg_offset); 2638 klms[i + j].key = cpu_to_be32(lkey); 2639 mr->ibmr.length += sg_dma_len(sg) - sg_offset; 2640 2641 sg_offset = 0; 2642 } 2643 if (meta_sg_offset_p) 2644 *meta_sg_offset_p = sg_offset; 2645 2646 mr->meta_ndescs = j; 2647 mr->meta_length = mr->ibmr.length - mr->data_length; 2648 } 2649 2650 return i + j; 2651 } 2652 2653 static int mlx5_set_page(struct ib_mr *ibmr, u64 addr) 2654 { 2655 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2656 __be64 *descs; 2657 2658 if (unlikely(mr->mmkey.ndescs == mr->max_descs)) 2659 return -ENOMEM; 2660 2661 descs = mr->descs; 2662 descs[mr->mmkey.ndescs++] = cpu_to_be64(addr | MLX5_EN_RD | MLX5_EN_WR); 2663 2664 return 0; 2665 } 2666 2667 static int mlx5_set_page_pi(struct ib_mr *ibmr, u64 addr) 2668 { 2669 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2670 __be64 *descs; 2671 2672 if (unlikely(mr->mmkey.ndescs + mr->meta_ndescs == mr->max_descs)) 2673 return -ENOMEM; 2674 2675 descs = mr->descs; 2676 descs[mr->mmkey.ndescs + mr->meta_ndescs++] = 2677 cpu_to_be64(addr | MLX5_EN_RD | MLX5_EN_WR); 2678 2679 return 0; 2680 } 2681 2682 static int 2683 mlx5_ib_map_mtt_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg, 2684 int data_sg_nents, unsigned int *data_sg_offset, 2685 struct scatterlist *meta_sg, int meta_sg_nents, 2686 unsigned int *meta_sg_offset) 2687 { 2688 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2689 struct mlx5_ib_mr *pi_mr = mr->mtt_mr; 2690 int n; 2691 2692 pi_mr->mmkey.ndescs = 0; 2693 pi_mr->meta_ndescs = 0; 2694 pi_mr->meta_length = 0; 2695 2696 ib_dma_sync_single_for_cpu(ibmr->device, pi_mr->desc_map, 2697 pi_mr->desc_size * pi_mr->max_descs, 2698 DMA_TO_DEVICE); 2699 2700 pi_mr->ibmr.page_size = ibmr->page_size; 2701 n = ib_sg_to_pages(&pi_mr->ibmr, data_sg, data_sg_nents, data_sg_offset, 2702 mlx5_set_page); 2703 if (n != data_sg_nents) 2704 return n; 2705 2706 pi_mr->data_iova = pi_mr->ibmr.iova; 2707 pi_mr->data_length = pi_mr->ibmr.length; 2708 pi_mr->ibmr.length = pi_mr->data_length; 2709 ibmr->length = pi_mr->data_length; 2710 2711 if (meta_sg_nents) { 2712 u64 page_mask = ~((u64)ibmr->page_size - 1); 2713 u64 iova = pi_mr->data_iova; 2714 2715 n += ib_sg_to_pages(&pi_mr->ibmr, meta_sg, meta_sg_nents, 2716 meta_sg_offset, mlx5_set_page_pi); 2717 2718 pi_mr->meta_length = pi_mr->ibmr.length; 2719 /* 2720 * PI address for the HW is the offset of the metadata address 2721 * relative to the first data page address. 2722 * It equals to first data page address + size of data pages + 2723 * metadata offset at the first metadata page 2724 */ 2725 pi_mr->pi_iova = (iova & page_mask) + 2726 pi_mr->mmkey.ndescs * ibmr->page_size + 2727 (pi_mr->ibmr.iova & ~page_mask); 2728 /* 2729 * In order to use one MTT MR for data and metadata, we register 2730 * also the gaps between the end of the data and the start of 2731 * the metadata (the sig MR will verify that the HW will access 2732 * to right addresses). This mapping is safe because we use 2733 * internal mkey for the registration. 2734 */ 2735 pi_mr->ibmr.length = pi_mr->pi_iova + pi_mr->meta_length - iova; 2736 pi_mr->ibmr.iova = iova; 2737 ibmr->length += pi_mr->meta_length; 2738 } 2739 2740 ib_dma_sync_single_for_device(ibmr->device, pi_mr->desc_map, 2741 pi_mr->desc_size * pi_mr->max_descs, 2742 DMA_TO_DEVICE); 2743 2744 return n; 2745 } 2746 2747 static int 2748 mlx5_ib_map_klm_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg, 2749 int data_sg_nents, unsigned int *data_sg_offset, 2750 struct scatterlist *meta_sg, int meta_sg_nents, 2751 unsigned int *meta_sg_offset) 2752 { 2753 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2754 struct mlx5_ib_mr *pi_mr = mr->klm_mr; 2755 int n; 2756 2757 pi_mr->mmkey.ndescs = 0; 2758 pi_mr->meta_ndescs = 0; 2759 pi_mr->meta_length = 0; 2760 2761 ib_dma_sync_single_for_cpu(ibmr->device, pi_mr->desc_map, 2762 pi_mr->desc_size * pi_mr->max_descs, 2763 DMA_TO_DEVICE); 2764 2765 n = mlx5_ib_sg_to_klms(pi_mr, data_sg, data_sg_nents, data_sg_offset, 2766 meta_sg, meta_sg_nents, meta_sg_offset); 2767 2768 ib_dma_sync_single_for_device(ibmr->device, pi_mr->desc_map, 2769 pi_mr->desc_size * pi_mr->max_descs, 2770 DMA_TO_DEVICE); 2771 2772 /* This is zero-based memory region */ 2773 pi_mr->data_iova = 0; 2774 pi_mr->ibmr.iova = 0; 2775 pi_mr->pi_iova = pi_mr->data_length; 2776 ibmr->length = pi_mr->ibmr.length; 2777 2778 return n; 2779 } 2780 2781 int mlx5_ib_map_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg, 2782 int data_sg_nents, unsigned int *data_sg_offset, 2783 struct scatterlist *meta_sg, int meta_sg_nents, 2784 unsigned int *meta_sg_offset) 2785 { 2786 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2787 struct mlx5_ib_mr *pi_mr = NULL; 2788 int n; 2789 2790 WARN_ON(ibmr->type != IB_MR_TYPE_INTEGRITY); 2791 2792 mr->mmkey.ndescs = 0; 2793 mr->data_length = 0; 2794 mr->data_iova = 0; 2795 mr->meta_ndescs = 0; 2796 mr->pi_iova = 0; 2797 /* 2798 * As a performance optimization, if possible, there is no need to 2799 * perform UMR operation to register the data/metadata buffers. 2800 * First try to map the sg lists to PA descriptors with local_dma_lkey. 2801 * Fallback to UMR only in case of a failure. 2802 */ 2803 n = mlx5_ib_map_pa_mr_sg_pi(ibmr, data_sg, data_sg_nents, 2804 data_sg_offset, meta_sg, meta_sg_nents, 2805 meta_sg_offset); 2806 if (n == data_sg_nents + meta_sg_nents) 2807 goto out; 2808 /* 2809 * As a performance optimization, if possible, there is no need to map 2810 * the sg lists to KLM descriptors. First try to map the sg lists to MTT 2811 * descriptors and fallback to KLM only in case of a failure. 2812 * It's more efficient for the HW to work with MTT descriptors 2813 * (especially in high load). 2814 * Use KLM (indirect access) only if it's mandatory. 2815 */ 2816 pi_mr = mr->mtt_mr; 2817 n = mlx5_ib_map_mtt_mr_sg_pi(ibmr, data_sg, data_sg_nents, 2818 data_sg_offset, meta_sg, meta_sg_nents, 2819 meta_sg_offset); 2820 if (n == data_sg_nents + meta_sg_nents) 2821 goto out; 2822 2823 pi_mr = mr->klm_mr; 2824 n = mlx5_ib_map_klm_mr_sg_pi(ibmr, data_sg, data_sg_nents, 2825 data_sg_offset, meta_sg, meta_sg_nents, 2826 meta_sg_offset); 2827 if (unlikely(n != data_sg_nents + meta_sg_nents)) 2828 return -ENOMEM; 2829 2830 out: 2831 /* This is zero-based memory region */ 2832 ibmr->iova = 0; 2833 mr->pi_mr = pi_mr; 2834 if (pi_mr) 2835 ibmr->sig_attrs->meta_length = pi_mr->meta_length; 2836 else 2837 ibmr->sig_attrs->meta_length = mr->meta_length; 2838 2839 return 0; 2840 } 2841 2842 int mlx5_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, 2843 unsigned int *sg_offset) 2844 { 2845 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2846 int n; 2847 2848 mr->mmkey.ndescs = 0; 2849 2850 ib_dma_sync_single_for_cpu(ibmr->device, mr->desc_map, 2851 mr->desc_size * mr->max_descs, 2852 DMA_TO_DEVICE); 2853 2854 if (mr->access_mode == MLX5_MKC_ACCESS_MODE_KLMS) 2855 n = mlx5_ib_sg_to_klms(mr, sg, sg_nents, sg_offset, NULL, 0, 2856 NULL); 2857 else 2858 n = ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, 2859 mlx5_set_page); 2860 2861 ib_dma_sync_single_for_device(ibmr->device, mr->desc_map, 2862 mr->desc_size * mr->max_descs, 2863 DMA_TO_DEVICE); 2864 2865 return n; 2866 } 2867