1 /* 2 * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved. 3 * Copyright (c) 2020, Intel Corporation. All rights reserved. 4 * 5 * This software is available to you under a choice of one of two 6 * licenses. You may choose to be licensed under the terms of the GNU 7 * General Public License (GPL) Version 2, available from the file 8 * COPYING in the main directory of this source tree, or the 9 * OpenIB.org BSD license below: 10 * 11 * Redistribution and use in source and binary forms, with or 12 * without modification, are permitted provided that the following 13 * conditions are met: 14 * 15 * - Redistributions of source code must retain the above 16 * copyright notice, this list of conditions and the following 17 * disclaimer. 18 * 19 * - Redistributions in binary form must reproduce the above 20 * copyright notice, this list of conditions and the following 21 * disclaimer in the documentation and/or other materials 22 * provided with the distribution. 23 * 24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 25 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 26 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 27 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 28 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 29 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 30 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 31 * SOFTWARE. 32 */ 33 34 35 #include <linux/kref.h> 36 #include <linux/random.h> 37 #include <linux/debugfs.h> 38 #include <linux/export.h> 39 #include <linux/delay.h> 40 #include <linux/dma-buf.h> 41 #include <linux/dma-resv.h> 42 #include <rdma/ib_umem_odp.h> 43 #include "dm.h" 44 #include "mlx5_ib.h" 45 #include "umr.h" 46 #include "data_direct.h" 47 48 enum { 49 MAX_PENDING_REG_MR = 8, 50 }; 51 52 #define MLX5_MR_CACHE_PERSISTENT_ENTRY_MIN_DESCS 4 53 #define MLX5_UMR_ALIGN 2048 54 55 static void 56 create_mkey_callback(int status, struct mlx5_async_work *context); 57 static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem, 58 u64 iova, int access_flags, 59 unsigned int page_size, bool populate, 60 int access_mode); 61 static int __mlx5_ib_dereg_mr(struct ib_mr *ibmr); 62 63 static void set_mkc_access_pd_addr_fields(void *mkc, int acc, u64 start_addr, 64 struct ib_pd *pd) 65 { 66 struct mlx5_ib_dev *dev = to_mdev(pd->device); 67 68 MLX5_SET(mkc, mkc, a, !!(acc & IB_ACCESS_REMOTE_ATOMIC)); 69 MLX5_SET(mkc, mkc, rw, !!(acc & IB_ACCESS_REMOTE_WRITE)); 70 MLX5_SET(mkc, mkc, rr, !!(acc & IB_ACCESS_REMOTE_READ)); 71 MLX5_SET(mkc, mkc, lw, !!(acc & IB_ACCESS_LOCAL_WRITE)); 72 MLX5_SET(mkc, mkc, lr, 1); 73 74 if (acc & IB_ACCESS_RELAXED_ORDERING) { 75 if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write)) 76 MLX5_SET(mkc, mkc, relaxed_ordering_write, 1); 77 78 if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read) || 79 (MLX5_CAP_GEN(dev->mdev, 80 relaxed_ordering_read_pci_enabled) && 81 pcie_relaxed_ordering_enabled(dev->mdev->pdev))) 82 MLX5_SET(mkc, mkc, relaxed_ordering_read, 1); 83 } 84 85 MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn); 86 MLX5_SET(mkc, mkc, qpn, 0xffffff); 87 MLX5_SET64(mkc, mkc, start_addr, start_addr); 88 } 89 90 static void assign_mkey_variant(struct mlx5_ib_dev *dev, u32 *mkey, u32 *in) 91 { 92 u8 key = atomic_inc_return(&dev->mkey_var); 93 void *mkc; 94 95 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 96 MLX5_SET(mkc, mkc, mkey_7_0, key); 97 *mkey = key; 98 } 99 100 static int mlx5_ib_create_mkey(struct mlx5_ib_dev *dev, 101 struct mlx5_ib_mkey *mkey, u32 *in, int inlen) 102 { 103 int ret; 104 105 assign_mkey_variant(dev, &mkey->key, in); 106 ret = mlx5_core_create_mkey(dev->mdev, &mkey->key, in, inlen); 107 if (!ret) 108 init_waitqueue_head(&mkey->wait); 109 110 return ret; 111 } 112 113 static int mlx5_ib_create_mkey_cb(struct mlx5r_async_create_mkey *async_create) 114 { 115 struct mlx5_ib_dev *dev = async_create->ent->dev; 116 size_t inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 117 size_t outlen = MLX5_ST_SZ_BYTES(create_mkey_out); 118 119 MLX5_SET(create_mkey_in, async_create->in, opcode, 120 MLX5_CMD_OP_CREATE_MKEY); 121 assign_mkey_variant(dev, &async_create->mkey, async_create->in); 122 return mlx5_cmd_exec_cb(&dev->async_ctx, async_create->in, inlen, 123 async_create->out, outlen, create_mkey_callback, 124 &async_create->cb_work); 125 } 126 127 static int mkey_cache_max_order(struct mlx5_ib_dev *dev); 128 static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent); 129 130 static int destroy_mkey(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) 131 { 132 WARN_ON(xa_load(&dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key))); 133 134 return mlx5_core_destroy_mkey(dev->mdev, mr->mmkey.key); 135 } 136 137 static void create_mkey_warn(struct mlx5_ib_dev *dev, int status, void *out) 138 { 139 if (status == -ENXIO) /* core driver is not available */ 140 return; 141 142 mlx5_ib_warn(dev, "async reg mr failed. status %d\n", status); 143 if (status != -EREMOTEIO) /* driver specific failure */ 144 return; 145 146 /* Failed in FW, print cmd out failure details */ 147 mlx5_cmd_out_err(dev->mdev, MLX5_CMD_OP_CREATE_MKEY, 0, out); 148 } 149 150 static int push_mkey_locked(struct mlx5_cache_ent *ent, u32 mkey) 151 { 152 unsigned long tmp = ent->mkeys_queue.ci % NUM_MKEYS_PER_PAGE; 153 struct mlx5_mkeys_page *page; 154 155 lockdep_assert_held(&ent->mkeys_queue.lock); 156 if (ent->mkeys_queue.ci >= 157 ent->mkeys_queue.num_pages * NUM_MKEYS_PER_PAGE) { 158 page = kzalloc(sizeof(*page), GFP_ATOMIC); 159 if (!page) 160 return -ENOMEM; 161 ent->mkeys_queue.num_pages++; 162 list_add_tail(&page->list, &ent->mkeys_queue.pages_list); 163 } else { 164 page = list_last_entry(&ent->mkeys_queue.pages_list, 165 struct mlx5_mkeys_page, list); 166 } 167 168 page->mkeys[tmp] = mkey; 169 ent->mkeys_queue.ci++; 170 return 0; 171 } 172 173 static int pop_mkey_locked(struct mlx5_cache_ent *ent) 174 { 175 unsigned long tmp = (ent->mkeys_queue.ci - 1) % NUM_MKEYS_PER_PAGE; 176 struct mlx5_mkeys_page *last_page; 177 u32 mkey; 178 179 lockdep_assert_held(&ent->mkeys_queue.lock); 180 last_page = list_last_entry(&ent->mkeys_queue.pages_list, 181 struct mlx5_mkeys_page, list); 182 mkey = last_page->mkeys[tmp]; 183 last_page->mkeys[tmp] = 0; 184 ent->mkeys_queue.ci--; 185 if (ent->mkeys_queue.num_pages > 1 && !tmp) { 186 list_del(&last_page->list); 187 ent->mkeys_queue.num_pages--; 188 kfree(last_page); 189 } 190 return mkey; 191 } 192 193 static void create_mkey_callback(int status, struct mlx5_async_work *context) 194 { 195 struct mlx5r_async_create_mkey *mkey_out = 196 container_of(context, struct mlx5r_async_create_mkey, cb_work); 197 struct mlx5_cache_ent *ent = mkey_out->ent; 198 struct mlx5_ib_dev *dev = ent->dev; 199 unsigned long flags; 200 201 if (status) { 202 create_mkey_warn(dev, status, mkey_out->out); 203 kfree(mkey_out); 204 spin_lock_irqsave(&ent->mkeys_queue.lock, flags); 205 ent->pending--; 206 WRITE_ONCE(dev->fill_delay, 1); 207 spin_unlock_irqrestore(&ent->mkeys_queue.lock, flags); 208 mod_timer(&dev->delay_timer, jiffies + HZ); 209 return; 210 } 211 212 mkey_out->mkey |= mlx5_idx_to_mkey( 213 MLX5_GET(create_mkey_out, mkey_out->out, mkey_index)); 214 WRITE_ONCE(dev->cache.last_add, jiffies); 215 216 spin_lock_irqsave(&ent->mkeys_queue.lock, flags); 217 push_mkey_locked(ent, mkey_out->mkey); 218 ent->pending--; 219 /* If we are doing fill_to_high_water then keep going. */ 220 queue_adjust_cache_locked(ent); 221 spin_unlock_irqrestore(&ent->mkeys_queue.lock, flags); 222 kfree(mkey_out); 223 } 224 225 static int get_mkc_octo_size(unsigned int access_mode, unsigned int ndescs) 226 { 227 int ret = 0; 228 229 switch (access_mode) { 230 case MLX5_MKC_ACCESS_MODE_MTT: 231 ret = DIV_ROUND_UP(ndescs, MLX5_IB_UMR_OCTOWORD / 232 sizeof(struct mlx5_mtt)); 233 break; 234 case MLX5_MKC_ACCESS_MODE_KSM: 235 ret = DIV_ROUND_UP(ndescs, MLX5_IB_UMR_OCTOWORD / 236 sizeof(struct mlx5_klm)); 237 break; 238 default: 239 WARN_ON(1); 240 } 241 return ret; 242 } 243 244 static void set_cache_mkc(struct mlx5_cache_ent *ent, void *mkc) 245 { 246 set_mkc_access_pd_addr_fields(mkc, ent->rb_key.access_flags, 0, 247 ent->dev->umrc.pd); 248 MLX5_SET(mkc, mkc, free, 1); 249 MLX5_SET(mkc, mkc, umr_en, 1); 250 MLX5_SET(mkc, mkc, access_mode_1_0, ent->rb_key.access_mode & 0x3); 251 MLX5_SET(mkc, mkc, access_mode_4_2, 252 (ent->rb_key.access_mode >> 2) & 0x7); 253 MLX5_SET(mkc, mkc, ma_translation_mode, !!ent->rb_key.ats); 254 255 MLX5_SET(mkc, mkc, translations_octword_size, 256 get_mkc_octo_size(ent->rb_key.access_mode, 257 ent->rb_key.ndescs)); 258 MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT); 259 } 260 261 /* Asynchronously schedule new MRs to be populated in the cache. */ 262 static int add_keys(struct mlx5_cache_ent *ent, unsigned int num) 263 { 264 struct mlx5r_async_create_mkey *async_create; 265 void *mkc; 266 int err = 0; 267 int i; 268 269 for (i = 0; i < num; i++) { 270 async_create = kzalloc(sizeof(struct mlx5r_async_create_mkey), 271 GFP_KERNEL); 272 if (!async_create) 273 return -ENOMEM; 274 mkc = MLX5_ADDR_OF(create_mkey_in, async_create->in, 275 memory_key_mkey_entry); 276 set_cache_mkc(ent, mkc); 277 async_create->ent = ent; 278 279 spin_lock_irq(&ent->mkeys_queue.lock); 280 if (ent->pending >= MAX_PENDING_REG_MR) { 281 err = -EAGAIN; 282 goto free_async_create; 283 } 284 ent->pending++; 285 spin_unlock_irq(&ent->mkeys_queue.lock); 286 287 err = mlx5_ib_create_mkey_cb(async_create); 288 if (err) { 289 mlx5_ib_warn(ent->dev, "create mkey failed %d\n", err); 290 goto err_create_mkey; 291 } 292 } 293 294 return 0; 295 296 err_create_mkey: 297 spin_lock_irq(&ent->mkeys_queue.lock); 298 ent->pending--; 299 free_async_create: 300 spin_unlock_irq(&ent->mkeys_queue.lock); 301 kfree(async_create); 302 return err; 303 } 304 305 /* Synchronously create a MR in the cache */ 306 static int create_cache_mkey(struct mlx5_cache_ent *ent, u32 *mkey) 307 { 308 size_t inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 309 void *mkc; 310 u32 *in; 311 int err; 312 313 in = kzalloc(inlen, GFP_KERNEL); 314 if (!in) 315 return -ENOMEM; 316 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 317 set_cache_mkc(ent, mkc); 318 319 err = mlx5_core_create_mkey(ent->dev->mdev, mkey, in, inlen); 320 if (err) 321 goto free_in; 322 323 WRITE_ONCE(ent->dev->cache.last_add, jiffies); 324 free_in: 325 kfree(in); 326 return err; 327 } 328 329 static void remove_cache_mr_locked(struct mlx5_cache_ent *ent) 330 { 331 u32 mkey; 332 333 lockdep_assert_held(&ent->mkeys_queue.lock); 334 if (!ent->mkeys_queue.ci) 335 return; 336 mkey = pop_mkey_locked(ent); 337 spin_unlock_irq(&ent->mkeys_queue.lock); 338 mlx5_core_destroy_mkey(ent->dev->mdev, mkey); 339 spin_lock_irq(&ent->mkeys_queue.lock); 340 } 341 342 static int resize_available_mrs(struct mlx5_cache_ent *ent, unsigned int target, 343 bool limit_fill) 344 __acquires(&ent->mkeys_queue.lock) __releases(&ent->mkeys_queue.lock) 345 { 346 int err; 347 348 lockdep_assert_held(&ent->mkeys_queue.lock); 349 350 while (true) { 351 if (limit_fill) 352 target = ent->limit * 2; 353 if (target == ent->pending + ent->mkeys_queue.ci) 354 return 0; 355 if (target > ent->pending + ent->mkeys_queue.ci) { 356 u32 todo = target - (ent->pending + ent->mkeys_queue.ci); 357 358 spin_unlock_irq(&ent->mkeys_queue.lock); 359 err = add_keys(ent, todo); 360 if (err == -EAGAIN) 361 usleep_range(3000, 5000); 362 spin_lock_irq(&ent->mkeys_queue.lock); 363 if (err) { 364 if (err != -EAGAIN) 365 return err; 366 } else 367 return 0; 368 } else { 369 remove_cache_mr_locked(ent); 370 } 371 } 372 } 373 374 static ssize_t size_write(struct file *filp, const char __user *buf, 375 size_t count, loff_t *pos) 376 { 377 struct mlx5_cache_ent *ent = filp->private_data; 378 u32 target; 379 int err; 380 381 err = kstrtou32_from_user(buf, count, 0, &target); 382 if (err) 383 return err; 384 385 /* 386 * Target is the new value of total_mrs the user requests, however we 387 * cannot free MRs that are in use. Compute the target value for stored 388 * mkeys. 389 */ 390 spin_lock_irq(&ent->mkeys_queue.lock); 391 if (target < ent->in_use) { 392 err = -EINVAL; 393 goto err_unlock; 394 } 395 target = target - ent->in_use; 396 if (target < ent->limit || target > ent->limit*2) { 397 err = -EINVAL; 398 goto err_unlock; 399 } 400 err = resize_available_mrs(ent, target, false); 401 if (err) 402 goto err_unlock; 403 spin_unlock_irq(&ent->mkeys_queue.lock); 404 405 return count; 406 407 err_unlock: 408 spin_unlock_irq(&ent->mkeys_queue.lock); 409 return err; 410 } 411 412 static ssize_t size_read(struct file *filp, char __user *buf, size_t count, 413 loff_t *pos) 414 { 415 struct mlx5_cache_ent *ent = filp->private_data; 416 char lbuf[20]; 417 int err; 418 419 err = snprintf(lbuf, sizeof(lbuf), "%ld\n", 420 ent->mkeys_queue.ci + ent->in_use); 421 if (err < 0) 422 return err; 423 424 return simple_read_from_buffer(buf, count, pos, lbuf, err); 425 } 426 427 static const struct file_operations size_fops = { 428 .owner = THIS_MODULE, 429 .open = simple_open, 430 .write = size_write, 431 .read = size_read, 432 }; 433 434 static ssize_t limit_write(struct file *filp, const char __user *buf, 435 size_t count, loff_t *pos) 436 { 437 struct mlx5_cache_ent *ent = filp->private_data; 438 u32 var; 439 int err; 440 441 err = kstrtou32_from_user(buf, count, 0, &var); 442 if (err) 443 return err; 444 445 /* 446 * Upon set we immediately fill the cache to high water mark implied by 447 * the limit. 448 */ 449 spin_lock_irq(&ent->mkeys_queue.lock); 450 ent->limit = var; 451 err = resize_available_mrs(ent, 0, true); 452 spin_unlock_irq(&ent->mkeys_queue.lock); 453 if (err) 454 return err; 455 return count; 456 } 457 458 static ssize_t limit_read(struct file *filp, char __user *buf, size_t count, 459 loff_t *pos) 460 { 461 struct mlx5_cache_ent *ent = filp->private_data; 462 char lbuf[20]; 463 int err; 464 465 err = snprintf(lbuf, sizeof(lbuf), "%d\n", ent->limit); 466 if (err < 0) 467 return err; 468 469 return simple_read_from_buffer(buf, count, pos, lbuf, err); 470 } 471 472 static const struct file_operations limit_fops = { 473 .owner = THIS_MODULE, 474 .open = simple_open, 475 .write = limit_write, 476 .read = limit_read, 477 }; 478 479 static bool someone_adding(struct mlx5_mkey_cache *cache) 480 { 481 struct mlx5_cache_ent *ent; 482 struct rb_node *node; 483 bool ret; 484 485 mutex_lock(&cache->rb_lock); 486 for (node = rb_first(&cache->rb_root); node; node = rb_next(node)) { 487 ent = rb_entry(node, struct mlx5_cache_ent, node); 488 spin_lock_irq(&ent->mkeys_queue.lock); 489 ret = ent->mkeys_queue.ci < ent->limit; 490 spin_unlock_irq(&ent->mkeys_queue.lock); 491 if (ret) { 492 mutex_unlock(&cache->rb_lock); 493 return true; 494 } 495 } 496 mutex_unlock(&cache->rb_lock); 497 return false; 498 } 499 500 /* 501 * Check if the bucket is outside the high/low water mark and schedule an async 502 * update. The cache refill has hysteresis, once the low water mark is hit it is 503 * refilled up to the high mark. 504 */ 505 static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent) 506 { 507 lockdep_assert_held(&ent->mkeys_queue.lock); 508 509 if (ent->disabled || READ_ONCE(ent->dev->fill_delay) || ent->is_tmp) 510 return; 511 if (ent->mkeys_queue.ci < ent->limit) { 512 ent->fill_to_high_water = true; 513 mod_delayed_work(ent->dev->cache.wq, &ent->dwork, 0); 514 } else if (ent->fill_to_high_water && 515 ent->mkeys_queue.ci + ent->pending < 2 * ent->limit) { 516 /* 517 * Once we start populating due to hitting a low water mark 518 * continue until we pass the high water mark. 519 */ 520 mod_delayed_work(ent->dev->cache.wq, &ent->dwork, 0); 521 } else if (ent->mkeys_queue.ci == 2 * ent->limit) { 522 ent->fill_to_high_water = false; 523 } else if (ent->mkeys_queue.ci > 2 * ent->limit) { 524 /* Queue deletion of excess entries */ 525 ent->fill_to_high_water = false; 526 if (ent->pending) 527 queue_delayed_work(ent->dev->cache.wq, &ent->dwork, 528 msecs_to_jiffies(1000)); 529 else 530 mod_delayed_work(ent->dev->cache.wq, &ent->dwork, 0); 531 } 532 } 533 534 static void clean_keys(struct mlx5_ib_dev *dev, struct mlx5_cache_ent *ent) 535 { 536 u32 mkey; 537 538 spin_lock_irq(&ent->mkeys_queue.lock); 539 while (ent->mkeys_queue.ci) { 540 mkey = pop_mkey_locked(ent); 541 spin_unlock_irq(&ent->mkeys_queue.lock); 542 mlx5_core_destroy_mkey(dev->mdev, mkey); 543 spin_lock_irq(&ent->mkeys_queue.lock); 544 } 545 ent->tmp_cleanup_scheduled = false; 546 spin_unlock_irq(&ent->mkeys_queue.lock); 547 } 548 549 static void __cache_work_func(struct mlx5_cache_ent *ent) 550 { 551 struct mlx5_ib_dev *dev = ent->dev; 552 struct mlx5_mkey_cache *cache = &dev->cache; 553 int err; 554 555 spin_lock_irq(&ent->mkeys_queue.lock); 556 if (ent->disabled) 557 goto out; 558 559 if (ent->fill_to_high_water && 560 ent->mkeys_queue.ci + ent->pending < 2 * ent->limit && 561 !READ_ONCE(dev->fill_delay)) { 562 spin_unlock_irq(&ent->mkeys_queue.lock); 563 err = add_keys(ent, 1); 564 spin_lock_irq(&ent->mkeys_queue.lock); 565 if (ent->disabled) 566 goto out; 567 if (err) { 568 /* 569 * EAGAIN only happens if there are pending MRs, so we 570 * will be rescheduled when storing them. The only 571 * failure path here is ENOMEM. 572 */ 573 if (err != -EAGAIN) { 574 mlx5_ib_warn( 575 dev, 576 "add keys command failed, err %d\n", 577 err); 578 queue_delayed_work(cache->wq, &ent->dwork, 579 msecs_to_jiffies(1000)); 580 } 581 } 582 } else if (ent->mkeys_queue.ci > 2 * ent->limit) { 583 bool need_delay; 584 585 /* 586 * The remove_cache_mr() logic is performed as garbage 587 * collection task. Such task is intended to be run when no 588 * other active processes are running. 589 * 590 * The need_resched() will return TRUE if there are user tasks 591 * to be activated in near future. 592 * 593 * In such case, we don't execute remove_cache_mr() and postpone 594 * the garbage collection work to try to run in next cycle, in 595 * order to free CPU resources to other tasks. 596 */ 597 spin_unlock_irq(&ent->mkeys_queue.lock); 598 need_delay = need_resched() || someone_adding(cache) || 599 !time_after(jiffies, 600 READ_ONCE(cache->last_add) + 300 * HZ); 601 spin_lock_irq(&ent->mkeys_queue.lock); 602 if (ent->disabled) 603 goto out; 604 if (need_delay) { 605 queue_delayed_work(cache->wq, &ent->dwork, 300 * HZ); 606 goto out; 607 } 608 remove_cache_mr_locked(ent); 609 queue_adjust_cache_locked(ent); 610 } 611 out: 612 spin_unlock_irq(&ent->mkeys_queue.lock); 613 } 614 615 static void delayed_cache_work_func(struct work_struct *work) 616 { 617 struct mlx5_cache_ent *ent; 618 619 ent = container_of(work, struct mlx5_cache_ent, dwork.work); 620 /* temp entries are never filled, only cleaned */ 621 if (ent->is_tmp) 622 clean_keys(ent->dev, ent); 623 else 624 __cache_work_func(ent); 625 } 626 627 static int cache_ent_key_cmp(struct mlx5r_cache_rb_key key1, 628 struct mlx5r_cache_rb_key key2) 629 { 630 int res; 631 632 res = key1.ats - key2.ats; 633 if (res) 634 return res; 635 636 res = key1.access_mode - key2.access_mode; 637 if (res) 638 return res; 639 640 res = key1.access_flags - key2.access_flags; 641 if (res) 642 return res; 643 644 /* 645 * keep ndescs the last in the compare table since the find function 646 * searches for an exact match on all properties and only closest 647 * match in size. 648 */ 649 return key1.ndescs - key2.ndescs; 650 } 651 652 static int mlx5_cache_ent_insert(struct mlx5_mkey_cache *cache, 653 struct mlx5_cache_ent *ent) 654 { 655 struct rb_node **new = &cache->rb_root.rb_node, *parent = NULL; 656 struct mlx5_cache_ent *cur; 657 int cmp; 658 659 /* Figure out where to put new node */ 660 while (*new) { 661 cur = rb_entry(*new, struct mlx5_cache_ent, node); 662 parent = *new; 663 cmp = cache_ent_key_cmp(cur->rb_key, ent->rb_key); 664 if (cmp > 0) 665 new = &((*new)->rb_left); 666 if (cmp < 0) 667 new = &((*new)->rb_right); 668 if (cmp == 0) 669 return -EEXIST; 670 } 671 672 /* Add new node and rebalance tree. */ 673 rb_link_node(&ent->node, parent, new); 674 rb_insert_color(&ent->node, &cache->rb_root); 675 676 return 0; 677 } 678 679 static struct mlx5_cache_ent * 680 mkey_cache_ent_from_rb_key(struct mlx5_ib_dev *dev, 681 struct mlx5r_cache_rb_key rb_key) 682 { 683 struct rb_node *node = dev->cache.rb_root.rb_node; 684 struct mlx5_cache_ent *cur, *smallest = NULL; 685 u64 ndescs_limit; 686 int cmp; 687 688 /* 689 * Find the smallest ent with order >= requested_order. 690 */ 691 while (node) { 692 cur = rb_entry(node, struct mlx5_cache_ent, node); 693 cmp = cache_ent_key_cmp(cur->rb_key, rb_key); 694 if (cmp > 0) { 695 smallest = cur; 696 node = node->rb_left; 697 } 698 if (cmp < 0) 699 node = node->rb_right; 700 if (cmp == 0) 701 return cur; 702 } 703 704 /* 705 * Limit the usage of mkeys larger than twice the required size while 706 * also allowing the usage of smallest cache entry for small MRs. 707 */ 708 ndescs_limit = max_t(u64, rb_key.ndescs * 2, 709 MLX5_MR_CACHE_PERSISTENT_ENTRY_MIN_DESCS); 710 711 return (smallest && 712 smallest->rb_key.access_mode == rb_key.access_mode && 713 smallest->rb_key.access_flags == rb_key.access_flags && 714 smallest->rb_key.ats == rb_key.ats && 715 smallest->rb_key.ndescs <= ndescs_limit) ? 716 smallest : 717 NULL; 718 } 719 720 static struct mlx5_ib_mr *_mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, 721 struct mlx5_cache_ent *ent, 722 int access_flags) 723 { 724 struct mlx5_ib_mr *mr; 725 int err; 726 727 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 728 if (!mr) 729 return ERR_PTR(-ENOMEM); 730 731 spin_lock_irq(&ent->mkeys_queue.lock); 732 ent->in_use++; 733 734 if (!ent->mkeys_queue.ci) { 735 queue_adjust_cache_locked(ent); 736 ent->miss++; 737 spin_unlock_irq(&ent->mkeys_queue.lock); 738 err = create_cache_mkey(ent, &mr->mmkey.key); 739 if (err) { 740 spin_lock_irq(&ent->mkeys_queue.lock); 741 ent->in_use--; 742 spin_unlock_irq(&ent->mkeys_queue.lock); 743 kfree(mr); 744 return ERR_PTR(err); 745 } 746 } else { 747 mr->mmkey.key = pop_mkey_locked(ent); 748 queue_adjust_cache_locked(ent); 749 spin_unlock_irq(&ent->mkeys_queue.lock); 750 } 751 mr->mmkey.cache_ent = ent; 752 mr->mmkey.type = MLX5_MKEY_MR; 753 mr->mmkey.rb_key = ent->rb_key; 754 mr->mmkey.cacheable = true; 755 init_waitqueue_head(&mr->mmkey.wait); 756 return mr; 757 } 758 759 static int get_unchangeable_access_flags(struct mlx5_ib_dev *dev, 760 int access_flags) 761 { 762 int ret = 0; 763 764 if ((access_flags & IB_ACCESS_REMOTE_ATOMIC) && 765 MLX5_CAP_GEN(dev->mdev, atomic) && 766 MLX5_CAP_GEN(dev->mdev, umr_modify_atomic_disabled)) 767 ret |= IB_ACCESS_REMOTE_ATOMIC; 768 769 if ((access_flags & IB_ACCESS_RELAXED_ORDERING) && 770 MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write) && 771 !MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write_umr)) 772 ret |= IB_ACCESS_RELAXED_ORDERING; 773 774 if ((access_flags & IB_ACCESS_RELAXED_ORDERING) && 775 (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read) || 776 MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read_pci_enabled)) && 777 !MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read_umr)) 778 ret |= IB_ACCESS_RELAXED_ORDERING; 779 780 return ret; 781 } 782 783 struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, 784 int access_flags, int access_mode, 785 int ndescs) 786 { 787 struct mlx5r_cache_rb_key rb_key = { 788 .ndescs = ndescs, 789 .access_mode = access_mode, 790 .access_flags = get_unchangeable_access_flags(dev, access_flags) 791 }; 792 struct mlx5_cache_ent *ent = mkey_cache_ent_from_rb_key(dev, rb_key); 793 794 if (!ent) 795 return ERR_PTR(-EOPNOTSUPP); 796 797 return _mlx5_mr_cache_alloc(dev, ent, access_flags); 798 } 799 800 static void mlx5_mkey_cache_debugfs_cleanup(struct mlx5_ib_dev *dev) 801 { 802 if (!mlx5_debugfs_root || dev->is_rep) 803 return; 804 805 debugfs_remove_recursive(dev->cache.fs_root); 806 dev->cache.fs_root = NULL; 807 } 808 809 static void mlx5_mkey_cache_debugfs_add_ent(struct mlx5_ib_dev *dev, 810 struct mlx5_cache_ent *ent) 811 { 812 int order = order_base_2(ent->rb_key.ndescs); 813 struct dentry *dir; 814 815 if (!mlx5_debugfs_root || dev->is_rep) 816 return; 817 818 if (ent->rb_key.access_mode == MLX5_MKC_ACCESS_MODE_KSM) 819 order = MLX5_IMR_KSM_CACHE_ENTRY + 2; 820 821 sprintf(ent->name, "%d", order); 822 dir = debugfs_create_dir(ent->name, dev->cache.fs_root); 823 debugfs_create_file("size", 0600, dir, ent, &size_fops); 824 debugfs_create_file("limit", 0600, dir, ent, &limit_fops); 825 debugfs_create_ulong("cur", 0400, dir, &ent->mkeys_queue.ci); 826 debugfs_create_u32("miss", 0600, dir, &ent->miss); 827 } 828 829 static void mlx5_mkey_cache_debugfs_init(struct mlx5_ib_dev *dev) 830 { 831 struct dentry *dbg_root = mlx5_debugfs_get_dev_root(dev->mdev); 832 struct mlx5_mkey_cache *cache = &dev->cache; 833 834 if (!mlx5_debugfs_root || dev->is_rep) 835 return; 836 837 cache->fs_root = debugfs_create_dir("mr_cache", dbg_root); 838 } 839 840 static void delay_time_func(struct timer_list *t) 841 { 842 struct mlx5_ib_dev *dev = from_timer(dev, t, delay_timer); 843 844 WRITE_ONCE(dev->fill_delay, 0); 845 } 846 847 static int mlx5r_mkeys_init(struct mlx5_cache_ent *ent) 848 { 849 struct mlx5_mkeys_page *page; 850 851 page = kzalloc(sizeof(*page), GFP_KERNEL); 852 if (!page) 853 return -ENOMEM; 854 INIT_LIST_HEAD(&ent->mkeys_queue.pages_list); 855 spin_lock_init(&ent->mkeys_queue.lock); 856 list_add_tail(&page->list, &ent->mkeys_queue.pages_list); 857 ent->mkeys_queue.num_pages++; 858 return 0; 859 } 860 861 static void mlx5r_mkeys_uninit(struct mlx5_cache_ent *ent) 862 { 863 struct mlx5_mkeys_page *page; 864 865 WARN_ON(ent->mkeys_queue.ci || ent->mkeys_queue.num_pages > 1); 866 page = list_last_entry(&ent->mkeys_queue.pages_list, 867 struct mlx5_mkeys_page, list); 868 list_del(&page->list); 869 kfree(page); 870 } 871 872 struct mlx5_cache_ent * 873 mlx5r_cache_create_ent_locked(struct mlx5_ib_dev *dev, 874 struct mlx5r_cache_rb_key rb_key, 875 bool persistent_entry) 876 { 877 struct mlx5_cache_ent *ent; 878 int order; 879 int ret; 880 881 ent = kzalloc(sizeof(*ent), GFP_KERNEL); 882 if (!ent) 883 return ERR_PTR(-ENOMEM); 884 885 ret = mlx5r_mkeys_init(ent); 886 if (ret) 887 goto mkeys_err; 888 ent->rb_key = rb_key; 889 ent->dev = dev; 890 ent->is_tmp = !persistent_entry; 891 892 INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func); 893 894 ret = mlx5_cache_ent_insert(&dev->cache, ent); 895 if (ret) 896 goto ent_insert_err; 897 898 if (persistent_entry) { 899 if (rb_key.access_mode == MLX5_MKC_ACCESS_MODE_KSM) 900 order = MLX5_IMR_KSM_CACHE_ENTRY; 901 else 902 order = order_base_2(rb_key.ndescs) - 2; 903 904 if ((dev->mdev->profile.mask & MLX5_PROF_MASK_MR_CACHE) && 905 !dev->is_rep && mlx5_core_is_pf(dev->mdev) && 906 mlx5r_umr_can_load_pas(dev, 0)) 907 ent->limit = dev->mdev->profile.mr_cache[order].limit; 908 else 909 ent->limit = 0; 910 911 mlx5_mkey_cache_debugfs_add_ent(dev, ent); 912 } 913 914 return ent; 915 ent_insert_err: 916 mlx5r_mkeys_uninit(ent); 917 mkeys_err: 918 kfree(ent); 919 return ERR_PTR(ret); 920 } 921 922 int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev) 923 { 924 struct mlx5_mkey_cache *cache = &dev->cache; 925 struct rb_root *root = &dev->cache.rb_root; 926 struct mlx5r_cache_rb_key rb_key = { 927 .access_mode = MLX5_MKC_ACCESS_MODE_MTT, 928 }; 929 struct mlx5_cache_ent *ent; 930 struct rb_node *node; 931 int ret; 932 int i; 933 934 mutex_init(&dev->slow_path_mutex); 935 mutex_init(&dev->cache.rb_lock); 936 dev->cache.rb_root = RB_ROOT; 937 cache->wq = alloc_ordered_workqueue("mkey_cache", WQ_MEM_RECLAIM); 938 if (!cache->wq) { 939 mlx5_ib_warn(dev, "failed to create work queue\n"); 940 return -ENOMEM; 941 } 942 943 mlx5_cmd_init_async_ctx(dev->mdev, &dev->async_ctx); 944 timer_setup(&dev->delay_timer, delay_time_func, 0); 945 mlx5_mkey_cache_debugfs_init(dev); 946 mutex_lock(&cache->rb_lock); 947 for (i = 0; i <= mkey_cache_max_order(dev); i++) { 948 rb_key.ndescs = MLX5_MR_CACHE_PERSISTENT_ENTRY_MIN_DESCS << i; 949 ent = mlx5r_cache_create_ent_locked(dev, rb_key, true); 950 if (IS_ERR(ent)) { 951 ret = PTR_ERR(ent); 952 goto err; 953 } 954 } 955 956 ret = mlx5_odp_init_mkey_cache(dev); 957 if (ret) 958 goto err; 959 960 mutex_unlock(&cache->rb_lock); 961 for (node = rb_first(root); node; node = rb_next(node)) { 962 ent = rb_entry(node, struct mlx5_cache_ent, node); 963 spin_lock_irq(&ent->mkeys_queue.lock); 964 queue_adjust_cache_locked(ent); 965 spin_unlock_irq(&ent->mkeys_queue.lock); 966 } 967 968 return 0; 969 970 err: 971 mutex_unlock(&cache->rb_lock); 972 mlx5_mkey_cache_debugfs_cleanup(dev); 973 mlx5_ib_warn(dev, "failed to create mkey cache entry\n"); 974 return ret; 975 } 976 977 void mlx5_mkey_cache_cleanup(struct mlx5_ib_dev *dev) 978 { 979 struct rb_root *root = &dev->cache.rb_root; 980 struct mlx5_cache_ent *ent; 981 struct rb_node *node; 982 983 if (!dev->cache.wq) 984 return; 985 986 mutex_lock(&dev->cache.rb_lock); 987 for (node = rb_first(root); node; node = rb_next(node)) { 988 ent = rb_entry(node, struct mlx5_cache_ent, node); 989 spin_lock_irq(&ent->mkeys_queue.lock); 990 ent->disabled = true; 991 spin_unlock_irq(&ent->mkeys_queue.lock); 992 cancel_delayed_work(&ent->dwork); 993 } 994 mutex_unlock(&dev->cache.rb_lock); 995 996 /* 997 * After all entries are disabled and will not reschedule on WQ, 998 * flush it and all async commands. 999 */ 1000 flush_workqueue(dev->cache.wq); 1001 1002 mlx5_mkey_cache_debugfs_cleanup(dev); 1003 mlx5_cmd_cleanup_async_ctx(&dev->async_ctx); 1004 1005 /* At this point all entries are disabled and have no concurrent work. */ 1006 mutex_lock(&dev->cache.rb_lock); 1007 node = rb_first(root); 1008 while (node) { 1009 ent = rb_entry(node, struct mlx5_cache_ent, node); 1010 node = rb_next(node); 1011 clean_keys(dev, ent); 1012 rb_erase(&ent->node, root); 1013 mlx5r_mkeys_uninit(ent); 1014 kfree(ent); 1015 } 1016 mutex_unlock(&dev->cache.rb_lock); 1017 1018 destroy_workqueue(dev->cache.wq); 1019 del_timer_sync(&dev->delay_timer); 1020 } 1021 1022 struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc) 1023 { 1024 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1025 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 1026 struct mlx5_ib_mr *mr; 1027 void *mkc; 1028 u32 *in; 1029 int err; 1030 1031 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 1032 if (!mr) 1033 return ERR_PTR(-ENOMEM); 1034 1035 in = kzalloc(inlen, GFP_KERNEL); 1036 if (!in) { 1037 err = -ENOMEM; 1038 goto err_free; 1039 } 1040 1041 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 1042 1043 MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_PA); 1044 MLX5_SET(mkc, mkc, length64, 1); 1045 set_mkc_access_pd_addr_fields(mkc, acc | IB_ACCESS_RELAXED_ORDERING, 0, 1046 pd); 1047 MLX5_SET(mkc, mkc, ma_translation_mode, MLX5_CAP_GEN(dev->mdev, ats)); 1048 1049 err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen); 1050 if (err) 1051 goto err_in; 1052 1053 kfree(in); 1054 mr->mmkey.type = MLX5_MKEY_MR; 1055 mr->ibmr.lkey = mr->mmkey.key; 1056 mr->ibmr.rkey = mr->mmkey.key; 1057 mr->umem = NULL; 1058 1059 return &mr->ibmr; 1060 1061 err_in: 1062 kfree(in); 1063 1064 err_free: 1065 kfree(mr); 1066 1067 return ERR_PTR(err); 1068 } 1069 1070 static int get_octo_len(u64 addr, u64 len, int page_shift) 1071 { 1072 u64 page_size = 1ULL << page_shift; 1073 u64 offset; 1074 int npages; 1075 1076 offset = addr & (page_size - 1); 1077 npages = ALIGN(len + offset, page_size) >> page_shift; 1078 return (npages + 1) / 2; 1079 } 1080 1081 static int mkey_cache_max_order(struct mlx5_ib_dev *dev) 1082 { 1083 if (MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset)) 1084 return MKEY_CACHE_LAST_STD_ENTRY; 1085 return MLX5_MAX_UMR_SHIFT; 1086 } 1087 1088 static void set_mr_fields(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr, 1089 u64 length, int access_flags, u64 iova) 1090 { 1091 mr->ibmr.lkey = mr->mmkey.key; 1092 mr->ibmr.rkey = mr->mmkey.key; 1093 mr->ibmr.length = length; 1094 mr->ibmr.device = &dev->ib_dev; 1095 mr->ibmr.iova = iova; 1096 mr->access_flags = access_flags; 1097 } 1098 1099 static unsigned int mlx5_umem_dmabuf_default_pgsz(struct ib_umem *umem, 1100 u64 iova) 1101 { 1102 /* 1103 * The alignment of iova has already been checked upon entering 1104 * UVERBS_METHOD_REG_DMABUF_MR 1105 */ 1106 umem->iova = iova; 1107 return PAGE_SIZE; 1108 } 1109 1110 static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd, 1111 struct ib_umem *umem, u64 iova, 1112 int access_flags, int access_mode) 1113 { 1114 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1115 struct mlx5r_cache_rb_key rb_key = {}; 1116 struct mlx5_cache_ent *ent; 1117 struct mlx5_ib_mr *mr; 1118 unsigned int page_size; 1119 1120 if (umem->is_dmabuf) 1121 page_size = mlx5_umem_dmabuf_default_pgsz(umem, iova); 1122 else 1123 page_size = mlx5_umem_mkc_find_best_pgsz(dev, umem, iova); 1124 if (WARN_ON(!page_size)) 1125 return ERR_PTR(-EINVAL); 1126 1127 rb_key.access_mode = access_mode; 1128 rb_key.ndescs = ib_umem_num_dma_blocks(umem, page_size); 1129 rb_key.ats = mlx5_umem_needs_ats(dev, umem, access_flags); 1130 rb_key.access_flags = get_unchangeable_access_flags(dev, access_flags); 1131 ent = mkey_cache_ent_from_rb_key(dev, rb_key); 1132 /* 1133 * If the MR can't come from the cache then synchronously create an uncached 1134 * one. 1135 */ 1136 if (!ent) { 1137 mutex_lock(&dev->slow_path_mutex); 1138 mr = reg_create(pd, umem, iova, access_flags, page_size, false, access_mode); 1139 mutex_unlock(&dev->slow_path_mutex); 1140 if (IS_ERR(mr)) 1141 return mr; 1142 mr->mmkey.rb_key = rb_key; 1143 mr->mmkey.cacheable = true; 1144 return mr; 1145 } 1146 1147 mr = _mlx5_mr_cache_alloc(dev, ent, access_flags); 1148 if (IS_ERR(mr)) 1149 return mr; 1150 1151 mr->ibmr.pd = pd; 1152 mr->umem = umem; 1153 mr->page_shift = order_base_2(page_size); 1154 set_mr_fields(dev, mr, umem->length, access_flags, iova); 1155 1156 return mr; 1157 } 1158 1159 static struct ib_mr * 1160 reg_create_crossing_vhca_mr(struct ib_pd *pd, u64 iova, u64 length, int access_flags, 1161 u32 crossed_lkey) 1162 { 1163 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1164 int access_mode = MLX5_MKC_ACCESS_MODE_CROSSING; 1165 struct mlx5_ib_mr *mr; 1166 void *mkc; 1167 int inlen; 1168 u32 *in; 1169 int err; 1170 1171 if (!MLX5_CAP_GEN(dev->mdev, crossing_vhca_mkey)) 1172 return ERR_PTR(-EOPNOTSUPP); 1173 1174 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 1175 if (!mr) 1176 return ERR_PTR(-ENOMEM); 1177 1178 inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 1179 in = kvzalloc(inlen, GFP_KERNEL); 1180 if (!in) { 1181 err = -ENOMEM; 1182 goto err_1; 1183 } 1184 1185 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 1186 MLX5_SET(mkc, mkc, crossing_target_vhca_id, 1187 MLX5_CAP_GEN(dev->mdev, vhca_id)); 1188 MLX5_SET(mkc, mkc, translations_octword_size, crossed_lkey); 1189 MLX5_SET(mkc, mkc, access_mode_1_0, access_mode & 0x3); 1190 MLX5_SET(mkc, mkc, access_mode_4_2, (access_mode >> 2) & 0x7); 1191 1192 /* for this crossing mkey IOVA should be 0 and len should be IOVA + len */ 1193 set_mkc_access_pd_addr_fields(mkc, access_flags, 0, pd); 1194 MLX5_SET64(mkc, mkc, len, iova + length); 1195 1196 MLX5_SET(mkc, mkc, free, 0); 1197 MLX5_SET(mkc, mkc, umr_en, 0); 1198 err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen); 1199 if (err) 1200 goto err_2; 1201 1202 mr->mmkey.type = MLX5_MKEY_MR; 1203 set_mr_fields(dev, mr, length, access_flags, iova); 1204 mr->ibmr.pd = pd; 1205 kvfree(in); 1206 mlx5_ib_dbg(dev, "crossing mkey = 0x%x\n", mr->mmkey.key); 1207 1208 return &mr->ibmr; 1209 err_2: 1210 kvfree(in); 1211 err_1: 1212 kfree(mr); 1213 return ERR_PTR(err); 1214 } 1215 1216 /* 1217 * If ibmr is NULL it will be allocated by reg_create. 1218 * Else, the given ibmr will be used. 1219 */ 1220 static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem, 1221 u64 iova, int access_flags, 1222 unsigned int page_size, bool populate, 1223 int access_mode) 1224 { 1225 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1226 struct mlx5_ib_mr *mr; 1227 __be64 *pas; 1228 void *mkc; 1229 int inlen; 1230 u32 *in; 1231 int err; 1232 bool pg_cap = !!(MLX5_CAP_GEN(dev->mdev, pg)) && 1233 (access_mode == MLX5_MKC_ACCESS_MODE_MTT); 1234 bool ksm_mode = (access_mode == MLX5_MKC_ACCESS_MODE_KSM); 1235 1236 if (!page_size) 1237 return ERR_PTR(-EINVAL); 1238 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 1239 if (!mr) 1240 return ERR_PTR(-ENOMEM); 1241 1242 mr->ibmr.pd = pd; 1243 mr->access_flags = access_flags; 1244 mr->page_shift = order_base_2(page_size); 1245 1246 inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 1247 if (populate) 1248 inlen += sizeof(*pas) * 1249 roundup(ib_umem_num_dma_blocks(umem, page_size), 2); 1250 in = kvzalloc(inlen, GFP_KERNEL); 1251 if (!in) { 1252 err = -ENOMEM; 1253 goto err_1; 1254 } 1255 pas = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt); 1256 if (populate) { 1257 if (WARN_ON(access_flags & IB_ACCESS_ON_DEMAND || ksm_mode)) { 1258 err = -EINVAL; 1259 goto err_2; 1260 } 1261 mlx5_ib_populate_pas(umem, 1UL << mr->page_shift, pas, 1262 pg_cap ? MLX5_IB_MTT_PRESENT : 0); 1263 } 1264 1265 /* The pg_access bit allows setting the access flags 1266 * in the page list submitted with the command. 1267 */ 1268 MLX5_SET(create_mkey_in, in, pg_access, !!(pg_cap)); 1269 1270 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 1271 set_mkc_access_pd_addr_fields(mkc, access_flags, iova, 1272 populate ? pd : dev->umrc.pd); 1273 /* In case a data direct flow, overwrite the pdn field by its internal kernel PD */ 1274 if (umem->is_dmabuf && ksm_mode) 1275 MLX5_SET(mkc, mkc, pd, dev->ddr.pdn); 1276 1277 MLX5_SET(mkc, mkc, free, !populate); 1278 MLX5_SET(mkc, mkc, access_mode_1_0, access_mode); 1279 MLX5_SET(mkc, mkc, umr_en, 1); 1280 1281 MLX5_SET64(mkc, mkc, len, umem->length); 1282 MLX5_SET(mkc, mkc, bsf_octword_size, 0); 1283 if (ksm_mode) 1284 MLX5_SET(mkc, mkc, translations_octword_size, 1285 get_octo_len(iova, umem->length, mr->page_shift) * 2); 1286 else 1287 MLX5_SET(mkc, mkc, translations_octword_size, 1288 get_octo_len(iova, umem->length, mr->page_shift)); 1289 MLX5_SET(mkc, mkc, log_page_size, mr->page_shift); 1290 if (mlx5_umem_needs_ats(dev, umem, access_flags)) 1291 MLX5_SET(mkc, mkc, ma_translation_mode, 1); 1292 if (populate) { 1293 MLX5_SET(create_mkey_in, in, translations_octword_actual_size, 1294 get_octo_len(iova, umem->length, mr->page_shift)); 1295 } 1296 1297 err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen); 1298 if (err) { 1299 mlx5_ib_warn(dev, "create mkey failed\n"); 1300 goto err_2; 1301 } 1302 mr->mmkey.type = MLX5_MKEY_MR; 1303 mr->mmkey.ndescs = get_octo_len(iova, umem->length, mr->page_shift); 1304 mr->umem = umem; 1305 set_mr_fields(dev, mr, umem->length, access_flags, iova); 1306 kvfree(in); 1307 1308 mlx5_ib_dbg(dev, "mkey = 0x%x\n", mr->mmkey.key); 1309 1310 return mr; 1311 1312 err_2: 1313 kvfree(in); 1314 err_1: 1315 kfree(mr); 1316 return ERR_PTR(err); 1317 } 1318 1319 static struct ib_mr *mlx5_ib_get_dm_mr(struct ib_pd *pd, u64 start_addr, 1320 u64 length, int acc, int mode) 1321 { 1322 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1323 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 1324 struct mlx5_ib_mr *mr; 1325 void *mkc; 1326 u32 *in; 1327 int err; 1328 1329 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 1330 if (!mr) 1331 return ERR_PTR(-ENOMEM); 1332 1333 in = kzalloc(inlen, GFP_KERNEL); 1334 if (!in) { 1335 err = -ENOMEM; 1336 goto err_free; 1337 } 1338 1339 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 1340 1341 MLX5_SET(mkc, mkc, access_mode_1_0, mode & 0x3); 1342 MLX5_SET(mkc, mkc, access_mode_4_2, (mode >> 2) & 0x7); 1343 MLX5_SET64(mkc, mkc, len, length); 1344 set_mkc_access_pd_addr_fields(mkc, acc, start_addr, pd); 1345 1346 err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen); 1347 if (err) 1348 goto err_in; 1349 1350 kfree(in); 1351 1352 set_mr_fields(dev, mr, length, acc, start_addr); 1353 1354 return &mr->ibmr; 1355 1356 err_in: 1357 kfree(in); 1358 1359 err_free: 1360 kfree(mr); 1361 1362 return ERR_PTR(err); 1363 } 1364 1365 int mlx5_ib_advise_mr(struct ib_pd *pd, 1366 enum ib_uverbs_advise_mr_advice advice, 1367 u32 flags, 1368 struct ib_sge *sg_list, 1369 u32 num_sge, 1370 struct uverbs_attr_bundle *attrs) 1371 { 1372 if (advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH && 1373 advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE && 1374 advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_NO_FAULT) 1375 return -EOPNOTSUPP; 1376 1377 return mlx5_ib_advise_mr_prefetch(pd, advice, flags, 1378 sg_list, num_sge); 1379 } 1380 1381 struct ib_mr *mlx5_ib_reg_dm_mr(struct ib_pd *pd, struct ib_dm *dm, 1382 struct ib_dm_mr_attr *attr, 1383 struct uverbs_attr_bundle *attrs) 1384 { 1385 struct mlx5_ib_dm *mdm = to_mdm(dm); 1386 struct mlx5_core_dev *dev = to_mdev(dm->device)->mdev; 1387 u64 start_addr = mdm->dev_addr + attr->offset; 1388 int mode; 1389 1390 switch (mdm->type) { 1391 case MLX5_IB_UAPI_DM_TYPE_MEMIC: 1392 if (attr->access_flags & ~MLX5_IB_DM_MEMIC_ALLOWED_ACCESS) 1393 return ERR_PTR(-EINVAL); 1394 1395 mode = MLX5_MKC_ACCESS_MODE_MEMIC; 1396 start_addr -= pci_resource_start(dev->pdev, 0); 1397 break; 1398 case MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM: 1399 case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM: 1400 case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_PATTERN_SW_ICM: 1401 case MLX5_IB_UAPI_DM_TYPE_ENCAP_SW_ICM: 1402 if (attr->access_flags & ~MLX5_IB_DM_SW_ICM_ALLOWED_ACCESS) 1403 return ERR_PTR(-EINVAL); 1404 1405 mode = MLX5_MKC_ACCESS_MODE_SW_ICM; 1406 break; 1407 default: 1408 return ERR_PTR(-EINVAL); 1409 } 1410 1411 return mlx5_ib_get_dm_mr(pd, start_addr, attr->length, 1412 attr->access_flags, mode); 1413 } 1414 1415 static struct ib_mr *create_real_mr(struct ib_pd *pd, struct ib_umem *umem, 1416 u64 iova, int access_flags) 1417 { 1418 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1419 struct mlx5_ib_mr *mr = NULL; 1420 bool xlt_with_umr; 1421 int err; 1422 1423 xlt_with_umr = mlx5r_umr_can_load_pas(dev, umem->length); 1424 if (xlt_with_umr) { 1425 mr = alloc_cacheable_mr(pd, umem, iova, access_flags, 1426 MLX5_MKC_ACCESS_MODE_MTT); 1427 } else { 1428 unsigned int page_size = 1429 mlx5_umem_mkc_find_best_pgsz(dev, umem, iova); 1430 1431 mutex_lock(&dev->slow_path_mutex); 1432 mr = reg_create(pd, umem, iova, access_flags, page_size, 1433 true, MLX5_MKC_ACCESS_MODE_MTT); 1434 mutex_unlock(&dev->slow_path_mutex); 1435 } 1436 if (IS_ERR(mr)) { 1437 ib_umem_release(umem); 1438 return ERR_CAST(mr); 1439 } 1440 1441 mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmkey.key); 1442 1443 atomic_add(ib_umem_num_pages(umem), &dev->mdev->priv.reg_pages); 1444 1445 if (xlt_with_umr) { 1446 /* 1447 * If the MR was created with reg_create then it will be 1448 * configured properly but left disabled. It is safe to go ahead 1449 * and configure it again via UMR while enabling it. 1450 */ 1451 err = mlx5r_umr_update_mr_pas(mr, MLX5_IB_UPD_XLT_ENABLE); 1452 if (err) { 1453 mlx5_ib_dereg_mr(&mr->ibmr, NULL); 1454 return ERR_PTR(err); 1455 } 1456 } 1457 return &mr->ibmr; 1458 } 1459 1460 static struct ib_mr *create_user_odp_mr(struct ib_pd *pd, u64 start, u64 length, 1461 u64 iova, int access_flags, 1462 struct ib_udata *udata) 1463 { 1464 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1465 struct ib_umem_odp *odp; 1466 struct mlx5_ib_mr *mr; 1467 int err; 1468 1469 if (!IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) 1470 return ERR_PTR(-EOPNOTSUPP); 1471 1472 err = mlx5r_odp_create_eq(dev, &dev->odp_pf_eq); 1473 if (err) 1474 return ERR_PTR(err); 1475 if (!start && length == U64_MAX) { 1476 if (iova != 0) 1477 return ERR_PTR(-EINVAL); 1478 if (!(dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT)) 1479 return ERR_PTR(-EINVAL); 1480 1481 mr = mlx5_ib_alloc_implicit_mr(to_mpd(pd), access_flags); 1482 if (IS_ERR(mr)) 1483 return ERR_CAST(mr); 1484 return &mr->ibmr; 1485 } 1486 1487 /* ODP requires xlt update via umr to work. */ 1488 if (!mlx5r_umr_can_load_pas(dev, length)) 1489 return ERR_PTR(-EINVAL); 1490 1491 odp = ib_umem_odp_get(&dev->ib_dev, start, length, access_flags, 1492 &mlx5_mn_ops); 1493 if (IS_ERR(odp)) 1494 return ERR_CAST(odp); 1495 1496 mr = alloc_cacheable_mr(pd, &odp->umem, iova, access_flags, 1497 MLX5_MKC_ACCESS_MODE_MTT); 1498 if (IS_ERR(mr)) { 1499 ib_umem_release(&odp->umem); 1500 return ERR_CAST(mr); 1501 } 1502 xa_init(&mr->implicit_children); 1503 1504 odp->private = mr; 1505 err = mlx5r_store_odp_mkey(dev, &mr->mmkey); 1506 if (err) 1507 goto err_dereg_mr; 1508 1509 err = mlx5_ib_init_odp_mr(mr); 1510 if (err) 1511 goto err_dereg_mr; 1512 return &mr->ibmr; 1513 1514 err_dereg_mr: 1515 mlx5_ib_dereg_mr(&mr->ibmr, NULL); 1516 return ERR_PTR(err); 1517 } 1518 1519 struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, 1520 u64 iova, int access_flags, 1521 struct ib_udata *udata) 1522 { 1523 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1524 struct ib_umem *umem; 1525 int err; 1526 1527 if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM)) 1528 return ERR_PTR(-EOPNOTSUPP); 1529 1530 mlx5_ib_dbg(dev, "start 0x%llx, iova 0x%llx, length 0x%llx, access_flags 0x%x\n", 1531 start, iova, length, access_flags); 1532 1533 err = mlx5r_umr_resource_init(dev); 1534 if (err) 1535 return ERR_PTR(err); 1536 1537 if (access_flags & IB_ACCESS_ON_DEMAND) 1538 return create_user_odp_mr(pd, start, length, iova, access_flags, 1539 udata); 1540 umem = ib_umem_get(&dev->ib_dev, start, length, access_flags); 1541 if (IS_ERR(umem)) 1542 return ERR_CAST(umem); 1543 return create_real_mr(pd, umem, iova, access_flags); 1544 } 1545 1546 static void mlx5_ib_dmabuf_invalidate_cb(struct dma_buf_attachment *attach) 1547 { 1548 struct ib_umem_dmabuf *umem_dmabuf = attach->importer_priv; 1549 struct mlx5_ib_mr *mr = umem_dmabuf->private; 1550 1551 dma_resv_assert_held(umem_dmabuf->attach->dmabuf->resv); 1552 1553 if (!umem_dmabuf->sgt) 1554 return; 1555 1556 mlx5r_umr_update_mr_pas(mr, MLX5_IB_UPD_XLT_ZAP); 1557 ib_umem_dmabuf_unmap_pages(umem_dmabuf); 1558 } 1559 1560 static struct dma_buf_attach_ops mlx5_ib_dmabuf_attach_ops = { 1561 .allow_peer2peer = 1, 1562 .move_notify = mlx5_ib_dmabuf_invalidate_cb, 1563 }; 1564 1565 static struct ib_mr * 1566 reg_user_mr_dmabuf(struct ib_pd *pd, struct device *dma_device, 1567 u64 offset, u64 length, u64 virt_addr, 1568 int fd, int access_flags, int access_mode) 1569 { 1570 bool pinned_mode = (access_mode == MLX5_MKC_ACCESS_MODE_KSM); 1571 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1572 struct mlx5_ib_mr *mr = NULL; 1573 struct ib_umem_dmabuf *umem_dmabuf; 1574 int err; 1575 1576 err = mlx5r_umr_resource_init(dev); 1577 if (err) 1578 return ERR_PTR(err); 1579 1580 if (!pinned_mode) 1581 umem_dmabuf = ib_umem_dmabuf_get(&dev->ib_dev, 1582 offset, length, fd, 1583 access_flags, 1584 &mlx5_ib_dmabuf_attach_ops); 1585 else 1586 umem_dmabuf = ib_umem_dmabuf_get_pinned_with_dma_device(&dev->ib_dev, 1587 dma_device, offset, length, 1588 fd, access_flags); 1589 1590 if (IS_ERR(umem_dmabuf)) { 1591 mlx5_ib_dbg(dev, "umem_dmabuf get failed (%ld)\n", 1592 PTR_ERR(umem_dmabuf)); 1593 return ERR_CAST(umem_dmabuf); 1594 } 1595 1596 mr = alloc_cacheable_mr(pd, &umem_dmabuf->umem, virt_addr, 1597 access_flags, access_mode); 1598 if (IS_ERR(mr)) { 1599 ib_umem_release(&umem_dmabuf->umem); 1600 return ERR_CAST(mr); 1601 } 1602 1603 mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmkey.key); 1604 1605 atomic_add(ib_umem_num_pages(mr->umem), &dev->mdev->priv.reg_pages); 1606 umem_dmabuf->private = mr; 1607 if (!pinned_mode) { 1608 err = mlx5r_store_odp_mkey(dev, &mr->mmkey); 1609 if (err) 1610 goto err_dereg_mr; 1611 } else { 1612 mr->data_direct = true; 1613 } 1614 1615 err = mlx5_ib_init_dmabuf_mr(mr); 1616 if (err) 1617 goto err_dereg_mr; 1618 return &mr->ibmr; 1619 1620 err_dereg_mr: 1621 __mlx5_ib_dereg_mr(&mr->ibmr); 1622 return ERR_PTR(err); 1623 } 1624 1625 static struct ib_mr * 1626 reg_user_mr_dmabuf_by_data_direct(struct ib_pd *pd, u64 offset, 1627 u64 length, u64 virt_addr, 1628 int fd, int access_flags) 1629 { 1630 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1631 struct mlx5_data_direct_dev *data_direct_dev; 1632 struct ib_mr *crossing_mr; 1633 struct ib_mr *crossed_mr; 1634 int ret = 0; 1635 1636 /* As of HW behaviour the IOVA must be page aligned in KSM mode */ 1637 if (!PAGE_ALIGNED(virt_addr) || (access_flags & IB_ACCESS_ON_DEMAND)) 1638 return ERR_PTR(-EOPNOTSUPP); 1639 1640 mutex_lock(&dev->data_direct_lock); 1641 data_direct_dev = dev->data_direct_dev; 1642 if (!data_direct_dev) { 1643 ret = -EINVAL; 1644 goto end; 1645 } 1646 1647 /* The device's 'data direct mkey' was created without RO flags to 1648 * simplify things and allow for a single mkey per device. 1649 * Since RO is not a must, mask it out accordingly. 1650 */ 1651 access_flags &= ~IB_ACCESS_RELAXED_ORDERING; 1652 crossed_mr = reg_user_mr_dmabuf(pd, &data_direct_dev->pdev->dev, 1653 offset, length, virt_addr, fd, 1654 access_flags, MLX5_MKC_ACCESS_MODE_KSM); 1655 if (IS_ERR(crossed_mr)) { 1656 ret = PTR_ERR(crossed_mr); 1657 goto end; 1658 } 1659 1660 mutex_lock(&dev->slow_path_mutex); 1661 crossing_mr = reg_create_crossing_vhca_mr(pd, virt_addr, length, access_flags, 1662 crossed_mr->lkey); 1663 mutex_unlock(&dev->slow_path_mutex); 1664 if (IS_ERR(crossing_mr)) { 1665 __mlx5_ib_dereg_mr(crossed_mr); 1666 ret = PTR_ERR(crossing_mr); 1667 goto end; 1668 } 1669 1670 list_add_tail(&to_mmr(crossed_mr)->dd_node, &dev->data_direct_mr_list); 1671 to_mmr(crossing_mr)->dd_crossed_mr = to_mmr(crossed_mr); 1672 to_mmr(crossing_mr)->data_direct = true; 1673 end: 1674 mutex_unlock(&dev->data_direct_lock); 1675 return ret ? ERR_PTR(ret) : crossing_mr; 1676 } 1677 1678 struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset, 1679 u64 length, u64 virt_addr, 1680 int fd, int access_flags, 1681 struct uverbs_attr_bundle *attrs) 1682 { 1683 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1684 int mlx5_access_flags = 0; 1685 int err; 1686 1687 if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM) || 1688 !IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) 1689 return ERR_PTR(-EOPNOTSUPP); 1690 1691 if (uverbs_attr_is_valid(attrs, MLX5_IB_ATTR_REG_DMABUF_MR_ACCESS_FLAGS)) { 1692 err = uverbs_get_flags32(&mlx5_access_flags, attrs, 1693 MLX5_IB_ATTR_REG_DMABUF_MR_ACCESS_FLAGS, 1694 MLX5_IB_UAPI_REG_DMABUF_ACCESS_DATA_DIRECT); 1695 if (err) 1696 return ERR_PTR(err); 1697 } 1698 1699 mlx5_ib_dbg(dev, 1700 "offset 0x%llx, virt_addr 0x%llx, length 0x%llx, fd %d, access_flags 0x%x, mlx5_access_flags 0x%x\n", 1701 offset, virt_addr, length, fd, access_flags, mlx5_access_flags); 1702 1703 /* dmabuf requires xlt update via umr to work. */ 1704 if (!mlx5r_umr_can_load_pas(dev, length)) 1705 return ERR_PTR(-EINVAL); 1706 1707 if (mlx5_access_flags & MLX5_IB_UAPI_REG_DMABUF_ACCESS_DATA_DIRECT) 1708 return reg_user_mr_dmabuf_by_data_direct(pd, offset, length, virt_addr, 1709 fd, access_flags); 1710 1711 return reg_user_mr_dmabuf(pd, pd->device->dma_device, 1712 offset, length, virt_addr, 1713 fd, access_flags, MLX5_MKC_ACCESS_MODE_MTT); 1714 } 1715 1716 /* 1717 * True if the change in access flags can be done via UMR, only some access 1718 * flags can be updated. 1719 */ 1720 static bool can_use_umr_rereg_access(struct mlx5_ib_dev *dev, 1721 unsigned int current_access_flags, 1722 unsigned int target_access_flags) 1723 { 1724 unsigned int diffs = current_access_flags ^ target_access_flags; 1725 1726 if (diffs & ~(IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE | 1727 IB_ACCESS_REMOTE_READ | IB_ACCESS_RELAXED_ORDERING | 1728 IB_ACCESS_REMOTE_ATOMIC)) 1729 return false; 1730 return mlx5r_umr_can_reconfig(dev, current_access_flags, 1731 target_access_flags); 1732 } 1733 1734 static bool can_use_umr_rereg_pas(struct mlx5_ib_mr *mr, 1735 struct ib_umem *new_umem, 1736 int new_access_flags, u64 iova, 1737 unsigned long *page_size) 1738 { 1739 struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device); 1740 1741 /* We only track the allocated sizes of MRs from the cache */ 1742 if (!mr->mmkey.cache_ent) 1743 return false; 1744 if (!mlx5r_umr_can_load_pas(dev, new_umem->length)) 1745 return false; 1746 1747 *page_size = mlx5_umem_mkc_find_best_pgsz(dev, new_umem, iova); 1748 if (WARN_ON(!*page_size)) 1749 return false; 1750 return (mr->mmkey.cache_ent->rb_key.ndescs) >= 1751 ib_umem_num_dma_blocks(new_umem, *page_size); 1752 } 1753 1754 static int umr_rereg_pas(struct mlx5_ib_mr *mr, struct ib_pd *pd, 1755 int access_flags, int flags, struct ib_umem *new_umem, 1756 u64 iova, unsigned long page_size) 1757 { 1758 struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device); 1759 int upd_flags = MLX5_IB_UPD_XLT_ADDR | MLX5_IB_UPD_XLT_ENABLE; 1760 struct ib_umem *old_umem = mr->umem; 1761 int err; 1762 1763 /* 1764 * To keep everything simple the MR is revoked before we start to mess 1765 * with it. This ensure the change is atomic relative to any use of the 1766 * MR. 1767 */ 1768 err = mlx5r_umr_revoke_mr(mr); 1769 if (err) 1770 return err; 1771 1772 if (flags & IB_MR_REREG_PD) { 1773 mr->ibmr.pd = pd; 1774 upd_flags |= MLX5_IB_UPD_XLT_PD; 1775 } 1776 if (flags & IB_MR_REREG_ACCESS) { 1777 mr->access_flags = access_flags; 1778 upd_flags |= MLX5_IB_UPD_XLT_ACCESS; 1779 } 1780 1781 mr->ibmr.iova = iova; 1782 mr->ibmr.length = new_umem->length; 1783 mr->page_shift = order_base_2(page_size); 1784 mr->umem = new_umem; 1785 err = mlx5r_umr_update_mr_pas(mr, upd_flags); 1786 if (err) { 1787 /* 1788 * The MR is revoked at this point so there is no issue to free 1789 * new_umem. 1790 */ 1791 mr->umem = old_umem; 1792 return err; 1793 } 1794 1795 atomic_sub(ib_umem_num_pages(old_umem), &dev->mdev->priv.reg_pages); 1796 ib_umem_release(old_umem); 1797 atomic_add(ib_umem_num_pages(new_umem), &dev->mdev->priv.reg_pages); 1798 return 0; 1799 } 1800 1801 struct ib_mr *mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, 1802 u64 length, u64 iova, int new_access_flags, 1803 struct ib_pd *new_pd, 1804 struct ib_udata *udata) 1805 { 1806 struct mlx5_ib_dev *dev = to_mdev(ib_mr->device); 1807 struct mlx5_ib_mr *mr = to_mmr(ib_mr); 1808 int err; 1809 1810 if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM) || mr->data_direct) 1811 return ERR_PTR(-EOPNOTSUPP); 1812 1813 mlx5_ib_dbg( 1814 dev, 1815 "start 0x%llx, iova 0x%llx, length 0x%llx, access_flags 0x%x\n", 1816 start, iova, length, new_access_flags); 1817 1818 if (flags & ~(IB_MR_REREG_TRANS | IB_MR_REREG_PD | IB_MR_REREG_ACCESS)) 1819 return ERR_PTR(-EOPNOTSUPP); 1820 1821 if (!(flags & IB_MR_REREG_ACCESS)) 1822 new_access_flags = mr->access_flags; 1823 if (!(flags & IB_MR_REREG_PD)) 1824 new_pd = ib_mr->pd; 1825 1826 if (!(flags & IB_MR_REREG_TRANS)) { 1827 struct ib_umem *umem; 1828 1829 /* Fast path for PD/access change */ 1830 if (can_use_umr_rereg_access(dev, mr->access_flags, 1831 new_access_flags)) { 1832 err = mlx5r_umr_rereg_pd_access(mr, new_pd, 1833 new_access_flags); 1834 if (err) 1835 return ERR_PTR(err); 1836 return NULL; 1837 } 1838 /* DM or ODP MR's don't have a normal umem so we can't re-use it */ 1839 if (!mr->umem || is_odp_mr(mr) || is_dmabuf_mr(mr)) 1840 goto recreate; 1841 1842 /* 1843 * Only one active MR can refer to a umem at one time, revoke 1844 * the old MR before assigning the umem to the new one. 1845 */ 1846 err = mlx5r_umr_revoke_mr(mr); 1847 if (err) 1848 return ERR_PTR(err); 1849 umem = mr->umem; 1850 mr->umem = NULL; 1851 atomic_sub(ib_umem_num_pages(umem), &dev->mdev->priv.reg_pages); 1852 1853 return create_real_mr(new_pd, umem, mr->ibmr.iova, 1854 new_access_flags); 1855 } 1856 1857 /* 1858 * DM doesn't have a PAS list so we can't re-use it, odp/dmabuf does 1859 * but the logic around releasing the umem is different 1860 */ 1861 if (!mr->umem || is_odp_mr(mr) || is_dmabuf_mr(mr)) 1862 goto recreate; 1863 1864 if (!(new_access_flags & IB_ACCESS_ON_DEMAND) && 1865 can_use_umr_rereg_access(dev, mr->access_flags, new_access_flags)) { 1866 struct ib_umem *new_umem; 1867 unsigned long page_size; 1868 1869 new_umem = ib_umem_get(&dev->ib_dev, start, length, 1870 new_access_flags); 1871 if (IS_ERR(new_umem)) 1872 return ERR_CAST(new_umem); 1873 1874 /* Fast path for PAS change */ 1875 if (can_use_umr_rereg_pas(mr, new_umem, new_access_flags, iova, 1876 &page_size)) { 1877 err = umr_rereg_pas(mr, new_pd, new_access_flags, flags, 1878 new_umem, iova, page_size); 1879 if (err) { 1880 ib_umem_release(new_umem); 1881 return ERR_PTR(err); 1882 } 1883 return NULL; 1884 } 1885 return create_real_mr(new_pd, new_umem, iova, new_access_flags); 1886 } 1887 1888 /* 1889 * Everything else has no state we can preserve, just create a new MR 1890 * from scratch 1891 */ 1892 recreate: 1893 return mlx5_ib_reg_user_mr(new_pd, start, length, iova, 1894 new_access_flags, udata); 1895 } 1896 1897 static int 1898 mlx5_alloc_priv_descs(struct ib_device *device, 1899 struct mlx5_ib_mr *mr, 1900 int ndescs, 1901 int desc_size) 1902 { 1903 struct mlx5_ib_dev *dev = to_mdev(device); 1904 struct device *ddev = &dev->mdev->pdev->dev; 1905 int size = ndescs * desc_size; 1906 int add_size; 1907 int ret; 1908 1909 add_size = max_t(int, MLX5_UMR_ALIGN - ARCH_KMALLOC_MINALIGN, 0); 1910 if (is_power_of_2(MLX5_UMR_ALIGN) && add_size) { 1911 int end = max_t(int, MLX5_UMR_ALIGN, roundup_pow_of_two(size)); 1912 1913 add_size = min_t(int, end - size, add_size); 1914 } 1915 1916 mr->descs_alloc = kzalloc(size + add_size, GFP_KERNEL); 1917 if (!mr->descs_alloc) 1918 return -ENOMEM; 1919 1920 mr->descs = PTR_ALIGN(mr->descs_alloc, MLX5_UMR_ALIGN); 1921 1922 mr->desc_map = dma_map_single(ddev, mr->descs, size, DMA_TO_DEVICE); 1923 if (dma_mapping_error(ddev, mr->desc_map)) { 1924 ret = -ENOMEM; 1925 goto err; 1926 } 1927 1928 return 0; 1929 err: 1930 kfree(mr->descs_alloc); 1931 1932 return ret; 1933 } 1934 1935 static void 1936 mlx5_free_priv_descs(struct mlx5_ib_mr *mr) 1937 { 1938 if (!mr->umem && !mr->data_direct && mr->descs) { 1939 struct ib_device *device = mr->ibmr.device; 1940 int size = mr->max_descs * mr->desc_size; 1941 struct mlx5_ib_dev *dev = to_mdev(device); 1942 1943 dma_unmap_single(&dev->mdev->pdev->dev, mr->desc_map, size, 1944 DMA_TO_DEVICE); 1945 kfree(mr->descs_alloc); 1946 mr->descs = NULL; 1947 } 1948 } 1949 1950 static int cache_ent_find_and_store(struct mlx5_ib_dev *dev, 1951 struct mlx5_ib_mr *mr) 1952 { 1953 struct mlx5_mkey_cache *cache = &dev->cache; 1954 struct mlx5_cache_ent *ent; 1955 int ret; 1956 1957 if (mr->mmkey.cache_ent) { 1958 spin_lock_irq(&mr->mmkey.cache_ent->mkeys_queue.lock); 1959 mr->mmkey.cache_ent->in_use--; 1960 goto end; 1961 } 1962 1963 mutex_lock(&cache->rb_lock); 1964 ent = mkey_cache_ent_from_rb_key(dev, mr->mmkey.rb_key); 1965 if (ent) { 1966 if (ent->rb_key.ndescs == mr->mmkey.rb_key.ndescs) { 1967 if (ent->disabled) { 1968 mutex_unlock(&cache->rb_lock); 1969 return -EOPNOTSUPP; 1970 } 1971 mr->mmkey.cache_ent = ent; 1972 spin_lock_irq(&mr->mmkey.cache_ent->mkeys_queue.lock); 1973 mutex_unlock(&cache->rb_lock); 1974 goto end; 1975 } 1976 } 1977 1978 ent = mlx5r_cache_create_ent_locked(dev, mr->mmkey.rb_key, false); 1979 mutex_unlock(&cache->rb_lock); 1980 if (IS_ERR(ent)) 1981 return PTR_ERR(ent); 1982 1983 mr->mmkey.cache_ent = ent; 1984 spin_lock_irq(&mr->mmkey.cache_ent->mkeys_queue.lock); 1985 1986 end: 1987 ret = push_mkey_locked(mr->mmkey.cache_ent, mr->mmkey.key); 1988 spin_unlock_irq(&mr->mmkey.cache_ent->mkeys_queue.lock); 1989 return ret; 1990 } 1991 1992 static int mlx5_ib_revoke_data_direct_mr(struct mlx5_ib_mr *mr) 1993 { 1994 struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device); 1995 struct ib_umem_dmabuf *umem_dmabuf = to_ib_umem_dmabuf(mr->umem); 1996 int err; 1997 1998 lockdep_assert_held(&dev->data_direct_lock); 1999 mr->revoked = true; 2000 err = mlx5r_umr_revoke_mr(mr); 2001 if (WARN_ON(err)) 2002 return err; 2003 2004 ib_umem_dmabuf_revoke(umem_dmabuf); 2005 return 0; 2006 } 2007 2008 void mlx5_ib_revoke_data_direct_mrs(struct mlx5_ib_dev *dev) 2009 { 2010 struct mlx5_ib_mr *mr, *next; 2011 2012 lockdep_assert_held(&dev->data_direct_lock); 2013 2014 list_for_each_entry_safe(mr, next, &dev->data_direct_mr_list, dd_node) { 2015 list_del(&mr->dd_node); 2016 mlx5_ib_revoke_data_direct_mr(mr); 2017 } 2018 } 2019 2020 static int mlx5_revoke_mr(struct mlx5_ib_mr *mr) 2021 { 2022 struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device); 2023 struct mlx5_cache_ent *ent = mr->mmkey.cache_ent; 2024 2025 if (mr->mmkey.cacheable && !mlx5r_umr_revoke_mr(mr) && !cache_ent_find_and_store(dev, mr)) { 2026 ent = mr->mmkey.cache_ent; 2027 /* upon storing to a clean temp entry - schedule its cleanup */ 2028 spin_lock_irq(&ent->mkeys_queue.lock); 2029 if (ent->is_tmp && !ent->tmp_cleanup_scheduled) { 2030 mod_delayed_work(ent->dev->cache.wq, &ent->dwork, 2031 msecs_to_jiffies(30 * 1000)); 2032 ent->tmp_cleanup_scheduled = true; 2033 } 2034 spin_unlock_irq(&ent->mkeys_queue.lock); 2035 return 0; 2036 } 2037 2038 if (ent) { 2039 spin_lock_irq(&ent->mkeys_queue.lock); 2040 ent->in_use--; 2041 mr->mmkey.cache_ent = NULL; 2042 spin_unlock_irq(&ent->mkeys_queue.lock); 2043 } 2044 return destroy_mkey(dev, mr); 2045 } 2046 2047 static int __mlx5_ib_dereg_mr(struct ib_mr *ibmr) 2048 { 2049 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2050 struct mlx5_ib_dev *dev = to_mdev(ibmr->device); 2051 int rc; 2052 2053 /* 2054 * Any async use of the mr must hold the refcount, once the refcount 2055 * goes to zero no other thread, such as ODP page faults, prefetch, any 2056 * UMR activity, etc can touch the mkey. Thus it is safe to destroy it. 2057 */ 2058 if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) && 2059 refcount_read(&mr->mmkey.usecount) != 0 && 2060 xa_erase(&mr_to_mdev(mr)->odp_mkeys, mlx5_base_mkey(mr->mmkey.key))) 2061 mlx5r_deref_wait_odp_mkey(&mr->mmkey); 2062 2063 if (ibmr->type == IB_MR_TYPE_INTEGRITY) { 2064 xa_cmpxchg(&dev->sig_mrs, mlx5_base_mkey(mr->mmkey.key), 2065 mr->sig, NULL, GFP_KERNEL); 2066 2067 if (mr->mtt_mr) { 2068 rc = mlx5_ib_dereg_mr(&mr->mtt_mr->ibmr, NULL); 2069 if (rc) 2070 return rc; 2071 mr->mtt_mr = NULL; 2072 } 2073 if (mr->klm_mr) { 2074 rc = mlx5_ib_dereg_mr(&mr->klm_mr->ibmr, NULL); 2075 if (rc) 2076 return rc; 2077 mr->klm_mr = NULL; 2078 } 2079 2080 if (mlx5_core_destroy_psv(dev->mdev, 2081 mr->sig->psv_memory.psv_idx)) 2082 mlx5_ib_warn(dev, "failed to destroy mem psv %d\n", 2083 mr->sig->psv_memory.psv_idx); 2084 if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_wire.psv_idx)) 2085 mlx5_ib_warn(dev, "failed to destroy wire psv %d\n", 2086 mr->sig->psv_wire.psv_idx); 2087 kfree(mr->sig); 2088 mr->sig = NULL; 2089 } 2090 2091 /* Stop DMA */ 2092 rc = mlx5_revoke_mr(mr); 2093 if (rc) 2094 return rc; 2095 2096 if (mr->umem) { 2097 bool is_odp = is_odp_mr(mr); 2098 2099 if (!is_odp) 2100 atomic_sub(ib_umem_num_pages(mr->umem), 2101 &dev->mdev->priv.reg_pages); 2102 ib_umem_release(mr->umem); 2103 if (is_odp) 2104 mlx5_ib_free_odp_mr(mr); 2105 } 2106 2107 if (!mr->mmkey.cache_ent) 2108 mlx5_free_priv_descs(mr); 2109 2110 kfree(mr); 2111 return 0; 2112 } 2113 2114 static int dereg_crossing_data_direct_mr(struct mlx5_ib_dev *dev, 2115 struct mlx5_ib_mr *mr) 2116 { 2117 struct mlx5_ib_mr *dd_crossed_mr = mr->dd_crossed_mr; 2118 int ret; 2119 2120 ret = __mlx5_ib_dereg_mr(&mr->ibmr); 2121 if (ret) 2122 return ret; 2123 2124 mutex_lock(&dev->data_direct_lock); 2125 if (!dd_crossed_mr->revoked) 2126 list_del(&dd_crossed_mr->dd_node); 2127 2128 ret = __mlx5_ib_dereg_mr(&dd_crossed_mr->ibmr); 2129 mutex_unlock(&dev->data_direct_lock); 2130 return ret; 2131 } 2132 2133 int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) 2134 { 2135 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2136 struct mlx5_ib_dev *dev = to_mdev(ibmr->device); 2137 2138 if (mr->data_direct) 2139 return dereg_crossing_data_direct_mr(dev, mr); 2140 2141 return __mlx5_ib_dereg_mr(ibmr); 2142 } 2143 2144 static void mlx5_set_umr_free_mkey(struct ib_pd *pd, u32 *in, int ndescs, 2145 int access_mode, int page_shift) 2146 { 2147 struct mlx5_ib_dev *dev = to_mdev(pd->device); 2148 void *mkc; 2149 2150 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 2151 2152 /* This is only used from the kernel, so setting the PD is OK. */ 2153 set_mkc_access_pd_addr_fields(mkc, IB_ACCESS_RELAXED_ORDERING, 0, pd); 2154 MLX5_SET(mkc, mkc, free, 1); 2155 MLX5_SET(mkc, mkc, translations_octword_size, ndescs); 2156 MLX5_SET(mkc, mkc, access_mode_1_0, access_mode & 0x3); 2157 MLX5_SET(mkc, mkc, access_mode_4_2, (access_mode >> 2) & 0x7); 2158 MLX5_SET(mkc, mkc, umr_en, 1); 2159 MLX5_SET(mkc, mkc, log_page_size, page_shift); 2160 if (access_mode == MLX5_MKC_ACCESS_MODE_PA || 2161 access_mode == MLX5_MKC_ACCESS_MODE_MTT) 2162 MLX5_SET(mkc, mkc, ma_translation_mode, MLX5_CAP_GEN(dev->mdev, ats)); 2163 } 2164 2165 static int _mlx5_alloc_mkey_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr, 2166 int ndescs, int desc_size, int page_shift, 2167 int access_mode, u32 *in, int inlen) 2168 { 2169 struct mlx5_ib_dev *dev = to_mdev(pd->device); 2170 int err; 2171 2172 mr->access_mode = access_mode; 2173 mr->desc_size = desc_size; 2174 mr->max_descs = ndescs; 2175 2176 err = mlx5_alloc_priv_descs(pd->device, mr, ndescs, desc_size); 2177 if (err) 2178 return err; 2179 2180 mlx5_set_umr_free_mkey(pd, in, ndescs, access_mode, page_shift); 2181 2182 err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen); 2183 if (err) 2184 goto err_free_descs; 2185 2186 mr->mmkey.type = MLX5_MKEY_MR; 2187 mr->ibmr.lkey = mr->mmkey.key; 2188 mr->ibmr.rkey = mr->mmkey.key; 2189 2190 return 0; 2191 2192 err_free_descs: 2193 mlx5_free_priv_descs(mr); 2194 return err; 2195 } 2196 2197 static struct mlx5_ib_mr *mlx5_ib_alloc_pi_mr(struct ib_pd *pd, 2198 u32 max_num_sg, u32 max_num_meta_sg, 2199 int desc_size, int access_mode) 2200 { 2201 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 2202 int ndescs = ALIGN(max_num_sg + max_num_meta_sg, 4); 2203 int page_shift = 0; 2204 struct mlx5_ib_mr *mr; 2205 u32 *in; 2206 int err; 2207 2208 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 2209 if (!mr) 2210 return ERR_PTR(-ENOMEM); 2211 2212 mr->ibmr.pd = pd; 2213 mr->ibmr.device = pd->device; 2214 2215 in = kzalloc(inlen, GFP_KERNEL); 2216 if (!in) { 2217 err = -ENOMEM; 2218 goto err_free; 2219 } 2220 2221 if (access_mode == MLX5_MKC_ACCESS_MODE_MTT) 2222 page_shift = PAGE_SHIFT; 2223 2224 err = _mlx5_alloc_mkey_descs(pd, mr, ndescs, desc_size, page_shift, 2225 access_mode, in, inlen); 2226 if (err) 2227 goto err_free_in; 2228 2229 mr->umem = NULL; 2230 kfree(in); 2231 2232 return mr; 2233 2234 err_free_in: 2235 kfree(in); 2236 err_free: 2237 kfree(mr); 2238 return ERR_PTR(err); 2239 } 2240 2241 static int mlx5_alloc_mem_reg_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr, 2242 int ndescs, u32 *in, int inlen) 2243 { 2244 return _mlx5_alloc_mkey_descs(pd, mr, ndescs, sizeof(struct mlx5_mtt), 2245 PAGE_SHIFT, MLX5_MKC_ACCESS_MODE_MTT, in, 2246 inlen); 2247 } 2248 2249 static int mlx5_alloc_sg_gaps_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr, 2250 int ndescs, u32 *in, int inlen) 2251 { 2252 return _mlx5_alloc_mkey_descs(pd, mr, ndescs, sizeof(struct mlx5_klm), 2253 0, MLX5_MKC_ACCESS_MODE_KLMS, in, inlen); 2254 } 2255 2256 static int mlx5_alloc_integrity_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr, 2257 int max_num_sg, int max_num_meta_sg, 2258 u32 *in, int inlen) 2259 { 2260 struct mlx5_ib_dev *dev = to_mdev(pd->device); 2261 u32 psv_index[2]; 2262 void *mkc; 2263 int err; 2264 2265 mr->sig = kzalloc(sizeof(*mr->sig), GFP_KERNEL); 2266 if (!mr->sig) 2267 return -ENOMEM; 2268 2269 /* create mem & wire PSVs */ 2270 err = mlx5_core_create_psv(dev->mdev, to_mpd(pd)->pdn, 2, psv_index); 2271 if (err) 2272 goto err_free_sig; 2273 2274 mr->sig->psv_memory.psv_idx = psv_index[0]; 2275 mr->sig->psv_wire.psv_idx = psv_index[1]; 2276 2277 mr->sig->sig_status_checked = true; 2278 mr->sig->sig_err_exists = false; 2279 /* Next UMR, Arm SIGERR */ 2280 ++mr->sig->sigerr_count; 2281 mr->klm_mr = mlx5_ib_alloc_pi_mr(pd, max_num_sg, max_num_meta_sg, 2282 sizeof(struct mlx5_klm), 2283 MLX5_MKC_ACCESS_MODE_KLMS); 2284 if (IS_ERR(mr->klm_mr)) { 2285 err = PTR_ERR(mr->klm_mr); 2286 goto err_destroy_psv; 2287 } 2288 mr->mtt_mr = mlx5_ib_alloc_pi_mr(pd, max_num_sg, max_num_meta_sg, 2289 sizeof(struct mlx5_mtt), 2290 MLX5_MKC_ACCESS_MODE_MTT); 2291 if (IS_ERR(mr->mtt_mr)) { 2292 err = PTR_ERR(mr->mtt_mr); 2293 goto err_free_klm_mr; 2294 } 2295 2296 /* Set bsf descriptors for mkey */ 2297 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 2298 MLX5_SET(mkc, mkc, bsf_en, 1); 2299 MLX5_SET(mkc, mkc, bsf_octword_size, MLX5_MKEY_BSF_OCTO_SIZE); 2300 2301 err = _mlx5_alloc_mkey_descs(pd, mr, 4, sizeof(struct mlx5_klm), 0, 2302 MLX5_MKC_ACCESS_MODE_KLMS, in, inlen); 2303 if (err) 2304 goto err_free_mtt_mr; 2305 2306 err = xa_err(xa_store(&dev->sig_mrs, mlx5_base_mkey(mr->mmkey.key), 2307 mr->sig, GFP_KERNEL)); 2308 if (err) 2309 goto err_free_descs; 2310 return 0; 2311 2312 err_free_descs: 2313 destroy_mkey(dev, mr); 2314 mlx5_free_priv_descs(mr); 2315 err_free_mtt_mr: 2316 mlx5_ib_dereg_mr(&mr->mtt_mr->ibmr, NULL); 2317 mr->mtt_mr = NULL; 2318 err_free_klm_mr: 2319 mlx5_ib_dereg_mr(&mr->klm_mr->ibmr, NULL); 2320 mr->klm_mr = NULL; 2321 err_destroy_psv: 2322 if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_memory.psv_idx)) 2323 mlx5_ib_warn(dev, "failed to destroy mem psv %d\n", 2324 mr->sig->psv_memory.psv_idx); 2325 if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_wire.psv_idx)) 2326 mlx5_ib_warn(dev, "failed to destroy wire psv %d\n", 2327 mr->sig->psv_wire.psv_idx); 2328 err_free_sig: 2329 kfree(mr->sig); 2330 2331 return err; 2332 } 2333 2334 static struct ib_mr *__mlx5_ib_alloc_mr(struct ib_pd *pd, 2335 enum ib_mr_type mr_type, u32 max_num_sg, 2336 u32 max_num_meta_sg) 2337 { 2338 struct mlx5_ib_dev *dev = to_mdev(pd->device); 2339 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 2340 int ndescs = ALIGN(max_num_sg, 4); 2341 struct mlx5_ib_mr *mr; 2342 u32 *in; 2343 int err; 2344 2345 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 2346 if (!mr) 2347 return ERR_PTR(-ENOMEM); 2348 2349 in = kzalloc(inlen, GFP_KERNEL); 2350 if (!in) { 2351 err = -ENOMEM; 2352 goto err_free; 2353 } 2354 2355 mr->ibmr.device = pd->device; 2356 mr->umem = NULL; 2357 2358 switch (mr_type) { 2359 case IB_MR_TYPE_MEM_REG: 2360 err = mlx5_alloc_mem_reg_descs(pd, mr, ndescs, in, inlen); 2361 break; 2362 case IB_MR_TYPE_SG_GAPS: 2363 err = mlx5_alloc_sg_gaps_descs(pd, mr, ndescs, in, inlen); 2364 break; 2365 case IB_MR_TYPE_INTEGRITY: 2366 err = mlx5_alloc_integrity_descs(pd, mr, max_num_sg, 2367 max_num_meta_sg, in, inlen); 2368 break; 2369 default: 2370 mlx5_ib_warn(dev, "Invalid mr type %d\n", mr_type); 2371 err = -EINVAL; 2372 } 2373 2374 if (err) 2375 goto err_free_in; 2376 2377 kfree(in); 2378 2379 return &mr->ibmr; 2380 2381 err_free_in: 2382 kfree(in); 2383 err_free: 2384 kfree(mr); 2385 return ERR_PTR(err); 2386 } 2387 2388 struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, 2389 u32 max_num_sg) 2390 { 2391 return __mlx5_ib_alloc_mr(pd, mr_type, max_num_sg, 0); 2392 } 2393 2394 struct ib_mr *mlx5_ib_alloc_mr_integrity(struct ib_pd *pd, 2395 u32 max_num_sg, u32 max_num_meta_sg) 2396 { 2397 return __mlx5_ib_alloc_mr(pd, IB_MR_TYPE_INTEGRITY, max_num_sg, 2398 max_num_meta_sg); 2399 } 2400 2401 int mlx5_ib_alloc_mw(struct ib_mw *ibmw, struct ib_udata *udata) 2402 { 2403 struct mlx5_ib_dev *dev = to_mdev(ibmw->device); 2404 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 2405 struct mlx5_ib_mw *mw = to_mmw(ibmw); 2406 unsigned int ndescs; 2407 u32 *in = NULL; 2408 void *mkc; 2409 int err; 2410 struct mlx5_ib_alloc_mw req = {}; 2411 struct { 2412 __u32 comp_mask; 2413 __u32 response_length; 2414 } resp = {}; 2415 2416 err = ib_copy_from_udata(&req, udata, min(udata->inlen, sizeof(req))); 2417 if (err) 2418 return err; 2419 2420 if (req.comp_mask || req.reserved1 || req.reserved2) 2421 return -EOPNOTSUPP; 2422 2423 if (udata->inlen > sizeof(req) && 2424 !ib_is_udata_cleared(udata, sizeof(req), 2425 udata->inlen - sizeof(req))) 2426 return -EOPNOTSUPP; 2427 2428 ndescs = req.num_klms ? roundup(req.num_klms, 4) : roundup(1, 4); 2429 2430 in = kzalloc(inlen, GFP_KERNEL); 2431 if (!in) 2432 return -ENOMEM; 2433 2434 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 2435 2436 MLX5_SET(mkc, mkc, free, 1); 2437 MLX5_SET(mkc, mkc, translations_octword_size, ndescs); 2438 MLX5_SET(mkc, mkc, pd, to_mpd(ibmw->pd)->pdn); 2439 MLX5_SET(mkc, mkc, umr_en, 1); 2440 MLX5_SET(mkc, mkc, lr, 1); 2441 MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_KLMS); 2442 MLX5_SET(mkc, mkc, en_rinval, !!((ibmw->type == IB_MW_TYPE_2))); 2443 MLX5_SET(mkc, mkc, qpn, 0xffffff); 2444 2445 err = mlx5_ib_create_mkey(dev, &mw->mmkey, in, inlen); 2446 if (err) 2447 goto free; 2448 2449 mw->mmkey.type = MLX5_MKEY_MW; 2450 ibmw->rkey = mw->mmkey.key; 2451 mw->mmkey.ndescs = ndescs; 2452 2453 resp.response_length = 2454 min(offsetofend(typeof(resp), response_length), udata->outlen); 2455 if (resp.response_length) { 2456 err = ib_copy_to_udata(udata, &resp, resp.response_length); 2457 if (err) 2458 goto free_mkey; 2459 } 2460 2461 if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) { 2462 err = mlx5r_store_odp_mkey(dev, &mw->mmkey); 2463 if (err) 2464 goto free_mkey; 2465 } 2466 2467 kfree(in); 2468 return 0; 2469 2470 free_mkey: 2471 mlx5_core_destroy_mkey(dev->mdev, mw->mmkey.key); 2472 free: 2473 kfree(in); 2474 return err; 2475 } 2476 2477 int mlx5_ib_dealloc_mw(struct ib_mw *mw) 2478 { 2479 struct mlx5_ib_dev *dev = to_mdev(mw->device); 2480 struct mlx5_ib_mw *mmw = to_mmw(mw); 2481 2482 if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) && 2483 xa_erase(&dev->odp_mkeys, mlx5_base_mkey(mmw->mmkey.key))) 2484 /* 2485 * pagefault_single_data_segment() may be accessing mmw 2486 * if the user bound an ODP MR to this MW. 2487 */ 2488 mlx5r_deref_wait_odp_mkey(&mmw->mmkey); 2489 2490 return mlx5_core_destroy_mkey(dev->mdev, mmw->mmkey.key); 2491 } 2492 2493 int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask, 2494 struct ib_mr_status *mr_status) 2495 { 2496 struct mlx5_ib_mr *mmr = to_mmr(ibmr); 2497 int ret = 0; 2498 2499 if (check_mask & ~IB_MR_CHECK_SIG_STATUS) { 2500 pr_err("Invalid status check mask\n"); 2501 ret = -EINVAL; 2502 goto done; 2503 } 2504 2505 mr_status->fail_status = 0; 2506 if (check_mask & IB_MR_CHECK_SIG_STATUS) { 2507 if (!mmr->sig) { 2508 ret = -EINVAL; 2509 pr_err("signature status check requested on a non-signature enabled MR\n"); 2510 goto done; 2511 } 2512 2513 mmr->sig->sig_status_checked = true; 2514 if (!mmr->sig->sig_err_exists) 2515 goto done; 2516 2517 if (ibmr->lkey == mmr->sig->err_item.key) 2518 memcpy(&mr_status->sig_err, &mmr->sig->err_item, 2519 sizeof(mr_status->sig_err)); 2520 else { 2521 mr_status->sig_err.err_type = IB_SIG_BAD_GUARD; 2522 mr_status->sig_err.sig_err_offset = 0; 2523 mr_status->sig_err.key = mmr->sig->err_item.key; 2524 } 2525 2526 mmr->sig->sig_err_exists = false; 2527 mr_status->fail_status |= IB_MR_CHECK_SIG_STATUS; 2528 } 2529 2530 done: 2531 return ret; 2532 } 2533 2534 static int 2535 mlx5_ib_map_pa_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg, 2536 int data_sg_nents, unsigned int *data_sg_offset, 2537 struct scatterlist *meta_sg, int meta_sg_nents, 2538 unsigned int *meta_sg_offset) 2539 { 2540 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2541 unsigned int sg_offset = 0; 2542 int n = 0; 2543 2544 mr->meta_length = 0; 2545 if (data_sg_nents == 1) { 2546 n++; 2547 mr->mmkey.ndescs = 1; 2548 if (data_sg_offset) 2549 sg_offset = *data_sg_offset; 2550 mr->data_length = sg_dma_len(data_sg) - sg_offset; 2551 mr->data_iova = sg_dma_address(data_sg) + sg_offset; 2552 if (meta_sg_nents == 1) { 2553 n++; 2554 mr->meta_ndescs = 1; 2555 if (meta_sg_offset) 2556 sg_offset = *meta_sg_offset; 2557 else 2558 sg_offset = 0; 2559 mr->meta_length = sg_dma_len(meta_sg) - sg_offset; 2560 mr->pi_iova = sg_dma_address(meta_sg) + sg_offset; 2561 } 2562 ibmr->length = mr->data_length + mr->meta_length; 2563 } 2564 2565 return n; 2566 } 2567 2568 static int 2569 mlx5_ib_sg_to_klms(struct mlx5_ib_mr *mr, 2570 struct scatterlist *sgl, 2571 unsigned short sg_nents, 2572 unsigned int *sg_offset_p, 2573 struct scatterlist *meta_sgl, 2574 unsigned short meta_sg_nents, 2575 unsigned int *meta_sg_offset_p) 2576 { 2577 struct scatterlist *sg = sgl; 2578 struct mlx5_klm *klms = mr->descs; 2579 unsigned int sg_offset = sg_offset_p ? *sg_offset_p : 0; 2580 u32 lkey = mr->ibmr.pd->local_dma_lkey; 2581 int i, j = 0; 2582 2583 mr->ibmr.iova = sg_dma_address(sg) + sg_offset; 2584 mr->ibmr.length = 0; 2585 2586 for_each_sg(sgl, sg, sg_nents, i) { 2587 if (unlikely(i >= mr->max_descs)) 2588 break; 2589 klms[i].va = cpu_to_be64(sg_dma_address(sg) + sg_offset); 2590 klms[i].bcount = cpu_to_be32(sg_dma_len(sg) - sg_offset); 2591 klms[i].key = cpu_to_be32(lkey); 2592 mr->ibmr.length += sg_dma_len(sg) - sg_offset; 2593 2594 sg_offset = 0; 2595 } 2596 2597 if (sg_offset_p) 2598 *sg_offset_p = sg_offset; 2599 2600 mr->mmkey.ndescs = i; 2601 mr->data_length = mr->ibmr.length; 2602 2603 if (meta_sg_nents) { 2604 sg = meta_sgl; 2605 sg_offset = meta_sg_offset_p ? *meta_sg_offset_p : 0; 2606 for_each_sg(meta_sgl, sg, meta_sg_nents, j) { 2607 if (unlikely(i + j >= mr->max_descs)) 2608 break; 2609 klms[i + j].va = cpu_to_be64(sg_dma_address(sg) + 2610 sg_offset); 2611 klms[i + j].bcount = cpu_to_be32(sg_dma_len(sg) - 2612 sg_offset); 2613 klms[i + j].key = cpu_to_be32(lkey); 2614 mr->ibmr.length += sg_dma_len(sg) - sg_offset; 2615 2616 sg_offset = 0; 2617 } 2618 if (meta_sg_offset_p) 2619 *meta_sg_offset_p = sg_offset; 2620 2621 mr->meta_ndescs = j; 2622 mr->meta_length = mr->ibmr.length - mr->data_length; 2623 } 2624 2625 return i + j; 2626 } 2627 2628 static int mlx5_set_page(struct ib_mr *ibmr, u64 addr) 2629 { 2630 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2631 __be64 *descs; 2632 2633 if (unlikely(mr->mmkey.ndescs == mr->max_descs)) 2634 return -ENOMEM; 2635 2636 descs = mr->descs; 2637 descs[mr->mmkey.ndescs++] = cpu_to_be64(addr | MLX5_EN_RD | MLX5_EN_WR); 2638 2639 return 0; 2640 } 2641 2642 static int mlx5_set_page_pi(struct ib_mr *ibmr, u64 addr) 2643 { 2644 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2645 __be64 *descs; 2646 2647 if (unlikely(mr->mmkey.ndescs + mr->meta_ndescs == mr->max_descs)) 2648 return -ENOMEM; 2649 2650 descs = mr->descs; 2651 descs[mr->mmkey.ndescs + mr->meta_ndescs++] = 2652 cpu_to_be64(addr | MLX5_EN_RD | MLX5_EN_WR); 2653 2654 return 0; 2655 } 2656 2657 static int 2658 mlx5_ib_map_mtt_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg, 2659 int data_sg_nents, unsigned int *data_sg_offset, 2660 struct scatterlist *meta_sg, int meta_sg_nents, 2661 unsigned int *meta_sg_offset) 2662 { 2663 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2664 struct mlx5_ib_mr *pi_mr = mr->mtt_mr; 2665 int n; 2666 2667 pi_mr->mmkey.ndescs = 0; 2668 pi_mr->meta_ndescs = 0; 2669 pi_mr->meta_length = 0; 2670 2671 ib_dma_sync_single_for_cpu(ibmr->device, pi_mr->desc_map, 2672 pi_mr->desc_size * pi_mr->max_descs, 2673 DMA_TO_DEVICE); 2674 2675 pi_mr->ibmr.page_size = ibmr->page_size; 2676 n = ib_sg_to_pages(&pi_mr->ibmr, data_sg, data_sg_nents, data_sg_offset, 2677 mlx5_set_page); 2678 if (n != data_sg_nents) 2679 return n; 2680 2681 pi_mr->data_iova = pi_mr->ibmr.iova; 2682 pi_mr->data_length = pi_mr->ibmr.length; 2683 pi_mr->ibmr.length = pi_mr->data_length; 2684 ibmr->length = pi_mr->data_length; 2685 2686 if (meta_sg_nents) { 2687 u64 page_mask = ~((u64)ibmr->page_size - 1); 2688 u64 iova = pi_mr->data_iova; 2689 2690 n += ib_sg_to_pages(&pi_mr->ibmr, meta_sg, meta_sg_nents, 2691 meta_sg_offset, mlx5_set_page_pi); 2692 2693 pi_mr->meta_length = pi_mr->ibmr.length; 2694 /* 2695 * PI address for the HW is the offset of the metadata address 2696 * relative to the first data page address. 2697 * It equals to first data page address + size of data pages + 2698 * metadata offset at the first metadata page 2699 */ 2700 pi_mr->pi_iova = (iova & page_mask) + 2701 pi_mr->mmkey.ndescs * ibmr->page_size + 2702 (pi_mr->ibmr.iova & ~page_mask); 2703 /* 2704 * In order to use one MTT MR for data and metadata, we register 2705 * also the gaps between the end of the data and the start of 2706 * the metadata (the sig MR will verify that the HW will access 2707 * to right addresses). This mapping is safe because we use 2708 * internal mkey for the registration. 2709 */ 2710 pi_mr->ibmr.length = pi_mr->pi_iova + pi_mr->meta_length - iova; 2711 pi_mr->ibmr.iova = iova; 2712 ibmr->length += pi_mr->meta_length; 2713 } 2714 2715 ib_dma_sync_single_for_device(ibmr->device, pi_mr->desc_map, 2716 pi_mr->desc_size * pi_mr->max_descs, 2717 DMA_TO_DEVICE); 2718 2719 return n; 2720 } 2721 2722 static int 2723 mlx5_ib_map_klm_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg, 2724 int data_sg_nents, unsigned int *data_sg_offset, 2725 struct scatterlist *meta_sg, int meta_sg_nents, 2726 unsigned int *meta_sg_offset) 2727 { 2728 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2729 struct mlx5_ib_mr *pi_mr = mr->klm_mr; 2730 int n; 2731 2732 pi_mr->mmkey.ndescs = 0; 2733 pi_mr->meta_ndescs = 0; 2734 pi_mr->meta_length = 0; 2735 2736 ib_dma_sync_single_for_cpu(ibmr->device, pi_mr->desc_map, 2737 pi_mr->desc_size * pi_mr->max_descs, 2738 DMA_TO_DEVICE); 2739 2740 n = mlx5_ib_sg_to_klms(pi_mr, data_sg, data_sg_nents, data_sg_offset, 2741 meta_sg, meta_sg_nents, meta_sg_offset); 2742 2743 ib_dma_sync_single_for_device(ibmr->device, pi_mr->desc_map, 2744 pi_mr->desc_size * pi_mr->max_descs, 2745 DMA_TO_DEVICE); 2746 2747 /* This is zero-based memory region */ 2748 pi_mr->data_iova = 0; 2749 pi_mr->ibmr.iova = 0; 2750 pi_mr->pi_iova = pi_mr->data_length; 2751 ibmr->length = pi_mr->ibmr.length; 2752 2753 return n; 2754 } 2755 2756 int mlx5_ib_map_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg, 2757 int data_sg_nents, unsigned int *data_sg_offset, 2758 struct scatterlist *meta_sg, int meta_sg_nents, 2759 unsigned int *meta_sg_offset) 2760 { 2761 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2762 struct mlx5_ib_mr *pi_mr = NULL; 2763 int n; 2764 2765 WARN_ON(ibmr->type != IB_MR_TYPE_INTEGRITY); 2766 2767 mr->mmkey.ndescs = 0; 2768 mr->data_length = 0; 2769 mr->data_iova = 0; 2770 mr->meta_ndescs = 0; 2771 mr->pi_iova = 0; 2772 /* 2773 * As a performance optimization, if possible, there is no need to 2774 * perform UMR operation to register the data/metadata buffers. 2775 * First try to map the sg lists to PA descriptors with local_dma_lkey. 2776 * Fallback to UMR only in case of a failure. 2777 */ 2778 n = mlx5_ib_map_pa_mr_sg_pi(ibmr, data_sg, data_sg_nents, 2779 data_sg_offset, meta_sg, meta_sg_nents, 2780 meta_sg_offset); 2781 if (n == data_sg_nents + meta_sg_nents) 2782 goto out; 2783 /* 2784 * As a performance optimization, if possible, there is no need to map 2785 * the sg lists to KLM descriptors. First try to map the sg lists to MTT 2786 * descriptors and fallback to KLM only in case of a failure. 2787 * It's more efficient for the HW to work with MTT descriptors 2788 * (especially in high load). 2789 * Use KLM (indirect access) only if it's mandatory. 2790 */ 2791 pi_mr = mr->mtt_mr; 2792 n = mlx5_ib_map_mtt_mr_sg_pi(ibmr, data_sg, data_sg_nents, 2793 data_sg_offset, meta_sg, meta_sg_nents, 2794 meta_sg_offset); 2795 if (n == data_sg_nents + meta_sg_nents) 2796 goto out; 2797 2798 pi_mr = mr->klm_mr; 2799 n = mlx5_ib_map_klm_mr_sg_pi(ibmr, data_sg, data_sg_nents, 2800 data_sg_offset, meta_sg, meta_sg_nents, 2801 meta_sg_offset); 2802 if (unlikely(n != data_sg_nents + meta_sg_nents)) 2803 return -ENOMEM; 2804 2805 out: 2806 /* This is zero-based memory region */ 2807 ibmr->iova = 0; 2808 mr->pi_mr = pi_mr; 2809 if (pi_mr) 2810 ibmr->sig_attrs->meta_length = pi_mr->meta_length; 2811 else 2812 ibmr->sig_attrs->meta_length = mr->meta_length; 2813 2814 return 0; 2815 } 2816 2817 int mlx5_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, 2818 unsigned int *sg_offset) 2819 { 2820 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2821 int n; 2822 2823 mr->mmkey.ndescs = 0; 2824 2825 ib_dma_sync_single_for_cpu(ibmr->device, mr->desc_map, 2826 mr->desc_size * mr->max_descs, 2827 DMA_TO_DEVICE); 2828 2829 if (mr->access_mode == MLX5_MKC_ACCESS_MODE_KLMS) 2830 n = mlx5_ib_sg_to_klms(mr, sg, sg_nents, sg_offset, NULL, 0, 2831 NULL); 2832 else 2833 n = ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, 2834 mlx5_set_page); 2835 2836 ib_dma_sync_single_for_device(ibmr->device, mr->desc_map, 2837 mr->desc_size * mr->max_descs, 2838 DMA_TO_DEVICE); 2839 2840 return n; 2841 } 2842