1 /*
2 * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
3 * Copyright (c) 2020, Intel Corporation. All rights reserved.
4 *
5 * This software is available to you under a choice of one of two
6 * licenses. You may choose to be licensed under the terms of the GNU
7 * General Public License (GPL) Version 2, available from the file
8 * COPYING in the main directory of this source tree, or the
9 * OpenIB.org BSD license below:
10 *
11 * Redistribution and use in source and binary forms, with or
12 * without modification, are permitted provided that the following
13 * conditions are met:
14 *
15 * - Redistributions of source code must retain the above
16 * copyright notice, this list of conditions and the following
17 * disclaimer.
18 *
19 * - Redistributions in binary form must reproduce the above
20 * copyright notice, this list of conditions and the following
21 * disclaimer in the documentation and/or other materials
22 * provided with the distribution.
23 *
24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
27 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
28 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
29 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
30 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
31 * SOFTWARE.
32 */
33
34
35 #include <linux/kref.h>
36 #include <linux/random.h>
37 #include <linux/debugfs.h>
38 #include <linux/export.h>
39 #include <linux/delay.h>
40 #include <linux/dma-buf.h>
41 #include <linux/dma-resv.h>
42 #include <rdma/ib_umem_odp.h>
43 #include "dm.h"
44 #include "mlx5_ib.h"
45 #include "umr.h"
46 #include "data_direct.h"
47
48 enum {
49 MAX_PENDING_REG_MR = 8,
50 };
51
52 #define MLX5_MR_CACHE_PERSISTENT_ENTRY_MIN_DESCS 4
53 #define MLX5_UMR_ALIGN 2048
54
55 static void
56 create_mkey_callback(int status, struct mlx5_async_work *context);
57 static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem,
58 u64 iova, int access_flags,
59 unsigned long page_size, bool populate,
60 int access_mode);
61 static int __mlx5_ib_dereg_mr(struct ib_mr *ibmr);
62
set_mkc_access_pd_addr_fields(void * mkc,int acc,u64 start_addr,struct ib_pd * pd)63 static void set_mkc_access_pd_addr_fields(void *mkc, int acc, u64 start_addr,
64 struct ib_pd *pd)
65 {
66 struct mlx5_ib_dev *dev = to_mdev(pd->device);
67
68 MLX5_SET(mkc, mkc, a, !!(acc & IB_ACCESS_REMOTE_ATOMIC));
69 MLX5_SET(mkc, mkc, rw, !!(acc & IB_ACCESS_REMOTE_WRITE));
70 MLX5_SET(mkc, mkc, rr, !!(acc & IB_ACCESS_REMOTE_READ));
71 MLX5_SET(mkc, mkc, lw, !!(acc & IB_ACCESS_LOCAL_WRITE));
72 MLX5_SET(mkc, mkc, lr, 1);
73
74 if (acc & IB_ACCESS_RELAXED_ORDERING) {
75 if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write))
76 MLX5_SET(mkc, mkc, relaxed_ordering_write, 1);
77
78 if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read) ||
79 (MLX5_CAP_GEN(dev->mdev,
80 relaxed_ordering_read_pci_enabled) &&
81 pcie_relaxed_ordering_enabled(dev->mdev->pdev)))
82 MLX5_SET(mkc, mkc, relaxed_ordering_read, 1);
83 }
84
85 MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn);
86 MLX5_SET(mkc, mkc, qpn, 0xffffff);
87 MLX5_SET64(mkc, mkc, start_addr, start_addr);
88 }
89
assign_mkey_variant(struct mlx5_ib_dev * dev,u32 * mkey,u32 * in)90 static void assign_mkey_variant(struct mlx5_ib_dev *dev, u32 *mkey, u32 *in)
91 {
92 u8 key = atomic_inc_return(&dev->mkey_var);
93 void *mkc;
94
95 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
96 MLX5_SET(mkc, mkc, mkey_7_0, key);
97 *mkey = key;
98 }
99
mlx5_ib_create_mkey(struct mlx5_ib_dev * dev,struct mlx5_ib_mkey * mkey,u32 * in,int inlen)100 static int mlx5_ib_create_mkey(struct mlx5_ib_dev *dev,
101 struct mlx5_ib_mkey *mkey, u32 *in, int inlen)
102 {
103 int ret;
104
105 assign_mkey_variant(dev, &mkey->key, in);
106 ret = mlx5_core_create_mkey(dev->mdev, &mkey->key, in, inlen);
107 if (!ret)
108 init_waitqueue_head(&mkey->wait);
109
110 return ret;
111 }
112
mlx5_ib_create_mkey_cb(struct mlx5r_async_create_mkey * async_create)113 static int mlx5_ib_create_mkey_cb(struct mlx5r_async_create_mkey *async_create)
114 {
115 struct mlx5_ib_dev *dev = async_create->ent->dev;
116 size_t inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
117 size_t outlen = MLX5_ST_SZ_BYTES(create_mkey_out);
118
119 MLX5_SET(create_mkey_in, async_create->in, opcode,
120 MLX5_CMD_OP_CREATE_MKEY);
121 assign_mkey_variant(dev, &async_create->mkey, async_create->in);
122 return mlx5_cmd_exec_cb(&dev->async_ctx, async_create->in, inlen,
123 async_create->out, outlen, create_mkey_callback,
124 &async_create->cb_work);
125 }
126
127 static int mkey_cache_max_order(struct mlx5_ib_dev *dev);
128 static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent);
129
destroy_mkey(struct mlx5_ib_dev * dev,struct mlx5_ib_mr * mr)130 static int destroy_mkey(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
131 {
132 WARN_ON(xa_load(&dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key)));
133
134 return mlx5_core_destroy_mkey(dev->mdev, mr->mmkey.key);
135 }
136
create_mkey_warn(struct mlx5_ib_dev * dev,int status,void * out)137 static void create_mkey_warn(struct mlx5_ib_dev *dev, int status, void *out)
138 {
139 if (status == -ENXIO) /* core driver is not available */
140 return;
141
142 mlx5_ib_warn(dev, "async reg mr failed. status %d\n", status);
143 if (status != -EREMOTEIO) /* driver specific failure */
144 return;
145
146 /* Failed in FW, print cmd out failure details */
147 mlx5_cmd_out_err(dev->mdev, MLX5_CMD_OP_CREATE_MKEY, 0, out);
148 }
149
push_mkey_locked(struct mlx5_cache_ent * ent,u32 mkey)150 static int push_mkey_locked(struct mlx5_cache_ent *ent, u32 mkey)
151 {
152 unsigned long tmp = ent->mkeys_queue.ci % NUM_MKEYS_PER_PAGE;
153 struct mlx5_mkeys_page *page;
154
155 lockdep_assert_held(&ent->mkeys_queue.lock);
156 if (ent->mkeys_queue.ci >=
157 ent->mkeys_queue.num_pages * NUM_MKEYS_PER_PAGE) {
158 page = kzalloc(sizeof(*page), GFP_ATOMIC);
159 if (!page)
160 return -ENOMEM;
161 ent->mkeys_queue.num_pages++;
162 list_add_tail(&page->list, &ent->mkeys_queue.pages_list);
163 } else {
164 page = list_last_entry(&ent->mkeys_queue.pages_list,
165 struct mlx5_mkeys_page, list);
166 }
167
168 page->mkeys[tmp] = mkey;
169 ent->mkeys_queue.ci++;
170 return 0;
171 }
172
pop_mkey_locked(struct mlx5_cache_ent * ent)173 static int pop_mkey_locked(struct mlx5_cache_ent *ent)
174 {
175 unsigned long tmp = (ent->mkeys_queue.ci - 1) % NUM_MKEYS_PER_PAGE;
176 struct mlx5_mkeys_page *last_page;
177 u32 mkey;
178
179 lockdep_assert_held(&ent->mkeys_queue.lock);
180 last_page = list_last_entry(&ent->mkeys_queue.pages_list,
181 struct mlx5_mkeys_page, list);
182 mkey = last_page->mkeys[tmp];
183 last_page->mkeys[tmp] = 0;
184 ent->mkeys_queue.ci--;
185 if (ent->mkeys_queue.num_pages > 1 && !tmp) {
186 list_del(&last_page->list);
187 ent->mkeys_queue.num_pages--;
188 kfree(last_page);
189 }
190 return mkey;
191 }
192
create_mkey_callback(int status,struct mlx5_async_work * context)193 static void create_mkey_callback(int status, struct mlx5_async_work *context)
194 {
195 struct mlx5r_async_create_mkey *mkey_out =
196 container_of(context, struct mlx5r_async_create_mkey, cb_work);
197 struct mlx5_cache_ent *ent = mkey_out->ent;
198 struct mlx5_ib_dev *dev = ent->dev;
199 unsigned long flags;
200
201 if (status) {
202 create_mkey_warn(dev, status, mkey_out->out);
203 kfree(mkey_out);
204 spin_lock_irqsave(&ent->mkeys_queue.lock, flags);
205 ent->pending--;
206 WRITE_ONCE(dev->fill_delay, 1);
207 spin_unlock_irqrestore(&ent->mkeys_queue.lock, flags);
208 mod_timer(&dev->delay_timer, jiffies + HZ);
209 return;
210 }
211
212 mkey_out->mkey |= mlx5_idx_to_mkey(
213 MLX5_GET(create_mkey_out, mkey_out->out, mkey_index));
214 WRITE_ONCE(dev->cache.last_add, jiffies);
215
216 spin_lock_irqsave(&ent->mkeys_queue.lock, flags);
217 push_mkey_locked(ent, mkey_out->mkey);
218 ent->pending--;
219 /* If we are doing fill_to_high_water then keep going. */
220 queue_adjust_cache_locked(ent);
221 spin_unlock_irqrestore(&ent->mkeys_queue.lock, flags);
222 kfree(mkey_out);
223 }
224
get_mkc_octo_size(unsigned int access_mode,unsigned int ndescs)225 static int get_mkc_octo_size(unsigned int access_mode, unsigned int ndescs)
226 {
227 int ret = 0;
228
229 switch (access_mode) {
230 case MLX5_MKC_ACCESS_MODE_MTT:
231 ret = DIV_ROUND_UP(ndescs, MLX5_IB_UMR_OCTOWORD /
232 sizeof(struct mlx5_mtt));
233 break;
234 case MLX5_MKC_ACCESS_MODE_KSM:
235 ret = DIV_ROUND_UP(ndescs, MLX5_IB_UMR_OCTOWORD /
236 sizeof(struct mlx5_klm));
237 break;
238 default:
239 WARN_ON(1);
240 }
241 return ret;
242 }
243
set_cache_mkc(struct mlx5_cache_ent * ent,void * mkc)244 static void set_cache_mkc(struct mlx5_cache_ent *ent, void *mkc)
245 {
246 set_mkc_access_pd_addr_fields(mkc, ent->rb_key.access_flags, 0,
247 ent->dev->umrc.pd);
248 MLX5_SET(mkc, mkc, free, 1);
249 MLX5_SET(mkc, mkc, umr_en, 1);
250 MLX5_SET(mkc, mkc, access_mode_1_0, ent->rb_key.access_mode & 0x3);
251 MLX5_SET(mkc, mkc, access_mode_4_2,
252 (ent->rb_key.access_mode >> 2) & 0x7);
253 MLX5_SET(mkc, mkc, ma_translation_mode, !!ent->rb_key.ats);
254
255 MLX5_SET(mkc, mkc, translations_octword_size,
256 get_mkc_octo_size(ent->rb_key.access_mode,
257 ent->rb_key.ndescs));
258 MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT);
259 }
260
261 /* Asynchronously schedule new MRs to be populated in the cache. */
add_keys(struct mlx5_cache_ent * ent,unsigned int num)262 static int add_keys(struct mlx5_cache_ent *ent, unsigned int num)
263 {
264 struct mlx5r_async_create_mkey *async_create;
265 void *mkc;
266 int err = 0;
267 int i;
268
269 for (i = 0; i < num; i++) {
270 async_create = kzalloc(sizeof(struct mlx5r_async_create_mkey),
271 GFP_KERNEL);
272 if (!async_create)
273 return -ENOMEM;
274 mkc = MLX5_ADDR_OF(create_mkey_in, async_create->in,
275 memory_key_mkey_entry);
276 set_cache_mkc(ent, mkc);
277 async_create->ent = ent;
278
279 spin_lock_irq(&ent->mkeys_queue.lock);
280 if (ent->pending >= MAX_PENDING_REG_MR) {
281 err = -EAGAIN;
282 goto free_async_create;
283 }
284 ent->pending++;
285 spin_unlock_irq(&ent->mkeys_queue.lock);
286
287 err = mlx5_ib_create_mkey_cb(async_create);
288 if (err) {
289 mlx5_ib_warn(ent->dev, "create mkey failed %d\n", err);
290 goto err_create_mkey;
291 }
292 }
293
294 return 0;
295
296 err_create_mkey:
297 spin_lock_irq(&ent->mkeys_queue.lock);
298 ent->pending--;
299 free_async_create:
300 spin_unlock_irq(&ent->mkeys_queue.lock);
301 kfree(async_create);
302 return err;
303 }
304
305 /* Synchronously create a MR in the cache */
create_cache_mkey(struct mlx5_cache_ent * ent,u32 * mkey)306 static int create_cache_mkey(struct mlx5_cache_ent *ent, u32 *mkey)
307 {
308 size_t inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
309 void *mkc;
310 u32 *in;
311 int err;
312
313 in = kzalloc(inlen, GFP_KERNEL);
314 if (!in)
315 return -ENOMEM;
316 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
317 set_cache_mkc(ent, mkc);
318
319 err = mlx5_core_create_mkey(ent->dev->mdev, mkey, in, inlen);
320 if (err)
321 goto free_in;
322
323 WRITE_ONCE(ent->dev->cache.last_add, jiffies);
324 free_in:
325 kfree(in);
326 return err;
327 }
328
remove_cache_mr_locked(struct mlx5_cache_ent * ent)329 static void remove_cache_mr_locked(struct mlx5_cache_ent *ent)
330 {
331 u32 mkey;
332
333 lockdep_assert_held(&ent->mkeys_queue.lock);
334 if (!ent->mkeys_queue.ci)
335 return;
336 mkey = pop_mkey_locked(ent);
337 spin_unlock_irq(&ent->mkeys_queue.lock);
338 mlx5_core_destroy_mkey(ent->dev->mdev, mkey);
339 spin_lock_irq(&ent->mkeys_queue.lock);
340 }
341
resize_available_mrs(struct mlx5_cache_ent * ent,unsigned int target,bool limit_fill)342 static int resize_available_mrs(struct mlx5_cache_ent *ent, unsigned int target,
343 bool limit_fill)
344 __acquires(&ent->mkeys_queue.lock) __releases(&ent->mkeys_queue.lock)
345 {
346 int err;
347
348 lockdep_assert_held(&ent->mkeys_queue.lock);
349
350 while (true) {
351 if (limit_fill)
352 target = ent->limit * 2;
353 if (target == ent->pending + ent->mkeys_queue.ci)
354 return 0;
355 if (target > ent->pending + ent->mkeys_queue.ci) {
356 u32 todo = target - (ent->pending + ent->mkeys_queue.ci);
357
358 spin_unlock_irq(&ent->mkeys_queue.lock);
359 err = add_keys(ent, todo);
360 if (err == -EAGAIN)
361 usleep_range(3000, 5000);
362 spin_lock_irq(&ent->mkeys_queue.lock);
363 if (err) {
364 if (err != -EAGAIN)
365 return err;
366 } else
367 return 0;
368 } else {
369 remove_cache_mr_locked(ent);
370 }
371 }
372 }
373
size_write(struct file * filp,const char __user * buf,size_t count,loff_t * pos)374 static ssize_t size_write(struct file *filp, const char __user *buf,
375 size_t count, loff_t *pos)
376 {
377 struct mlx5_cache_ent *ent = filp->private_data;
378 u32 target;
379 int err;
380
381 err = kstrtou32_from_user(buf, count, 0, &target);
382 if (err)
383 return err;
384
385 /*
386 * Target is the new value of total_mrs the user requests, however we
387 * cannot free MRs that are in use. Compute the target value for stored
388 * mkeys.
389 */
390 spin_lock_irq(&ent->mkeys_queue.lock);
391 if (target < ent->in_use) {
392 err = -EINVAL;
393 goto err_unlock;
394 }
395 target = target - ent->in_use;
396 if (target < ent->limit || target > ent->limit*2) {
397 err = -EINVAL;
398 goto err_unlock;
399 }
400 err = resize_available_mrs(ent, target, false);
401 if (err)
402 goto err_unlock;
403 spin_unlock_irq(&ent->mkeys_queue.lock);
404
405 return count;
406
407 err_unlock:
408 spin_unlock_irq(&ent->mkeys_queue.lock);
409 return err;
410 }
411
size_read(struct file * filp,char __user * buf,size_t count,loff_t * pos)412 static ssize_t size_read(struct file *filp, char __user *buf, size_t count,
413 loff_t *pos)
414 {
415 struct mlx5_cache_ent *ent = filp->private_data;
416 char lbuf[20];
417 int err;
418
419 err = snprintf(lbuf, sizeof(lbuf), "%ld\n",
420 ent->mkeys_queue.ci + ent->in_use);
421 if (err < 0)
422 return err;
423
424 return simple_read_from_buffer(buf, count, pos, lbuf, err);
425 }
426
427 static const struct file_operations size_fops = {
428 .owner = THIS_MODULE,
429 .open = simple_open,
430 .write = size_write,
431 .read = size_read,
432 };
433
limit_write(struct file * filp,const char __user * buf,size_t count,loff_t * pos)434 static ssize_t limit_write(struct file *filp, const char __user *buf,
435 size_t count, loff_t *pos)
436 {
437 struct mlx5_cache_ent *ent = filp->private_data;
438 u32 var;
439 int err;
440
441 err = kstrtou32_from_user(buf, count, 0, &var);
442 if (err)
443 return err;
444
445 /*
446 * Upon set we immediately fill the cache to high water mark implied by
447 * the limit.
448 */
449 spin_lock_irq(&ent->mkeys_queue.lock);
450 ent->limit = var;
451 err = resize_available_mrs(ent, 0, true);
452 spin_unlock_irq(&ent->mkeys_queue.lock);
453 if (err)
454 return err;
455 return count;
456 }
457
limit_read(struct file * filp,char __user * buf,size_t count,loff_t * pos)458 static ssize_t limit_read(struct file *filp, char __user *buf, size_t count,
459 loff_t *pos)
460 {
461 struct mlx5_cache_ent *ent = filp->private_data;
462 char lbuf[20];
463 int err;
464
465 err = snprintf(lbuf, sizeof(lbuf), "%d\n", ent->limit);
466 if (err < 0)
467 return err;
468
469 return simple_read_from_buffer(buf, count, pos, lbuf, err);
470 }
471
472 static const struct file_operations limit_fops = {
473 .owner = THIS_MODULE,
474 .open = simple_open,
475 .write = limit_write,
476 .read = limit_read,
477 };
478
someone_adding(struct mlx5_mkey_cache * cache)479 static bool someone_adding(struct mlx5_mkey_cache *cache)
480 {
481 struct mlx5_cache_ent *ent;
482 struct rb_node *node;
483 bool ret;
484
485 mutex_lock(&cache->rb_lock);
486 for (node = rb_first(&cache->rb_root); node; node = rb_next(node)) {
487 ent = rb_entry(node, struct mlx5_cache_ent, node);
488 spin_lock_irq(&ent->mkeys_queue.lock);
489 ret = ent->mkeys_queue.ci < ent->limit;
490 spin_unlock_irq(&ent->mkeys_queue.lock);
491 if (ret) {
492 mutex_unlock(&cache->rb_lock);
493 return true;
494 }
495 }
496 mutex_unlock(&cache->rb_lock);
497 return false;
498 }
499
500 /*
501 * Check if the bucket is outside the high/low water mark and schedule an async
502 * update. The cache refill has hysteresis, once the low water mark is hit it is
503 * refilled up to the high mark.
504 */
queue_adjust_cache_locked(struct mlx5_cache_ent * ent)505 static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent)
506 {
507 lockdep_assert_held(&ent->mkeys_queue.lock);
508
509 if (ent->disabled || READ_ONCE(ent->dev->fill_delay) || ent->is_tmp)
510 return;
511 if (ent->mkeys_queue.ci < ent->limit) {
512 ent->fill_to_high_water = true;
513 mod_delayed_work(ent->dev->cache.wq, &ent->dwork, 0);
514 } else if (ent->fill_to_high_water &&
515 ent->mkeys_queue.ci + ent->pending < 2 * ent->limit) {
516 /*
517 * Once we start populating due to hitting a low water mark
518 * continue until we pass the high water mark.
519 */
520 mod_delayed_work(ent->dev->cache.wq, &ent->dwork, 0);
521 } else if (ent->mkeys_queue.ci == 2 * ent->limit) {
522 ent->fill_to_high_water = false;
523 } else if (ent->mkeys_queue.ci > 2 * ent->limit) {
524 /* Queue deletion of excess entries */
525 ent->fill_to_high_water = false;
526 if (ent->pending)
527 queue_delayed_work(ent->dev->cache.wq, &ent->dwork,
528 msecs_to_jiffies(1000));
529 else
530 mod_delayed_work(ent->dev->cache.wq, &ent->dwork, 0);
531 }
532 }
533
clean_keys(struct mlx5_ib_dev * dev,struct mlx5_cache_ent * ent)534 static void clean_keys(struct mlx5_ib_dev *dev, struct mlx5_cache_ent *ent)
535 {
536 u32 mkey;
537
538 spin_lock_irq(&ent->mkeys_queue.lock);
539 while (ent->mkeys_queue.ci) {
540 mkey = pop_mkey_locked(ent);
541 spin_unlock_irq(&ent->mkeys_queue.lock);
542 mlx5_core_destroy_mkey(dev->mdev, mkey);
543 spin_lock_irq(&ent->mkeys_queue.lock);
544 }
545 ent->tmp_cleanup_scheduled = false;
546 spin_unlock_irq(&ent->mkeys_queue.lock);
547 }
548
__cache_work_func(struct mlx5_cache_ent * ent)549 static void __cache_work_func(struct mlx5_cache_ent *ent)
550 {
551 struct mlx5_ib_dev *dev = ent->dev;
552 struct mlx5_mkey_cache *cache = &dev->cache;
553 int err;
554
555 spin_lock_irq(&ent->mkeys_queue.lock);
556 if (ent->disabled)
557 goto out;
558
559 if (ent->fill_to_high_water &&
560 ent->mkeys_queue.ci + ent->pending < 2 * ent->limit &&
561 !READ_ONCE(dev->fill_delay)) {
562 spin_unlock_irq(&ent->mkeys_queue.lock);
563 err = add_keys(ent, 1);
564 spin_lock_irq(&ent->mkeys_queue.lock);
565 if (ent->disabled)
566 goto out;
567 if (err) {
568 /*
569 * EAGAIN only happens if there are pending MRs, so we
570 * will be rescheduled when storing them. The only
571 * failure path here is ENOMEM.
572 */
573 if (err != -EAGAIN) {
574 mlx5_ib_warn(
575 dev,
576 "add keys command failed, err %d\n",
577 err);
578 queue_delayed_work(cache->wq, &ent->dwork,
579 msecs_to_jiffies(1000));
580 }
581 }
582 } else if (ent->mkeys_queue.ci > 2 * ent->limit) {
583 bool need_delay;
584
585 /*
586 * The remove_cache_mr() logic is performed as garbage
587 * collection task. Such task is intended to be run when no
588 * other active processes are running.
589 *
590 * The need_resched() will return TRUE if there are user tasks
591 * to be activated in near future.
592 *
593 * In such case, we don't execute remove_cache_mr() and postpone
594 * the garbage collection work to try to run in next cycle, in
595 * order to free CPU resources to other tasks.
596 */
597 spin_unlock_irq(&ent->mkeys_queue.lock);
598 need_delay = need_resched() || someone_adding(cache) ||
599 !time_after(jiffies,
600 READ_ONCE(cache->last_add) + 300 * HZ);
601 spin_lock_irq(&ent->mkeys_queue.lock);
602 if (ent->disabled)
603 goto out;
604 if (need_delay) {
605 queue_delayed_work(cache->wq, &ent->dwork, 300 * HZ);
606 goto out;
607 }
608 remove_cache_mr_locked(ent);
609 queue_adjust_cache_locked(ent);
610 }
611 out:
612 spin_unlock_irq(&ent->mkeys_queue.lock);
613 }
614
delayed_cache_work_func(struct work_struct * work)615 static void delayed_cache_work_func(struct work_struct *work)
616 {
617 struct mlx5_cache_ent *ent;
618
619 ent = container_of(work, struct mlx5_cache_ent, dwork.work);
620 /* temp entries are never filled, only cleaned */
621 if (ent->is_tmp)
622 clean_keys(ent->dev, ent);
623 else
624 __cache_work_func(ent);
625 }
626
cache_ent_key_cmp(struct mlx5r_cache_rb_key key1,struct mlx5r_cache_rb_key key2)627 static int cache_ent_key_cmp(struct mlx5r_cache_rb_key key1,
628 struct mlx5r_cache_rb_key key2)
629 {
630 int res;
631
632 res = key1.ats - key2.ats;
633 if (res)
634 return res;
635
636 res = key1.access_mode - key2.access_mode;
637 if (res)
638 return res;
639
640 res = key1.access_flags - key2.access_flags;
641 if (res)
642 return res;
643
644 /*
645 * keep ndescs the last in the compare table since the find function
646 * searches for an exact match on all properties and only closest
647 * match in size.
648 */
649 return key1.ndescs - key2.ndescs;
650 }
651
mlx5_cache_ent_insert(struct mlx5_mkey_cache * cache,struct mlx5_cache_ent * ent)652 static int mlx5_cache_ent_insert(struct mlx5_mkey_cache *cache,
653 struct mlx5_cache_ent *ent)
654 {
655 struct rb_node **new = &cache->rb_root.rb_node, *parent = NULL;
656 struct mlx5_cache_ent *cur;
657 int cmp;
658
659 /* Figure out where to put new node */
660 while (*new) {
661 cur = rb_entry(*new, struct mlx5_cache_ent, node);
662 parent = *new;
663 cmp = cache_ent_key_cmp(cur->rb_key, ent->rb_key);
664 if (cmp > 0)
665 new = &((*new)->rb_left);
666 if (cmp < 0)
667 new = &((*new)->rb_right);
668 if (cmp == 0)
669 return -EEXIST;
670 }
671
672 /* Add new node and rebalance tree. */
673 rb_link_node(&ent->node, parent, new);
674 rb_insert_color(&ent->node, &cache->rb_root);
675
676 return 0;
677 }
678
679 static struct mlx5_cache_ent *
mkey_cache_ent_from_rb_key(struct mlx5_ib_dev * dev,struct mlx5r_cache_rb_key rb_key)680 mkey_cache_ent_from_rb_key(struct mlx5_ib_dev *dev,
681 struct mlx5r_cache_rb_key rb_key)
682 {
683 struct rb_node *node = dev->cache.rb_root.rb_node;
684 struct mlx5_cache_ent *cur, *smallest = NULL;
685 u64 ndescs_limit;
686 int cmp;
687
688 /*
689 * Find the smallest ent with order >= requested_order.
690 */
691 while (node) {
692 cur = rb_entry(node, struct mlx5_cache_ent, node);
693 cmp = cache_ent_key_cmp(cur->rb_key, rb_key);
694 if (cmp > 0) {
695 smallest = cur;
696 node = node->rb_left;
697 }
698 if (cmp < 0)
699 node = node->rb_right;
700 if (cmp == 0)
701 return cur;
702 }
703
704 /*
705 * Limit the usage of mkeys larger than twice the required size while
706 * also allowing the usage of smallest cache entry for small MRs.
707 */
708 ndescs_limit = max_t(u64, rb_key.ndescs * 2,
709 MLX5_MR_CACHE_PERSISTENT_ENTRY_MIN_DESCS);
710
711 return (smallest &&
712 smallest->rb_key.access_mode == rb_key.access_mode &&
713 smallest->rb_key.access_flags == rb_key.access_flags &&
714 smallest->rb_key.ats == rb_key.ats &&
715 smallest->rb_key.ndescs <= ndescs_limit) ?
716 smallest :
717 NULL;
718 }
719
_mlx5_mr_cache_alloc(struct mlx5_ib_dev * dev,struct mlx5_cache_ent * ent)720 static struct mlx5_ib_mr *_mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev,
721 struct mlx5_cache_ent *ent)
722 {
723 struct mlx5_ib_mr *mr;
724 int err;
725
726 mr = kzalloc(sizeof(*mr), GFP_KERNEL);
727 if (!mr)
728 return ERR_PTR(-ENOMEM);
729
730 spin_lock_irq(&ent->mkeys_queue.lock);
731 ent->in_use++;
732
733 if (!ent->mkeys_queue.ci) {
734 queue_adjust_cache_locked(ent);
735 ent->miss++;
736 spin_unlock_irq(&ent->mkeys_queue.lock);
737 err = create_cache_mkey(ent, &mr->mmkey.key);
738 if (err) {
739 spin_lock_irq(&ent->mkeys_queue.lock);
740 ent->in_use--;
741 spin_unlock_irq(&ent->mkeys_queue.lock);
742 kfree(mr);
743 return ERR_PTR(err);
744 }
745 } else {
746 mr->mmkey.key = pop_mkey_locked(ent);
747 queue_adjust_cache_locked(ent);
748 spin_unlock_irq(&ent->mkeys_queue.lock);
749 }
750 mr->mmkey.cache_ent = ent;
751 mr->mmkey.type = MLX5_MKEY_MR;
752 mr->mmkey.rb_key = ent->rb_key;
753 mr->mmkey.cacheable = true;
754 init_waitqueue_head(&mr->mmkey.wait);
755 return mr;
756 }
757
get_unchangeable_access_flags(struct mlx5_ib_dev * dev,int access_flags)758 static int get_unchangeable_access_flags(struct mlx5_ib_dev *dev,
759 int access_flags)
760 {
761 int ret = 0;
762
763 if ((access_flags & IB_ACCESS_REMOTE_ATOMIC) &&
764 MLX5_CAP_GEN(dev->mdev, atomic) &&
765 MLX5_CAP_GEN(dev->mdev, umr_modify_atomic_disabled))
766 ret |= IB_ACCESS_REMOTE_ATOMIC;
767
768 if ((access_flags & IB_ACCESS_RELAXED_ORDERING) &&
769 MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write) &&
770 !MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write_umr))
771 ret |= IB_ACCESS_RELAXED_ORDERING;
772
773 if ((access_flags & IB_ACCESS_RELAXED_ORDERING) &&
774 (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read) ||
775 MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read_pci_enabled)) &&
776 !MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read_umr))
777 ret |= IB_ACCESS_RELAXED_ORDERING;
778
779 return ret;
780 }
781
mlx5_mr_cache_alloc(struct mlx5_ib_dev * dev,int access_flags,int access_mode,int ndescs)782 struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev,
783 int access_flags, int access_mode,
784 int ndescs)
785 {
786 struct mlx5r_cache_rb_key rb_key = {
787 .ndescs = ndescs,
788 .access_mode = access_mode,
789 .access_flags = get_unchangeable_access_flags(dev, access_flags)
790 };
791 struct mlx5_cache_ent *ent = mkey_cache_ent_from_rb_key(dev, rb_key);
792
793 if (!ent)
794 return ERR_PTR(-EOPNOTSUPP);
795
796 return _mlx5_mr_cache_alloc(dev, ent);
797 }
798
mlx5_mkey_cache_debugfs_cleanup(struct mlx5_ib_dev * dev)799 static void mlx5_mkey_cache_debugfs_cleanup(struct mlx5_ib_dev *dev)
800 {
801 if (!mlx5_debugfs_root || dev->is_rep)
802 return;
803
804 debugfs_remove_recursive(dev->cache.fs_root);
805 dev->cache.fs_root = NULL;
806 }
807
mlx5_mkey_cache_debugfs_add_ent(struct mlx5_ib_dev * dev,struct mlx5_cache_ent * ent)808 static void mlx5_mkey_cache_debugfs_add_ent(struct mlx5_ib_dev *dev,
809 struct mlx5_cache_ent *ent)
810 {
811 int order = order_base_2(ent->rb_key.ndescs);
812 struct dentry *dir;
813
814 if (!mlx5_debugfs_root || dev->is_rep)
815 return;
816
817 if (ent->rb_key.access_mode == MLX5_MKC_ACCESS_MODE_KSM)
818 order = MLX5_IMR_KSM_CACHE_ENTRY + 2;
819
820 sprintf(ent->name, "%d", order);
821 dir = debugfs_create_dir(ent->name, dev->cache.fs_root);
822 debugfs_create_file("size", 0600, dir, ent, &size_fops);
823 debugfs_create_file("limit", 0600, dir, ent, &limit_fops);
824 debugfs_create_ulong("cur", 0400, dir, &ent->mkeys_queue.ci);
825 debugfs_create_u32("miss", 0600, dir, &ent->miss);
826 }
827
mlx5_mkey_cache_debugfs_init(struct mlx5_ib_dev * dev)828 static void mlx5_mkey_cache_debugfs_init(struct mlx5_ib_dev *dev)
829 {
830 struct dentry *dbg_root = mlx5_debugfs_get_dev_root(dev->mdev);
831 struct mlx5_mkey_cache *cache = &dev->cache;
832
833 if (!mlx5_debugfs_root || dev->is_rep)
834 return;
835
836 cache->fs_root = debugfs_create_dir("mr_cache", dbg_root);
837 }
838
delay_time_func(struct timer_list * t)839 static void delay_time_func(struct timer_list *t)
840 {
841 struct mlx5_ib_dev *dev = from_timer(dev, t, delay_timer);
842
843 WRITE_ONCE(dev->fill_delay, 0);
844 }
845
mlx5r_mkeys_init(struct mlx5_cache_ent * ent)846 static int mlx5r_mkeys_init(struct mlx5_cache_ent *ent)
847 {
848 struct mlx5_mkeys_page *page;
849
850 page = kzalloc(sizeof(*page), GFP_KERNEL);
851 if (!page)
852 return -ENOMEM;
853 INIT_LIST_HEAD(&ent->mkeys_queue.pages_list);
854 spin_lock_init(&ent->mkeys_queue.lock);
855 list_add_tail(&page->list, &ent->mkeys_queue.pages_list);
856 ent->mkeys_queue.num_pages++;
857 return 0;
858 }
859
mlx5r_mkeys_uninit(struct mlx5_cache_ent * ent)860 static void mlx5r_mkeys_uninit(struct mlx5_cache_ent *ent)
861 {
862 struct mlx5_mkeys_page *page;
863
864 WARN_ON(ent->mkeys_queue.ci || ent->mkeys_queue.num_pages > 1);
865 page = list_last_entry(&ent->mkeys_queue.pages_list,
866 struct mlx5_mkeys_page, list);
867 list_del(&page->list);
868 kfree(page);
869 }
870
871 struct mlx5_cache_ent *
mlx5r_cache_create_ent_locked(struct mlx5_ib_dev * dev,struct mlx5r_cache_rb_key rb_key,bool persistent_entry)872 mlx5r_cache_create_ent_locked(struct mlx5_ib_dev *dev,
873 struct mlx5r_cache_rb_key rb_key,
874 bool persistent_entry)
875 {
876 struct mlx5_cache_ent *ent;
877 int order;
878 int ret;
879
880 ent = kzalloc(sizeof(*ent), GFP_KERNEL);
881 if (!ent)
882 return ERR_PTR(-ENOMEM);
883
884 ret = mlx5r_mkeys_init(ent);
885 if (ret)
886 goto mkeys_err;
887 ent->rb_key = rb_key;
888 ent->dev = dev;
889 ent->is_tmp = !persistent_entry;
890
891 INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func);
892
893 ret = mlx5_cache_ent_insert(&dev->cache, ent);
894 if (ret)
895 goto ent_insert_err;
896
897 if (persistent_entry) {
898 if (rb_key.access_mode == MLX5_MKC_ACCESS_MODE_KSM)
899 order = MLX5_IMR_KSM_CACHE_ENTRY;
900 else
901 order = order_base_2(rb_key.ndescs) - 2;
902
903 if ((dev->mdev->profile.mask & MLX5_PROF_MASK_MR_CACHE) &&
904 !dev->is_rep && mlx5_core_is_pf(dev->mdev) &&
905 mlx5r_umr_can_load_pas(dev, 0))
906 ent->limit = dev->mdev->profile.mr_cache[order].limit;
907 else
908 ent->limit = 0;
909
910 mlx5_mkey_cache_debugfs_add_ent(dev, ent);
911 }
912
913 return ent;
914 ent_insert_err:
915 mlx5r_mkeys_uninit(ent);
916 mkeys_err:
917 kfree(ent);
918 return ERR_PTR(ret);
919 }
920
mlx5r_destroy_cache_entries(struct mlx5_ib_dev * dev)921 static void mlx5r_destroy_cache_entries(struct mlx5_ib_dev *dev)
922 {
923 struct rb_root *root = &dev->cache.rb_root;
924 struct mlx5_cache_ent *ent;
925 struct rb_node *node;
926
927 mutex_lock(&dev->cache.rb_lock);
928 node = rb_first(root);
929 while (node) {
930 ent = rb_entry(node, struct mlx5_cache_ent, node);
931 node = rb_next(node);
932 clean_keys(dev, ent);
933 rb_erase(&ent->node, root);
934 mlx5r_mkeys_uninit(ent);
935 kfree(ent);
936 }
937 mutex_unlock(&dev->cache.rb_lock);
938 }
939
mlx5_mkey_cache_init(struct mlx5_ib_dev * dev)940 int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev)
941 {
942 struct mlx5_mkey_cache *cache = &dev->cache;
943 struct rb_root *root = &dev->cache.rb_root;
944 struct mlx5r_cache_rb_key rb_key = {
945 .access_mode = MLX5_MKC_ACCESS_MODE_MTT,
946 };
947 struct mlx5_cache_ent *ent;
948 struct rb_node *node;
949 int ret;
950 int i;
951
952 mutex_init(&dev->slow_path_mutex);
953 mutex_init(&dev->cache.rb_lock);
954 dev->cache.rb_root = RB_ROOT;
955 cache->wq = alloc_ordered_workqueue("mkey_cache", WQ_MEM_RECLAIM);
956 if (!cache->wq) {
957 mlx5_ib_warn(dev, "failed to create work queue\n");
958 return -ENOMEM;
959 }
960
961 mlx5_cmd_init_async_ctx(dev->mdev, &dev->async_ctx);
962 timer_setup(&dev->delay_timer, delay_time_func, 0);
963 mlx5_mkey_cache_debugfs_init(dev);
964 mutex_lock(&cache->rb_lock);
965 for (i = 0; i <= mkey_cache_max_order(dev); i++) {
966 rb_key.ndescs = MLX5_MR_CACHE_PERSISTENT_ENTRY_MIN_DESCS << i;
967 ent = mlx5r_cache_create_ent_locked(dev, rb_key, true);
968 if (IS_ERR(ent)) {
969 ret = PTR_ERR(ent);
970 goto err;
971 }
972 }
973
974 ret = mlx5_odp_init_mkey_cache(dev);
975 if (ret)
976 goto err;
977
978 mutex_unlock(&cache->rb_lock);
979 for (node = rb_first(root); node; node = rb_next(node)) {
980 ent = rb_entry(node, struct mlx5_cache_ent, node);
981 spin_lock_irq(&ent->mkeys_queue.lock);
982 queue_adjust_cache_locked(ent);
983 spin_unlock_irq(&ent->mkeys_queue.lock);
984 }
985
986 return 0;
987
988 err:
989 mutex_unlock(&cache->rb_lock);
990 mlx5_mkey_cache_debugfs_cleanup(dev);
991 mlx5r_destroy_cache_entries(dev);
992 destroy_workqueue(cache->wq);
993 mlx5_ib_warn(dev, "failed to create mkey cache entry\n");
994 return ret;
995 }
996
mlx5_mkey_cache_cleanup(struct mlx5_ib_dev * dev)997 void mlx5_mkey_cache_cleanup(struct mlx5_ib_dev *dev)
998 {
999 struct rb_root *root = &dev->cache.rb_root;
1000 struct mlx5_cache_ent *ent;
1001 struct rb_node *node;
1002
1003 if (!dev->cache.wq)
1004 return;
1005
1006 mutex_lock(&dev->cache.rb_lock);
1007 for (node = rb_first(root); node; node = rb_next(node)) {
1008 ent = rb_entry(node, struct mlx5_cache_ent, node);
1009 spin_lock_irq(&ent->mkeys_queue.lock);
1010 ent->disabled = true;
1011 spin_unlock_irq(&ent->mkeys_queue.lock);
1012 cancel_delayed_work(&ent->dwork);
1013 }
1014 mutex_unlock(&dev->cache.rb_lock);
1015
1016 /*
1017 * After all entries are disabled and will not reschedule on WQ,
1018 * flush it and all async commands.
1019 */
1020 flush_workqueue(dev->cache.wq);
1021
1022 mlx5_mkey_cache_debugfs_cleanup(dev);
1023 mlx5_cmd_cleanup_async_ctx(&dev->async_ctx);
1024
1025 /* At this point all entries are disabled and have no concurrent work. */
1026 mlx5r_destroy_cache_entries(dev);
1027
1028 destroy_workqueue(dev->cache.wq);
1029 timer_delete_sync(&dev->delay_timer);
1030 }
1031
mlx5_ib_get_dma_mr(struct ib_pd * pd,int acc)1032 struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc)
1033 {
1034 struct mlx5_ib_dev *dev = to_mdev(pd->device);
1035 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
1036 struct mlx5_ib_mr *mr;
1037 void *mkc;
1038 u32 *in;
1039 int err;
1040
1041 mr = kzalloc(sizeof(*mr), GFP_KERNEL);
1042 if (!mr)
1043 return ERR_PTR(-ENOMEM);
1044
1045 in = kzalloc(inlen, GFP_KERNEL);
1046 if (!in) {
1047 err = -ENOMEM;
1048 goto err_free;
1049 }
1050
1051 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
1052
1053 MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_PA);
1054 MLX5_SET(mkc, mkc, length64, 1);
1055 set_mkc_access_pd_addr_fields(mkc, acc | IB_ACCESS_RELAXED_ORDERING, 0,
1056 pd);
1057 MLX5_SET(mkc, mkc, ma_translation_mode, MLX5_CAP_GEN(dev->mdev, ats));
1058
1059 err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen);
1060 if (err)
1061 goto err_in;
1062
1063 kfree(in);
1064 mr->mmkey.type = MLX5_MKEY_MR;
1065 mr->ibmr.lkey = mr->mmkey.key;
1066 mr->ibmr.rkey = mr->mmkey.key;
1067 mr->umem = NULL;
1068
1069 return &mr->ibmr;
1070
1071 err_in:
1072 kfree(in);
1073
1074 err_free:
1075 kfree(mr);
1076
1077 return ERR_PTR(err);
1078 }
1079
get_octo_len(u64 addr,u64 len,int page_shift)1080 static int get_octo_len(u64 addr, u64 len, int page_shift)
1081 {
1082 u64 page_size = 1ULL << page_shift;
1083 u64 offset;
1084 int npages;
1085
1086 offset = addr & (page_size - 1);
1087 npages = ALIGN(len + offset, page_size) >> page_shift;
1088 return (npages + 1) / 2;
1089 }
1090
mkey_cache_max_order(struct mlx5_ib_dev * dev)1091 static int mkey_cache_max_order(struct mlx5_ib_dev *dev)
1092 {
1093 if (MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset))
1094 return MKEY_CACHE_LAST_STD_ENTRY;
1095 return MLX5_MAX_UMR_SHIFT;
1096 }
1097
set_mr_fields(struct mlx5_ib_dev * dev,struct mlx5_ib_mr * mr,u64 length,int access_flags,u64 iova)1098 static void set_mr_fields(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr,
1099 u64 length, int access_flags, u64 iova)
1100 {
1101 mr->ibmr.lkey = mr->mmkey.key;
1102 mr->ibmr.rkey = mr->mmkey.key;
1103 mr->ibmr.length = length;
1104 mr->ibmr.device = &dev->ib_dev;
1105 mr->ibmr.iova = iova;
1106 mr->access_flags = access_flags;
1107 }
1108
mlx5_umem_dmabuf_default_pgsz(struct ib_umem * umem,u64 iova)1109 static unsigned int mlx5_umem_dmabuf_default_pgsz(struct ib_umem *umem,
1110 u64 iova)
1111 {
1112 /*
1113 * The alignment of iova has already been checked upon entering
1114 * UVERBS_METHOD_REG_DMABUF_MR
1115 */
1116 umem->iova = iova;
1117 return PAGE_SIZE;
1118 }
1119
alloc_cacheable_mr(struct ib_pd * pd,struct ib_umem * umem,u64 iova,int access_flags,int access_mode)1120 static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd,
1121 struct ib_umem *umem, u64 iova,
1122 int access_flags, int access_mode)
1123 {
1124 struct mlx5_ib_dev *dev = to_mdev(pd->device);
1125 struct mlx5r_cache_rb_key rb_key = {};
1126 struct mlx5_cache_ent *ent;
1127 struct mlx5_ib_mr *mr;
1128 unsigned long page_size;
1129
1130 if (umem->is_dmabuf)
1131 page_size = mlx5_umem_dmabuf_default_pgsz(umem, iova);
1132 else
1133 page_size = mlx5_umem_mkc_find_best_pgsz(dev, umem, iova);
1134 if (WARN_ON(!page_size))
1135 return ERR_PTR(-EINVAL);
1136
1137 rb_key.access_mode = access_mode;
1138 rb_key.ndescs = ib_umem_num_dma_blocks(umem, page_size);
1139 rb_key.ats = mlx5_umem_needs_ats(dev, umem, access_flags);
1140 rb_key.access_flags = get_unchangeable_access_flags(dev, access_flags);
1141 ent = mkey_cache_ent_from_rb_key(dev, rb_key);
1142 /*
1143 * If the MR can't come from the cache then synchronously create an uncached
1144 * one.
1145 */
1146 if (!ent) {
1147 mutex_lock(&dev->slow_path_mutex);
1148 mr = reg_create(pd, umem, iova, access_flags, page_size, false, access_mode);
1149 mutex_unlock(&dev->slow_path_mutex);
1150 if (IS_ERR(mr))
1151 return mr;
1152 mr->mmkey.rb_key = rb_key;
1153 mr->mmkey.cacheable = true;
1154 return mr;
1155 }
1156
1157 mr = _mlx5_mr_cache_alloc(dev, ent);
1158 if (IS_ERR(mr))
1159 return mr;
1160
1161 mr->ibmr.pd = pd;
1162 mr->umem = umem;
1163 mr->page_shift = order_base_2(page_size);
1164 set_mr_fields(dev, mr, umem->length, access_flags, iova);
1165
1166 return mr;
1167 }
1168
1169 static struct ib_mr *
reg_create_crossing_vhca_mr(struct ib_pd * pd,u64 iova,u64 length,int access_flags,u32 crossed_lkey)1170 reg_create_crossing_vhca_mr(struct ib_pd *pd, u64 iova, u64 length, int access_flags,
1171 u32 crossed_lkey)
1172 {
1173 struct mlx5_ib_dev *dev = to_mdev(pd->device);
1174 int access_mode = MLX5_MKC_ACCESS_MODE_CROSSING;
1175 struct mlx5_ib_mr *mr;
1176 void *mkc;
1177 int inlen;
1178 u32 *in;
1179 int err;
1180
1181 if (!MLX5_CAP_GEN(dev->mdev, crossing_vhca_mkey))
1182 return ERR_PTR(-EOPNOTSUPP);
1183
1184 mr = kzalloc(sizeof(*mr), GFP_KERNEL);
1185 if (!mr)
1186 return ERR_PTR(-ENOMEM);
1187
1188 inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
1189 in = kvzalloc(inlen, GFP_KERNEL);
1190 if (!in) {
1191 err = -ENOMEM;
1192 goto err_1;
1193 }
1194
1195 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
1196 MLX5_SET(mkc, mkc, crossing_target_vhca_id,
1197 MLX5_CAP_GEN(dev->mdev, vhca_id));
1198 MLX5_SET(mkc, mkc, translations_octword_size, crossed_lkey);
1199 MLX5_SET(mkc, mkc, access_mode_1_0, access_mode & 0x3);
1200 MLX5_SET(mkc, mkc, access_mode_4_2, (access_mode >> 2) & 0x7);
1201
1202 /* for this crossing mkey IOVA should be 0 and len should be IOVA + len */
1203 set_mkc_access_pd_addr_fields(mkc, access_flags, 0, pd);
1204 MLX5_SET64(mkc, mkc, len, iova + length);
1205
1206 MLX5_SET(mkc, mkc, free, 0);
1207 MLX5_SET(mkc, mkc, umr_en, 0);
1208 err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen);
1209 if (err)
1210 goto err_2;
1211
1212 mr->mmkey.type = MLX5_MKEY_MR;
1213 set_mr_fields(dev, mr, length, access_flags, iova);
1214 mr->ibmr.pd = pd;
1215 kvfree(in);
1216 mlx5_ib_dbg(dev, "crossing mkey = 0x%x\n", mr->mmkey.key);
1217
1218 return &mr->ibmr;
1219 err_2:
1220 kvfree(in);
1221 err_1:
1222 kfree(mr);
1223 return ERR_PTR(err);
1224 }
1225
1226 /*
1227 * If ibmr is NULL it will be allocated by reg_create.
1228 * Else, the given ibmr will be used.
1229 */
reg_create(struct ib_pd * pd,struct ib_umem * umem,u64 iova,int access_flags,unsigned long page_size,bool populate,int access_mode)1230 static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem,
1231 u64 iova, int access_flags,
1232 unsigned long page_size, bool populate,
1233 int access_mode)
1234 {
1235 struct mlx5_ib_dev *dev = to_mdev(pd->device);
1236 struct mlx5_ib_mr *mr;
1237 __be64 *pas;
1238 void *mkc;
1239 int inlen;
1240 u32 *in;
1241 int err;
1242 bool pg_cap = !!(MLX5_CAP_GEN(dev->mdev, pg)) &&
1243 (access_mode == MLX5_MKC_ACCESS_MODE_MTT);
1244 bool ksm_mode = (access_mode == MLX5_MKC_ACCESS_MODE_KSM);
1245
1246 if (!page_size)
1247 return ERR_PTR(-EINVAL);
1248 mr = kzalloc(sizeof(*mr), GFP_KERNEL);
1249 if (!mr)
1250 return ERR_PTR(-ENOMEM);
1251
1252 mr->ibmr.pd = pd;
1253 mr->access_flags = access_flags;
1254 mr->page_shift = order_base_2(page_size);
1255
1256 inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
1257 if (populate)
1258 inlen += sizeof(*pas) *
1259 roundup(ib_umem_num_dma_blocks(umem, page_size), 2);
1260 in = kvzalloc(inlen, GFP_KERNEL);
1261 if (!in) {
1262 err = -ENOMEM;
1263 goto err_1;
1264 }
1265 pas = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt);
1266 if (populate) {
1267 if (WARN_ON(access_flags & IB_ACCESS_ON_DEMAND || ksm_mode)) {
1268 err = -EINVAL;
1269 goto err_2;
1270 }
1271 mlx5_ib_populate_pas(umem, 1UL << mr->page_shift, pas,
1272 pg_cap ? MLX5_IB_MTT_PRESENT : 0);
1273 }
1274
1275 /* The pg_access bit allows setting the access flags
1276 * in the page list submitted with the command.
1277 */
1278 MLX5_SET(create_mkey_in, in, pg_access, !!(pg_cap));
1279
1280 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
1281 set_mkc_access_pd_addr_fields(mkc, access_flags, iova,
1282 populate ? pd : dev->umrc.pd);
1283 /* In case a data direct flow, overwrite the pdn field by its internal kernel PD */
1284 if (umem->is_dmabuf && ksm_mode)
1285 MLX5_SET(mkc, mkc, pd, dev->ddr.pdn);
1286
1287 MLX5_SET(mkc, mkc, free, !populate);
1288 MLX5_SET(mkc, mkc, access_mode_1_0, access_mode);
1289 MLX5_SET(mkc, mkc, umr_en, 1);
1290
1291 MLX5_SET64(mkc, mkc, len, umem->length);
1292 MLX5_SET(mkc, mkc, bsf_octword_size, 0);
1293 if (ksm_mode)
1294 MLX5_SET(mkc, mkc, translations_octword_size,
1295 get_octo_len(iova, umem->length, mr->page_shift) * 2);
1296 else
1297 MLX5_SET(mkc, mkc, translations_octword_size,
1298 get_octo_len(iova, umem->length, mr->page_shift));
1299 MLX5_SET(mkc, mkc, log_page_size, mr->page_shift);
1300 if (mlx5_umem_needs_ats(dev, umem, access_flags))
1301 MLX5_SET(mkc, mkc, ma_translation_mode, 1);
1302 if (populate) {
1303 MLX5_SET(create_mkey_in, in, translations_octword_actual_size,
1304 get_octo_len(iova, umem->length, mr->page_shift));
1305 }
1306
1307 err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen);
1308 if (err) {
1309 mlx5_ib_warn(dev, "create mkey failed\n");
1310 goto err_2;
1311 }
1312 mr->mmkey.type = MLX5_MKEY_MR;
1313 mr->mmkey.ndescs = get_octo_len(iova, umem->length, mr->page_shift);
1314 mr->umem = umem;
1315 set_mr_fields(dev, mr, umem->length, access_flags, iova);
1316 kvfree(in);
1317
1318 mlx5_ib_dbg(dev, "mkey = 0x%x\n", mr->mmkey.key);
1319
1320 return mr;
1321
1322 err_2:
1323 kvfree(in);
1324 err_1:
1325 kfree(mr);
1326 return ERR_PTR(err);
1327 }
1328
mlx5_ib_get_dm_mr(struct ib_pd * pd,u64 start_addr,u64 length,int acc,int mode)1329 static struct ib_mr *mlx5_ib_get_dm_mr(struct ib_pd *pd, u64 start_addr,
1330 u64 length, int acc, int mode)
1331 {
1332 struct mlx5_ib_dev *dev = to_mdev(pd->device);
1333 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
1334 struct mlx5_ib_mr *mr;
1335 void *mkc;
1336 u32 *in;
1337 int err;
1338
1339 mr = kzalloc(sizeof(*mr), GFP_KERNEL);
1340 if (!mr)
1341 return ERR_PTR(-ENOMEM);
1342
1343 in = kzalloc(inlen, GFP_KERNEL);
1344 if (!in) {
1345 err = -ENOMEM;
1346 goto err_free;
1347 }
1348
1349 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
1350
1351 MLX5_SET(mkc, mkc, access_mode_1_0, mode & 0x3);
1352 MLX5_SET(mkc, mkc, access_mode_4_2, (mode >> 2) & 0x7);
1353 MLX5_SET64(mkc, mkc, len, length);
1354 set_mkc_access_pd_addr_fields(mkc, acc, start_addr, pd);
1355
1356 err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen);
1357 if (err)
1358 goto err_in;
1359
1360 kfree(in);
1361
1362 set_mr_fields(dev, mr, length, acc, start_addr);
1363
1364 return &mr->ibmr;
1365
1366 err_in:
1367 kfree(in);
1368
1369 err_free:
1370 kfree(mr);
1371
1372 return ERR_PTR(err);
1373 }
1374
mlx5_ib_advise_mr(struct ib_pd * pd,enum ib_uverbs_advise_mr_advice advice,u32 flags,struct ib_sge * sg_list,u32 num_sge,struct uverbs_attr_bundle * attrs)1375 int mlx5_ib_advise_mr(struct ib_pd *pd,
1376 enum ib_uverbs_advise_mr_advice advice,
1377 u32 flags,
1378 struct ib_sge *sg_list,
1379 u32 num_sge,
1380 struct uverbs_attr_bundle *attrs)
1381 {
1382 if (advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH &&
1383 advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE &&
1384 advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_NO_FAULT)
1385 return -EOPNOTSUPP;
1386
1387 return mlx5_ib_advise_mr_prefetch(pd, advice, flags,
1388 sg_list, num_sge);
1389 }
1390
mlx5_ib_reg_dm_mr(struct ib_pd * pd,struct ib_dm * dm,struct ib_dm_mr_attr * attr,struct uverbs_attr_bundle * attrs)1391 struct ib_mr *mlx5_ib_reg_dm_mr(struct ib_pd *pd, struct ib_dm *dm,
1392 struct ib_dm_mr_attr *attr,
1393 struct uverbs_attr_bundle *attrs)
1394 {
1395 struct mlx5_ib_dm *mdm = to_mdm(dm);
1396 struct mlx5_core_dev *dev = to_mdev(dm->device)->mdev;
1397 u64 start_addr = mdm->dev_addr + attr->offset;
1398 int mode;
1399
1400 switch (mdm->type) {
1401 case MLX5_IB_UAPI_DM_TYPE_MEMIC:
1402 if (attr->access_flags & ~MLX5_IB_DM_MEMIC_ALLOWED_ACCESS)
1403 return ERR_PTR(-EINVAL);
1404
1405 mode = MLX5_MKC_ACCESS_MODE_MEMIC;
1406 start_addr -= pci_resource_start(dev->pdev, 0);
1407 break;
1408 case MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM:
1409 case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM:
1410 case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_PATTERN_SW_ICM:
1411 case MLX5_IB_UAPI_DM_TYPE_ENCAP_SW_ICM:
1412 if (attr->access_flags & ~MLX5_IB_DM_SW_ICM_ALLOWED_ACCESS)
1413 return ERR_PTR(-EINVAL);
1414
1415 mode = MLX5_MKC_ACCESS_MODE_SW_ICM;
1416 break;
1417 default:
1418 return ERR_PTR(-EINVAL);
1419 }
1420
1421 return mlx5_ib_get_dm_mr(pd, start_addr, attr->length,
1422 attr->access_flags, mode);
1423 }
1424
create_real_mr(struct ib_pd * pd,struct ib_umem * umem,u64 iova,int access_flags)1425 static struct ib_mr *create_real_mr(struct ib_pd *pd, struct ib_umem *umem,
1426 u64 iova, int access_flags)
1427 {
1428 struct mlx5_ib_dev *dev = to_mdev(pd->device);
1429 struct mlx5_ib_mr *mr = NULL;
1430 bool xlt_with_umr;
1431 int err;
1432
1433 xlt_with_umr = mlx5r_umr_can_load_pas(dev, umem->length);
1434 if (xlt_with_umr) {
1435 mr = alloc_cacheable_mr(pd, umem, iova, access_flags,
1436 MLX5_MKC_ACCESS_MODE_MTT);
1437 } else {
1438 unsigned long page_size =
1439 mlx5_umem_mkc_find_best_pgsz(dev, umem, iova);
1440
1441 mutex_lock(&dev->slow_path_mutex);
1442 mr = reg_create(pd, umem, iova, access_flags, page_size,
1443 true, MLX5_MKC_ACCESS_MODE_MTT);
1444 mutex_unlock(&dev->slow_path_mutex);
1445 }
1446 if (IS_ERR(mr)) {
1447 ib_umem_release(umem);
1448 return ERR_CAST(mr);
1449 }
1450
1451 mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmkey.key);
1452
1453 atomic_add(ib_umem_num_pages(umem), &dev->mdev->priv.reg_pages);
1454
1455 if (xlt_with_umr) {
1456 /*
1457 * If the MR was created with reg_create then it will be
1458 * configured properly but left disabled. It is safe to go ahead
1459 * and configure it again via UMR while enabling it.
1460 */
1461 err = mlx5r_umr_update_mr_pas(mr, MLX5_IB_UPD_XLT_ENABLE);
1462 if (err) {
1463 mlx5_ib_dereg_mr(&mr->ibmr, NULL);
1464 return ERR_PTR(err);
1465 }
1466 }
1467 return &mr->ibmr;
1468 }
1469
create_user_odp_mr(struct ib_pd * pd,u64 start,u64 length,u64 iova,int access_flags,struct ib_udata * udata)1470 static struct ib_mr *create_user_odp_mr(struct ib_pd *pd, u64 start, u64 length,
1471 u64 iova, int access_flags,
1472 struct ib_udata *udata)
1473 {
1474 struct mlx5_ib_dev *dev = to_mdev(pd->device);
1475 struct ib_umem_odp *odp;
1476 struct mlx5_ib_mr *mr;
1477 int err;
1478
1479 if (!IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING))
1480 return ERR_PTR(-EOPNOTSUPP);
1481
1482 err = mlx5r_odp_create_eq(dev, &dev->odp_pf_eq);
1483 if (err)
1484 return ERR_PTR(err);
1485 if (!start && length == U64_MAX) {
1486 if (iova != 0)
1487 return ERR_PTR(-EINVAL);
1488 if (!(dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT))
1489 return ERR_PTR(-EINVAL);
1490
1491 mr = mlx5_ib_alloc_implicit_mr(to_mpd(pd), access_flags);
1492 if (IS_ERR(mr))
1493 return ERR_CAST(mr);
1494 return &mr->ibmr;
1495 }
1496
1497 /* ODP requires xlt update via umr to work. */
1498 if (!mlx5r_umr_can_load_pas(dev, length))
1499 return ERR_PTR(-EINVAL);
1500
1501 odp = ib_umem_odp_get(&dev->ib_dev, start, length, access_flags,
1502 &mlx5_mn_ops);
1503 if (IS_ERR(odp))
1504 return ERR_CAST(odp);
1505
1506 mr = alloc_cacheable_mr(pd, &odp->umem, iova, access_flags,
1507 MLX5_MKC_ACCESS_MODE_MTT);
1508 if (IS_ERR(mr)) {
1509 ib_umem_release(&odp->umem);
1510 return ERR_CAST(mr);
1511 }
1512 xa_init(&mr->implicit_children);
1513
1514 odp->private = mr;
1515 err = mlx5r_store_odp_mkey(dev, &mr->mmkey);
1516 if (err)
1517 goto err_dereg_mr;
1518
1519 err = mlx5_ib_init_odp_mr(mr);
1520 if (err)
1521 goto err_dereg_mr;
1522 return &mr->ibmr;
1523
1524 err_dereg_mr:
1525 mlx5_ib_dereg_mr(&mr->ibmr, NULL);
1526 return ERR_PTR(err);
1527 }
1528
mlx5_ib_reg_user_mr(struct ib_pd * pd,u64 start,u64 length,u64 iova,int access_flags,struct ib_udata * udata)1529 struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
1530 u64 iova, int access_flags,
1531 struct ib_udata *udata)
1532 {
1533 struct mlx5_ib_dev *dev = to_mdev(pd->device);
1534 struct ib_umem *umem;
1535 int err;
1536
1537 if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM))
1538 return ERR_PTR(-EOPNOTSUPP);
1539
1540 mlx5_ib_dbg(dev, "start 0x%llx, iova 0x%llx, length 0x%llx, access_flags 0x%x\n",
1541 start, iova, length, access_flags);
1542
1543 err = mlx5r_umr_resource_init(dev);
1544 if (err)
1545 return ERR_PTR(err);
1546
1547 if (access_flags & IB_ACCESS_ON_DEMAND)
1548 return create_user_odp_mr(pd, start, length, iova, access_flags,
1549 udata);
1550 umem = ib_umem_get(&dev->ib_dev, start, length, access_flags);
1551 if (IS_ERR(umem))
1552 return ERR_CAST(umem);
1553 return create_real_mr(pd, umem, iova, access_flags);
1554 }
1555
mlx5_ib_dmabuf_invalidate_cb(struct dma_buf_attachment * attach)1556 static void mlx5_ib_dmabuf_invalidate_cb(struct dma_buf_attachment *attach)
1557 {
1558 struct ib_umem_dmabuf *umem_dmabuf = attach->importer_priv;
1559 struct mlx5_ib_mr *mr = umem_dmabuf->private;
1560
1561 dma_resv_assert_held(umem_dmabuf->attach->dmabuf->resv);
1562
1563 if (!umem_dmabuf->sgt || !mr)
1564 return;
1565
1566 mlx5r_umr_update_mr_pas(mr, MLX5_IB_UPD_XLT_ZAP);
1567 ib_umem_dmabuf_unmap_pages(umem_dmabuf);
1568 }
1569
1570 static struct dma_buf_attach_ops mlx5_ib_dmabuf_attach_ops = {
1571 .allow_peer2peer = 1,
1572 .move_notify = mlx5_ib_dmabuf_invalidate_cb,
1573 };
1574
1575 static struct ib_mr *
reg_user_mr_dmabuf(struct ib_pd * pd,struct device * dma_device,u64 offset,u64 length,u64 virt_addr,int fd,int access_flags,int access_mode)1576 reg_user_mr_dmabuf(struct ib_pd *pd, struct device *dma_device,
1577 u64 offset, u64 length, u64 virt_addr,
1578 int fd, int access_flags, int access_mode)
1579 {
1580 bool pinned_mode = (access_mode == MLX5_MKC_ACCESS_MODE_KSM);
1581 struct mlx5_ib_dev *dev = to_mdev(pd->device);
1582 struct mlx5_ib_mr *mr = NULL;
1583 struct ib_umem_dmabuf *umem_dmabuf;
1584 int err;
1585
1586 err = mlx5r_umr_resource_init(dev);
1587 if (err)
1588 return ERR_PTR(err);
1589
1590 if (!pinned_mode)
1591 umem_dmabuf = ib_umem_dmabuf_get(&dev->ib_dev,
1592 offset, length, fd,
1593 access_flags,
1594 &mlx5_ib_dmabuf_attach_ops);
1595 else
1596 umem_dmabuf = ib_umem_dmabuf_get_pinned_with_dma_device(&dev->ib_dev,
1597 dma_device, offset, length,
1598 fd, access_flags);
1599
1600 if (IS_ERR(umem_dmabuf)) {
1601 mlx5_ib_dbg(dev, "umem_dmabuf get failed (%ld)\n",
1602 PTR_ERR(umem_dmabuf));
1603 return ERR_CAST(umem_dmabuf);
1604 }
1605
1606 mr = alloc_cacheable_mr(pd, &umem_dmabuf->umem, virt_addr,
1607 access_flags, access_mode);
1608 if (IS_ERR(mr)) {
1609 ib_umem_release(&umem_dmabuf->umem);
1610 return ERR_CAST(mr);
1611 }
1612
1613 mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmkey.key);
1614
1615 atomic_add(ib_umem_num_pages(mr->umem), &dev->mdev->priv.reg_pages);
1616 umem_dmabuf->private = mr;
1617 if (!pinned_mode) {
1618 err = mlx5r_store_odp_mkey(dev, &mr->mmkey);
1619 if (err)
1620 goto err_dereg_mr;
1621 } else {
1622 mr->data_direct = true;
1623 }
1624
1625 err = mlx5_ib_init_dmabuf_mr(mr);
1626 if (err)
1627 goto err_dereg_mr;
1628 return &mr->ibmr;
1629
1630 err_dereg_mr:
1631 __mlx5_ib_dereg_mr(&mr->ibmr);
1632 return ERR_PTR(err);
1633 }
1634
1635 static struct ib_mr *
reg_user_mr_dmabuf_by_data_direct(struct ib_pd * pd,u64 offset,u64 length,u64 virt_addr,int fd,int access_flags)1636 reg_user_mr_dmabuf_by_data_direct(struct ib_pd *pd, u64 offset,
1637 u64 length, u64 virt_addr,
1638 int fd, int access_flags)
1639 {
1640 struct mlx5_ib_dev *dev = to_mdev(pd->device);
1641 struct mlx5_data_direct_dev *data_direct_dev;
1642 struct ib_mr *crossing_mr;
1643 struct ib_mr *crossed_mr;
1644 int ret = 0;
1645
1646 /* As of HW behaviour the IOVA must be page aligned in KSM mode */
1647 if (!PAGE_ALIGNED(virt_addr) || (access_flags & IB_ACCESS_ON_DEMAND))
1648 return ERR_PTR(-EOPNOTSUPP);
1649
1650 mutex_lock(&dev->data_direct_lock);
1651 data_direct_dev = dev->data_direct_dev;
1652 if (!data_direct_dev) {
1653 ret = -EINVAL;
1654 goto end;
1655 }
1656
1657 /* The device's 'data direct mkey' was created without RO flags to
1658 * simplify things and allow for a single mkey per device.
1659 * Since RO is not a must, mask it out accordingly.
1660 */
1661 access_flags &= ~IB_ACCESS_RELAXED_ORDERING;
1662 crossed_mr = reg_user_mr_dmabuf(pd, &data_direct_dev->pdev->dev,
1663 offset, length, virt_addr, fd,
1664 access_flags, MLX5_MKC_ACCESS_MODE_KSM);
1665 if (IS_ERR(crossed_mr)) {
1666 ret = PTR_ERR(crossed_mr);
1667 goto end;
1668 }
1669
1670 mutex_lock(&dev->slow_path_mutex);
1671 crossing_mr = reg_create_crossing_vhca_mr(pd, virt_addr, length, access_flags,
1672 crossed_mr->lkey);
1673 mutex_unlock(&dev->slow_path_mutex);
1674 if (IS_ERR(crossing_mr)) {
1675 __mlx5_ib_dereg_mr(crossed_mr);
1676 ret = PTR_ERR(crossing_mr);
1677 goto end;
1678 }
1679
1680 list_add_tail(&to_mmr(crossed_mr)->dd_node, &dev->data_direct_mr_list);
1681 to_mmr(crossing_mr)->dd_crossed_mr = to_mmr(crossed_mr);
1682 to_mmr(crossing_mr)->data_direct = true;
1683 end:
1684 mutex_unlock(&dev->data_direct_lock);
1685 return ret ? ERR_PTR(ret) : crossing_mr;
1686 }
1687
mlx5_ib_reg_user_mr_dmabuf(struct ib_pd * pd,u64 offset,u64 length,u64 virt_addr,int fd,int access_flags,struct uverbs_attr_bundle * attrs)1688 struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset,
1689 u64 length, u64 virt_addr,
1690 int fd, int access_flags,
1691 struct uverbs_attr_bundle *attrs)
1692 {
1693 struct mlx5_ib_dev *dev = to_mdev(pd->device);
1694 int mlx5_access_flags = 0;
1695 int err;
1696
1697 if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM) ||
1698 !IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING))
1699 return ERR_PTR(-EOPNOTSUPP);
1700
1701 if (uverbs_attr_is_valid(attrs, MLX5_IB_ATTR_REG_DMABUF_MR_ACCESS_FLAGS)) {
1702 err = uverbs_get_flags32(&mlx5_access_flags, attrs,
1703 MLX5_IB_ATTR_REG_DMABUF_MR_ACCESS_FLAGS,
1704 MLX5_IB_UAPI_REG_DMABUF_ACCESS_DATA_DIRECT);
1705 if (err)
1706 return ERR_PTR(err);
1707 }
1708
1709 mlx5_ib_dbg(dev,
1710 "offset 0x%llx, virt_addr 0x%llx, length 0x%llx, fd %d, access_flags 0x%x, mlx5_access_flags 0x%x\n",
1711 offset, virt_addr, length, fd, access_flags, mlx5_access_flags);
1712
1713 /* dmabuf requires xlt update via umr to work. */
1714 if (!mlx5r_umr_can_load_pas(dev, length))
1715 return ERR_PTR(-EINVAL);
1716
1717 if (mlx5_access_flags & MLX5_IB_UAPI_REG_DMABUF_ACCESS_DATA_DIRECT)
1718 return reg_user_mr_dmabuf_by_data_direct(pd, offset, length, virt_addr,
1719 fd, access_flags);
1720
1721 return reg_user_mr_dmabuf(pd, pd->device->dma_device,
1722 offset, length, virt_addr,
1723 fd, access_flags, MLX5_MKC_ACCESS_MODE_MTT);
1724 }
1725
1726 /*
1727 * True if the change in access flags can be done via UMR, only some access
1728 * flags can be updated.
1729 */
can_use_umr_rereg_access(struct mlx5_ib_dev * dev,unsigned int current_access_flags,unsigned int target_access_flags)1730 static bool can_use_umr_rereg_access(struct mlx5_ib_dev *dev,
1731 unsigned int current_access_flags,
1732 unsigned int target_access_flags)
1733 {
1734 unsigned int diffs = current_access_flags ^ target_access_flags;
1735
1736 if (diffs & ~(IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE |
1737 IB_ACCESS_REMOTE_READ | IB_ACCESS_RELAXED_ORDERING |
1738 IB_ACCESS_REMOTE_ATOMIC))
1739 return false;
1740 return mlx5r_umr_can_reconfig(dev, current_access_flags,
1741 target_access_flags);
1742 }
1743
can_use_umr_rereg_pas(struct mlx5_ib_mr * mr,struct ib_umem * new_umem,int new_access_flags,u64 iova,unsigned long * page_size)1744 static bool can_use_umr_rereg_pas(struct mlx5_ib_mr *mr,
1745 struct ib_umem *new_umem,
1746 int new_access_flags, u64 iova,
1747 unsigned long *page_size)
1748 {
1749 struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device);
1750
1751 /* We only track the allocated sizes of MRs from the cache */
1752 if (!mr->mmkey.cache_ent)
1753 return false;
1754 if (!mlx5r_umr_can_load_pas(dev, new_umem->length))
1755 return false;
1756
1757 *page_size = mlx5_umem_mkc_find_best_pgsz(dev, new_umem, iova);
1758 if (WARN_ON(!*page_size))
1759 return false;
1760 return (mr->mmkey.cache_ent->rb_key.ndescs) >=
1761 ib_umem_num_dma_blocks(new_umem, *page_size);
1762 }
1763
umr_rereg_pas(struct mlx5_ib_mr * mr,struct ib_pd * pd,int access_flags,int flags,struct ib_umem * new_umem,u64 iova,unsigned long page_size)1764 static int umr_rereg_pas(struct mlx5_ib_mr *mr, struct ib_pd *pd,
1765 int access_flags, int flags, struct ib_umem *new_umem,
1766 u64 iova, unsigned long page_size)
1767 {
1768 struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device);
1769 int upd_flags = MLX5_IB_UPD_XLT_ADDR | MLX5_IB_UPD_XLT_ENABLE;
1770 struct ib_umem *old_umem = mr->umem;
1771 int err;
1772
1773 /*
1774 * To keep everything simple the MR is revoked before we start to mess
1775 * with it. This ensure the change is atomic relative to any use of the
1776 * MR.
1777 */
1778 err = mlx5r_umr_revoke_mr(mr);
1779 if (err)
1780 return err;
1781
1782 if (flags & IB_MR_REREG_PD) {
1783 mr->ibmr.pd = pd;
1784 upd_flags |= MLX5_IB_UPD_XLT_PD;
1785 }
1786 if (flags & IB_MR_REREG_ACCESS) {
1787 mr->access_flags = access_flags;
1788 upd_flags |= MLX5_IB_UPD_XLT_ACCESS;
1789 }
1790
1791 mr->ibmr.iova = iova;
1792 mr->ibmr.length = new_umem->length;
1793 mr->page_shift = order_base_2(page_size);
1794 mr->umem = new_umem;
1795 err = mlx5r_umr_update_mr_pas(mr, upd_flags);
1796 if (err) {
1797 /*
1798 * The MR is revoked at this point so there is no issue to free
1799 * new_umem.
1800 */
1801 mr->umem = old_umem;
1802 return err;
1803 }
1804
1805 atomic_sub(ib_umem_num_pages(old_umem), &dev->mdev->priv.reg_pages);
1806 ib_umem_release(old_umem);
1807 atomic_add(ib_umem_num_pages(new_umem), &dev->mdev->priv.reg_pages);
1808 return 0;
1809 }
1810
mlx5_ib_rereg_user_mr(struct ib_mr * ib_mr,int flags,u64 start,u64 length,u64 iova,int new_access_flags,struct ib_pd * new_pd,struct ib_udata * udata)1811 struct ib_mr *mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
1812 u64 length, u64 iova, int new_access_flags,
1813 struct ib_pd *new_pd,
1814 struct ib_udata *udata)
1815 {
1816 struct mlx5_ib_dev *dev = to_mdev(ib_mr->device);
1817 struct mlx5_ib_mr *mr = to_mmr(ib_mr);
1818 int err;
1819
1820 if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM) || mr->data_direct)
1821 return ERR_PTR(-EOPNOTSUPP);
1822
1823 mlx5_ib_dbg(
1824 dev,
1825 "start 0x%llx, iova 0x%llx, length 0x%llx, access_flags 0x%x\n",
1826 start, iova, length, new_access_flags);
1827
1828 if (flags & ~(IB_MR_REREG_TRANS | IB_MR_REREG_PD | IB_MR_REREG_ACCESS))
1829 return ERR_PTR(-EOPNOTSUPP);
1830
1831 if (!(flags & IB_MR_REREG_ACCESS))
1832 new_access_flags = mr->access_flags;
1833 if (!(flags & IB_MR_REREG_PD))
1834 new_pd = ib_mr->pd;
1835
1836 if (!(flags & IB_MR_REREG_TRANS)) {
1837 struct ib_umem *umem;
1838
1839 /* Fast path for PD/access change */
1840 if (can_use_umr_rereg_access(dev, mr->access_flags,
1841 new_access_flags)) {
1842 err = mlx5r_umr_rereg_pd_access(mr, new_pd,
1843 new_access_flags);
1844 if (err)
1845 return ERR_PTR(err);
1846 return NULL;
1847 }
1848 /* DM or ODP MR's don't have a normal umem so we can't re-use it */
1849 if (!mr->umem || is_odp_mr(mr) || is_dmabuf_mr(mr))
1850 goto recreate;
1851
1852 /*
1853 * Only one active MR can refer to a umem at one time, revoke
1854 * the old MR before assigning the umem to the new one.
1855 */
1856 err = mlx5r_umr_revoke_mr(mr);
1857 if (err)
1858 return ERR_PTR(err);
1859 umem = mr->umem;
1860 mr->umem = NULL;
1861 atomic_sub(ib_umem_num_pages(umem), &dev->mdev->priv.reg_pages);
1862
1863 return create_real_mr(new_pd, umem, mr->ibmr.iova,
1864 new_access_flags);
1865 }
1866
1867 /*
1868 * DM doesn't have a PAS list so we can't re-use it, odp/dmabuf does
1869 * but the logic around releasing the umem is different
1870 */
1871 if (!mr->umem || is_odp_mr(mr) || is_dmabuf_mr(mr))
1872 goto recreate;
1873
1874 if (!(new_access_flags & IB_ACCESS_ON_DEMAND) &&
1875 can_use_umr_rereg_access(dev, mr->access_flags, new_access_flags)) {
1876 struct ib_umem *new_umem;
1877 unsigned long page_size;
1878
1879 new_umem = ib_umem_get(&dev->ib_dev, start, length,
1880 new_access_flags);
1881 if (IS_ERR(new_umem))
1882 return ERR_CAST(new_umem);
1883
1884 /* Fast path for PAS change */
1885 if (can_use_umr_rereg_pas(mr, new_umem, new_access_flags, iova,
1886 &page_size)) {
1887 err = umr_rereg_pas(mr, new_pd, new_access_flags, flags,
1888 new_umem, iova, page_size);
1889 if (err) {
1890 ib_umem_release(new_umem);
1891 return ERR_PTR(err);
1892 }
1893 return NULL;
1894 }
1895 return create_real_mr(new_pd, new_umem, iova, new_access_flags);
1896 }
1897
1898 /*
1899 * Everything else has no state we can preserve, just create a new MR
1900 * from scratch
1901 */
1902 recreate:
1903 return mlx5_ib_reg_user_mr(new_pd, start, length, iova,
1904 new_access_flags, udata);
1905 }
1906
1907 static int
mlx5_alloc_priv_descs(struct ib_device * device,struct mlx5_ib_mr * mr,int ndescs,int desc_size)1908 mlx5_alloc_priv_descs(struct ib_device *device,
1909 struct mlx5_ib_mr *mr,
1910 int ndescs,
1911 int desc_size)
1912 {
1913 struct mlx5_ib_dev *dev = to_mdev(device);
1914 struct device *ddev = &dev->mdev->pdev->dev;
1915 int size = ndescs * desc_size;
1916 int add_size;
1917 int ret;
1918
1919 add_size = max_t(int, MLX5_UMR_ALIGN - ARCH_KMALLOC_MINALIGN, 0);
1920 if (is_power_of_2(MLX5_UMR_ALIGN) && add_size) {
1921 int end = max_t(int, MLX5_UMR_ALIGN, roundup_pow_of_two(size));
1922
1923 add_size = min_t(int, end - size, add_size);
1924 }
1925
1926 mr->descs_alloc = kzalloc(size + add_size, GFP_KERNEL);
1927 if (!mr->descs_alloc)
1928 return -ENOMEM;
1929
1930 mr->descs = PTR_ALIGN(mr->descs_alloc, MLX5_UMR_ALIGN);
1931
1932 mr->desc_map = dma_map_single(ddev, mr->descs, size, DMA_TO_DEVICE);
1933 if (dma_mapping_error(ddev, mr->desc_map)) {
1934 ret = -ENOMEM;
1935 goto err;
1936 }
1937
1938 return 0;
1939 err:
1940 kfree(mr->descs_alloc);
1941
1942 return ret;
1943 }
1944
1945 static void
mlx5_free_priv_descs(struct mlx5_ib_mr * mr)1946 mlx5_free_priv_descs(struct mlx5_ib_mr *mr)
1947 {
1948 if (!mr->umem && !mr->data_direct &&
1949 mr->ibmr.type != IB_MR_TYPE_DM && mr->descs) {
1950 struct ib_device *device = mr->ibmr.device;
1951 int size = mr->max_descs * mr->desc_size;
1952 struct mlx5_ib_dev *dev = to_mdev(device);
1953
1954 dma_unmap_single(&dev->mdev->pdev->dev, mr->desc_map, size,
1955 DMA_TO_DEVICE);
1956 kfree(mr->descs_alloc);
1957 mr->descs = NULL;
1958 }
1959 }
1960
cache_ent_find_and_store(struct mlx5_ib_dev * dev,struct mlx5_ib_mr * mr)1961 static int cache_ent_find_and_store(struct mlx5_ib_dev *dev,
1962 struct mlx5_ib_mr *mr)
1963 {
1964 struct mlx5_mkey_cache *cache = &dev->cache;
1965 struct mlx5_cache_ent *ent;
1966 int ret;
1967
1968 if (mr->mmkey.cache_ent) {
1969 spin_lock_irq(&mr->mmkey.cache_ent->mkeys_queue.lock);
1970 goto end;
1971 }
1972
1973 mutex_lock(&cache->rb_lock);
1974 ent = mkey_cache_ent_from_rb_key(dev, mr->mmkey.rb_key);
1975 if (ent) {
1976 if (ent->rb_key.ndescs == mr->mmkey.rb_key.ndescs) {
1977 if (ent->disabled) {
1978 mutex_unlock(&cache->rb_lock);
1979 return -EOPNOTSUPP;
1980 }
1981 mr->mmkey.cache_ent = ent;
1982 spin_lock_irq(&mr->mmkey.cache_ent->mkeys_queue.lock);
1983 mutex_unlock(&cache->rb_lock);
1984 goto end;
1985 }
1986 }
1987
1988 ent = mlx5r_cache_create_ent_locked(dev, mr->mmkey.rb_key, false);
1989 mutex_unlock(&cache->rb_lock);
1990 if (IS_ERR(ent))
1991 return PTR_ERR(ent);
1992
1993 mr->mmkey.cache_ent = ent;
1994 spin_lock_irq(&mr->mmkey.cache_ent->mkeys_queue.lock);
1995
1996 end:
1997 ret = push_mkey_locked(mr->mmkey.cache_ent, mr->mmkey.key);
1998 spin_unlock_irq(&mr->mmkey.cache_ent->mkeys_queue.lock);
1999 return ret;
2000 }
2001
mlx5_ib_revoke_data_direct_mr(struct mlx5_ib_mr * mr)2002 static int mlx5_ib_revoke_data_direct_mr(struct mlx5_ib_mr *mr)
2003 {
2004 struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device);
2005 struct ib_umem_dmabuf *umem_dmabuf = to_ib_umem_dmabuf(mr->umem);
2006 int err;
2007
2008 lockdep_assert_held(&dev->data_direct_lock);
2009 mr->revoked = true;
2010 err = mlx5r_umr_revoke_mr(mr);
2011 if (WARN_ON(err))
2012 return err;
2013
2014 ib_umem_dmabuf_revoke(umem_dmabuf);
2015 return 0;
2016 }
2017
mlx5_ib_revoke_data_direct_mrs(struct mlx5_ib_dev * dev)2018 void mlx5_ib_revoke_data_direct_mrs(struct mlx5_ib_dev *dev)
2019 {
2020 struct mlx5_ib_mr *mr, *next;
2021
2022 lockdep_assert_held(&dev->data_direct_lock);
2023
2024 list_for_each_entry_safe(mr, next, &dev->data_direct_mr_list, dd_node) {
2025 list_del(&mr->dd_node);
2026 mlx5_ib_revoke_data_direct_mr(mr);
2027 }
2028 }
2029
mlx5_revoke_mr(struct mlx5_ib_mr * mr)2030 static int mlx5_revoke_mr(struct mlx5_ib_mr *mr)
2031 {
2032 struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device);
2033 struct mlx5_cache_ent *ent = mr->mmkey.cache_ent;
2034 bool is_odp = is_odp_mr(mr);
2035 bool is_odp_dma_buf = is_dmabuf_mr(mr) &&
2036 !to_ib_umem_dmabuf(mr->umem)->pinned;
2037 bool from_cache = !!ent;
2038 int ret = 0;
2039
2040 if (is_odp)
2041 mutex_lock(&to_ib_umem_odp(mr->umem)->umem_mutex);
2042
2043 if (is_odp_dma_buf)
2044 dma_resv_lock(to_ib_umem_dmabuf(mr->umem)->attach->dmabuf->resv, NULL);
2045
2046 if (mr->mmkey.cacheable && !mlx5r_umr_revoke_mr(mr) && !cache_ent_find_and_store(dev, mr)) {
2047 ent = mr->mmkey.cache_ent;
2048 /* upon storing to a clean temp entry - schedule its cleanup */
2049 spin_lock_irq(&ent->mkeys_queue.lock);
2050 if (from_cache)
2051 ent->in_use--;
2052 if (ent->is_tmp && !ent->tmp_cleanup_scheduled) {
2053 mod_delayed_work(ent->dev->cache.wq, &ent->dwork,
2054 msecs_to_jiffies(30 * 1000));
2055 ent->tmp_cleanup_scheduled = true;
2056 }
2057 spin_unlock_irq(&ent->mkeys_queue.lock);
2058 goto out;
2059 }
2060
2061 if (ent) {
2062 spin_lock_irq(&ent->mkeys_queue.lock);
2063 ent->in_use--;
2064 mr->mmkey.cache_ent = NULL;
2065 spin_unlock_irq(&ent->mkeys_queue.lock);
2066 }
2067 ret = destroy_mkey(dev, mr);
2068 out:
2069 if (is_odp) {
2070 if (!ret)
2071 to_ib_umem_odp(mr->umem)->private = NULL;
2072 mutex_unlock(&to_ib_umem_odp(mr->umem)->umem_mutex);
2073 }
2074
2075 if (is_odp_dma_buf) {
2076 if (!ret)
2077 to_ib_umem_dmabuf(mr->umem)->private = NULL;
2078 dma_resv_unlock(to_ib_umem_dmabuf(mr->umem)->attach->dmabuf->resv);
2079 }
2080
2081 return ret;
2082 }
2083
__mlx5_ib_dereg_mr(struct ib_mr * ibmr)2084 static int __mlx5_ib_dereg_mr(struct ib_mr *ibmr)
2085 {
2086 struct mlx5_ib_mr *mr = to_mmr(ibmr);
2087 struct mlx5_ib_dev *dev = to_mdev(ibmr->device);
2088 int rc;
2089
2090 /*
2091 * Any async use of the mr must hold the refcount, once the refcount
2092 * goes to zero no other thread, such as ODP page faults, prefetch, any
2093 * UMR activity, etc can touch the mkey. Thus it is safe to destroy it.
2094 */
2095 if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) &&
2096 refcount_read(&mr->mmkey.usecount) != 0 &&
2097 xa_erase(&mr_to_mdev(mr)->odp_mkeys, mlx5_base_mkey(mr->mmkey.key)))
2098 mlx5r_deref_wait_odp_mkey(&mr->mmkey);
2099
2100 if (ibmr->type == IB_MR_TYPE_INTEGRITY) {
2101 xa_cmpxchg(&dev->sig_mrs, mlx5_base_mkey(mr->mmkey.key),
2102 mr->sig, NULL, GFP_KERNEL);
2103
2104 if (mr->mtt_mr) {
2105 rc = mlx5_ib_dereg_mr(&mr->mtt_mr->ibmr, NULL);
2106 if (rc)
2107 return rc;
2108 mr->mtt_mr = NULL;
2109 }
2110 if (mr->klm_mr) {
2111 rc = mlx5_ib_dereg_mr(&mr->klm_mr->ibmr, NULL);
2112 if (rc)
2113 return rc;
2114 mr->klm_mr = NULL;
2115 }
2116
2117 if (mlx5_core_destroy_psv(dev->mdev,
2118 mr->sig->psv_memory.psv_idx))
2119 mlx5_ib_warn(dev, "failed to destroy mem psv %d\n",
2120 mr->sig->psv_memory.psv_idx);
2121 if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_wire.psv_idx))
2122 mlx5_ib_warn(dev, "failed to destroy wire psv %d\n",
2123 mr->sig->psv_wire.psv_idx);
2124 kfree(mr->sig);
2125 mr->sig = NULL;
2126 }
2127
2128 /* Stop DMA */
2129 rc = mlx5_revoke_mr(mr);
2130 if (rc)
2131 return rc;
2132
2133 if (mr->umem) {
2134 bool is_odp = is_odp_mr(mr);
2135
2136 if (!is_odp)
2137 atomic_sub(ib_umem_num_pages(mr->umem),
2138 &dev->mdev->priv.reg_pages);
2139 ib_umem_release(mr->umem);
2140 if (is_odp)
2141 mlx5_ib_free_odp_mr(mr);
2142 }
2143
2144 if (!mr->mmkey.cache_ent)
2145 mlx5_free_priv_descs(mr);
2146
2147 kfree(mr);
2148 return 0;
2149 }
2150
dereg_crossing_data_direct_mr(struct mlx5_ib_dev * dev,struct mlx5_ib_mr * mr)2151 static int dereg_crossing_data_direct_mr(struct mlx5_ib_dev *dev,
2152 struct mlx5_ib_mr *mr)
2153 {
2154 struct mlx5_ib_mr *dd_crossed_mr = mr->dd_crossed_mr;
2155 int ret;
2156
2157 ret = __mlx5_ib_dereg_mr(&mr->ibmr);
2158 if (ret)
2159 return ret;
2160
2161 mutex_lock(&dev->data_direct_lock);
2162 if (!dd_crossed_mr->revoked)
2163 list_del(&dd_crossed_mr->dd_node);
2164
2165 ret = __mlx5_ib_dereg_mr(&dd_crossed_mr->ibmr);
2166 mutex_unlock(&dev->data_direct_lock);
2167 return ret;
2168 }
2169
mlx5_ib_dereg_mr(struct ib_mr * ibmr,struct ib_udata * udata)2170 int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
2171 {
2172 struct mlx5_ib_mr *mr = to_mmr(ibmr);
2173 struct mlx5_ib_dev *dev = to_mdev(ibmr->device);
2174
2175 if (mr->data_direct)
2176 return dereg_crossing_data_direct_mr(dev, mr);
2177
2178 return __mlx5_ib_dereg_mr(ibmr);
2179 }
2180
mlx5_set_umr_free_mkey(struct ib_pd * pd,u32 * in,int ndescs,int access_mode,int page_shift)2181 static void mlx5_set_umr_free_mkey(struct ib_pd *pd, u32 *in, int ndescs,
2182 int access_mode, int page_shift)
2183 {
2184 struct mlx5_ib_dev *dev = to_mdev(pd->device);
2185 void *mkc;
2186
2187 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
2188
2189 /* This is only used from the kernel, so setting the PD is OK. */
2190 set_mkc_access_pd_addr_fields(mkc, IB_ACCESS_RELAXED_ORDERING, 0, pd);
2191 MLX5_SET(mkc, mkc, free, 1);
2192 MLX5_SET(mkc, mkc, translations_octword_size, ndescs);
2193 MLX5_SET(mkc, mkc, access_mode_1_0, access_mode & 0x3);
2194 MLX5_SET(mkc, mkc, access_mode_4_2, (access_mode >> 2) & 0x7);
2195 MLX5_SET(mkc, mkc, umr_en, 1);
2196 MLX5_SET(mkc, mkc, log_page_size, page_shift);
2197 if (access_mode == MLX5_MKC_ACCESS_MODE_PA ||
2198 access_mode == MLX5_MKC_ACCESS_MODE_MTT)
2199 MLX5_SET(mkc, mkc, ma_translation_mode, MLX5_CAP_GEN(dev->mdev, ats));
2200 }
2201
_mlx5_alloc_mkey_descs(struct ib_pd * pd,struct mlx5_ib_mr * mr,int ndescs,int desc_size,int page_shift,int access_mode,u32 * in,int inlen)2202 static int _mlx5_alloc_mkey_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr,
2203 int ndescs, int desc_size, int page_shift,
2204 int access_mode, u32 *in, int inlen)
2205 {
2206 struct mlx5_ib_dev *dev = to_mdev(pd->device);
2207 int err;
2208
2209 mr->access_mode = access_mode;
2210 mr->desc_size = desc_size;
2211 mr->max_descs = ndescs;
2212
2213 err = mlx5_alloc_priv_descs(pd->device, mr, ndescs, desc_size);
2214 if (err)
2215 return err;
2216
2217 mlx5_set_umr_free_mkey(pd, in, ndescs, access_mode, page_shift);
2218
2219 err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen);
2220 if (err)
2221 goto err_free_descs;
2222
2223 mr->mmkey.type = MLX5_MKEY_MR;
2224 mr->ibmr.lkey = mr->mmkey.key;
2225 mr->ibmr.rkey = mr->mmkey.key;
2226
2227 return 0;
2228
2229 err_free_descs:
2230 mlx5_free_priv_descs(mr);
2231 return err;
2232 }
2233
mlx5_ib_alloc_pi_mr(struct ib_pd * pd,u32 max_num_sg,u32 max_num_meta_sg,int desc_size,int access_mode)2234 static struct mlx5_ib_mr *mlx5_ib_alloc_pi_mr(struct ib_pd *pd,
2235 u32 max_num_sg, u32 max_num_meta_sg,
2236 int desc_size, int access_mode)
2237 {
2238 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
2239 int ndescs = ALIGN(max_num_sg + max_num_meta_sg, 4);
2240 int page_shift = 0;
2241 struct mlx5_ib_mr *mr;
2242 u32 *in;
2243 int err;
2244
2245 mr = kzalloc(sizeof(*mr), GFP_KERNEL);
2246 if (!mr)
2247 return ERR_PTR(-ENOMEM);
2248
2249 mr->ibmr.pd = pd;
2250 mr->ibmr.device = pd->device;
2251
2252 in = kzalloc(inlen, GFP_KERNEL);
2253 if (!in) {
2254 err = -ENOMEM;
2255 goto err_free;
2256 }
2257
2258 if (access_mode == MLX5_MKC_ACCESS_MODE_MTT)
2259 page_shift = PAGE_SHIFT;
2260
2261 err = _mlx5_alloc_mkey_descs(pd, mr, ndescs, desc_size, page_shift,
2262 access_mode, in, inlen);
2263 if (err)
2264 goto err_free_in;
2265
2266 mr->umem = NULL;
2267 kfree(in);
2268
2269 return mr;
2270
2271 err_free_in:
2272 kfree(in);
2273 err_free:
2274 kfree(mr);
2275 return ERR_PTR(err);
2276 }
2277
mlx5_alloc_mem_reg_descs(struct ib_pd * pd,struct mlx5_ib_mr * mr,int ndescs,u32 * in,int inlen)2278 static int mlx5_alloc_mem_reg_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr,
2279 int ndescs, u32 *in, int inlen)
2280 {
2281 return _mlx5_alloc_mkey_descs(pd, mr, ndescs, sizeof(struct mlx5_mtt),
2282 PAGE_SHIFT, MLX5_MKC_ACCESS_MODE_MTT, in,
2283 inlen);
2284 }
2285
mlx5_alloc_sg_gaps_descs(struct ib_pd * pd,struct mlx5_ib_mr * mr,int ndescs,u32 * in,int inlen)2286 static int mlx5_alloc_sg_gaps_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr,
2287 int ndescs, u32 *in, int inlen)
2288 {
2289 return _mlx5_alloc_mkey_descs(pd, mr, ndescs, sizeof(struct mlx5_klm),
2290 0, MLX5_MKC_ACCESS_MODE_KLMS, in, inlen);
2291 }
2292
mlx5_alloc_integrity_descs(struct ib_pd * pd,struct mlx5_ib_mr * mr,int max_num_sg,int max_num_meta_sg,u32 * in,int inlen)2293 static int mlx5_alloc_integrity_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr,
2294 int max_num_sg, int max_num_meta_sg,
2295 u32 *in, int inlen)
2296 {
2297 struct mlx5_ib_dev *dev = to_mdev(pd->device);
2298 u32 psv_index[2];
2299 void *mkc;
2300 int err;
2301
2302 mr->sig = kzalloc(sizeof(*mr->sig), GFP_KERNEL);
2303 if (!mr->sig)
2304 return -ENOMEM;
2305
2306 /* create mem & wire PSVs */
2307 err = mlx5_core_create_psv(dev->mdev, to_mpd(pd)->pdn, 2, psv_index);
2308 if (err)
2309 goto err_free_sig;
2310
2311 mr->sig->psv_memory.psv_idx = psv_index[0];
2312 mr->sig->psv_wire.psv_idx = psv_index[1];
2313
2314 mr->sig->sig_status_checked = true;
2315 mr->sig->sig_err_exists = false;
2316 /* Next UMR, Arm SIGERR */
2317 ++mr->sig->sigerr_count;
2318 mr->klm_mr = mlx5_ib_alloc_pi_mr(pd, max_num_sg, max_num_meta_sg,
2319 sizeof(struct mlx5_klm),
2320 MLX5_MKC_ACCESS_MODE_KLMS);
2321 if (IS_ERR(mr->klm_mr)) {
2322 err = PTR_ERR(mr->klm_mr);
2323 goto err_destroy_psv;
2324 }
2325 mr->mtt_mr = mlx5_ib_alloc_pi_mr(pd, max_num_sg, max_num_meta_sg,
2326 sizeof(struct mlx5_mtt),
2327 MLX5_MKC_ACCESS_MODE_MTT);
2328 if (IS_ERR(mr->mtt_mr)) {
2329 err = PTR_ERR(mr->mtt_mr);
2330 goto err_free_klm_mr;
2331 }
2332
2333 /* Set bsf descriptors for mkey */
2334 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
2335 MLX5_SET(mkc, mkc, bsf_en, 1);
2336 MLX5_SET(mkc, mkc, bsf_octword_size, MLX5_MKEY_BSF_OCTO_SIZE);
2337
2338 err = _mlx5_alloc_mkey_descs(pd, mr, 4, sizeof(struct mlx5_klm), 0,
2339 MLX5_MKC_ACCESS_MODE_KLMS, in, inlen);
2340 if (err)
2341 goto err_free_mtt_mr;
2342
2343 err = xa_err(xa_store(&dev->sig_mrs, mlx5_base_mkey(mr->mmkey.key),
2344 mr->sig, GFP_KERNEL));
2345 if (err)
2346 goto err_free_descs;
2347 return 0;
2348
2349 err_free_descs:
2350 destroy_mkey(dev, mr);
2351 mlx5_free_priv_descs(mr);
2352 err_free_mtt_mr:
2353 mlx5_ib_dereg_mr(&mr->mtt_mr->ibmr, NULL);
2354 mr->mtt_mr = NULL;
2355 err_free_klm_mr:
2356 mlx5_ib_dereg_mr(&mr->klm_mr->ibmr, NULL);
2357 mr->klm_mr = NULL;
2358 err_destroy_psv:
2359 if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_memory.psv_idx))
2360 mlx5_ib_warn(dev, "failed to destroy mem psv %d\n",
2361 mr->sig->psv_memory.psv_idx);
2362 if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_wire.psv_idx))
2363 mlx5_ib_warn(dev, "failed to destroy wire psv %d\n",
2364 mr->sig->psv_wire.psv_idx);
2365 err_free_sig:
2366 kfree(mr->sig);
2367
2368 return err;
2369 }
2370
__mlx5_ib_alloc_mr(struct ib_pd * pd,enum ib_mr_type mr_type,u32 max_num_sg,u32 max_num_meta_sg)2371 static struct ib_mr *__mlx5_ib_alloc_mr(struct ib_pd *pd,
2372 enum ib_mr_type mr_type, u32 max_num_sg,
2373 u32 max_num_meta_sg)
2374 {
2375 struct mlx5_ib_dev *dev = to_mdev(pd->device);
2376 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
2377 int ndescs = ALIGN(max_num_sg, 4);
2378 struct mlx5_ib_mr *mr;
2379 u32 *in;
2380 int err;
2381
2382 mr = kzalloc(sizeof(*mr), GFP_KERNEL);
2383 if (!mr)
2384 return ERR_PTR(-ENOMEM);
2385
2386 in = kzalloc(inlen, GFP_KERNEL);
2387 if (!in) {
2388 err = -ENOMEM;
2389 goto err_free;
2390 }
2391
2392 mr->ibmr.device = pd->device;
2393 mr->umem = NULL;
2394
2395 switch (mr_type) {
2396 case IB_MR_TYPE_MEM_REG:
2397 err = mlx5_alloc_mem_reg_descs(pd, mr, ndescs, in, inlen);
2398 break;
2399 case IB_MR_TYPE_SG_GAPS:
2400 err = mlx5_alloc_sg_gaps_descs(pd, mr, ndescs, in, inlen);
2401 break;
2402 case IB_MR_TYPE_INTEGRITY:
2403 err = mlx5_alloc_integrity_descs(pd, mr, max_num_sg,
2404 max_num_meta_sg, in, inlen);
2405 break;
2406 default:
2407 mlx5_ib_warn(dev, "Invalid mr type %d\n", mr_type);
2408 err = -EINVAL;
2409 }
2410
2411 if (err)
2412 goto err_free_in;
2413
2414 kfree(in);
2415
2416 return &mr->ibmr;
2417
2418 err_free_in:
2419 kfree(in);
2420 err_free:
2421 kfree(mr);
2422 return ERR_PTR(err);
2423 }
2424
mlx5_ib_alloc_mr(struct ib_pd * pd,enum ib_mr_type mr_type,u32 max_num_sg)2425 struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
2426 u32 max_num_sg)
2427 {
2428 return __mlx5_ib_alloc_mr(pd, mr_type, max_num_sg, 0);
2429 }
2430
mlx5_ib_alloc_mr_integrity(struct ib_pd * pd,u32 max_num_sg,u32 max_num_meta_sg)2431 struct ib_mr *mlx5_ib_alloc_mr_integrity(struct ib_pd *pd,
2432 u32 max_num_sg, u32 max_num_meta_sg)
2433 {
2434 return __mlx5_ib_alloc_mr(pd, IB_MR_TYPE_INTEGRITY, max_num_sg,
2435 max_num_meta_sg);
2436 }
2437
mlx5_ib_alloc_mw(struct ib_mw * ibmw,struct ib_udata * udata)2438 int mlx5_ib_alloc_mw(struct ib_mw *ibmw, struct ib_udata *udata)
2439 {
2440 struct mlx5_ib_dev *dev = to_mdev(ibmw->device);
2441 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
2442 struct mlx5_ib_mw *mw = to_mmw(ibmw);
2443 unsigned int ndescs;
2444 u32 *in = NULL;
2445 void *mkc;
2446 int err;
2447 struct mlx5_ib_alloc_mw req = {};
2448 struct {
2449 __u32 comp_mask;
2450 __u32 response_length;
2451 } resp = {};
2452
2453 err = ib_copy_from_udata(&req, udata, min(udata->inlen, sizeof(req)));
2454 if (err)
2455 return err;
2456
2457 if (req.comp_mask || req.reserved1 || req.reserved2)
2458 return -EOPNOTSUPP;
2459
2460 if (udata->inlen > sizeof(req) &&
2461 !ib_is_udata_cleared(udata, sizeof(req),
2462 udata->inlen - sizeof(req)))
2463 return -EOPNOTSUPP;
2464
2465 ndescs = req.num_klms ? roundup(req.num_klms, 4) : roundup(1, 4);
2466
2467 in = kzalloc(inlen, GFP_KERNEL);
2468 if (!in)
2469 return -ENOMEM;
2470
2471 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
2472
2473 MLX5_SET(mkc, mkc, free, 1);
2474 MLX5_SET(mkc, mkc, translations_octword_size, ndescs);
2475 MLX5_SET(mkc, mkc, pd, to_mpd(ibmw->pd)->pdn);
2476 MLX5_SET(mkc, mkc, umr_en, 1);
2477 MLX5_SET(mkc, mkc, lr, 1);
2478 MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_KLMS);
2479 MLX5_SET(mkc, mkc, en_rinval, !!((ibmw->type == IB_MW_TYPE_2)));
2480 MLX5_SET(mkc, mkc, qpn, 0xffffff);
2481
2482 err = mlx5_ib_create_mkey(dev, &mw->mmkey, in, inlen);
2483 if (err)
2484 goto free;
2485
2486 mw->mmkey.type = MLX5_MKEY_MW;
2487 ibmw->rkey = mw->mmkey.key;
2488 mw->mmkey.ndescs = ndescs;
2489
2490 resp.response_length =
2491 min(offsetofend(typeof(resp), response_length), udata->outlen);
2492 if (resp.response_length) {
2493 err = ib_copy_to_udata(udata, &resp, resp.response_length);
2494 if (err)
2495 goto free_mkey;
2496 }
2497
2498 if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) {
2499 err = mlx5r_store_odp_mkey(dev, &mw->mmkey);
2500 if (err)
2501 goto free_mkey;
2502 }
2503
2504 kfree(in);
2505 return 0;
2506
2507 free_mkey:
2508 mlx5_core_destroy_mkey(dev->mdev, mw->mmkey.key);
2509 free:
2510 kfree(in);
2511 return err;
2512 }
2513
mlx5_ib_dealloc_mw(struct ib_mw * mw)2514 int mlx5_ib_dealloc_mw(struct ib_mw *mw)
2515 {
2516 struct mlx5_ib_dev *dev = to_mdev(mw->device);
2517 struct mlx5_ib_mw *mmw = to_mmw(mw);
2518
2519 if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) &&
2520 xa_erase(&dev->odp_mkeys, mlx5_base_mkey(mmw->mmkey.key)))
2521 /*
2522 * pagefault_single_data_segment() may be accessing mmw
2523 * if the user bound an ODP MR to this MW.
2524 */
2525 mlx5r_deref_wait_odp_mkey(&mmw->mmkey);
2526
2527 return mlx5_core_destroy_mkey(dev->mdev, mmw->mmkey.key);
2528 }
2529
mlx5_ib_check_mr_status(struct ib_mr * ibmr,u32 check_mask,struct ib_mr_status * mr_status)2530 int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask,
2531 struct ib_mr_status *mr_status)
2532 {
2533 struct mlx5_ib_mr *mmr = to_mmr(ibmr);
2534 int ret = 0;
2535
2536 if (check_mask & ~IB_MR_CHECK_SIG_STATUS) {
2537 pr_err("Invalid status check mask\n");
2538 ret = -EINVAL;
2539 goto done;
2540 }
2541
2542 mr_status->fail_status = 0;
2543 if (check_mask & IB_MR_CHECK_SIG_STATUS) {
2544 if (!mmr->sig) {
2545 ret = -EINVAL;
2546 pr_err("signature status check requested on a non-signature enabled MR\n");
2547 goto done;
2548 }
2549
2550 mmr->sig->sig_status_checked = true;
2551 if (!mmr->sig->sig_err_exists)
2552 goto done;
2553
2554 if (ibmr->lkey == mmr->sig->err_item.key)
2555 memcpy(&mr_status->sig_err, &mmr->sig->err_item,
2556 sizeof(mr_status->sig_err));
2557 else {
2558 mr_status->sig_err.err_type = IB_SIG_BAD_GUARD;
2559 mr_status->sig_err.sig_err_offset = 0;
2560 mr_status->sig_err.key = mmr->sig->err_item.key;
2561 }
2562
2563 mmr->sig->sig_err_exists = false;
2564 mr_status->fail_status |= IB_MR_CHECK_SIG_STATUS;
2565 }
2566
2567 done:
2568 return ret;
2569 }
2570
2571 static int
mlx5_ib_map_pa_mr_sg_pi(struct ib_mr * ibmr,struct scatterlist * data_sg,int data_sg_nents,unsigned int * data_sg_offset,struct scatterlist * meta_sg,int meta_sg_nents,unsigned int * meta_sg_offset)2572 mlx5_ib_map_pa_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg,
2573 int data_sg_nents, unsigned int *data_sg_offset,
2574 struct scatterlist *meta_sg, int meta_sg_nents,
2575 unsigned int *meta_sg_offset)
2576 {
2577 struct mlx5_ib_mr *mr = to_mmr(ibmr);
2578 unsigned int sg_offset = 0;
2579 int n = 0;
2580
2581 mr->meta_length = 0;
2582 if (data_sg_nents == 1) {
2583 n++;
2584 mr->mmkey.ndescs = 1;
2585 if (data_sg_offset)
2586 sg_offset = *data_sg_offset;
2587 mr->data_length = sg_dma_len(data_sg) - sg_offset;
2588 mr->data_iova = sg_dma_address(data_sg) + sg_offset;
2589 if (meta_sg_nents == 1) {
2590 n++;
2591 mr->meta_ndescs = 1;
2592 if (meta_sg_offset)
2593 sg_offset = *meta_sg_offset;
2594 else
2595 sg_offset = 0;
2596 mr->meta_length = sg_dma_len(meta_sg) - sg_offset;
2597 mr->pi_iova = sg_dma_address(meta_sg) + sg_offset;
2598 }
2599 ibmr->length = mr->data_length + mr->meta_length;
2600 }
2601
2602 return n;
2603 }
2604
2605 static int
mlx5_ib_sg_to_klms(struct mlx5_ib_mr * mr,struct scatterlist * sgl,unsigned short sg_nents,unsigned int * sg_offset_p,struct scatterlist * meta_sgl,unsigned short meta_sg_nents,unsigned int * meta_sg_offset_p)2606 mlx5_ib_sg_to_klms(struct mlx5_ib_mr *mr,
2607 struct scatterlist *sgl,
2608 unsigned short sg_nents,
2609 unsigned int *sg_offset_p,
2610 struct scatterlist *meta_sgl,
2611 unsigned short meta_sg_nents,
2612 unsigned int *meta_sg_offset_p)
2613 {
2614 struct scatterlist *sg = sgl;
2615 struct mlx5_klm *klms = mr->descs;
2616 unsigned int sg_offset = sg_offset_p ? *sg_offset_p : 0;
2617 u32 lkey = mr->ibmr.pd->local_dma_lkey;
2618 int i, j = 0;
2619
2620 mr->ibmr.iova = sg_dma_address(sg) + sg_offset;
2621 mr->ibmr.length = 0;
2622
2623 for_each_sg(sgl, sg, sg_nents, i) {
2624 if (unlikely(i >= mr->max_descs))
2625 break;
2626 klms[i].va = cpu_to_be64(sg_dma_address(sg) + sg_offset);
2627 klms[i].bcount = cpu_to_be32(sg_dma_len(sg) - sg_offset);
2628 klms[i].key = cpu_to_be32(lkey);
2629 mr->ibmr.length += sg_dma_len(sg) - sg_offset;
2630
2631 sg_offset = 0;
2632 }
2633
2634 if (sg_offset_p)
2635 *sg_offset_p = sg_offset;
2636
2637 mr->mmkey.ndescs = i;
2638 mr->data_length = mr->ibmr.length;
2639
2640 if (meta_sg_nents) {
2641 sg = meta_sgl;
2642 sg_offset = meta_sg_offset_p ? *meta_sg_offset_p : 0;
2643 for_each_sg(meta_sgl, sg, meta_sg_nents, j) {
2644 if (unlikely(i + j >= mr->max_descs))
2645 break;
2646 klms[i + j].va = cpu_to_be64(sg_dma_address(sg) +
2647 sg_offset);
2648 klms[i + j].bcount = cpu_to_be32(sg_dma_len(sg) -
2649 sg_offset);
2650 klms[i + j].key = cpu_to_be32(lkey);
2651 mr->ibmr.length += sg_dma_len(sg) - sg_offset;
2652
2653 sg_offset = 0;
2654 }
2655 if (meta_sg_offset_p)
2656 *meta_sg_offset_p = sg_offset;
2657
2658 mr->meta_ndescs = j;
2659 mr->meta_length = mr->ibmr.length - mr->data_length;
2660 }
2661
2662 return i + j;
2663 }
2664
mlx5_set_page(struct ib_mr * ibmr,u64 addr)2665 static int mlx5_set_page(struct ib_mr *ibmr, u64 addr)
2666 {
2667 struct mlx5_ib_mr *mr = to_mmr(ibmr);
2668 __be64 *descs;
2669
2670 if (unlikely(mr->mmkey.ndescs == mr->max_descs))
2671 return -ENOMEM;
2672
2673 descs = mr->descs;
2674 descs[mr->mmkey.ndescs++] = cpu_to_be64(addr | MLX5_EN_RD | MLX5_EN_WR);
2675
2676 return 0;
2677 }
2678
mlx5_set_page_pi(struct ib_mr * ibmr,u64 addr)2679 static int mlx5_set_page_pi(struct ib_mr *ibmr, u64 addr)
2680 {
2681 struct mlx5_ib_mr *mr = to_mmr(ibmr);
2682 __be64 *descs;
2683
2684 if (unlikely(mr->mmkey.ndescs + mr->meta_ndescs == mr->max_descs))
2685 return -ENOMEM;
2686
2687 descs = mr->descs;
2688 descs[mr->mmkey.ndescs + mr->meta_ndescs++] =
2689 cpu_to_be64(addr | MLX5_EN_RD | MLX5_EN_WR);
2690
2691 return 0;
2692 }
2693
2694 static int
mlx5_ib_map_mtt_mr_sg_pi(struct ib_mr * ibmr,struct scatterlist * data_sg,int data_sg_nents,unsigned int * data_sg_offset,struct scatterlist * meta_sg,int meta_sg_nents,unsigned int * meta_sg_offset)2695 mlx5_ib_map_mtt_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg,
2696 int data_sg_nents, unsigned int *data_sg_offset,
2697 struct scatterlist *meta_sg, int meta_sg_nents,
2698 unsigned int *meta_sg_offset)
2699 {
2700 struct mlx5_ib_mr *mr = to_mmr(ibmr);
2701 struct mlx5_ib_mr *pi_mr = mr->mtt_mr;
2702 int n;
2703
2704 pi_mr->mmkey.ndescs = 0;
2705 pi_mr->meta_ndescs = 0;
2706 pi_mr->meta_length = 0;
2707
2708 ib_dma_sync_single_for_cpu(ibmr->device, pi_mr->desc_map,
2709 pi_mr->desc_size * pi_mr->max_descs,
2710 DMA_TO_DEVICE);
2711
2712 pi_mr->ibmr.page_size = ibmr->page_size;
2713 n = ib_sg_to_pages(&pi_mr->ibmr, data_sg, data_sg_nents, data_sg_offset,
2714 mlx5_set_page);
2715 if (n != data_sg_nents)
2716 return n;
2717
2718 pi_mr->data_iova = pi_mr->ibmr.iova;
2719 pi_mr->data_length = pi_mr->ibmr.length;
2720 pi_mr->ibmr.length = pi_mr->data_length;
2721 ibmr->length = pi_mr->data_length;
2722
2723 if (meta_sg_nents) {
2724 u64 page_mask = ~((u64)ibmr->page_size - 1);
2725 u64 iova = pi_mr->data_iova;
2726
2727 n += ib_sg_to_pages(&pi_mr->ibmr, meta_sg, meta_sg_nents,
2728 meta_sg_offset, mlx5_set_page_pi);
2729
2730 pi_mr->meta_length = pi_mr->ibmr.length;
2731 /*
2732 * PI address for the HW is the offset of the metadata address
2733 * relative to the first data page address.
2734 * It equals to first data page address + size of data pages +
2735 * metadata offset at the first metadata page
2736 */
2737 pi_mr->pi_iova = (iova & page_mask) +
2738 pi_mr->mmkey.ndescs * ibmr->page_size +
2739 (pi_mr->ibmr.iova & ~page_mask);
2740 /*
2741 * In order to use one MTT MR for data and metadata, we register
2742 * also the gaps between the end of the data and the start of
2743 * the metadata (the sig MR will verify that the HW will access
2744 * to right addresses). This mapping is safe because we use
2745 * internal mkey for the registration.
2746 */
2747 pi_mr->ibmr.length = pi_mr->pi_iova + pi_mr->meta_length - iova;
2748 pi_mr->ibmr.iova = iova;
2749 ibmr->length += pi_mr->meta_length;
2750 }
2751
2752 ib_dma_sync_single_for_device(ibmr->device, pi_mr->desc_map,
2753 pi_mr->desc_size * pi_mr->max_descs,
2754 DMA_TO_DEVICE);
2755
2756 return n;
2757 }
2758
2759 static int
mlx5_ib_map_klm_mr_sg_pi(struct ib_mr * ibmr,struct scatterlist * data_sg,int data_sg_nents,unsigned int * data_sg_offset,struct scatterlist * meta_sg,int meta_sg_nents,unsigned int * meta_sg_offset)2760 mlx5_ib_map_klm_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg,
2761 int data_sg_nents, unsigned int *data_sg_offset,
2762 struct scatterlist *meta_sg, int meta_sg_nents,
2763 unsigned int *meta_sg_offset)
2764 {
2765 struct mlx5_ib_mr *mr = to_mmr(ibmr);
2766 struct mlx5_ib_mr *pi_mr = mr->klm_mr;
2767 int n;
2768
2769 pi_mr->mmkey.ndescs = 0;
2770 pi_mr->meta_ndescs = 0;
2771 pi_mr->meta_length = 0;
2772
2773 ib_dma_sync_single_for_cpu(ibmr->device, pi_mr->desc_map,
2774 pi_mr->desc_size * pi_mr->max_descs,
2775 DMA_TO_DEVICE);
2776
2777 n = mlx5_ib_sg_to_klms(pi_mr, data_sg, data_sg_nents, data_sg_offset,
2778 meta_sg, meta_sg_nents, meta_sg_offset);
2779
2780 ib_dma_sync_single_for_device(ibmr->device, pi_mr->desc_map,
2781 pi_mr->desc_size * pi_mr->max_descs,
2782 DMA_TO_DEVICE);
2783
2784 /* This is zero-based memory region */
2785 pi_mr->data_iova = 0;
2786 pi_mr->ibmr.iova = 0;
2787 pi_mr->pi_iova = pi_mr->data_length;
2788 ibmr->length = pi_mr->ibmr.length;
2789
2790 return n;
2791 }
2792
mlx5_ib_map_mr_sg_pi(struct ib_mr * ibmr,struct scatterlist * data_sg,int data_sg_nents,unsigned int * data_sg_offset,struct scatterlist * meta_sg,int meta_sg_nents,unsigned int * meta_sg_offset)2793 int mlx5_ib_map_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg,
2794 int data_sg_nents, unsigned int *data_sg_offset,
2795 struct scatterlist *meta_sg, int meta_sg_nents,
2796 unsigned int *meta_sg_offset)
2797 {
2798 struct mlx5_ib_mr *mr = to_mmr(ibmr);
2799 struct mlx5_ib_mr *pi_mr = NULL;
2800 int n;
2801
2802 WARN_ON(ibmr->type != IB_MR_TYPE_INTEGRITY);
2803
2804 mr->mmkey.ndescs = 0;
2805 mr->data_length = 0;
2806 mr->data_iova = 0;
2807 mr->meta_ndescs = 0;
2808 mr->pi_iova = 0;
2809 /*
2810 * As a performance optimization, if possible, there is no need to
2811 * perform UMR operation to register the data/metadata buffers.
2812 * First try to map the sg lists to PA descriptors with local_dma_lkey.
2813 * Fallback to UMR only in case of a failure.
2814 */
2815 n = mlx5_ib_map_pa_mr_sg_pi(ibmr, data_sg, data_sg_nents,
2816 data_sg_offset, meta_sg, meta_sg_nents,
2817 meta_sg_offset);
2818 if (n == data_sg_nents + meta_sg_nents)
2819 goto out;
2820 /*
2821 * As a performance optimization, if possible, there is no need to map
2822 * the sg lists to KLM descriptors. First try to map the sg lists to MTT
2823 * descriptors and fallback to KLM only in case of a failure.
2824 * It's more efficient for the HW to work with MTT descriptors
2825 * (especially in high load).
2826 * Use KLM (indirect access) only if it's mandatory.
2827 */
2828 pi_mr = mr->mtt_mr;
2829 n = mlx5_ib_map_mtt_mr_sg_pi(ibmr, data_sg, data_sg_nents,
2830 data_sg_offset, meta_sg, meta_sg_nents,
2831 meta_sg_offset);
2832 if (n == data_sg_nents + meta_sg_nents)
2833 goto out;
2834
2835 pi_mr = mr->klm_mr;
2836 n = mlx5_ib_map_klm_mr_sg_pi(ibmr, data_sg, data_sg_nents,
2837 data_sg_offset, meta_sg, meta_sg_nents,
2838 meta_sg_offset);
2839 if (unlikely(n != data_sg_nents + meta_sg_nents))
2840 return -ENOMEM;
2841
2842 out:
2843 /* This is zero-based memory region */
2844 ibmr->iova = 0;
2845 mr->pi_mr = pi_mr;
2846 if (pi_mr)
2847 ibmr->sig_attrs->meta_length = pi_mr->meta_length;
2848 else
2849 ibmr->sig_attrs->meta_length = mr->meta_length;
2850
2851 return 0;
2852 }
2853
mlx5_ib_map_mr_sg(struct ib_mr * ibmr,struct scatterlist * sg,int sg_nents,unsigned int * sg_offset)2854 int mlx5_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
2855 unsigned int *sg_offset)
2856 {
2857 struct mlx5_ib_mr *mr = to_mmr(ibmr);
2858 int n;
2859
2860 mr->mmkey.ndescs = 0;
2861
2862 ib_dma_sync_single_for_cpu(ibmr->device, mr->desc_map,
2863 mr->desc_size * mr->max_descs,
2864 DMA_TO_DEVICE);
2865
2866 if (mr->access_mode == MLX5_MKC_ACCESS_MODE_KLMS)
2867 n = mlx5_ib_sg_to_klms(mr, sg, sg_nents, sg_offset, NULL, 0,
2868 NULL);
2869 else
2870 n = ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset,
2871 mlx5_set_page);
2872
2873 ib_dma_sync_single_for_device(ibmr->device, mr->desc_map,
2874 mr->desc_size * mr->max_descs,
2875 DMA_TO_DEVICE);
2876
2877 return n;
2878 }
2879