1 /*
2 * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
3 * Copyright (c) 2020, Intel Corporation. All rights reserved.
4 *
5 * This software is available to you under a choice of one of two
6 * licenses. You may choose to be licensed under the terms of the GNU
7 * General Public License (GPL) Version 2, available from the file
8 * COPYING in the main directory of this source tree, or the
9 * OpenIB.org BSD license below:
10 *
11 * Redistribution and use in source and binary forms, with or
12 * without modification, are permitted provided that the following
13 * conditions are met:
14 *
15 * - Redistributions of source code must retain the above
16 * copyright notice, this list of conditions and the following
17 * disclaimer.
18 *
19 * - Redistributions in binary form must reproduce the above
20 * copyright notice, this list of conditions and the following
21 * disclaimer in the documentation and/or other materials
22 * provided with the distribution.
23 *
24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
27 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
28 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
29 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
30 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
31 * SOFTWARE.
32 */
33
34
35 #include <linux/kref.h>
36 #include <linux/random.h>
37 #include <linux/debugfs.h>
38 #include <linux/export.h>
39 #include <linux/delay.h>
40 #include <linux/dma-buf.h>
41 #include <linux/dma-resv.h>
42 #include <rdma/ib_umem_odp.h>
43 #include "dm.h"
44 #include "mlx5_ib.h"
45 #include "umr.h"
46 #include "data_direct.h"
47 #include "dmah.h"
48
49 enum {
50 MAX_PENDING_REG_MR = 8,
51 };
52
53 #define MLX5_MR_CACHE_PERSISTENT_ENTRY_MIN_DESCS 4
54 #define MLX5_UMR_ALIGN 2048
55
56 static void
57 create_mkey_callback(int status, struct mlx5_async_work *context);
58 static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem,
59 u64 iova, int access_flags,
60 unsigned long page_size, bool populate,
61 int access_mode, u16 st_index, u8 ph);
62 static int __mlx5_ib_dereg_mr(struct ib_mr *ibmr);
63
set_mkc_access_pd_addr_fields(void * mkc,int acc,u64 start_addr,struct ib_pd * pd)64 static void set_mkc_access_pd_addr_fields(void *mkc, int acc, u64 start_addr,
65 struct ib_pd *pd)
66 {
67 struct mlx5_ib_dev *dev = to_mdev(pd->device);
68
69 MLX5_SET(mkc, mkc, a, !!(acc & IB_ACCESS_REMOTE_ATOMIC));
70 MLX5_SET(mkc, mkc, rw, !!(acc & IB_ACCESS_REMOTE_WRITE));
71 MLX5_SET(mkc, mkc, rr, !!(acc & IB_ACCESS_REMOTE_READ));
72 MLX5_SET(mkc, mkc, lw, !!(acc & IB_ACCESS_LOCAL_WRITE));
73 MLX5_SET(mkc, mkc, lr, 1);
74
75 if (acc & IB_ACCESS_RELAXED_ORDERING) {
76 if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write))
77 MLX5_SET(mkc, mkc, relaxed_ordering_write, 1);
78
79 if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read) ||
80 (MLX5_CAP_GEN(dev->mdev,
81 relaxed_ordering_read_pci_enabled) &&
82 pcie_relaxed_ordering_enabled(dev->mdev->pdev)))
83 MLX5_SET(mkc, mkc, relaxed_ordering_read, 1);
84 }
85
86 MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn);
87 MLX5_SET(mkc, mkc, qpn, 0xffffff);
88 MLX5_SET64(mkc, mkc, start_addr, start_addr);
89 }
90
assign_mkey_variant(struct mlx5_ib_dev * dev,u32 * mkey,u32 * in)91 static void assign_mkey_variant(struct mlx5_ib_dev *dev, u32 *mkey, u32 *in)
92 {
93 u8 key = atomic_inc_return(&dev->mkey_var);
94 void *mkc;
95
96 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
97 MLX5_SET(mkc, mkc, mkey_7_0, key);
98 *mkey = key;
99 }
100
mlx5_ib_create_mkey(struct mlx5_ib_dev * dev,struct mlx5_ib_mkey * mkey,u32 * in,int inlen)101 static int mlx5_ib_create_mkey(struct mlx5_ib_dev *dev,
102 struct mlx5_ib_mkey *mkey, u32 *in, int inlen)
103 {
104 int ret;
105
106 assign_mkey_variant(dev, &mkey->key, in);
107 ret = mlx5_core_create_mkey(dev->mdev, &mkey->key, in, inlen);
108 if (!ret)
109 init_waitqueue_head(&mkey->wait);
110
111 return ret;
112 }
113
mlx5_ib_create_mkey_cb(struct mlx5r_async_create_mkey * async_create)114 static int mlx5_ib_create_mkey_cb(struct mlx5r_async_create_mkey *async_create)
115 {
116 struct mlx5_ib_dev *dev = async_create->ent->dev;
117 size_t inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
118 size_t outlen = MLX5_ST_SZ_BYTES(create_mkey_out);
119
120 MLX5_SET(create_mkey_in, async_create->in, opcode,
121 MLX5_CMD_OP_CREATE_MKEY);
122 assign_mkey_variant(dev, &async_create->mkey, async_create->in);
123 return mlx5_cmd_exec_cb(&dev->async_ctx, async_create->in, inlen,
124 async_create->out, outlen, create_mkey_callback,
125 &async_create->cb_work);
126 }
127
128 static int mkey_cache_max_order(struct mlx5_ib_dev *dev);
129 static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent);
130
destroy_mkey(struct mlx5_ib_dev * dev,struct mlx5_ib_mr * mr)131 static int destroy_mkey(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
132 {
133 WARN_ON(xa_load(&dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key)));
134
135 return mlx5_core_destroy_mkey(dev->mdev, mr->mmkey.key);
136 }
137
create_mkey_warn(struct mlx5_ib_dev * dev,int status,void * out)138 static void create_mkey_warn(struct mlx5_ib_dev *dev, int status, void *out)
139 {
140 if (status == -ENXIO) /* core driver is not available */
141 return;
142
143 mlx5_ib_warn(dev, "async reg mr failed. status %d\n", status);
144 if (status != -EREMOTEIO) /* driver specific failure */
145 return;
146
147 /* Failed in FW, print cmd out failure details */
148 mlx5_cmd_out_err(dev->mdev, MLX5_CMD_OP_CREATE_MKEY, 0, out);
149 }
150
push_mkey_locked(struct mlx5_cache_ent * ent,u32 mkey)151 static int push_mkey_locked(struct mlx5_cache_ent *ent, u32 mkey)
152 {
153 unsigned long tmp = ent->mkeys_queue.ci % NUM_MKEYS_PER_PAGE;
154 struct mlx5_mkeys_page *page;
155
156 lockdep_assert_held(&ent->mkeys_queue.lock);
157 if (ent->mkeys_queue.ci >=
158 ent->mkeys_queue.num_pages * NUM_MKEYS_PER_PAGE) {
159 page = kzalloc(sizeof(*page), GFP_ATOMIC);
160 if (!page)
161 return -ENOMEM;
162 ent->mkeys_queue.num_pages++;
163 list_add_tail(&page->list, &ent->mkeys_queue.pages_list);
164 } else {
165 page = list_last_entry(&ent->mkeys_queue.pages_list,
166 struct mlx5_mkeys_page, list);
167 }
168
169 page->mkeys[tmp] = mkey;
170 ent->mkeys_queue.ci++;
171 return 0;
172 }
173
pop_mkey_locked(struct mlx5_cache_ent * ent)174 static int pop_mkey_locked(struct mlx5_cache_ent *ent)
175 {
176 unsigned long tmp = (ent->mkeys_queue.ci - 1) % NUM_MKEYS_PER_PAGE;
177 struct mlx5_mkeys_page *last_page;
178 u32 mkey;
179
180 lockdep_assert_held(&ent->mkeys_queue.lock);
181 last_page = list_last_entry(&ent->mkeys_queue.pages_list,
182 struct mlx5_mkeys_page, list);
183 mkey = last_page->mkeys[tmp];
184 last_page->mkeys[tmp] = 0;
185 ent->mkeys_queue.ci--;
186 if (ent->mkeys_queue.num_pages > 1 && !tmp) {
187 list_del(&last_page->list);
188 ent->mkeys_queue.num_pages--;
189 kfree(last_page);
190 }
191 return mkey;
192 }
193
create_mkey_callback(int status,struct mlx5_async_work * context)194 static void create_mkey_callback(int status, struct mlx5_async_work *context)
195 {
196 struct mlx5r_async_create_mkey *mkey_out =
197 container_of(context, struct mlx5r_async_create_mkey, cb_work);
198 struct mlx5_cache_ent *ent = mkey_out->ent;
199 struct mlx5_ib_dev *dev = ent->dev;
200 unsigned long flags;
201
202 if (status) {
203 create_mkey_warn(dev, status, mkey_out->out);
204 kfree(mkey_out);
205 spin_lock_irqsave(&ent->mkeys_queue.lock, flags);
206 ent->pending--;
207 WRITE_ONCE(dev->fill_delay, 1);
208 spin_unlock_irqrestore(&ent->mkeys_queue.lock, flags);
209 mod_timer(&dev->delay_timer, jiffies + HZ);
210 return;
211 }
212
213 mkey_out->mkey |= mlx5_idx_to_mkey(
214 MLX5_GET(create_mkey_out, mkey_out->out, mkey_index));
215 WRITE_ONCE(dev->cache.last_add, jiffies);
216
217 spin_lock_irqsave(&ent->mkeys_queue.lock, flags);
218 push_mkey_locked(ent, mkey_out->mkey);
219 ent->pending--;
220 /* If we are doing fill_to_high_water then keep going. */
221 queue_adjust_cache_locked(ent);
222 spin_unlock_irqrestore(&ent->mkeys_queue.lock, flags);
223 kfree(mkey_out);
224 }
225
get_mkc_octo_size(unsigned int access_mode,unsigned int ndescs)226 static int get_mkc_octo_size(unsigned int access_mode, unsigned int ndescs)
227 {
228 int ret = 0;
229
230 switch (access_mode) {
231 case MLX5_MKC_ACCESS_MODE_MTT:
232 ret = DIV_ROUND_UP(ndescs, MLX5_IB_UMR_OCTOWORD /
233 sizeof(struct mlx5_mtt));
234 break;
235 case MLX5_MKC_ACCESS_MODE_KSM:
236 ret = DIV_ROUND_UP(ndescs, MLX5_IB_UMR_OCTOWORD /
237 sizeof(struct mlx5_klm));
238 break;
239 default:
240 WARN_ON(1);
241 }
242 return ret;
243 }
244
set_cache_mkc(struct mlx5_cache_ent * ent,void * mkc)245 static void set_cache_mkc(struct mlx5_cache_ent *ent, void *mkc)
246 {
247 set_mkc_access_pd_addr_fields(mkc, ent->rb_key.access_flags, 0,
248 ent->dev->umrc.pd);
249 MLX5_SET(mkc, mkc, free, 1);
250 MLX5_SET(mkc, mkc, umr_en, 1);
251 MLX5_SET(mkc, mkc, access_mode_1_0, ent->rb_key.access_mode & 0x3);
252 MLX5_SET(mkc, mkc, access_mode_4_2,
253 (ent->rb_key.access_mode >> 2) & 0x7);
254 MLX5_SET(mkc, mkc, ma_translation_mode, !!ent->rb_key.ats);
255
256 MLX5_SET(mkc, mkc, translations_octword_size,
257 get_mkc_octo_size(ent->rb_key.access_mode,
258 ent->rb_key.ndescs));
259 MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT);
260
261 if (ent->rb_key.ph != MLX5_IB_NO_PH) {
262 MLX5_SET(mkc, mkc, pcie_tph_en, 1);
263 MLX5_SET(mkc, mkc, pcie_tph_ph, ent->rb_key.ph);
264 if (ent->rb_key.st_index != MLX5_MKC_PCIE_TPH_NO_STEERING_TAG_INDEX)
265 MLX5_SET(mkc, mkc, pcie_tph_steering_tag_index,
266 ent->rb_key.st_index);
267 }
268 }
269
270 /* Asynchronously schedule new MRs to be populated in the cache. */
add_keys(struct mlx5_cache_ent * ent,unsigned int num)271 static int add_keys(struct mlx5_cache_ent *ent, unsigned int num)
272 {
273 struct mlx5r_async_create_mkey *async_create;
274 void *mkc;
275 int err = 0;
276 int i;
277
278 for (i = 0; i < num; i++) {
279 async_create = kzalloc(sizeof(struct mlx5r_async_create_mkey),
280 GFP_KERNEL);
281 if (!async_create)
282 return -ENOMEM;
283 mkc = MLX5_ADDR_OF(create_mkey_in, async_create->in,
284 memory_key_mkey_entry);
285 set_cache_mkc(ent, mkc);
286 async_create->ent = ent;
287
288 spin_lock_irq(&ent->mkeys_queue.lock);
289 if (ent->pending >= MAX_PENDING_REG_MR) {
290 err = -EAGAIN;
291 goto free_async_create;
292 }
293 ent->pending++;
294 spin_unlock_irq(&ent->mkeys_queue.lock);
295
296 err = mlx5_ib_create_mkey_cb(async_create);
297 if (err) {
298 mlx5_ib_warn(ent->dev, "create mkey failed %d\n", err);
299 goto err_create_mkey;
300 }
301 }
302
303 return 0;
304
305 err_create_mkey:
306 spin_lock_irq(&ent->mkeys_queue.lock);
307 ent->pending--;
308 free_async_create:
309 spin_unlock_irq(&ent->mkeys_queue.lock);
310 kfree(async_create);
311 return err;
312 }
313
314 /* Synchronously create a MR in the cache */
create_cache_mkey(struct mlx5_cache_ent * ent,u32 * mkey)315 static int create_cache_mkey(struct mlx5_cache_ent *ent, u32 *mkey)
316 {
317 size_t inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
318 void *mkc;
319 u32 *in;
320 int err;
321
322 in = kzalloc(inlen, GFP_KERNEL);
323 if (!in)
324 return -ENOMEM;
325 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
326 set_cache_mkc(ent, mkc);
327
328 err = mlx5_core_create_mkey(ent->dev->mdev, mkey, in, inlen);
329 if (err)
330 goto free_in;
331
332 WRITE_ONCE(ent->dev->cache.last_add, jiffies);
333 free_in:
334 kfree(in);
335 return err;
336 }
337
remove_cache_mr_locked(struct mlx5_cache_ent * ent)338 static void remove_cache_mr_locked(struct mlx5_cache_ent *ent)
339 {
340 u32 mkey;
341
342 lockdep_assert_held(&ent->mkeys_queue.lock);
343 if (!ent->mkeys_queue.ci)
344 return;
345 mkey = pop_mkey_locked(ent);
346 spin_unlock_irq(&ent->mkeys_queue.lock);
347 mlx5_core_destroy_mkey(ent->dev->mdev, mkey);
348 spin_lock_irq(&ent->mkeys_queue.lock);
349 }
350
resize_available_mrs(struct mlx5_cache_ent * ent,unsigned int target,bool limit_fill)351 static int resize_available_mrs(struct mlx5_cache_ent *ent, unsigned int target,
352 bool limit_fill)
353 __acquires(&ent->mkeys_queue.lock) __releases(&ent->mkeys_queue.lock)
354 {
355 int err;
356
357 lockdep_assert_held(&ent->mkeys_queue.lock);
358
359 while (true) {
360 if (limit_fill)
361 target = ent->limit * 2;
362 if (target == ent->pending + ent->mkeys_queue.ci)
363 return 0;
364 if (target > ent->pending + ent->mkeys_queue.ci) {
365 u32 todo = target - (ent->pending + ent->mkeys_queue.ci);
366
367 spin_unlock_irq(&ent->mkeys_queue.lock);
368 err = add_keys(ent, todo);
369 if (err == -EAGAIN)
370 usleep_range(3000, 5000);
371 spin_lock_irq(&ent->mkeys_queue.lock);
372 if (err) {
373 if (err != -EAGAIN)
374 return err;
375 } else
376 return 0;
377 } else {
378 remove_cache_mr_locked(ent);
379 }
380 }
381 }
382
size_write(struct file * filp,const char __user * buf,size_t count,loff_t * pos)383 static ssize_t size_write(struct file *filp, const char __user *buf,
384 size_t count, loff_t *pos)
385 {
386 struct mlx5_cache_ent *ent = filp->private_data;
387 u32 target;
388 int err;
389
390 err = kstrtou32_from_user(buf, count, 0, &target);
391 if (err)
392 return err;
393
394 /*
395 * Target is the new value of total_mrs the user requests, however we
396 * cannot free MRs that are in use. Compute the target value for stored
397 * mkeys.
398 */
399 spin_lock_irq(&ent->mkeys_queue.lock);
400 if (target < ent->in_use) {
401 err = -EINVAL;
402 goto err_unlock;
403 }
404 target = target - ent->in_use;
405 if (target < ent->limit || target > ent->limit*2) {
406 err = -EINVAL;
407 goto err_unlock;
408 }
409 err = resize_available_mrs(ent, target, false);
410 if (err)
411 goto err_unlock;
412 spin_unlock_irq(&ent->mkeys_queue.lock);
413
414 return count;
415
416 err_unlock:
417 spin_unlock_irq(&ent->mkeys_queue.lock);
418 return err;
419 }
420
size_read(struct file * filp,char __user * buf,size_t count,loff_t * pos)421 static ssize_t size_read(struct file *filp, char __user *buf, size_t count,
422 loff_t *pos)
423 {
424 struct mlx5_cache_ent *ent = filp->private_data;
425 char lbuf[20];
426 int err;
427
428 err = snprintf(lbuf, sizeof(lbuf), "%ld\n",
429 ent->mkeys_queue.ci + ent->in_use);
430 if (err < 0)
431 return err;
432
433 return simple_read_from_buffer(buf, count, pos, lbuf, err);
434 }
435
436 static const struct file_operations size_fops = {
437 .owner = THIS_MODULE,
438 .open = simple_open,
439 .write = size_write,
440 .read = size_read,
441 };
442
limit_write(struct file * filp,const char __user * buf,size_t count,loff_t * pos)443 static ssize_t limit_write(struct file *filp, const char __user *buf,
444 size_t count, loff_t *pos)
445 {
446 struct mlx5_cache_ent *ent = filp->private_data;
447 u32 var;
448 int err;
449
450 err = kstrtou32_from_user(buf, count, 0, &var);
451 if (err)
452 return err;
453
454 /*
455 * Upon set we immediately fill the cache to high water mark implied by
456 * the limit.
457 */
458 spin_lock_irq(&ent->mkeys_queue.lock);
459 ent->limit = var;
460 err = resize_available_mrs(ent, 0, true);
461 spin_unlock_irq(&ent->mkeys_queue.lock);
462 if (err)
463 return err;
464 return count;
465 }
466
limit_read(struct file * filp,char __user * buf,size_t count,loff_t * pos)467 static ssize_t limit_read(struct file *filp, char __user *buf, size_t count,
468 loff_t *pos)
469 {
470 struct mlx5_cache_ent *ent = filp->private_data;
471 char lbuf[20];
472 int err;
473
474 err = snprintf(lbuf, sizeof(lbuf), "%d\n", ent->limit);
475 if (err < 0)
476 return err;
477
478 return simple_read_from_buffer(buf, count, pos, lbuf, err);
479 }
480
481 static const struct file_operations limit_fops = {
482 .owner = THIS_MODULE,
483 .open = simple_open,
484 .write = limit_write,
485 .read = limit_read,
486 };
487
someone_adding(struct mlx5_mkey_cache * cache)488 static bool someone_adding(struct mlx5_mkey_cache *cache)
489 {
490 struct mlx5_cache_ent *ent;
491 struct rb_node *node;
492 bool ret;
493
494 mutex_lock(&cache->rb_lock);
495 for (node = rb_first(&cache->rb_root); node; node = rb_next(node)) {
496 ent = rb_entry(node, struct mlx5_cache_ent, node);
497 spin_lock_irq(&ent->mkeys_queue.lock);
498 ret = ent->mkeys_queue.ci < ent->limit;
499 spin_unlock_irq(&ent->mkeys_queue.lock);
500 if (ret) {
501 mutex_unlock(&cache->rb_lock);
502 return true;
503 }
504 }
505 mutex_unlock(&cache->rb_lock);
506 return false;
507 }
508
509 /*
510 * Check if the bucket is outside the high/low water mark and schedule an async
511 * update. The cache refill has hysteresis, once the low water mark is hit it is
512 * refilled up to the high mark.
513 */
queue_adjust_cache_locked(struct mlx5_cache_ent * ent)514 static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent)
515 {
516 lockdep_assert_held(&ent->mkeys_queue.lock);
517
518 if (ent->disabled || READ_ONCE(ent->dev->fill_delay) || ent->is_tmp)
519 return;
520 if (ent->mkeys_queue.ci < ent->limit) {
521 ent->fill_to_high_water = true;
522 mod_delayed_work(ent->dev->cache.wq, &ent->dwork, 0);
523 } else if (ent->fill_to_high_water &&
524 ent->mkeys_queue.ci + ent->pending < 2 * ent->limit) {
525 /*
526 * Once we start populating due to hitting a low water mark
527 * continue until we pass the high water mark.
528 */
529 mod_delayed_work(ent->dev->cache.wq, &ent->dwork, 0);
530 } else if (ent->mkeys_queue.ci == 2 * ent->limit) {
531 ent->fill_to_high_water = false;
532 } else if (ent->mkeys_queue.ci > 2 * ent->limit) {
533 /* Queue deletion of excess entries */
534 ent->fill_to_high_water = false;
535 if (ent->pending)
536 queue_delayed_work(ent->dev->cache.wq, &ent->dwork,
537 secs_to_jiffies(1));
538 else
539 mod_delayed_work(ent->dev->cache.wq, &ent->dwork, 0);
540 }
541 }
542
clean_keys(struct mlx5_ib_dev * dev,struct mlx5_cache_ent * ent)543 static void clean_keys(struct mlx5_ib_dev *dev, struct mlx5_cache_ent *ent)
544 {
545 u32 mkey;
546
547 spin_lock_irq(&ent->mkeys_queue.lock);
548 while (ent->mkeys_queue.ci) {
549 mkey = pop_mkey_locked(ent);
550 spin_unlock_irq(&ent->mkeys_queue.lock);
551 mlx5_core_destroy_mkey(dev->mdev, mkey);
552 spin_lock_irq(&ent->mkeys_queue.lock);
553 }
554 ent->tmp_cleanup_scheduled = false;
555 spin_unlock_irq(&ent->mkeys_queue.lock);
556 }
557
__cache_work_func(struct mlx5_cache_ent * ent)558 static void __cache_work_func(struct mlx5_cache_ent *ent)
559 {
560 struct mlx5_ib_dev *dev = ent->dev;
561 struct mlx5_mkey_cache *cache = &dev->cache;
562 int err;
563
564 spin_lock_irq(&ent->mkeys_queue.lock);
565 if (ent->disabled)
566 goto out;
567
568 if (ent->fill_to_high_water &&
569 ent->mkeys_queue.ci + ent->pending < 2 * ent->limit &&
570 !READ_ONCE(dev->fill_delay)) {
571 spin_unlock_irq(&ent->mkeys_queue.lock);
572 err = add_keys(ent, 1);
573 spin_lock_irq(&ent->mkeys_queue.lock);
574 if (ent->disabled)
575 goto out;
576 if (err) {
577 /*
578 * EAGAIN only happens if there are pending MRs, so we
579 * will be rescheduled when storing them. The only
580 * failure path here is ENOMEM.
581 */
582 if (err != -EAGAIN) {
583 mlx5_ib_warn(
584 dev,
585 "add keys command failed, err %d\n",
586 err);
587 queue_delayed_work(cache->wq, &ent->dwork,
588 secs_to_jiffies(1));
589 }
590 }
591 } else if (ent->mkeys_queue.ci > 2 * ent->limit) {
592 bool need_delay;
593
594 /*
595 * The remove_cache_mr() logic is performed as garbage
596 * collection task. Such task is intended to be run when no
597 * other active processes are running.
598 *
599 * The need_resched() will return TRUE if there are user tasks
600 * to be activated in near future.
601 *
602 * In such case, we don't execute remove_cache_mr() and postpone
603 * the garbage collection work to try to run in next cycle, in
604 * order to free CPU resources to other tasks.
605 */
606 spin_unlock_irq(&ent->mkeys_queue.lock);
607 need_delay = need_resched() || someone_adding(cache) ||
608 !time_after(jiffies,
609 READ_ONCE(cache->last_add) + 300 * HZ);
610 spin_lock_irq(&ent->mkeys_queue.lock);
611 if (ent->disabled)
612 goto out;
613 if (need_delay) {
614 queue_delayed_work(cache->wq, &ent->dwork, 300 * HZ);
615 goto out;
616 }
617 remove_cache_mr_locked(ent);
618 queue_adjust_cache_locked(ent);
619 }
620 out:
621 spin_unlock_irq(&ent->mkeys_queue.lock);
622 }
623
delayed_cache_work_func(struct work_struct * work)624 static void delayed_cache_work_func(struct work_struct *work)
625 {
626 struct mlx5_cache_ent *ent;
627
628 ent = container_of(work, struct mlx5_cache_ent, dwork.work);
629 /* temp entries are never filled, only cleaned */
630 if (ent->is_tmp)
631 clean_keys(ent->dev, ent);
632 else
633 __cache_work_func(ent);
634 }
635
cache_ent_key_cmp(struct mlx5r_cache_rb_key key1,struct mlx5r_cache_rb_key key2)636 static int cache_ent_key_cmp(struct mlx5r_cache_rb_key key1,
637 struct mlx5r_cache_rb_key key2)
638 {
639 int res;
640
641 res = key1.ats - key2.ats;
642 if (res)
643 return res;
644
645 res = key1.access_mode - key2.access_mode;
646 if (res)
647 return res;
648
649 res = key1.access_flags - key2.access_flags;
650 if (res)
651 return res;
652
653 res = key1.st_index - key2.st_index;
654 if (res)
655 return res;
656
657 res = key1.ph - key2.ph;
658 if (res)
659 return res;
660
661 /*
662 * keep ndescs the last in the compare table since the find function
663 * searches for an exact match on all properties and only closest
664 * match in size.
665 */
666 return key1.ndescs - key2.ndescs;
667 }
668
mlx5_cache_ent_insert(struct mlx5_mkey_cache * cache,struct mlx5_cache_ent * ent)669 static int mlx5_cache_ent_insert(struct mlx5_mkey_cache *cache,
670 struct mlx5_cache_ent *ent)
671 {
672 struct rb_node **new = &cache->rb_root.rb_node, *parent = NULL;
673 struct mlx5_cache_ent *cur;
674 int cmp;
675
676 /* Figure out where to put new node */
677 while (*new) {
678 cur = rb_entry(*new, struct mlx5_cache_ent, node);
679 parent = *new;
680 cmp = cache_ent_key_cmp(cur->rb_key, ent->rb_key);
681 if (cmp > 0)
682 new = &((*new)->rb_left);
683 if (cmp < 0)
684 new = &((*new)->rb_right);
685 if (cmp == 0)
686 return -EEXIST;
687 }
688
689 /* Add new node and rebalance tree. */
690 rb_link_node(&ent->node, parent, new);
691 rb_insert_color(&ent->node, &cache->rb_root);
692
693 return 0;
694 }
695
696 static struct mlx5_cache_ent *
mkey_cache_ent_from_rb_key(struct mlx5_ib_dev * dev,struct mlx5r_cache_rb_key rb_key)697 mkey_cache_ent_from_rb_key(struct mlx5_ib_dev *dev,
698 struct mlx5r_cache_rb_key rb_key)
699 {
700 struct rb_node *node = dev->cache.rb_root.rb_node;
701 struct mlx5_cache_ent *cur, *smallest = NULL;
702 u64 ndescs_limit;
703 int cmp;
704
705 /*
706 * Find the smallest ent with order >= requested_order.
707 */
708 while (node) {
709 cur = rb_entry(node, struct mlx5_cache_ent, node);
710 cmp = cache_ent_key_cmp(cur->rb_key, rb_key);
711 if (cmp > 0) {
712 smallest = cur;
713 node = node->rb_left;
714 }
715 if (cmp < 0)
716 node = node->rb_right;
717 if (cmp == 0)
718 return cur;
719 }
720
721 /*
722 * Limit the usage of mkeys larger than twice the required size while
723 * also allowing the usage of smallest cache entry for small MRs.
724 */
725 ndescs_limit = max_t(u64, rb_key.ndescs * 2,
726 MLX5_MR_CACHE_PERSISTENT_ENTRY_MIN_DESCS);
727
728 return (smallest &&
729 smallest->rb_key.access_mode == rb_key.access_mode &&
730 smallest->rb_key.access_flags == rb_key.access_flags &&
731 smallest->rb_key.ats == rb_key.ats &&
732 smallest->rb_key.st_index == rb_key.st_index &&
733 smallest->rb_key.ph == rb_key.ph &&
734 smallest->rb_key.ndescs <= ndescs_limit) ?
735 smallest :
736 NULL;
737 }
738
_mlx5_mr_cache_alloc(struct mlx5_ib_dev * dev,struct mlx5_cache_ent * ent)739 static struct mlx5_ib_mr *_mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev,
740 struct mlx5_cache_ent *ent)
741 {
742 struct mlx5_ib_mr *mr;
743 int err;
744
745 mr = kzalloc(sizeof(*mr), GFP_KERNEL);
746 if (!mr)
747 return ERR_PTR(-ENOMEM);
748
749 spin_lock_irq(&ent->mkeys_queue.lock);
750 ent->in_use++;
751
752 if (!ent->mkeys_queue.ci) {
753 queue_adjust_cache_locked(ent);
754 ent->miss++;
755 spin_unlock_irq(&ent->mkeys_queue.lock);
756 err = create_cache_mkey(ent, &mr->mmkey.key);
757 if (err) {
758 spin_lock_irq(&ent->mkeys_queue.lock);
759 ent->in_use--;
760 spin_unlock_irq(&ent->mkeys_queue.lock);
761 kfree(mr);
762 return ERR_PTR(err);
763 }
764 } else {
765 mr->mmkey.key = pop_mkey_locked(ent);
766 queue_adjust_cache_locked(ent);
767 spin_unlock_irq(&ent->mkeys_queue.lock);
768 }
769 mr->mmkey.cache_ent = ent;
770 mr->mmkey.type = MLX5_MKEY_MR;
771 mr->mmkey.rb_key = ent->rb_key;
772 mr->mmkey.cacheable = true;
773 init_waitqueue_head(&mr->mmkey.wait);
774 return mr;
775 }
776
get_unchangeable_access_flags(struct mlx5_ib_dev * dev,int access_flags)777 static int get_unchangeable_access_flags(struct mlx5_ib_dev *dev,
778 int access_flags)
779 {
780 int ret = 0;
781
782 if ((access_flags & IB_ACCESS_REMOTE_ATOMIC) &&
783 MLX5_CAP_GEN(dev->mdev, atomic) &&
784 MLX5_CAP_GEN(dev->mdev, umr_modify_atomic_disabled))
785 ret |= IB_ACCESS_REMOTE_ATOMIC;
786
787 if ((access_flags & IB_ACCESS_RELAXED_ORDERING) &&
788 MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write) &&
789 !MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write_umr))
790 ret |= IB_ACCESS_RELAXED_ORDERING;
791
792 if ((access_flags & IB_ACCESS_RELAXED_ORDERING) &&
793 (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read) ||
794 MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read_pci_enabled)) &&
795 !MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read_umr))
796 ret |= IB_ACCESS_RELAXED_ORDERING;
797
798 return ret;
799 }
800
mlx5_mr_cache_alloc(struct mlx5_ib_dev * dev,int access_flags,int access_mode,int ndescs)801 struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev,
802 int access_flags, int access_mode,
803 int ndescs)
804 {
805 struct mlx5r_cache_rb_key rb_key = {
806 .ndescs = ndescs,
807 .access_mode = access_mode,
808 .access_flags = get_unchangeable_access_flags(dev, access_flags),
809 .ph = MLX5_IB_NO_PH,
810 };
811 struct mlx5_cache_ent *ent = mkey_cache_ent_from_rb_key(dev, rb_key);
812
813 if (!ent)
814 return ERR_PTR(-EOPNOTSUPP);
815
816 return _mlx5_mr_cache_alloc(dev, ent);
817 }
818
mlx5_mkey_cache_debugfs_cleanup(struct mlx5_ib_dev * dev)819 static void mlx5_mkey_cache_debugfs_cleanup(struct mlx5_ib_dev *dev)
820 {
821 if (!mlx5_debugfs_root || dev->is_rep)
822 return;
823
824 debugfs_remove_recursive(dev->cache.fs_root);
825 dev->cache.fs_root = NULL;
826 }
827
mlx5_mkey_cache_debugfs_add_ent(struct mlx5_ib_dev * dev,struct mlx5_cache_ent * ent)828 static void mlx5_mkey_cache_debugfs_add_ent(struct mlx5_ib_dev *dev,
829 struct mlx5_cache_ent *ent)
830 {
831 int order = order_base_2(ent->rb_key.ndescs);
832 struct dentry *dir;
833
834 if (!mlx5_debugfs_root || dev->is_rep)
835 return;
836
837 if (ent->rb_key.access_mode == MLX5_MKC_ACCESS_MODE_KSM)
838 order = MLX5_IMR_KSM_CACHE_ENTRY + 2;
839
840 sprintf(ent->name, "%d", order);
841 dir = debugfs_create_dir(ent->name, dev->cache.fs_root);
842 debugfs_create_file("size", 0600, dir, ent, &size_fops);
843 debugfs_create_file("limit", 0600, dir, ent, &limit_fops);
844 debugfs_create_ulong("cur", 0400, dir, &ent->mkeys_queue.ci);
845 debugfs_create_u32("miss", 0600, dir, &ent->miss);
846 }
847
mlx5_mkey_cache_debugfs_init(struct mlx5_ib_dev * dev)848 static void mlx5_mkey_cache_debugfs_init(struct mlx5_ib_dev *dev)
849 {
850 struct dentry *dbg_root = mlx5_debugfs_get_dev_root(dev->mdev);
851 struct mlx5_mkey_cache *cache = &dev->cache;
852
853 if (!mlx5_debugfs_root || dev->is_rep)
854 return;
855
856 cache->fs_root = debugfs_create_dir("mr_cache", dbg_root);
857 }
858
delay_time_func(struct timer_list * t)859 static void delay_time_func(struct timer_list *t)
860 {
861 struct mlx5_ib_dev *dev = timer_container_of(dev, t, delay_timer);
862
863 WRITE_ONCE(dev->fill_delay, 0);
864 }
865
mlx5r_mkeys_init(struct mlx5_cache_ent * ent)866 static int mlx5r_mkeys_init(struct mlx5_cache_ent *ent)
867 {
868 struct mlx5_mkeys_page *page;
869
870 page = kzalloc(sizeof(*page), GFP_KERNEL);
871 if (!page)
872 return -ENOMEM;
873 INIT_LIST_HEAD(&ent->mkeys_queue.pages_list);
874 spin_lock_init(&ent->mkeys_queue.lock);
875 list_add_tail(&page->list, &ent->mkeys_queue.pages_list);
876 ent->mkeys_queue.num_pages++;
877 return 0;
878 }
879
mlx5r_mkeys_uninit(struct mlx5_cache_ent * ent)880 static void mlx5r_mkeys_uninit(struct mlx5_cache_ent *ent)
881 {
882 struct mlx5_mkeys_page *page;
883
884 WARN_ON(ent->mkeys_queue.ci || ent->mkeys_queue.num_pages > 1);
885 page = list_last_entry(&ent->mkeys_queue.pages_list,
886 struct mlx5_mkeys_page, list);
887 list_del(&page->list);
888 kfree(page);
889 }
890
891 struct mlx5_cache_ent *
mlx5r_cache_create_ent_locked(struct mlx5_ib_dev * dev,struct mlx5r_cache_rb_key rb_key,bool persistent_entry)892 mlx5r_cache_create_ent_locked(struct mlx5_ib_dev *dev,
893 struct mlx5r_cache_rb_key rb_key,
894 bool persistent_entry)
895 {
896 struct mlx5_cache_ent *ent;
897 int order;
898 int ret;
899
900 ent = kzalloc(sizeof(*ent), GFP_KERNEL);
901 if (!ent)
902 return ERR_PTR(-ENOMEM);
903
904 ret = mlx5r_mkeys_init(ent);
905 if (ret)
906 goto mkeys_err;
907 ent->rb_key = rb_key;
908 ent->dev = dev;
909 ent->is_tmp = !persistent_entry;
910
911 INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func);
912
913 ret = mlx5_cache_ent_insert(&dev->cache, ent);
914 if (ret)
915 goto ent_insert_err;
916
917 if (persistent_entry) {
918 if (rb_key.access_mode == MLX5_MKC_ACCESS_MODE_KSM)
919 order = MLX5_IMR_KSM_CACHE_ENTRY;
920 else
921 order = order_base_2(rb_key.ndescs) - 2;
922
923 if ((dev->mdev->profile.mask & MLX5_PROF_MASK_MR_CACHE) &&
924 !dev->is_rep && mlx5_core_is_pf(dev->mdev) &&
925 mlx5r_umr_can_load_pas(dev, 0))
926 ent->limit = dev->mdev->profile.mr_cache[order].limit;
927 else
928 ent->limit = 0;
929
930 mlx5_mkey_cache_debugfs_add_ent(dev, ent);
931 }
932
933 return ent;
934 ent_insert_err:
935 mlx5r_mkeys_uninit(ent);
936 mkeys_err:
937 kfree(ent);
938 return ERR_PTR(ret);
939 }
940
mlx5r_destroy_cache_entries(struct mlx5_ib_dev * dev)941 static void mlx5r_destroy_cache_entries(struct mlx5_ib_dev *dev)
942 {
943 struct rb_root *root = &dev->cache.rb_root;
944 struct mlx5_cache_ent *ent;
945 struct rb_node *node;
946
947 mutex_lock(&dev->cache.rb_lock);
948 node = rb_first(root);
949 while (node) {
950 ent = rb_entry(node, struct mlx5_cache_ent, node);
951 node = rb_next(node);
952 clean_keys(dev, ent);
953 rb_erase(&ent->node, root);
954 mlx5r_mkeys_uninit(ent);
955 kfree(ent);
956 }
957 mutex_unlock(&dev->cache.rb_lock);
958 }
959
mlx5_mkey_cache_init(struct mlx5_ib_dev * dev)960 int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev)
961 {
962 struct mlx5_mkey_cache *cache = &dev->cache;
963 struct rb_root *root = &dev->cache.rb_root;
964 struct mlx5r_cache_rb_key rb_key = {
965 .access_mode = MLX5_MKC_ACCESS_MODE_MTT,
966 .ph = MLX5_IB_NO_PH,
967 };
968 struct mlx5_cache_ent *ent;
969 struct rb_node *node;
970 int ret;
971 int i;
972
973 mutex_init(&dev->slow_path_mutex);
974 mutex_init(&dev->cache.rb_lock);
975 dev->cache.rb_root = RB_ROOT;
976 cache->wq = alloc_ordered_workqueue("mkey_cache", WQ_MEM_RECLAIM);
977 if (!cache->wq) {
978 mlx5_ib_warn(dev, "failed to create work queue\n");
979 return -ENOMEM;
980 }
981
982 mlx5_cmd_init_async_ctx(dev->mdev, &dev->async_ctx);
983 timer_setup(&dev->delay_timer, delay_time_func, 0);
984 mlx5_mkey_cache_debugfs_init(dev);
985 mutex_lock(&cache->rb_lock);
986 for (i = 0; i <= mkey_cache_max_order(dev); i++) {
987 rb_key.ndescs = MLX5_MR_CACHE_PERSISTENT_ENTRY_MIN_DESCS << i;
988 ent = mlx5r_cache_create_ent_locked(dev, rb_key, true);
989 if (IS_ERR(ent)) {
990 ret = PTR_ERR(ent);
991 goto err;
992 }
993 }
994
995 ret = mlx5_odp_init_mkey_cache(dev);
996 if (ret)
997 goto err;
998
999 mutex_unlock(&cache->rb_lock);
1000 for (node = rb_first(root); node; node = rb_next(node)) {
1001 ent = rb_entry(node, struct mlx5_cache_ent, node);
1002 spin_lock_irq(&ent->mkeys_queue.lock);
1003 queue_adjust_cache_locked(ent);
1004 spin_unlock_irq(&ent->mkeys_queue.lock);
1005 }
1006
1007 return 0;
1008
1009 err:
1010 mutex_unlock(&cache->rb_lock);
1011 mlx5_mkey_cache_debugfs_cleanup(dev);
1012 mlx5r_destroy_cache_entries(dev);
1013 destroy_workqueue(cache->wq);
1014 mlx5_ib_warn(dev, "failed to create mkey cache entry\n");
1015 return ret;
1016 }
1017
mlx5_mkey_cache_cleanup(struct mlx5_ib_dev * dev)1018 void mlx5_mkey_cache_cleanup(struct mlx5_ib_dev *dev)
1019 {
1020 struct rb_root *root = &dev->cache.rb_root;
1021 struct mlx5_cache_ent *ent;
1022 struct rb_node *node;
1023
1024 if (!dev->cache.wq)
1025 return;
1026
1027 mutex_lock(&dev->cache.rb_lock);
1028 for (node = rb_first(root); node; node = rb_next(node)) {
1029 ent = rb_entry(node, struct mlx5_cache_ent, node);
1030 spin_lock_irq(&ent->mkeys_queue.lock);
1031 ent->disabled = true;
1032 spin_unlock_irq(&ent->mkeys_queue.lock);
1033 cancel_delayed_work(&ent->dwork);
1034 }
1035 mutex_unlock(&dev->cache.rb_lock);
1036
1037 /*
1038 * After all entries are disabled and will not reschedule on WQ,
1039 * flush it and all async commands.
1040 */
1041 flush_workqueue(dev->cache.wq);
1042
1043 mlx5_mkey_cache_debugfs_cleanup(dev);
1044 mlx5_cmd_cleanup_async_ctx(&dev->async_ctx);
1045
1046 /* At this point all entries are disabled and have no concurrent work. */
1047 mlx5r_destroy_cache_entries(dev);
1048
1049 destroy_workqueue(dev->cache.wq);
1050 timer_delete_sync(&dev->delay_timer);
1051 }
1052
mlx5_ib_get_dma_mr(struct ib_pd * pd,int acc)1053 struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc)
1054 {
1055 struct mlx5_ib_dev *dev = to_mdev(pd->device);
1056 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
1057 struct mlx5_ib_mr *mr;
1058 void *mkc;
1059 u32 *in;
1060 int err;
1061
1062 mr = kzalloc(sizeof(*mr), GFP_KERNEL);
1063 if (!mr)
1064 return ERR_PTR(-ENOMEM);
1065
1066 in = kzalloc(inlen, GFP_KERNEL);
1067 if (!in) {
1068 err = -ENOMEM;
1069 goto err_free;
1070 }
1071
1072 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
1073
1074 MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_PA);
1075 MLX5_SET(mkc, mkc, length64, 1);
1076 set_mkc_access_pd_addr_fields(mkc, acc | IB_ACCESS_RELAXED_ORDERING, 0,
1077 pd);
1078 MLX5_SET(mkc, mkc, ma_translation_mode, MLX5_CAP_GEN(dev->mdev, ats));
1079
1080 err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen);
1081 if (err)
1082 goto err_in;
1083
1084 kfree(in);
1085 mr->mmkey.type = MLX5_MKEY_MR;
1086 mr->ibmr.lkey = mr->mmkey.key;
1087 mr->ibmr.rkey = mr->mmkey.key;
1088 mr->umem = NULL;
1089
1090 return &mr->ibmr;
1091
1092 err_in:
1093 kfree(in);
1094
1095 err_free:
1096 kfree(mr);
1097
1098 return ERR_PTR(err);
1099 }
1100
get_octo_len(u64 addr,u64 len,int page_shift)1101 static int get_octo_len(u64 addr, u64 len, int page_shift)
1102 {
1103 u64 page_size = 1ULL << page_shift;
1104 u64 offset;
1105 int npages;
1106
1107 offset = addr & (page_size - 1);
1108 npages = ALIGN(len + offset, page_size) >> page_shift;
1109 return (npages + 1) / 2;
1110 }
1111
mkey_cache_max_order(struct mlx5_ib_dev * dev)1112 static int mkey_cache_max_order(struct mlx5_ib_dev *dev)
1113 {
1114 if (MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset))
1115 return MKEY_CACHE_LAST_STD_ENTRY;
1116 return MLX5_MAX_UMR_SHIFT;
1117 }
1118
set_mr_fields(struct mlx5_ib_dev * dev,struct mlx5_ib_mr * mr,u64 length,int access_flags,u64 iova)1119 static void set_mr_fields(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr,
1120 u64 length, int access_flags, u64 iova)
1121 {
1122 mr->ibmr.lkey = mr->mmkey.key;
1123 mr->ibmr.rkey = mr->mmkey.key;
1124 mr->ibmr.length = length;
1125 mr->ibmr.device = &dev->ib_dev;
1126 mr->ibmr.iova = iova;
1127 mr->access_flags = access_flags;
1128 }
1129
mlx5_umem_dmabuf_default_pgsz(struct ib_umem * umem,u64 iova)1130 static unsigned int mlx5_umem_dmabuf_default_pgsz(struct ib_umem *umem,
1131 u64 iova)
1132 {
1133 /*
1134 * The alignment of iova has already been checked upon entering
1135 * UVERBS_METHOD_REG_DMABUF_MR
1136 */
1137 umem->iova = iova;
1138 return PAGE_SIZE;
1139 }
1140
alloc_cacheable_mr(struct ib_pd * pd,struct ib_umem * umem,u64 iova,int access_flags,int access_mode,u16 st_index,u8 ph)1141 static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd,
1142 struct ib_umem *umem, u64 iova,
1143 int access_flags, int access_mode,
1144 u16 st_index, u8 ph)
1145 {
1146 struct mlx5_ib_dev *dev = to_mdev(pd->device);
1147 struct mlx5r_cache_rb_key rb_key = {};
1148 struct mlx5_cache_ent *ent;
1149 struct mlx5_ib_mr *mr;
1150 unsigned long page_size;
1151
1152 if (umem->is_dmabuf)
1153 page_size = mlx5_umem_dmabuf_default_pgsz(umem, iova);
1154 else
1155 page_size = mlx5_umem_mkc_find_best_pgsz(dev, umem, iova,
1156 access_mode);
1157 if (WARN_ON(!page_size))
1158 return ERR_PTR(-EINVAL);
1159
1160 rb_key.access_mode = access_mode;
1161 rb_key.ndescs = ib_umem_num_dma_blocks(umem, page_size);
1162 rb_key.ats = mlx5_umem_needs_ats(dev, umem, access_flags);
1163 rb_key.access_flags = get_unchangeable_access_flags(dev, access_flags);
1164 rb_key.st_index = st_index;
1165 rb_key.ph = ph;
1166 ent = mkey_cache_ent_from_rb_key(dev, rb_key);
1167 /*
1168 * If the MR can't come from the cache then synchronously create an uncached
1169 * one.
1170 */
1171 if (!ent) {
1172 mutex_lock(&dev->slow_path_mutex);
1173 mr = reg_create(pd, umem, iova, access_flags, page_size, false, access_mode,
1174 st_index, ph);
1175 mutex_unlock(&dev->slow_path_mutex);
1176 if (IS_ERR(mr))
1177 return mr;
1178 mr->mmkey.rb_key = rb_key;
1179 mr->mmkey.cacheable = true;
1180 return mr;
1181 }
1182
1183 mr = _mlx5_mr_cache_alloc(dev, ent);
1184 if (IS_ERR(mr))
1185 return mr;
1186
1187 mr->ibmr.pd = pd;
1188 mr->umem = umem;
1189 mr->page_shift = order_base_2(page_size);
1190 set_mr_fields(dev, mr, umem->length, access_flags, iova);
1191
1192 return mr;
1193 }
1194
1195 static struct ib_mr *
reg_create_crossing_vhca_mr(struct ib_pd * pd,u64 iova,u64 length,int access_flags,u32 crossed_lkey)1196 reg_create_crossing_vhca_mr(struct ib_pd *pd, u64 iova, u64 length, int access_flags,
1197 u32 crossed_lkey)
1198 {
1199 struct mlx5_ib_dev *dev = to_mdev(pd->device);
1200 int access_mode = MLX5_MKC_ACCESS_MODE_CROSSING;
1201 struct mlx5_ib_mr *mr;
1202 void *mkc;
1203 int inlen;
1204 u32 *in;
1205 int err;
1206
1207 if (!MLX5_CAP_GEN(dev->mdev, crossing_vhca_mkey))
1208 return ERR_PTR(-EOPNOTSUPP);
1209
1210 mr = kzalloc(sizeof(*mr), GFP_KERNEL);
1211 if (!mr)
1212 return ERR_PTR(-ENOMEM);
1213
1214 inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
1215 in = kvzalloc(inlen, GFP_KERNEL);
1216 if (!in) {
1217 err = -ENOMEM;
1218 goto err_1;
1219 }
1220
1221 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
1222 MLX5_SET(mkc, mkc, crossing_target_vhca_id,
1223 MLX5_CAP_GEN(dev->mdev, vhca_id));
1224 MLX5_SET(mkc, mkc, translations_octword_size, crossed_lkey);
1225 MLX5_SET(mkc, mkc, access_mode_1_0, access_mode & 0x3);
1226 MLX5_SET(mkc, mkc, access_mode_4_2, (access_mode >> 2) & 0x7);
1227
1228 /* for this crossing mkey IOVA should be 0 and len should be IOVA + len */
1229 set_mkc_access_pd_addr_fields(mkc, access_flags, 0, pd);
1230 MLX5_SET64(mkc, mkc, len, iova + length);
1231
1232 MLX5_SET(mkc, mkc, free, 0);
1233 MLX5_SET(mkc, mkc, umr_en, 0);
1234 err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen);
1235 if (err)
1236 goto err_2;
1237
1238 mr->mmkey.type = MLX5_MKEY_MR;
1239 set_mr_fields(dev, mr, length, access_flags, iova);
1240 mr->ibmr.pd = pd;
1241 kvfree(in);
1242 mlx5_ib_dbg(dev, "crossing mkey = 0x%x\n", mr->mmkey.key);
1243
1244 return &mr->ibmr;
1245 err_2:
1246 kvfree(in);
1247 err_1:
1248 kfree(mr);
1249 return ERR_PTR(err);
1250 }
1251
1252 /*
1253 * If ibmr is NULL it will be allocated by reg_create.
1254 * Else, the given ibmr will be used.
1255 */
reg_create(struct ib_pd * pd,struct ib_umem * umem,u64 iova,int access_flags,unsigned long page_size,bool populate,int access_mode,u16 st_index,u8 ph)1256 static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem,
1257 u64 iova, int access_flags,
1258 unsigned long page_size, bool populate,
1259 int access_mode, u16 st_index, u8 ph)
1260 {
1261 struct mlx5_ib_dev *dev = to_mdev(pd->device);
1262 struct mlx5_ib_mr *mr;
1263 __be64 *pas;
1264 void *mkc;
1265 int inlen;
1266 u32 *in;
1267 int err;
1268 bool pg_cap = !!(MLX5_CAP_GEN(dev->mdev, pg)) &&
1269 (access_mode == MLX5_MKC_ACCESS_MODE_MTT) &&
1270 (ph == MLX5_IB_NO_PH);
1271 bool ksm_mode = (access_mode == MLX5_MKC_ACCESS_MODE_KSM);
1272
1273 if (!page_size)
1274 return ERR_PTR(-EINVAL);
1275 mr = kzalloc(sizeof(*mr), GFP_KERNEL);
1276 if (!mr)
1277 return ERR_PTR(-ENOMEM);
1278
1279 mr->ibmr.pd = pd;
1280 mr->access_flags = access_flags;
1281 mr->page_shift = order_base_2(page_size);
1282
1283 inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
1284 if (populate)
1285 inlen += sizeof(*pas) *
1286 roundup(ib_umem_num_dma_blocks(umem, page_size), 2);
1287 in = kvzalloc(inlen, GFP_KERNEL);
1288 if (!in) {
1289 err = -ENOMEM;
1290 goto err_1;
1291 }
1292 pas = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt);
1293 if (populate) {
1294 if (WARN_ON(access_flags & IB_ACCESS_ON_DEMAND || ksm_mode)) {
1295 err = -EINVAL;
1296 goto err_2;
1297 }
1298 mlx5_ib_populate_pas(umem, 1UL << mr->page_shift, pas,
1299 pg_cap ? MLX5_IB_MTT_PRESENT : 0);
1300 }
1301
1302 /* The pg_access bit allows setting the access flags
1303 * in the page list submitted with the command.
1304 */
1305 MLX5_SET(create_mkey_in, in, pg_access, !!(pg_cap));
1306
1307 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
1308 set_mkc_access_pd_addr_fields(mkc, access_flags, iova,
1309 populate ? pd : dev->umrc.pd);
1310 /* In case a data direct flow, overwrite the pdn field by its internal kernel PD */
1311 if (umem->is_dmabuf && ksm_mode)
1312 MLX5_SET(mkc, mkc, pd, dev->ddr.pdn);
1313
1314 MLX5_SET(mkc, mkc, free, !populate);
1315 MLX5_SET(mkc, mkc, access_mode_1_0, access_mode);
1316 MLX5_SET(mkc, mkc, umr_en, 1);
1317
1318 MLX5_SET64(mkc, mkc, len, umem->length);
1319 MLX5_SET(mkc, mkc, bsf_octword_size, 0);
1320 if (ksm_mode)
1321 MLX5_SET(mkc, mkc, translations_octword_size,
1322 get_octo_len(iova, umem->length, mr->page_shift) * 2);
1323 else
1324 MLX5_SET(mkc, mkc, translations_octword_size,
1325 get_octo_len(iova, umem->length, mr->page_shift));
1326 MLX5_SET(mkc, mkc, log_page_size, mr->page_shift);
1327 if (mlx5_umem_needs_ats(dev, umem, access_flags))
1328 MLX5_SET(mkc, mkc, ma_translation_mode, 1);
1329 if (populate) {
1330 MLX5_SET(create_mkey_in, in, translations_octword_actual_size,
1331 get_octo_len(iova, umem->length, mr->page_shift));
1332 }
1333
1334 if (ph != MLX5_IB_NO_PH) {
1335 MLX5_SET(mkc, mkc, pcie_tph_en, 1);
1336 MLX5_SET(mkc, mkc, pcie_tph_ph, ph);
1337 if (st_index != MLX5_MKC_PCIE_TPH_NO_STEERING_TAG_INDEX)
1338 MLX5_SET(mkc, mkc, pcie_tph_steering_tag_index, st_index);
1339 }
1340
1341 err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen);
1342 if (err) {
1343 mlx5_ib_warn(dev, "create mkey failed\n");
1344 goto err_2;
1345 }
1346 mr->mmkey.type = MLX5_MKEY_MR;
1347 mr->mmkey.ndescs = get_octo_len(iova, umem->length, mr->page_shift);
1348 mr->umem = umem;
1349 set_mr_fields(dev, mr, umem->length, access_flags, iova);
1350 kvfree(in);
1351
1352 mlx5_ib_dbg(dev, "mkey = 0x%x\n", mr->mmkey.key);
1353
1354 return mr;
1355
1356 err_2:
1357 kvfree(in);
1358 err_1:
1359 kfree(mr);
1360 return ERR_PTR(err);
1361 }
1362
mlx5_ib_get_dm_mr(struct ib_pd * pd,u64 start_addr,u64 length,int acc,int mode)1363 static struct ib_mr *mlx5_ib_get_dm_mr(struct ib_pd *pd, u64 start_addr,
1364 u64 length, int acc, int mode)
1365 {
1366 struct mlx5_ib_dev *dev = to_mdev(pd->device);
1367 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
1368 struct mlx5_ib_mr *mr;
1369 void *mkc;
1370 u32 *in;
1371 int err;
1372
1373 mr = kzalloc(sizeof(*mr), GFP_KERNEL);
1374 if (!mr)
1375 return ERR_PTR(-ENOMEM);
1376
1377 in = kzalloc(inlen, GFP_KERNEL);
1378 if (!in) {
1379 err = -ENOMEM;
1380 goto err_free;
1381 }
1382
1383 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
1384
1385 MLX5_SET(mkc, mkc, access_mode_1_0, mode & 0x3);
1386 MLX5_SET(mkc, mkc, access_mode_4_2, (mode >> 2) & 0x7);
1387 MLX5_SET64(mkc, mkc, len, length);
1388 set_mkc_access_pd_addr_fields(mkc, acc, start_addr, pd);
1389
1390 err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen);
1391 if (err)
1392 goto err_in;
1393
1394 kfree(in);
1395
1396 set_mr_fields(dev, mr, length, acc, start_addr);
1397
1398 return &mr->ibmr;
1399
1400 err_in:
1401 kfree(in);
1402
1403 err_free:
1404 kfree(mr);
1405
1406 return ERR_PTR(err);
1407 }
1408
mlx5_ib_advise_mr(struct ib_pd * pd,enum ib_uverbs_advise_mr_advice advice,u32 flags,struct ib_sge * sg_list,u32 num_sge,struct uverbs_attr_bundle * attrs)1409 int mlx5_ib_advise_mr(struct ib_pd *pd,
1410 enum ib_uverbs_advise_mr_advice advice,
1411 u32 flags,
1412 struct ib_sge *sg_list,
1413 u32 num_sge,
1414 struct uverbs_attr_bundle *attrs)
1415 {
1416 if (advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH &&
1417 advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE &&
1418 advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_NO_FAULT)
1419 return -EOPNOTSUPP;
1420
1421 return mlx5_ib_advise_mr_prefetch(pd, advice, flags,
1422 sg_list, num_sge);
1423 }
1424
mlx5_ib_reg_dm_mr(struct ib_pd * pd,struct ib_dm * dm,struct ib_dm_mr_attr * attr,struct uverbs_attr_bundle * attrs)1425 struct ib_mr *mlx5_ib_reg_dm_mr(struct ib_pd *pd, struct ib_dm *dm,
1426 struct ib_dm_mr_attr *attr,
1427 struct uverbs_attr_bundle *attrs)
1428 {
1429 struct mlx5_ib_dm *mdm = to_mdm(dm);
1430 struct mlx5_core_dev *dev = to_mdev(dm->device)->mdev;
1431 u64 start_addr = mdm->dev_addr + attr->offset;
1432 int mode;
1433
1434 switch (mdm->type) {
1435 case MLX5_IB_UAPI_DM_TYPE_MEMIC:
1436 if (attr->access_flags & ~MLX5_IB_DM_MEMIC_ALLOWED_ACCESS)
1437 return ERR_PTR(-EINVAL);
1438
1439 mode = MLX5_MKC_ACCESS_MODE_MEMIC;
1440 start_addr -= pci_resource_start(dev->pdev, 0);
1441 break;
1442 case MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM:
1443 case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM:
1444 case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_PATTERN_SW_ICM:
1445 case MLX5_IB_UAPI_DM_TYPE_ENCAP_SW_ICM:
1446 if (attr->access_flags & ~MLX5_IB_DM_SW_ICM_ALLOWED_ACCESS)
1447 return ERR_PTR(-EINVAL);
1448
1449 mode = MLX5_MKC_ACCESS_MODE_SW_ICM;
1450 break;
1451 default:
1452 return ERR_PTR(-EINVAL);
1453 }
1454
1455 return mlx5_ib_get_dm_mr(pd, start_addr, attr->length,
1456 attr->access_flags, mode);
1457 }
1458
create_real_mr(struct ib_pd * pd,struct ib_umem * umem,u64 iova,int access_flags,struct ib_dmah * dmah)1459 static struct ib_mr *create_real_mr(struct ib_pd *pd, struct ib_umem *umem,
1460 u64 iova, int access_flags,
1461 struct ib_dmah *dmah)
1462 {
1463 struct mlx5_ib_dev *dev = to_mdev(pd->device);
1464 struct mlx5_ib_mr *mr = NULL;
1465 bool xlt_with_umr;
1466 u16 st_index = MLX5_MKC_PCIE_TPH_NO_STEERING_TAG_INDEX;
1467 u8 ph = MLX5_IB_NO_PH;
1468 int err;
1469
1470 if (dmah) {
1471 struct mlx5_ib_dmah *mdmah = to_mdmah(dmah);
1472
1473 ph = dmah->ph;
1474 if (dmah->valid_fields & BIT(IB_DMAH_CPU_ID_EXISTS))
1475 st_index = mdmah->st_index;
1476 }
1477
1478 xlt_with_umr = mlx5r_umr_can_load_pas(dev, umem->length);
1479 if (xlt_with_umr) {
1480 mr = alloc_cacheable_mr(pd, umem, iova, access_flags,
1481 MLX5_MKC_ACCESS_MODE_MTT,
1482 st_index, ph);
1483 } else {
1484 unsigned long page_size = mlx5_umem_mkc_find_best_pgsz(
1485 dev, umem, iova, MLX5_MKC_ACCESS_MODE_MTT);
1486
1487 mutex_lock(&dev->slow_path_mutex);
1488 mr = reg_create(pd, umem, iova, access_flags, page_size,
1489 true, MLX5_MKC_ACCESS_MODE_MTT,
1490 st_index, ph);
1491 mutex_unlock(&dev->slow_path_mutex);
1492 }
1493 if (IS_ERR(mr)) {
1494 ib_umem_release(umem);
1495 return ERR_CAST(mr);
1496 }
1497
1498 mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmkey.key);
1499
1500 atomic_add(ib_umem_num_pages(umem), &dev->mdev->priv.reg_pages);
1501
1502 if (xlt_with_umr) {
1503 /*
1504 * If the MR was created with reg_create then it will be
1505 * configured properly but left disabled. It is safe to go ahead
1506 * and configure it again via UMR while enabling it.
1507 */
1508 err = mlx5r_umr_update_mr_pas(mr, MLX5_IB_UPD_XLT_ENABLE);
1509 if (err) {
1510 mlx5_ib_dereg_mr(&mr->ibmr, NULL);
1511 return ERR_PTR(err);
1512 }
1513 }
1514 return &mr->ibmr;
1515 }
1516
create_user_odp_mr(struct ib_pd * pd,u64 start,u64 length,u64 iova,int access_flags,struct ib_udata * udata)1517 static struct ib_mr *create_user_odp_mr(struct ib_pd *pd, u64 start, u64 length,
1518 u64 iova, int access_flags,
1519 struct ib_udata *udata)
1520 {
1521 struct mlx5_ib_dev *dev = to_mdev(pd->device);
1522 struct ib_umem_odp *odp;
1523 struct mlx5_ib_mr *mr;
1524 int err;
1525
1526 if (!IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING))
1527 return ERR_PTR(-EOPNOTSUPP);
1528
1529 err = mlx5r_odp_create_eq(dev, &dev->odp_pf_eq);
1530 if (err)
1531 return ERR_PTR(err);
1532 if (!start && length == U64_MAX) {
1533 if (iova != 0)
1534 return ERR_PTR(-EINVAL);
1535 if (!(dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT))
1536 return ERR_PTR(-EINVAL);
1537
1538 mr = mlx5_ib_alloc_implicit_mr(to_mpd(pd), access_flags);
1539 if (IS_ERR(mr))
1540 return ERR_CAST(mr);
1541 return &mr->ibmr;
1542 }
1543
1544 /* ODP requires xlt update via umr to work. */
1545 if (!mlx5r_umr_can_load_pas(dev, length))
1546 return ERR_PTR(-EINVAL);
1547
1548 odp = ib_umem_odp_get(&dev->ib_dev, start, length, access_flags,
1549 &mlx5_mn_ops);
1550 if (IS_ERR(odp))
1551 return ERR_CAST(odp);
1552
1553 mr = alloc_cacheable_mr(pd, &odp->umem, iova, access_flags,
1554 MLX5_MKC_ACCESS_MODE_MTT,
1555 MLX5_MKC_PCIE_TPH_NO_STEERING_TAG_INDEX,
1556 MLX5_IB_NO_PH);
1557 if (IS_ERR(mr)) {
1558 ib_umem_release(&odp->umem);
1559 return ERR_CAST(mr);
1560 }
1561 xa_init(&mr->implicit_children);
1562
1563 odp->private = mr;
1564 err = mlx5r_store_odp_mkey(dev, &mr->mmkey);
1565 if (err)
1566 goto err_dereg_mr;
1567
1568 err = mlx5_ib_init_odp_mr(mr);
1569 if (err)
1570 goto err_dereg_mr;
1571 return &mr->ibmr;
1572
1573 err_dereg_mr:
1574 mlx5_ib_dereg_mr(&mr->ibmr, NULL);
1575 return ERR_PTR(err);
1576 }
1577
mlx5_ib_reg_user_mr(struct ib_pd * pd,u64 start,u64 length,u64 iova,int access_flags,struct ib_dmah * dmah,struct ib_udata * udata)1578 struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
1579 u64 iova, int access_flags,
1580 struct ib_dmah *dmah,
1581 struct ib_udata *udata)
1582 {
1583 struct mlx5_ib_dev *dev = to_mdev(pd->device);
1584 struct ib_umem *umem;
1585 int err;
1586
1587 if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM) ||
1588 ((access_flags & IB_ACCESS_ON_DEMAND) && dmah))
1589 return ERR_PTR(-EOPNOTSUPP);
1590
1591 mlx5_ib_dbg(dev, "start 0x%llx, iova 0x%llx, length 0x%llx, access_flags 0x%x\n",
1592 start, iova, length, access_flags);
1593
1594 err = mlx5r_umr_resource_init(dev);
1595 if (err)
1596 return ERR_PTR(err);
1597
1598 if (access_flags & IB_ACCESS_ON_DEMAND)
1599 return create_user_odp_mr(pd, start, length, iova, access_flags,
1600 udata);
1601 umem = ib_umem_get(&dev->ib_dev, start, length, access_flags);
1602 if (IS_ERR(umem))
1603 return ERR_CAST(umem);
1604 return create_real_mr(pd, umem, iova, access_flags, dmah);
1605 }
1606
mlx5_ib_dmabuf_invalidate_cb(struct dma_buf_attachment * attach)1607 static void mlx5_ib_dmabuf_invalidate_cb(struct dma_buf_attachment *attach)
1608 {
1609 struct ib_umem_dmabuf *umem_dmabuf = attach->importer_priv;
1610 struct mlx5_ib_mr *mr = umem_dmabuf->private;
1611
1612 dma_resv_assert_held(umem_dmabuf->attach->dmabuf->resv);
1613
1614 if (!umem_dmabuf->sgt || !mr)
1615 return;
1616
1617 mlx5r_umr_update_mr_pas(mr, MLX5_IB_UPD_XLT_ZAP);
1618 ib_umem_dmabuf_unmap_pages(umem_dmabuf);
1619 }
1620
1621 static struct dma_buf_attach_ops mlx5_ib_dmabuf_attach_ops = {
1622 .allow_peer2peer = 1,
1623 .move_notify = mlx5_ib_dmabuf_invalidate_cb,
1624 };
1625
1626 static struct ib_mr *
reg_user_mr_dmabuf(struct ib_pd * pd,struct device * dma_device,u64 offset,u64 length,u64 virt_addr,int fd,int access_flags,int access_mode,struct ib_dmah * dmah)1627 reg_user_mr_dmabuf(struct ib_pd *pd, struct device *dma_device,
1628 u64 offset, u64 length, u64 virt_addr,
1629 int fd, int access_flags, int access_mode,
1630 struct ib_dmah *dmah)
1631 {
1632 bool pinned_mode = (access_mode == MLX5_MKC_ACCESS_MODE_KSM);
1633 struct mlx5_ib_dev *dev = to_mdev(pd->device);
1634 struct mlx5_ib_mr *mr = NULL;
1635 struct ib_umem_dmabuf *umem_dmabuf;
1636 u16 st_index = MLX5_MKC_PCIE_TPH_NO_STEERING_TAG_INDEX;
1637 u8 ph = MLX5_IB_NO_PH;
1638 int err;
1639
1640 err = mlx5r_umr_resource_init(dev);
1641 if (err)
1642 return ERR_PTR(err);
1643
1644 if (!pinned_mode)
1645 umem_dmabuf = ib_umem_dmabuf_get(&dev->ib_dev,
1646 offset, length, fd,
1647 access_flags,
1648 &mlx5_ib_dmabuf_attach_ops);
1649 else
1650 umem_dmabuf = ib_umem_dmabuf_get_pinned_with_dma_device(&dev->ib_dev,
1651 dma_device, offset, length,
1652 fd, access_flags);
1653
1654 if (IS_ERR(umem_dmabuf)) {
1655 mlx5_ib_dbg(dev, "umem_dmabuf get failed (%ld)\n",
1656 PTR_ERR(umem_dmabuf));
1657 return ERR_CAST(umem_dmabuf);
1658 }
1659
1660 if (dmah) {
1661 struct mlx5_ib_dmah *mdmah = to_mdmah(dmah);
1662
1663 ph = dmah->ph;
1664 if (dmah->valid_fields & BIT(IB_DMAH_CPU_ID_EXISTS))
1665 st_index = mdmah->st_index;
1666 }
1667
1668 mr = alloc_cacheable_mr(pd, &umem_dmabuf->umem, virt_addr,
1669 access_flags, access_mode,
1670 st_index, ph);
1671 if (IS_ERR(mr)) {
1672 ib_umem_release(&umem_dmabuf->umem);
1673 return ERR_CAST(mr);
1674 }
1675
1676 mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmkey.key);
1677
1678 atomic_add(ib_umem_num_pages(mr->umem), &dev->mdev->priv.reg_pages);
1679 umem_dmabuf->private = mr;
1680 if (!pinned_mode) {
1681 err = mlx5r_store_odp_mkey(dev, &mr->mmkey);
1682 if (err)
1683 goto err_dereg_mr;
1684 } else {
1685 mr->data_direct = true;
1686 }
1687
1688 err = mlx5_ib_init_dmabuf_mr(mr);
1689 if (err)
1690 goto err_dereg_mr;
1691 return &mr->ibmr;
1692
1693 err_dereg_mr:
1694 __mlx5_ib_dereg_mr(&mr->ibmr);
1695 return ERR_PTR(err);
1696 }
1697
1698 static struct ib_mr *
reg_user_mr_dmabuf_by_data_direct(struct ib_pd * pd,u64 offset,u64 length,u64 virt_addr,int fd,int access_flags)1699 reg_user_mr_dmabuf_by_data_direct(struct ib_pd *pd, u64 offset,
1700 u64 length, u64 virt_addr,
1701 int fd, int access_flags)
1702 {
1703 struct mlx5_ib_dev *dev = to_mdev(pd->device);
1704 struct mlx5_data_direct_dev *data_direct_dev;
1705 struct ib_mr *crossing_mr;
1706 struct ib_mr *crossed_mr;
1707 int ret = 0;
1708
1709 /* As of HW behaviour the IOVA must be page aligned in KSM mode */
1710 if (!PAGE_ALIGNED(virt_addr) || (access_flags & IB_ACCESS_ON_DEMAND))
1711 return ERR_PTR(-EOPNOTSUPP);
1712
1713 mutex_lock(&dev->data_direct_lock);
1714 data_direct_dev = dev->data_direct_dev;
1715 if (!data_direct_dev) {
1716 ret = -EINVAL;
1717 goto end;
1718 }
1719
1720 /* The device's 'data direct mkey' was created without RO flags to
1721 * simplify things and allow for a single mkey per device.
1722 * Since RO is not a must, mask it out accordingly.
1723 */
1724 access_flags &= ~IB_ACCESS_RELAXED_ORDERING;
1725 crossed_mr = reg_user_mr_dmabuf(pd, &data_direct_dev->pdev->dev,
1726 offset, length, virt_addr, fd,
1727 access_flags, MLX5_MKC_ACCESS_MODE_KSM,
1728 NULL);
1729 if (IS_ERR(crossed_mr)) {
1730 ret = PTR_ERR(crossed_mr);
1731 goto end;
1732 }
1733
1734 mutex_lock(&dev->slow_path_mutex);
1735 crossing_mr = reg_create_crossing_vhca_mr(pd, virt_addr, length, access_flags,
1736 crossed_mr->lkey);
1737 mutex_unlock(&dev->slow_path_mutex);
1738 if (IS_ERR(crossing_mr)) {
1739 __mlx5_ib_dereg_mr(crossed_mr);
1740 ret = PTR_ERR(crossing_mr);
1741 goto end;
1742 }
1743
1744 list_add_tail(&to_mmr(crossed_mr)->dd_node, &dev->data_direct_mr_list);
1745 to_mmr(crossing_mr)->dd_crossed_mr = to_mmr(crossed_mr);
1746 to_mmr(crossing_mr)->data_direct = true;
1747 end:
1748 mutex_unlock(&dev->data_direct_lock);
1749 return ret ? ERR_PTR(ret) : crossing_mr;
1750 }
1751
mlx5_ib_reg_user_mr_dmabuf(struct ib_pd * pd,u64 offset,u64 length,u64 virt_addr,int fd,int access_flags,struct ib_dmah * dmah,struct uverbs_attr_bundle * attrs)1752 struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset,
1753 u64 length, u64 virt_addr,
1754 int fd, int access_flags,
1755 struct ib_dmah *dmah,
1756 struct uverbs_attr_bundle *attrs)
1757 {
1758 struct mlx5_ib_dev *dev = to_mdev(pd->device);
1759 int mlx5_access_flags = 0;
1760 int err;
1761
1762 if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM) ||
1763 !IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING))
1764 return ERR_PTR(-EOPNOTSUPP);
1765
1766 if (uverbs_attr_is_valid(attrs, MLX5_IB_ATTR_REG_DMABUF_MR_ACCESS_FLAGS)) {
1767 err = uverbs_get_flags32(&mlx5_access_flags, attrs,
1768 MLX5_IB_ATTR_REG_DMABUF_MR_ACCESS_FLAGS,
1769 MLX5_IB_UAPI_REG_DMABUF_ACCESS_DATA_DIRECT);
1770 if (err)
1771 return ERR_PTR(err);
1772 }
1773
1774 mlx5_ib_dbg(dev,
1775 "offset 0x%llx, virt_addr 0x%llx, length 0x%llx, fd %d, access_flags 0x%x, mlx5_access_flags 0x%x\n",
1776 offset, virt_addr, length, fd, access_flags, mlx5_access_flags);
1777
1778 /* dmabuf requires xlt update via umr to work. */
1779 if (!mlx5r_umr_can_load_pas(dev, length))
1780 return ERR_PTR(-EINVAL);
1781
1782 if (mlx5_access_flags & MLX5_IB_UAPI_REG_DMABUF_ACCESS_DATA_DIRECT)
1783 return reg_user_mr_dmabuf_by_data_direct(pd, offset, length, virt_addr,
1784 fd, access_flags);
1785
1786 return reg_user_mr_dmabuf(pd, pd->device->dma_device,
1787 offset, length, virt_addr,
1788 fd, access_flags, MLX5_MKC_ACCESS_MODE_MTT,
1789 dmah);
1790 }
1791
1792 /*
1793 * True if the change in access flags can be done via UMR, only some access
1794 * flags can be updated.
1795 */
can_use_umr_rereg_access(struct mlx5_ib_dev * dev,unsigned int current_access_flags,unsigned int target_access_flags)1796 static bool can_use_umr_rereg_access(struct mlx5_ib_dev *dev,
1797 unsigned int current_access_flags,
1798 unsigned int target_access_flags)
1799 {
1800 unsigned int diffs = current_access_flags ^ target_access_flags;
1801
1802 if (diffs & ~(IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE |
1803 IB_ACCESS_REMOTE_READ | IB_ACCESS_RELAXED_ORDERING |
1804 IB_ACCESS_REMOTE_ATOMIC))
1805 return false;
1806 return mlx5r_umr_can_reconfig(dev, current_access_flags,
1807 target_access_flags);
1808 }
1809
can_use_umr_rereg_pas(struct mlx5_ib_mr * mr,struct ib_umem * new_umem,int new_access_flags,u64 iova,unsigned long * page_size)1810 static bool can_use_umr_rereg_pas(struct mlx5_ib_mr *mr,
1811 struct ib_umem *new_umem,
1812 int new_access_flags, u64 iova,
1813 unsigned long *page_size)
1814 {
1815 struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device);
1816
1817 /* We only track the allocated sizes of MRs from the cache */
1818 if (!mr->mmkey.cache_ent)
1819 return false;
1820 if (!mlx5r_umr_can_load_pas(dev, new_umem->length))
1821 return false;
1822
1823 *page_size = mlx5_umem_mkc_find_best_pgsz(
1824 dev, new_umem, iova, mr->mmkey.cache_ent->rb_key.access_mode);
1825 if (WARN_ON(!*page_size))
1826 return false;
1827 return (mr->mmkey.cache_ent->rb_key.ndescs) >=
1828 ib_umem_num_dma_blocks(new_umem, *page_size);
1829 }
1830
umr_rereg_pas(struct mlx5_ib_mr * mr,struct ib_pd * pd,int access_flags,int flags,struct ib_umem * new_umem,u64 iova,unsigned long page_size)1831 static int umr_rereg_pas(struct mlx5_ib_mr *mr, struct ib_pd *pd,
1832 int access_flags, int flags, struct ib_umem *new_umem,
1833 u64 iova, unsigned long page_size)
1834 {
1835 struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device);
1836 int upd_flags = MLX5_IB_UPD_XLT_ADDR | MLX5_IB_UPD_XLT_ENABLE;
1837 struct ib_umem *old_umem = mr->umem;
1838 int err;
1839
1840 /*
1841 * To keep everything simple the MR is revoked before we start to mess
1842 * with it. This ensure the change is atomic relative to any use of the
1843 * MR.
1844 */
1845 err = mlx5r_umr_revoke_mr(mr);
1846 if (err)
1847 return err;
1848
1849 if (flags & IB_MR_REREG_PD) {
1850 mr->ibmr.pd = pd;
1851 upd_flags |= MLX5_IB_UPD_XLT_PD;
1852 }
1853 if (flags & IB_MR_REREG_ACCESS) {
1854 mr->access_flags = access_flags;
1855 upd_flags |= MLX5_IB_UPD_XLT_ACCESS;
1856 }
1857
1858 mr->ibmr.iova = iova;
1859 mr->ibmr.length = new_umem->length;
1860 mr->page_shift = order_base_2(page_size);
1861 mr->umem = new_umem;
1862 err = mlx5r_umr_update_mr_pas(mr, upd_flags);
1863 if (err) {
1864 /*
1865 * The MR is revoked at this point so there is no issue to free
1866 * new_umem.
1867 */
1868 mr->umem = old_umem;
1869 return err;
1870 }
1871
1872 atomic_sub(ib_umem_num_pages(old_umem), &dev->mdev->priv.reg_pages);
1873 ib_umem_release(old_umem);
1874 atomic_add(ib_umem_num_pages(new_umem), &dev->mdev->priv.reg_pages);
1875 return 0;
1876 }
1877
mlx5_ib_rereg_user_mr(struct ib_mr * ib_mr,int flags,u64 start,u64 length,u64 iova,int new_access_flags,struct ib_pd * new_pd,struct ib_udata * udata)1878 struct ib_mr *mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
1879 u64 length, u64 iova, int new_access_flags,
1880 struct ib_pd *new_pd,
1881 struct ib_udata *udata)
1882 {
1883 struct mlx5_ib_dev *dev = to_mdev(ib_mr->device);
1884 struct mlx5_ib_mr *mr = to_mmr(ib_mr);
1885 int err;
1886
1887 if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM) || mr->data_direct ||
1888 mr->mmkey.rb_key.ph != MLX5_IB_NO_PH)
1889 return ERR_PTR(-EOPNOTSUPP);
1890
1891 mlx5_ib_dbg(
1892 dev,
1893 "start 0x%llx, iova 0x%llx, length 0x%llx, access_flags 0x%x\n",
1894 start, iova, length, new_access_flags);
1895
1896 if (flags & ~(IB_MR_REREG_TRANS | IB_MR_REREG_PD | IB_MR_REREG_ACCESS))
1897 return ERR_PTR(-EOPNOTSUPP);
1898
1899 if (!(flags & IB_MR_REREG_ACCESS))
1900 new_access_flags = mr->access_flags;
1901 if (!(flags & IB_MR_REREG_PD))
1902 new_pd = ib_mr->pd;
1903
1904 if (!(flags & IB_MR_REREG_TRANS)) {
1905 struct ib_umem *umem;
1906
1907 /* Fast path for PD/access change */
1908 if (can_use_umr_rereg_access(dev, mr->access_flags,
1909 new_access_flags)) {
1910 err = mlx5r_umr_rereg_pd_access(mr, new_pd,
1911 new_access_flags);
1912 if (err)
1913 return ERR_PTR(err);
1914 return NULL;
1915 }
1916 /* DM or ODP MR's don't have a normal umem so we can't re-use it */
1917 if (!mr->umem || is_odp_mr(mr) || is_dmabuf_mr(mr))
1918 goto recreate;
1919
1920 /*
1921 * Only one active MR can refer to a umem at one time, revoke
1922 * the old MR before assigning the umem to the new one.
1923 */
1924 err = mlx5r_umr_revoke_mr(mr);
1925 if (err)
1926 return ERR_PTR(err);
1927 umem = mr->umem;
1928 mr->umem = NULL;
1929 atomic_sub(ib_umem_num_pages(umem), &dev->mdev->priv.reg_pages);
1930
1931 return create_real_mr(new_pd, umem, mr->ibmr.iova,
1932 new_access_flags, NULL);
1933 }
1934
1935 /*
1936 * DM doesn't have a PAS list so we can't re-use it, odp/dmabuf does
1937 * but the logic around releasing the umem is different
1938 */
1939 if (!mr->umem || is_odp_mr(mr) || is_dmabuf_mr(mr))
1940 goto recreate;
1941
1942 if (!(new_access_flags & IB_ACCESS_ON_DEMAND) &&
1943 can_use_umr_rereg_access(dev, mr->access_flags, new_access_flags)) {
1944 struct ib_umem *new_umem;
1945 unsigned long page_size;
1946
1947 new_umem = ib_umem_get(&dev->ib_dev, start, length,
1948 new_access_flags);
1949 if (IS_ERR(new_umem))
1950 return ERR_CAST(new_umem);
1951
1952 /* Fast path for PAS change */
1953 if (can_use_umr_rereg_pas(mr, new_umem, new_access_flags, iova,
1954 &page_size)) {
1955 err = umr_rereg_pas(mr, new_pd, new_access_flags, flags,
1956 new_umem, iova, page_size);
1957 if (err) {
1958 ib_umem_release(new_umem);
1959 return ERR_PTR(err);
1960 }
1961 return NULL;
1962 }
1963 return create_real_mr(new_pd, new_umem, iova, new_access_flags, NULL);
1964 }
1965
1966 /*
1967 * Everything else has no state we can preserve, just create a new MR
1968 * from scratch
1969 */
1970 recreate:
1971 return mlx5_ib_reg_user_mr(new_pd, start, length, iova,
1972 new_access_flags, NULL, udata);
1973 }
1974
1975 static int
mlx5_alloc_priv_descs(struct ib_device * device,struct mlx5_ib_mr * mr,int ndescs,int desc_size)1976 mlx5_alloc_priv_descs(struct ib_device *device,
1977 struct mlx5_ib_mr *mr,
1978 int ndescs,
1979 int desc_size)
1980 {
1981 struct mlx5_ib_dev *dev = to_mdev(device);
1982 struct device *ddev = &dev->mdev->pdev->dev;
1983 int size = ndescs * desc_size;
1984 int add_size;
1985 int ret;
1986
1987 add_size = max_t(int, MLX5_UMR_ALIGN - ARCH_KMALLOC_MINALIGN, 0);
1988 if (is_power_of_2(MLX5_UMR_ALIGN) && add_size) {
1989 int end = max_t(int, MLX5_UMR_ALIGN, roundup_pow_of_two(size));
1990
1991 add_size = min_t(int, end - size, add_size);
1992 }
1993
1994 mr->descs_alloc = kzalloc(size + add_size, GFP_KERNEL);
1995 if (!mr->descs_alloc)
1996 return -ENOMEM;
1997
1998 mr->descs = PTR_ALIGN(mr->descs_alloc, MLX5_UMR_ALIGN);
1999
2000 mr->desc_map = dma_map_single(ddev, mr->descs, size, DMA_TO_DEVICE);
2001 if (dma_mapping_error(ddev, mr->desc_map)) {
2002 ret = -ENOMEM;
2003 goto err;
2004 }
2005
2006 return 0;
2007 err:
2008 kfree(mr->descs_alloc);
2009
2010 return ret;
2011 }
2012
2013 static void
mlx5_free_priv_descs(struct mlx5_ib_mr * mr)2014 mlx5_free_priv_descs(struct mlx5_ib_mr *mr)
2015 {
2016 if (!mr->umem && !mr->data_direct &&
2017 mr->ibmr.type != IB_MR_TYPE_DM && mr->descs) {
2018 struct ib_device *device = mr->ibmr.device;
2019 int size = mr->max_descs * mr->desc_size;
2020 struct mlx5_ib_dev *dev = to_mdev(device);
2021
2022 dma_unmap_single(&dev->mdev->pdev->dev, mr->desc_map, size,
2023 DMA_TO_DEVICE);
2024 kfree(mr->descs_alloc);
2025 mr->descs = NULL;
2026 }
2027 }
2028
cache_ent_find_and_store(struct mlx5_ib_dev * dev,struct mlx5_ib_mr * mr)2029 static int cache_ent_find_and_store(struct mlx5_ib_dev *dev,
2030 struct mlx5_ib_mr *mr)
2031 {
2032 struct mlx5_mkey_cache *cache = &dev->cache;
2033 struct mlx5_cache_ent *ent;
2034 int ret;
2035
2036 if (mr->mmkey.cache_ent) {
2037 spin_lock_irq(&mr->mmkey.cache_ent->mkeys_queue.lock);
2038 goto end;
2039 }
2040
2041 mutex_lock(&cache->rb_lock);
2042 ent = mkey_cache_ent_from_rb_key(dev, mr->mmkey.rb_key);
2043 if (ent) {
2044 if (ent->rb_key.ndescs == mr->mmkey.rb_key.ndescs) {
2045 if (ent->disabled) {
2046 mutex_unlock(&cache->rb_lock);
2047 return -EOPNOTSUPP;
2048 }
2049 mr->mmkey.cache_ent = ent;
2050 spin_lock_irq(&mr->mmkey.cache_ent->mkeys_queue.lock);
2051 mutex_unlock(&cache->rb_lock);
2052 goto end;
2053 }
2054 }
2055
2056 ent = mlx5r_cache_create_ent_locked(dev, mr->mmkey.rb_key, false);
2057 mutex_unlock(&cache->rb_lock);
2058 if (IS_ERR(ent))
2059 return PTR_ERR(ent);
2060
2061 mr->mmkey.cache_ent = ent;
2062 spin_lock_irq(&mr->mmkey.cache_ent->mkeys_queue.lock);
2063
2064 end:
2065 ret = push_mkey_locked(mr->mmkey.cache_ent, mr->mmkey.key);
2066 spin_unlock_irq(&mr->mmkey.cache_ent->mkeys_queue.lock);
2067 return ret;
2068 }
2069
mlx5_ib_revoke_data_direct_mr(struct mlx5_ib_mr * mr)2070 static int mlx5_ib_revoke_data_direct_mr(struct mlx5_ib_mr *mr)
2071 {
2072 struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device);
2073 struct ib_umem_dmabuf *umem_dmabuf = to_ib_umem_dmabuf(mr->umem);
2074 int err;
2075
2076 lockdep_assert_held(&dev->data_direct_lock);
2077 mr->revoked = true;
2078 err = mlx5r_umr_revoke_mr(mr);
2079 if (WARN_ON(err))
2080 return err;
2081
2082 ib_umem_dmabuf_revoke(umem_dmabuf);
2083 return 0;
2084 }
2085
mlx5_ib_revoke_data_direct_mrs(struct mlx5_ib_dev * dev)2086 void mlx5_ib_revoke_data_direct_mrs(struct mlx5_ib_dev *dev)
2087 {
2088 struct mlx5_ib_mr *mr, *next;
2089
2090 lockdep_assert_held(&dev->data_direct_lock);
2091
2092 list_for_each_entry_safe(mr, next, &dev->data_direct_mr_list, dd_node) {
2093 list_del(&mr->dd_node);
2094 mlx5_ib_revoke_data_direct_mr(mr);
2095 }
2096 }
2097
mlx5_umr_revoke_mr_with_lock(struct mlx5_ib_mr * mr)2098 static int mlx5_umr_revoke_mr_with_lock(struct mlx5_ib_mr *mr)
2099 {
2100 bool is_odp_dma_buf = is_dmabuf_mr(mr) &&
2101 !to_ib_umem_dmabuf(mr->umem)->pinned;
2102 bool is_odp = is_odp_mr(mr);
2103 int ret;
2104
2105 if (is_odp)
2106 mutex_lock(&to_ib_umem_odp(mr->umem)->umem_mutex);
2107
2108 if (is_odp_dma_buf)
2109 dma_resv_lock(to_ib_umem_dmabuf(mr->umem)->attach->dmabuf->resv,
2110 NULL);
2111
2112 ret = mlx5r_umr_revoke_mr(mr);
2113
2114 if (is_odp) {
2115 if (!ret)
2116 to_ib_umem_odp(mr->umem)->private = NULL;
2117 mutex_unlock(&to_ib_umem_odp(mr->umem)->umem_mutex);
2118 }
2119
2120 if (is_odp_dma_buf) {
2121 if (!ret)
2122 to_ib_umem_dmabuf(mr->umem)->private = NULL;
2123 dma_resv_unlock(
2124 to_ib_umem_dmabuf(mr->umem)->attach->dmabuf->resv);
2125 }
2126
2127 return ret;
2128 }
2129
mlx5r_handle_mkey_cleanup(struct mlx5_ib_mr * mr)2130 static int mlx5r_handle_mkey_cleanup(struct mlx5_ib_mr *mr)
2131 {
2132 bool is_odp_dma_buf = is_dmabuf_mr(mr) &&
2133 !to_ib_umem_dmabuf(mr->umem)->pinned;
2134 struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device);
2135 struct mlx5_cache_ent *ent = mr->mmkey.cache_ent;
2136 bool is_odp = is_odp_mr(mr);
2137 bool from_cache = !!ent;
2138 int ret;
2139
2140 if (mr->mmkey.cacheable && !mlx5_umr_revoke_mr_with_lock(mr) &&
2141 !cache_ent_find_and_store(dev, mr)) {
2142 ent = mr->mmkey.cache_ent;
2143 /* upon storing to a clean temp entry - schedule its cleanup */
2144 spin_lock_irq(&ent->mkeys_queue.lock);
2145 if (from_cache)
2146 ent->in_use--;
2147 if (ent->is_tmp && !ent->tmp_cleanup_scheduled) {
2148 mod_delayed_work(ent->dev->cache.wq, &ent->dwork,
2149 secs_to_jiffies(30));
2150 ent->tmp_cleanup_scheduled = true;
2151 }
2152 spin_unlock_irq(&ent->mkeys_queue.lock);
2153 return 0;
2154 }
2155
2156 if (ent) {
2157 spin_lock_irq(&ent->mkeys_queue.lock);
2158 ent->in_use--;
2159 mr->mmkey.cache_ent = NULL;
2160 spin_unlock_irq(&ent->mkeys_queue.lock);
2161 }
2162
2163 if (is_odp)
2164 mutex_lock(&to_ib_umem_odp(mr->umem)->umem_mutex);
2165
2166 if (is_odp_dma_buf)
2167 dma_resv_lock(to_ib_umem_dmabuf(mr->umem)->attach->dmabuf->resv,
2168 NULL);
2169 ret = destroy_mkey(dev, mr);
2170 if (is_odp) {
2171 if (!ret)
2172 to_ib_umem_odp(mr->umem)->private = NULL;
2173 mutex_unlock(&to_ib_umem_odp(mr->umem)->umem_mutex);
2174 }
2175
2176 if (is_odp_dma_buf) {
2177 if (!ret)
2178 to_ib_umem_dmabuf(mr->umem)->private = NULL;
2179 dma_resv_unlock(
2180 to_ib_umem_dmabuf(mr->umem)->attach->dmabuf->resv);
2181 }
2182 return ret;
2183 }
2184
__mlx5_ib_dereg_mr(struct ib_mr * ibmr)2185 static int __mlx5_ib_dereg_mr(struct ib_mr *ibmr)
2186 {
2187 struct mlx5_ib_mr *mr = to_mmr(ibmr);
2188 struct mlx5_ib_dev *dev = to_mdev(ibmr->device);
2189 int rc;
2190
2191 /*
2192 * Any async use of the mr must hold the refcount, once the refcount
2193 * goes to zero no other thread, such as ODP page faults, prefetch, any
2194 * UMR activity, etc can touch the mkey. Thus it is safe to destroy it.
2195 */
2196 if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) &&
2197 refcount_read(&mr->mmkey.usecount) != 0 &&
2198 xa_erase(&mr_to_mdev(mr)->odp_mkeys, mlx5_base_mkey(mr->mmkey.key)))
2199 mlx5r_deref_wait_odp_mkey(&mr->mmkey);
2200
2201 if (ibmr->type == IB_MR_TYPE_INTEGRITY) {
2202 xa_cmpxchg(&dev->sig_mrs, mlx5_base_mkey(mr->mmkey.key),
2203 mr->sig, NULL, GFP_KERNEL);
2204
2205 if (mr->mtt_mr) {
2206 rc = mlx5_ib_dereg_mr(&mr->mtt_mr->ibmr, NULL);
2207 if (rc)
2208 return rc;
2209 mr->mtt_mr = NULL;
2210 }
2211 if (mr->klm_mr) {
2212 rc = mlx5_ib_dereg_mr(&mr->klm_mr->ibmr, NULL);
2213 if (rc)
2214 return rc;
2215 mr->klm_mr = NULL;
2216 }
2217
2218 if (mlx5_core_destroy_psv(dev->mdev,
2219 mr->sig->psv_memory.psv_idx))
2220 mlx5_ib_warn(dev, "failed to destroy mem psv %d\n",
2221 mr->sig->psv_memory.psv_idx);
2222 if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_wire.psv_idx))
2223 mlx5_ib_warn(dev, "failed to destroy wire psv %d\n",
2224 mr->sig->psv_wire.psv_idx);
2225 kfree(mr->sig);
2226 mr->sig = NULL;
2227 }
2228
2229 /* Stop DMA */
2230 rc = mlx5r_handle_mkey_cleanup(mr);
2231 if (rc)
2232 return rc;
2233
2234 if (mr->umem) {
2235 bool is_odp = is_odp_mr(mr);
2236
2237 if (!is_odp)
2238 atomic_sub(ib_umem_num_pages(mr->umem),
2239 &dev->mdev->priv.reg_pages);
2240 ib_umem_release(mr->umem);
2241 if (is_odp)
2242 mlx5_ib_free_odp_mr(mr);
2243 }
2244
2245 if (!mr->mmkey.cache_ent)
2246 mlx5_free_priv_descs(mr);
2247
2248 kfree(mr);
2249 return 0;
2250 }
2251
dereg_crossing_data_direct_mr(struct mlx5_ib_dev * dev,struct mlx5_ib_mr * mr)2252 static int dereg_crossing_data_direct_mr(struct mlx5_ib_dev *dev,
2253 struct mlx5_ib_mr *mr)
2254 {
2255 struct mlx5_ib_mr *dd_crossed_mr = mr->dd_crossed_mr;
2256 int ret;
2257
2258 ret = __mlx5_ib_dereg_mr(&mr->ibmr);
2259 if (ret)
2260 return ret;
2261
2262 mutex_lock(&dev->data_direct_lock);
2263 if (!dd_crossed_mr->revoked)
2264 list_del(&dd_crossed_mr->dd_node);
2265
2266 ret = __mlx5_ib_dereg_mr(&dd_crossed_mr->ibmr);
2267 mutex_unlock(&dev->data_direct_lock);
2268 return ret;
2269 }
2270
mlx5_ib_dereg_mr(struct ib_mr * ibmr,struct ib_udata * udata)2271 int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
2272 {
2273 struct mlx5_ib_mr *mr = to_mmr(ibmr);
2274 struct mlx5_ib_dev *dev = to_mdev(ibmr->device);
2275
2276 if (mr->data_direct)
2277 return dereg_crossing_data_direct_mr(dev, mr);
2278
2279 return __mlx5_ib_dereg_mr(ibmr);
2280 }
2281
mlx5_set_umr_free_mkey(struct ib_pd * pd,u32 * in,int ndescs,int access_mode,int page_shift)2282 static void mlx5_set_umr_free_mkey(struct ib_pd *pd, u32 *in, int ndescs,
2283 int access_mode, int page_shift)
2284 {
2285 struct mlx5_ib_dev *dev = to_mdev(pd->device);
2286 void *mkc;
2287
2288 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
2289
2290 /* This is only used from the kernel, so setting the PD is OK. */
2291 set_mkc_access_pd_addr_fields(mkc, IB_ACCESS_RELAXED_ORDERING, 0, pd);
2292 MLX5_SET(mkc, mkc, free, 1);
2293 MLX5_SET(mkc, mkc, translations_octword_size, ndescs);
2294 MLX5_SET(mkc, mkc, access_mode_1_0, access_mode & 0x3);
2295 MLX5_SET(mkc, mkc, access_mode_4_2, (access_mode >> 2) & 0x7);
2296 MLX5_SET(mkc, mkc, umr_en, 1);
2297 MLX5_SET(mkc, mkc, log_page_size, page_shift);
2298 if (access_mode == MLX5_MKC_ACCESS_MODE_PA ||
2299 access_mode == MLX5_MKC_ACCESS_MODE_MTT)
2300 MLX5_SET(mkc, mkc, ma_translation_mode, MLX5_CAP_GEN(dev->mdev, ats));
2301 }
2302
_mlx5_alloc_mkey_descs(struct ib_pd * pd,struct mlx5_ib_mr * mr,int ndescs,int desc_size,int page_shift,int access_mode,u32 * in,int inlen)2303 static int _mlx5_alloc_mkey_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr,
2304 int ndescs, int desc_size, int page_shift,
2305 int access_mode, u32 *in, int inlen)
2306 {
2307 struct mlx5_ib_dev *dev = to_mdev(pd->device);
2308 int err;
2309
2310 mr->access_mode = access_mode;
2311 mr->desc_size = desc_size;
2312 mr->max_descs = ndescs;
2313
2314 err = mlx5_alloc_priv_descs(pd->device, mr, ndescs, desc_size);
2315 if (err)
2316 return err;
2317
2318 mlx5_set_umr_free_mkey(pd, in, ndescs, access_mode, page_shift);
2319
2320 err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen);
2321 if (err)
2322 goto err_free_descs;
2323
2324 mr->mmkey.type = MLX5_MKEY_MR;
2325 mr->ibmr.lkey = mr->mmkey.key;
2326 mr->ibmr.rkey = mr->mmkey.key;
2327
2328 return 0;
2329
2330 err_free_descs:
2331 mlx5_free_priv_descs(mr);
2332 return err;
2333 }
2334
mlx5_ib_alloc_pi_mr(struct ib_pd * pd,u32 max_num_sg,u32 max_num_meta_sg,int desc_size,int access_mode)2335 static struct mlx5_ib_mr *mlx5_ib_alloc_pi_mr(struct ib_pd *pd,
2336 u32 max_num_sg, u32 max_num_meta_sg,
2337 int desc_size, int access_mode)
2338 {
2339 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
2340 int ndescs = ALIGN(max_num_sg + max_num_meta_sg, 4);
2341 int page_shift = 0;
2342 struct mlx5_ib_mr *mr;
2343 u32 *in;
2344 int err;
2345
2346 mr = kzalloc(sizeof(*mr), GFP_KERNEL);
2347 if (!mr)
2348 return ERR_PTR(-ENOMEM);
2349
2350 mr->ibmr.pd = pd;
2351 mr->ibmr.device = pd->device;
2352
2353 in = kzalloc(inlen, GFP_KERNEL);
2354 if (!in) {
2355 err = -ENOMEM;
2356 goto err_free;
2357 }
2358
2359 if (access_mode == MLX5_MKC_ACCESS_MODE_MTT)
2360 page_shift = PAGE_SHIFT;
2361
2362 err = _mlx5_alloc_mkey_descs(pd, mr, ndescs, desc_size, page_shift,
2363 access_mode, in, inlen);
2364 if (err)
2365 goto err_free_in;
2366
2367 mr->umem = NULL;
2368 kfree(in);
2369
2370 return mr;
2371
2372 err_free_in:
2373 kfree(in);
2374 err_free:
2375 kfree(mr);
2376 return ERR_PTR(err);
2377 }
2378
mlx5_alloc_mem_reg_descs(struct ib_pd * pd,struct mlx5_ib_mr * mr,int ndescs,u32 * in,int inlen)2379 static int mlx5_alloc_mem_reg_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr,
2380 int ndescs, u32 *in, int inlen)
2381 {
2382 return _mlx5_alloc_mkey_descs(pd, mr, ndescs, sizeof(struct mlx5_mtt),
2383 PAGE_SHIFT, MLX5_MKC_ACCESS_MODE_MTT, in,
2384 inlen);
2385 }
2386
mlx5_alloc_sg_gaps_descs(struct ib_pd * pd,struct mlx5_ib_mr * mr,int ndescs,u32 * in,int inlen)2387 static int mlx5_alloc_sg_gaps_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr,
2388 int ndescs, u32 *in, int inlen)
2389 {
2390 return _mlx5_alloc_mkey_descs(pd, mr, ndescs, sizeof(struct mlx5_klm),
2391 0, MLX5_MKC_ACCESS_MODE_KLMS, in, inlen);
2392 }
2393
mlx5_alloc_integrity_descs(struct ib_pd * pd,struct mlx5_ib_mr * mr,int max_num_sg,int max_num_meta_sg,u32 * in,int inlen)2394 static int mlx5_alloc_integrity_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr,
2395 int max_num_sg, int max_num_meta_sg,
2396 u32 *in, int inlen)
2397 {
2398 struct mlx5_ib_dev *dev = to_mdev(pd->device);
2399 u32 psv_index[2];
2400 void *mkc;
2401 int err;
2402
2403 mr->sig = kzalloc(sizeof(*mr->sig), GFP_KERNEL);
2404 if (!mr->sig)
2405 return -ENOMEM;
2406
2407 /* create mem & wire PSVs */
2408 err = mlx5_core_create_psv(dev->mdev, to_mpd(pd)->pdn, 2, psv_index);
2409 if (err)
2410 goto err_free_sig;
2411
2412 mr->sig->psv_memory.psv_idx = psv_index[0];
2413 mr->sig->psv_wire.psv_idx = psv_index[1];
2414
2415 mr->sig->sig_status_checked = true;
2416 mr->sig->sig_err_exists = false;
2417 /* Next UMR, Arm SIGERR */
2418 ++mr->sig->sigerr_count;
2419 mr->klm_mr = mlx5_ib_alloc_pi_mr(pd, max_num_sg, max_num_meta_sg,
2420 sizeof(struct mlx5_klm),
2421 MLX5_MKC_ACCESS_MODE_KLMS);
2422 if (IS_ERR(mr->klm_mr)) {
2423 err = PTR_ERR(mr->klm_mr);
2424 goto err_destroy_psv;
2425 }
2426 mr->mtt_mr = mlx5_ib_alloc_pi_mr(pd, max_num_sg, max_num_meta_sg,
2427 sizeof(struct mlx5_mtt),
2428 MLX5_MKC_ACCESS_MODE_MTT);
2429 if (IS_ERR(mr->mtt_mr)) {
2430 err = PTR_ERR(mr->mtt_mr);
2431 goto err_free_klm_mr;
2432 }
2433
2434 /* Set bsf descriptors for mkey */
2435 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
2436 MLX5_SET(mkc, mkc, bsf_en, 1);
2437 MLX5_SET(mkc, mkc, bsf_octword_size, MLX5_MKEY_BSF_OCTO_SIZE);
2438
2439 err = _mlx5_alloc_mkey_descs(pd, mr, 4, sizeof(struct mlx5_klm), 0,
2440 MLX5_MKC_ACCESS_MODE_KLMS, in, inlen);
2441 if (err)
2442 goto err_free_mtt_mr;
2443
2444 err = xa_err(xa_store(&dev->sig_mrs, mlx5_base_mkey(mr->mmkey.key),
2445 mr->sig, GFP_KERNEL));
2446 if (err)
2447 goto err_free_descs;
2448 return 0;
2449
2450 err_free_descs:
2451 destroy_mkey(dev, mr);
2452 mlx5_free_priv_descs(mr);
2453 err_free_mtt_mr:
2454 mlx5_ib_dereg_mr(&mr->mtt_mr->ibmr, NULL);
2455 mr->mtt_mr = NULL;
2456 err_free_klm_mr:
2457 mlx5_ib_dereg_mr(&mr->klm_mr->ibmr, NULL);
2458 mr->klm_mr = NULL;
2459 err_destroy_psv:
2460 if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_memory.psv_idx))
2461 mlx5_ib_warn(dev, "failed to destroy mem psv %d\n",
2462 mr->sig->psv_memory.psv_idx);
2463 if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_wire.psv_idx))
2464 mlx5_ib_warn(dev, "failed to destroy wire psv %d\n",
2465 mr->sig->psv_wire.psv_idx);
2466 err_free_sig:
2467 kfree(mr->sig);
2468
2469 return err;
2470 }
2471
__mlx5_ib_alloc_mr(struct ib_pd * pd,enum ib_mr_type mr_type,u32 max_num_sg,u32 max_num_meta_sg)2472 static struct ib_mr *__mlx5_ib_alloc_mr(struct ib_pd *pd,
2473 enum ib_mr_type mr_type, u32 max_num_sg,
2474 u32 max_num_meta_sg)
2475 {
2476 struct mlx5_ib_dev *dev = to_mdev(pd->device);
2477 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
2478 int ndescs = ALIGN(max_num_sg, 4);
2479 struct mlx5_ib_mr *mr;
2480 u32 *in;
2481 int err;
2482
2483 mr = kzalloc(sizeof(*mr), GFP_KERNEL);
2484 if (!mr)
2485 return ERR_PTR(-ENOMEM);
2486
2487 in = kzalloc(inlen, GFP_KERNEL);
2488 if (!in) {
2489 err = -ENOMEM;
2490 goto err_free;
2491 }
2492
2493 mr->ibmr.device = pd->device;
2494 mr->umem = NULL;
2495
2496 switch (mr_type) {
2497 case IB_MR_TYPE_MEM_REG:
2498 err = mlx5_alloc_mem_reg_descs(pd, mr, ndescs, in, inlen);
2499 break;
2500 case IB_MR_TYPE_SG_GAPS:
2501 err = mlx5_alloc_sg_gaps_descs(pd, mr, ndescs, in, inlen);
2502 break;
2503 case IB_MR_TYPE_INTEGRITY:
2504 err = mlx5_alloc_integrity_descs(pd, mr, max_num_sg,
2505 max_num_meta_sg, in, inlen);
2506 break;
2507 default:
2508 mlx5_ib_warn(dev, "Invalid mr type %d\n", mr_type);
2509 err = -EINVAL;
2510 }
2511
2512 if (err)
2513 goto err_free_in;
2514
2515 kfree(in);
2516
2517 return &mr->ibmr;
2518
2519 err_free_in:
2520 kfree(in);
2521 err_free:
2522 kfree(mr);
2523 return ERR_PTR(err);
2524 }
2525
mlx5_ib_alloc_mr(struct ib_pd * pd,enum ib_mr_type mr_type,u32 max_num_sg)2526 struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
2527 u32 max_num_sg)
2528 {
2529 return __mlx5_ib_alloc_mr(pd, mr_type, max_num_sg, 0);
2530 }
2531
mlx5_ib_alloc_mr_integrity(struct ib_pd * pd,u32 max_num_sg,u32 max_num_meta_sg)2532 struct ib_mr *mlx5_ib_alloc_mr_integrity(struct ib_pd *pd,
2533 u32 max_num_sg, u32 max_num_meta_sg)
2534 {
2535 return __mlx5_ib_alloc_mr(pd, IB_MR_TYPE_INTEGRITY, max_num_sg,
2536 max_num_meta_sg);
2537 }
2538
mlx5_ib_alloc_mw(struct ib_mw * ibmw,struct ib_udata * udata)2539 int mlx5_ib_alloc_mw(struct ib_mw *ibmw, struct ib_udata *udata)
2540 {
2541 struct mlx5_ib_dev *dev = to_mdev(ibmw->device);
2542 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
2543 struct mlx5_ib_mw *mw = to_mmw(ibmw);
2544 unsigned int ndescs;
2545 u32 *in = NULL;
2546 void *mkc;
2547 int err;
2548 struct mlx5_ib_alloc_mw req = {};
2549 struct {
2550 __u32 comp_mask;
2551 __u32 response_length;
2552 } resp = {};
2553
2554 err = ib_copy_from_udata(&req, udata, min(udata->inlen, sizeof(req)));
2555 if (err)
2556 return err;
2557
2558 if (req.comp_mask || req.reserved1 || req.reserved2)
2559 return -EOPNOTSUPP;
2560
2561 if (udata->inlen > sizeof(req) &&
2562 !ib_is_udata_cleared(udata, sizeof(req),
2563 udata->inlen - sizeof(req)))
2564 return -EOPNOTSUPP;
2565
2566 ndescs = req.num_klms ? roundup(req.num_klms, 4) : roundup(1, 4);
2567
2568 in = kzalloc(inlen, GFP_KERNEL);
2569 if (!in)
2570 return -ENOMEM;
2571
2572 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
2573
2574 MLX5_SET(mkc, mkc, free, 1);
2575 MLX5_SET(mkc, mkc, translations_octword_size, ndescs);
2576 MLX5_SET(mkc, mkc, pd, to_mpd(ibmw->pd)->pdn);
2577 MLX5_SET(mkc, mkc, umr_en, 1);
2578 MLX5_SET(mkc, mkc, lr, 1);
2579 MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_KLMS);
2580 MLX5_SET(mkc, mkc, en_rinval, !!((ibmw->type == IB_MW_TYPE_2)));
2581 MLX5_SET(mkc, mkc, qpn, 0xffffff);
2582
2583 err = mlx5_ib_create_mkey(dev, &mw->mmkey, in, inlen);
2584 if (err)
2585 goto free;
2586
2587 mw->mmkey.type = MLX5_MKEY_MW;
2588 ibmw->rkey = mw->mmkey.key;
2589 mw->mmkey.ndescs = ndescs;
2590
2591 resp.response_length =
2592 min(offsetofend(typeof(resp), response_length), udata->outlen);
2593 if (resp.response_length) {
2594 err = ib_copy_to_udata(udata, &resp, resp.response_length);
2595 if (err)
2596 goto free_mkey;
2597 }
2598
2599 if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) {
2600 err = mlx5r_store_odp_mkey(dev, &mw->mmkey);
2601 if (err)
2602 goto free_mkey;
2603 }
2604
2605 kfree(in);
2606 return 0;
2607
2608 free_mkey:
2609 mlx5_core_destroy_mkey(dev->mdev, mw->mmkey.key);
2610 free:
2611 kfree(in);
2612 return err;
2613 }
2614
mlx5_ib_dealloc_mw(struct ib_mw * mw)2615 int mlx5_ib_dealloc_mw(struct ib_mw *mw)
2616 {
2617 struct mlx5_ib_dev *dev = to_mdev(mw->device);
2618 struct mlx5_ib_mw *mmw = to_mmw(mw);
2619
2620 if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) &&
2621 xa_erase(&dev->odp_mkeys, mlx5_base_mkey(mmw->mmkey.key)))
2622 /*
2623 * pagefault_single_data_segment() may be accessing mmw
2624 * if the user bound an ODP MR to this MW.
2625 */
2626 mlx5r_deref_wait_odp_mkey(&mmw->mmkey);
2627
2628 return mlx5_core_destroy_mkey(dev->mdev, mmw->mmkey.key);
2629 }
2630
mlx5_ib_check_mr_status(struct ib_mr * ibmr,u32 check_mask,struct ib_mr_status * mr_status)2631 int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask,
2632 struct ib_mr_status *mr_status)
2633 {
2634 struct mlx5_ib_mr *mmr = to_mmr(ibmr);
2635 int ret = 0;
2636
2637 if (check_mask & ~IB_MR_CHECK_SIG_STATUS) {
2638 pr_err("Invalid status check mask\n");
2639 ret = -EINVAL;
2640 goto done;
2641 }
2642
2643 mr_status->fail_status = 0;
2644 if (check_mask & IB_MR_CHECK_SIG_STATUS) {
2645 if (!mmr->sig) {
2646 ret = -EINVAL;
2647 pr_err("signature status check requested on a non-signature enabled MR\n");
2648 goto done;
2649 }
2650
2651 mmr->sig->sig_status_checked = true;
2652 if (!mmr->sig->sig_err_exists)
2653 goto done;
2654
2655 if (ibmr->lkey == mmr->sig->err_item.key)
2656 memcpy(&mr_status->sig_err, &mmr->sig->err_item,
2657 sizeof(mr_status->sig_err));
2658 else {
2659 mr_status->sig_err.err_type = IB_SIG_BAD_GUARD;
2660 mr_status->sig_err.sig_err_offset = 0;
2661 mr_status->sig_err.key = mmr->sig->err_item.key;
2662 }
2663
2664 mmr->sig->sig_err_exists = false;
2665 mr_status->fail_status |= IB_MR_CHECK_SIG_STATUS;
2666 }
2667
2668 done:
2669 return ret;
2670 }
2671
2672 static int
mlx5_ib_map_pa_mr_sg_pi(struct ib_mr * ibmr,struct scatterlist * data_sg,int data_sg_nents,unsigned int * data_sg_offset,struct scatterlist * meta_sg,int meta_sg_nents,unsigned int * meta_sg_offset)2673 mlx5_ib_map_pa_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg,
2674 int data_sg_nents, unsigned int *data_sg_offset,
2675 struct scatterlist *meta_sg, int meta_sg_nents,
2676 unsigned int *meta_sg_offset)
2677 {
2678 struct mlx5_ib_mr *mr = to_mmr(ibmr);
2679 unsigned int sg_offset = 0;
2680 int n = 0;
2681
2682 mr->meta_length = 0;
2683 if (data_sg_nents == 1) {
2684 n++;
2685 mr->mmkey.ndescs = 1;
2686 if (data_sg_offset)
2687 sg_offset = *data_sg_offset;
2688 mr->data_length = sg_dma_len(data_sg) - sg_offset;
2689 mr->data_iova = sg_dma_address(data_sg) + sg_offset;
2690 if (meta_sg_nents == 1) {
2691 n++;
2692 mr->meta_ndescs = 1;
2693 if (meta_sg_offset)
2694 sg_offset = *meta_sg_offset;
2695 else
2696 sg_offset = 0;
2697 mr->meta_length = sg_dma_len(meta_sg) - sg_offset;
2698 mr->pi_iova = sg_dma_address(meta_sg) + sg_offset;
2699 }
2700 ibmr->length = mr->data_length + mr->meta_length;
2701 }
2702
2703 return n;
2704 }
2705
2706 static int
mlx5_ib_sg_to_klms(struct mlx5_ib_mr * mr,struct scatterlist * sgl,unsigned short sg_nents,unsigned int * sg_offset_p,struct scatterlist * meta_sgl,unsigned short meta_sg_nents,unsigned int * meta_sg_offset_p)2707 mlx5_ib_sg_to_klms(struct mlx5_ib_mr *mr,
2708 struct scatterlist *sgl,
2709 unsigned short sg_nents,
2710 unsigned int *sg_offset_p,
2711 struct scatterlist *meta_sgl,
2712 unsigned short meta_sg_nents,
2713 unsigned int *meta_sg_offset_p)
2714 {
2715 struct scatterlist *sg = sgl;
2716 struct mlx5_klm *klms = mr->descs;
2717 unsigned int sg_offset = sg_offset_p ? *sg_offset_p : 0;
2718 u32 lkey = mr->ibmr.pd->local_dma_lkey;
2719 int i, j = 0;
2720
2721 mr->ibmr.iova = sg_dma_address(sg) + sg_offset;
2722 mr->ibmr.length = 0;
2723
2724 for_each_sg(sgl, sg, sg_nents, i) {
2725 if (unlikely(i >= mr->max_descs))
2726 break;
2727 klms[i].va = cpu_to_be64(sg_dma_address(sg) + sg_offset);
2728 klms[i].bcount = cpu_to_be32(sg_dma_len(sg) - sg_offset);
2729 klms[i].key = cpu_to_be32(lkey);
2730 mr->ibmr.length += sg_dma_len(sg) - sg_offset;
2731
2732 sg_offset = 0;
2733 }
2734
2735 if (sg_offset_p)
2736 *sg_offset_p = sg_offset;
2737
2738 mr->mmkey.ndescs = i;
2739 mr->data_length = mr->ibmr.length;
2740
2741 if (meta_sg_nents) {
2742 sg = meta_sgl;
2743 sg_offset = meta_sg_offset_p ? *meta_sg_offset_p : 0;
2744 for_each_sg(meta_sgl, sg, meta_sg_nents, j) {
2745 if (unlikely(i + j >= mr->max_descs))
2746 break;
2747 klms[i + j].va = cpu_to_be64(sg_dma_address(sg) +
2748 sg_offset);
2749 klms[i + j].bcount = cpu_to_be32(sg_dma_len(sg) -
2750 sg_offset);
2751 klms[i + j].key = cpu_to_be32(lkey);
2752 mr->ibmr.length += sg_dma_len(sg) - sg_offset;
2753
2754 sg_offset = 0;
2755 }
2756 if (meta_sg_offset_p)
2757 *meta_sg_offset_p = sg_offset;
2758
2759 mr->meta_ndescs = j;
2760 mr->meta_length = mr->ibmr.length - mr->data_length;
2761 }
2762
2763 return i + j;
2764 }
2765
mlx5_set_page(struct ib_mr * ibmr,u64 addr)2766 static int mlx5_set_page(struct ib_mr *ibmr, u64 addr)
2767 {
2768 struct mlx5_ib_mr *mr = to_mmr(ibmr);
2769 __be64 *descs;
2770
2771 if (unlikely(mr->mmkey.ndescs == mr->max_descs))
2772 return -ENOMEM;
2773
2774 descs = mr->descs;
2775 descs[mr->mmkey.ndescs++] = cpu_to_be64(addr | MLX5_EN_RD | MLX5_EN_WR);
2776
2777 return 0;
2778 }
2779
mlx5_set_page_pi(struct ib_mr * ibmr,u64 addr)2780 static int mlx5_set_page_pi(struct ib_mr *ibmr, u64 addr)
2781 {
2782 struct mlx5_ib_mr *mr = to_mmr(ibmr);
2783 __be64 *descs;
2784
2785 if (unlikely(mr->mmkey.ndescs + mr->meta_ndescs == mr->max_descs))
2786 return -ENOMEM;
2787
2788 descs = mr->descs;
2789 descs[mr->mmkey.ndescs + mr->meta_ndescs++] =
2790 cpu_to_be64(addr | MLX5_EN_RD | MLX5_EN_WR);
2791
2792 return 0;
2793 }
2794
2795 static int
mlx5_ib_map_mtt_mr_sg_pi(struct ib_mr * ibmr,struct scatterlist * data_sg,int data_sg_nents,unsigned int * data_sg_offset,struct scatterlist * meta_sg,int meta_sg_nents,unsigned int * meta_sg_offset)2796 mlx5_ib_map_mtt_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg,
2797 int data_sg_nents, unsigned int *data_sg_offset,
2798 struct scatterlist *meta_sg, int meta_sg_nents,
2799 unsigned int *meta_sg_offset)
2800 {
2801 struct mlx5_ib_mr *mr = to_mmr(ibmr);
2802 struct mlx5_ib_mr *pi_mr = mr->mtt_mr;
2803 int n;
2804
2805 pi_mr->mmkey.ndescs = 0;
2806 pi_mr->meta_ndescs = 0;
2807 pi_mr->meta_length = 0;
2808
2809 ib_dma_sync_single_for_cpu(ibmr->device, pi_mr->desc_map,
2810 pi_mr->desc_size * pi_mr->max_descs,
2811 DMA_TO_DEVICE);
2812
2813 pi_mr->ibmr.page_size = ibmr->page_size;
2814 n = ib_sg_to_pages(&pi_mr->ibmr, data_sg, data_sg_nents, data_sg_offset,
2815 mlx5_set_page);
2816 if (n != data_sg_nents)
2817 return n;
2818
2819 pi_mr->data_iova = pi_mr->ibmr.iova;
2820 pi_mr->data_length = pi_mr->ibmr.length;
2821 pi_mr->ibmr.length = pi_mr->data_length;
2822 ibmr->length = pi_mr->data_length;
2823
2824 if (meta_sg_nents) {
2825 u64 page_mask = ~((u64)ibmr->page_size - 1);
2826 u64 iova = pi_mr->data_iova;
2827
2828 n += ib_sg_to_pages(&pi_mr->ibmr, meta_sg, meta_sg_nents,
2829 meta_sg_offset, mlx5_set_page_pi);
2830
2831 pi_mr->meta_length = pi_mr->ibmr.length;
2832 /*
2833 * PI address for the HW is the offset of the metadata address
2834 * relative to the first data page address.
2835 * It equals to first data page address + size of data pages +
2836 * metadata offset at the first metadata page
2837 */
2838 pi_mr->pi_iova = (iova & page_mask) +
2839 pi_mr->mmkey.ndescs * ibmr->page_size +
2840 (pi_mr->ibmr.iova & ~page_mask);
2841 /*
2842 * In order to use one MTT MR for data and metadata, we register
2843 * also the gaps between the end of the data and the start of
2844 * the metadata (the sig MR will verify that the HW will access
2845 * to right addresses). This mapping is safe because we use
2846 * internal mkey for the registration.
2847 */
2848 pi_mr->ibmr.length = pi_mr->pi_iova + pi_mr->meta_length - iova;
2849 pi_mr->ibmr.iova = iova;
2850 ibmr->length += pi_mr->meta_length;
2851 }
2852
2853 ib_dma_sync_single_for_device(ibmr->device, pi_mr->desc_map,
2854 pi_mr->desc_size * pi_mr->max_descs,
2855 DMA_TO_DEVICE);
2856
2857 return n;
2858 }
2859
2860 static int
mlx5_ib_map_klm_mr_sg_pi(struct ib_mr * ibmr,struct scatterlist * data_sg,int data_sg_nents,unsigned int * data_sg_offset,struct scatterlist * meta_sg,int meta_sg_nents,unsigned int * meta_sg_offset)2861 mlx5_ib_map_klm_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg,
2862 int data_sg_nents, unsigned int *data_sg_offset,
2863 struct scatterlist *meta_sg, int meta_sg_nents,
2864 unsigned int *meta_sg_offset)
2865 {
2866 struct mlx5_ib_mr *mr = to_mmr(ibmr);
2867 struct mlx5_ib_mr *pi_mr = mr->klm_mr;
2868 int n;
2869
2870 pi_mr->mmkey.ndescs = 0;
2871 pi_mr->meta_ndescs = 0;
2872 pi_mr->meta_length = 0;
2873
2874 ib_dma_sync_single_for_cpu(ibmr->device, pi_mr->desc_map,
2875 pi_mr->desc_size * pi_mr->max_descs,
2876 DMA_TO_DEVICE);
2877
2878 n = mlx5_ib_sg_to_klms(pi_mr, data_sg, data_sg_nents, data_sg_offset,
2879 meta_sg, meta_sg_nents, meta_sg_offset);
2880
2881 ib_dma_sync_single_for_device(ibmr->device, pi_mr->desc_map,
2882 pi_mr->desc_size * pi_mr->max_descs,
2883 DMA_TO_DEVICE);
2884
2885 /* This is zero-based memory region */
2886 pi_mr->data_iova = 0;
2887 pi_mr->ibmr.iova = 0;
2888 pi_mr->pi_iova = pi_mr->data_length;
2889 ibmr->length = pi_mr->ibmr.length;
2890
2891 return n;
2892 }
2893
mlx5_ib_map_mr_sg_pi(struct ib_mr * ibmr,struct scatterlist * data_sg,int data_sg_nents,unsigned int * data_sg_offset,struct scatterlist * meta_sg,int meta_sg_nents,unsigned int * meta_sg_offset)2894 int mlx5_ib_map_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg,
2895 int data_sg_nents, unsigned int *data_sg_offset,
2896 struct scatterlist *meta_sg, int meta_sg_nents,
2897 unsigned int *meta_sg_offset)
2898 {
2899 struct mlx5_ib_mr *mr = to_mmr(ibmr);
2900 struct mlx5_ib_mr *pi_mr = NULL;
2901 int n;
2902
2903 WARN_ON(ibmr->type != IB_MR_TYPE_INTEGRITY);
2904
2905 mr->mmkey.ndescs = 0;
2906 mr->data_length = 0;
2907 mr->data_iova = 0;
2908 mr->meta_ndescs = 0;
2909 mr->pi_iova = 0;
2910 /*
2911 * As a performance optimization, if possible, there is no need to
2912 * perform UMR operation to register the data/metadata buffers.
2913 * First try to map the sg lists to PA descriptors with local_dma_lkey.
2914 * Fallback to UMR only in case of a failure.
2915 */
2916 n = mlx5_ib_map_pa_mr_sg_pi(ibmr, data_sg, data_sg_nents,
2917 data_sg_offset, meta_sg, meta_sg_nents,
2918 meta_sg_offset);
2919 if (n == data_sg_nents + meta_sg_nents)
2920 goto out;
2921 /*
2922 * As a performance optimization, if possible, there is no need to map
2923 * the sg lists to KLM descriptors. First try to map the sg lists to MTT
2924 * descriptors and fallback to KLM only in case of a failure.
2925 * It's more efficient for the HW to work with MTT descriptors
2926 * (especially in high load).
2927 * Use KLM (indirect access) only if it's mandatory.
2928 */
2929 pi_mr = mr->mtt_mr;
2930 n = mlx5_ib_map_mtt_mr_sg_pi(ibmr, data_sg, data_sg_nents,
2931 data_sg_offset, meta_sg, meta_sg_nents,
2932 meta_sg_offset);
2933 if (n == data_sg_nents + meta_sg_nents)
2934 goto out;
2935
2936 pi_mr = mr->klm_mr;
2937 n = mlx5_ib_map_klm_mr_sg_pi(ibmr, data_sg, data_sg_nents,
2938 data_sg_offset, meta_sg, meta_sg_nents,
2939 meta_sg_offset);
2940 if (unlikely(n != data_sg_nents + meta_sg_nents))
2941 return -ENOMEM;
2942
2943 out:
2944 /* This is zero-based memory region */
2945 ibmr->iova = 0;
2946 mr->pi_mr = pi_mr;
2947 if (pi_mr)
2948 ibmr->sig_attrs->meta_length = pi_mr->meta_length;
2949 else
2950 ibmr->sig_attrs->meta_length = mr->meta_length;
2951
2952 return 0;
2953 }
2954
mlx5_ib_map_mr_sg(struct ib_mr * ibmr,struct scatterlist * sg,int sg_nents,unsigned int * sg_offset)2955 int mlx5_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
2956 unsigned int *sg_offset)
2957 {
2958 struct mlx5_ib_mr *mr = to_mmr(ibmr);
2959 int n;
2960
2961 mr->mmkey.ndescs = 0;
2962
2963 ib_dma_sync_single_for_cpu(ibmr->device, mr->desc_map,
2964 mr->desc_size * mr->max_descs,
2965 DMA_TO_DEVICE);
2966
2967 if (mr->access_mode == MLX5_MKC_ACCESS_MODE_KLMS)
2968 n = mlx5_ib_sg_to_klms(mr, sg, sg_nents, sg_offset, NULL, 0,
2969 NULL);
2970 else
2971 n = ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset,
2972 mlx5_set_page);
2973
2974 ib_dma_sync_single_for_device(ibmr->device, mr->desc_map,
2975 mr->desc_size * mr->max_descs,
2976 DMA_TO_DEVICE);
2977
2978 return n;
2979 }
2980