1 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
2 /*
3 * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved
4 */
5
6 #include "cmd.h"
7
8 enum { CQ_OK = 0, CQ_EMPTY = -1, CQ_POLL_ERR = -2 };
9
mlx5vf_is_migratable(struct mlx5_core_dev * mdev,u16 func_id)10 static int mlx5vf_is_migratable(struct mlx5_core_dev *mdev, u16 func_id)
11 {
12 int query_sz = MLX5_ST_SZ_BYTES(query_hca_cap_out);
13 void *query_cap = NULL, *cap;
14 int ret;
15
16 query_cap = kzalloc(query_sz, GFP_KERNEL);
17 if (!query_cap)
18 return -ENOMEM;
19
20 ret = mlx5_vport_get_other_func_cap(mdev, func_id, query_cap,
21 MLX5_CAP_GENERAL_2);
22 if (ret)
23 goto out;
24
25 cap = MLX5_ADDR_OF(query_hca_cap_out, query_cap, capability);
26 if (!MLX5_GET(cmd_hca_cap_2, cap, migratable))
27 ret = -EOPNOTSUPP;
28 out:
29 kfree(query_cap);
30 return ret;
31 }
32
33 static int mlx5vf_cmd_get_vhca_id(struct mlx5_core_dev *mdev, u16 function_id,
34 u16 *vhca_id);
35 static void
36 _mlx5vf_free_page_tracker_resources(struct mlx5vf_pci_core_device *mvdev);
37
mlx5vf_cmd_suspend_vhca(struct mlx5vf_pci_core_device * mvdev,u16 op_mod)38 int mlx5vf_cmd_suspend_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod)
39 {
40 struct mlx5_vf_migration_file *migf = mvdev->saving_migf;
41 u32 out[MLX5_ST_SZ_DW(suspend_vhca_out)] = {};
42 u32 in[MLX5_ST_SZ_DW(suspend_vhca_in)] = {};
43 int err;
44
45 lockdep_assert_held(&mvdev->state_mutex);
46 if (mvdev->mdev_detach)
47 return -ENOTCONN;
48
49 /*
50 * In case PRE_COPY is used, saving_migf is exposed while the device is
51 * running. Make sure to run only once there is no active save command.
52 * Running both in parallel, might end-up with a failure in the save
53 * command once it will try to turn on 'tracking' on a suspended device.
54 */
55 if (migf) {
56 err = wait_for_completion_interruptible(&migf->save_comp);
57 if (err)
58 return err;
59 }
60
61 MLX5_SET(suspend_vhca_in, in, opcode, MLX5_CMD_OP_SUSPEND_VHCA);
62 MLX5_SET(suspend_vhca_in, in, vhca_id, mvdev->vhca_id);
63 MLX5_SET(suspend_vhca_in, in, op_mod, op_mod);
64
65 err = mlx5_cmd_exec_inout(mvdev->mdev, suspend_vhca, in, out);
66 if (migf)
67 complete(&migf->save_comp);
68
69 return err;
70 }
71
mlx5vf_cmd_resume_vhca(struct mlx5vf_pci_core_device * mvdev,u16 op_mod)72 int mlx5vf_cmd_resume_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod)
73 {
74 u32 out[MLX5_ST_SZ_DW(resume_vhca_out)] = {};
75 u32 in[MLX5_ST_SZ_DW(resume_vhca_in)] = {};
76
77 lockdep_assert_held(&mvdev->state_mutex);
78 if (mvdev->mdev_detach)
79 return -ENOTCONN;
80
81 MLX5_SET(resume_vhca_in, in, opcode, MLX5_CMD_OP_RESUME_VHCA);
82 MLX5_SET(resume_vhca_in, in, vhca_id, mvdev->vhca_id);
83 MLX5_SET(resume_vhca_in, in, op_mod, op_mod);
84
85 return mlx5_cmd_exec_inout(mvdev->mdev, resume_vhca, in, out);
86 }
87
mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device * mvdev,size_t * state_size,u64 * total_size,u8 query_flags)88 int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev,
89 size_t *state_size, u64 *total_size,
90 u8 query_flags)
91 {
92 u32 out[MLX5_ST_SZ_DW(query_vhca_migration_state_out)] = {};
93 u32 in[MLX5_ST_SZ_DW(query_vhca_migration_state_in)] = {};
94 bool inc = query_flags & MLX5VF_QUERY_INC;
95 int ret;
96
97 lockdep_assert_held(&mvdev->state_mutex);
98 if (mvdev->mdev_detach)
99 return -ENOTCONN;
100
101 /*
102 * In case PRE_COPY is used, saving_migf is exposed while device is
103 * running. Make sure to run only once there is no active save command.
104 * Running both in parallel, might end-up with a failure in the
105 * incremental query command on un-tracked vhca.
106 */
107 if (inc) {
108 ret = wait_for_completion_interruptible(&mvdev->saving_migf->save_comp);
109 if (ret)
110 return ret;
111 /* Upon cleanup, ignore previous pre_copy error state */
112 if (mvdev->saving_migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR &&
113 !(query_flags & MLX5VF_QUERY_CLEANUP)) {
114 /*
115 * In case we had a PRE_COPY error, only query full
116 * image for final image
117 */
118 if (!(query_flags & MLX5VF_QUERY_FINAL)) {
119 *state_size = 0;
120 complete(&mvdev->saving_migf->save_comp);
121 return 0;
122 }
123 query_flags &= ~MLX5VF_QUERY_INC;
124 }
125 /* Block incremental query which is state-dependent */
126 if (mvdev->saving_migf->state == MLX5_MIGF_STATE_ERROR) {
127 complete(&mvdev->saving_migf->save_comp);
128 return -ENODEV;
129 }
130 }
131
132 MLX5_SET(query_vhca_migration_state_in, in, opcode,
133 MLX5_CMD_OP_QUERY_VHCA_MIGRATION_STATE);
134 MLX5_SET(query_vhca_migration_state_in, in, vhca_id, mvdev->vhca_id);
135 MLX5_SET(query_vhca_migration_state_in, in, op_mod, 0);
136 MLX5_SET(query_vhca_migration_state_in, in, incremental,
137 query_flags & MLX5VF_QUERY_INC);
138 MLX5_SET(query_vhca_migration_state_in, in, chunk, mvdev->chunk_mode);
139
140 ret = mlx5_cmd_exec_inout(mvdev->mdev, query_vhca_migration_state, in,
141 out);
142 if (inc)
143 complete(&mvdev->saving_migf->save_comp);
144
145 if (ret)
146 return ret;
147
148 *state_size = MLX5_GET(query_vhca_migration_state_out, out,
149 required_umem_size);
150 if (total_size)
151 *total_size = mvdev->chunk_mode ?
152 MLX5_GET64(query_vhca_migration_state_out, out,
153 remaining_total_size) : *state_size;
154
155 return 0;
156 }
157
set_tracker_change_event(struct mlx5vf_pci_core_device * mvdev)158 static void set_tracker_change_event(struct mlx5vf_pci_core_device *mvdev)
159 {
160 mvdev->tracker.object_changed = true;
161 complete(&mvdev->tracker_comp);
162 }
163
set_tracker_error(struct mlx5vf_pci_core_device * mvdev)164 static void set_tracker_error(struct mlx5vf_pci_core_device *mvdev)
165 {
166 /* Mark the tracker under an error and wake it up if it's running */
167 mvdev->tracker.is_err = true;
168 complete(&mvdev->tracker_comp);
169 }
170
mlx5fv_vf_event(struct notifier_block * nb,unsigned long event,void * data)171 static int mlx5fv_vf_event(struct notifier_block *nb,
172 unsigned long event, void *data)
173 {
174 struct mlx5vf_pci_core_device *mvdev =
175 container_of(nb, struct mlx5vf_pci_core_device, nb);
176
177 switch (event) {
178 case MLX5_PF_NOTIFY_ENABLE_VF:
179 mutex_lock(&mvdev->state_mutex);
180 mvdev->mdev_detach = false;
181 mlx5vf_state_mutex_unlock(mvdev);
182 break;
183 case MLX5_PF_NOTIFY_DISABLE_VF:
184 mlx5vf_cmd_close_migratable(mvdev);
185 mutex_lock(&mvdev->state_mutex);
186 mvdev->mdev_detach = true;
187 mlx5vf_state_mutex_unlock(mvdev);
188 break;
189 default:
190 break;
191 }
192
193 return 0;
194 }
195
mlx5vf_cmd_close_migratable(struct mlx5vf_pci_core_device * mvdev)196 void mlx5vf_cmd_close_migratable(struct mlx5vf_pci_core_device *mvdev)
197 {
198 if (!mvdev->migrate_cap)
199 return;
200
201 /* Must be done outside the lock to let it progress */
202 set_tracker_error(mvdev);
203 mutex_lock(&mvdev->state_mutex);
204 mlx5vf_disable_fds(mvdev, NULL);
205 _mlx5vf_free_page_tracker_resources(mvdev);
206 mlx5vf_state_mutex_unlock(mvdev);
207 }
208
mlx5vf_cmd_remove_migratable(struct mlx5vf_pci_core_device * mvdev)209 void mlx5vf_cmd_remove_migratable(struct mlx5vf_pci_core_device *mvdev)
210 {
211 if (!mvdev->migrate_cap)
212 return;
213
214 mlx5_sriov_blocking_notifier_unregister(mvdev->mdev, mvdev->vf_id,
215 &mvdev->nb);
216 destroy_workqueue(mvdev->cb_wq);
217 }
218
mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device * mvdev,const struct vfio_migration_ops * mig_ops,const struct vfio_log_ops * log_ops)219 void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev,
220 const struct vfio_migration_ops *mig_ops,
221 const struct vfio_log_ops *log_ops)
222 {
223 struct pci_dev *pdev = mvdev->core_device.pdev;
224 int ret;
225
226 if (!pdev->is_virtfn)
227 return;
228
229 mvdev->mdev = mlx5_vf_get_core_dev(pdev);
230 if (!mvdev->mdev)
231 return;
232
233 if (!MLX5_CAP_GEN(mvdev->mdev, migration))
234 goto end;
235
236 if (!(MLX5_CAP_GEN_2(mvdev->mdev, migration_multi_load) &&
237 MLX5_CAP_GEN_2(mvdev->mdev, migration_tracking_state)))
238 goto end;
239
240 mvdev->vf_id = pci_iov_vf_id(pdev);
241 if (mvdev->vf_id < 0)
242 goto end;
243
244 ret = mlx5vf_is_migratable(mvdev->mdev, mvdev->vf_id + 1);
245 if (ret)
246 goto end;
247
248 if (mlx5vf_cmd_get_vhca_id(mvdev->mdev, mvdev->vf_id + 1,
249 &mvdev->vhca_id))
250 goto end;
251
252 mvdev->cb_wq = alloc_ordered_workqueue("mlx5vf_wq", 0);
253 if (!mvdev->cb_wq)
254 goto end;
255
256 mutex_init(&mvdev->state_mutex);
257 spin_lock_init(&mvdev->reset_lock);
258 mvdev->nb.notifier_call = mlx5fv_vf_event;
259 ret = mlx5_sriov_blocking_notifier_register(mvdev->mdev, mvdev->vf_id,
260 &mvdev->nb);
261 if (ret) {
262 destroy_workqueue(mvdev->cb_wq);
263 goto end;
264 }
265
266 mvdev->migrate_cap = 1;
267 mvdev->core_device.vdev.migration_flags =
268 VFIO_MIGRATION_STOP_COPY |
269 VFIO_MIGRATION_P2P |
270 VFIO_MIGRATION_PRE_COPY;
271
272 mvdev->core_device.vdev.mig_ops = mig_ops;
273 init_completion(&mvdev->tracker_comp);
274 if (MLX5_CAP_GEN(mvdev->mdev, adv_virtualization))
275 mvdev->core_device.vdev.log_ops = log_ops;
276
277 if (MLX5_CAP_GEN_2(mvdev->mdev, migration_in_chunks))
278 mvdev->chunk_mode = 1;
279
280 end:
281 mlx5_vf_put_core_dev(mvdev->mdev);
282 }
283
mlx5vf_cmd_get_vhca_id(struct mlx5_core_dev * mdev,u16 function_id,u16 * vhca_id)284 static int mlx5vf_cmd_get_vhca_id(struct mlx5_core_dev *mdev, u16 function_id,
285 u16 *vhca_id)
286 {
287 u32 in[MLX5_ST_SZ_DW(query_hca_cap_in)] = {};
288 int out_size;
289 void *out;
290 int ret;
291
292 out_size = MLX5_ST_SZ_BYTES(query_hca_cap_out);
293 out = kzalloc(out_size, GFP_KERNEL);
294 if (!out)
295 return -ENOMEM;
296
297 MLX5_SET(query_hca_cap_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_CAP);
298 MLX5_SET(query_hca_cap_in, in, other_function, 1);
299 MLX5_SET(query_hca_cap_in, in, function_id, function_id);
300 MLX5_SET(query_hca_cap_in, in, op_mod,
301 MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE << 1 |
302 HCA_CAP_OPMOD_GET_CUR);
303
304 ret = mlx5_cmd_exec_inout(mdev, query_hca_cap, in, out);
305 if (ret)
306 goto err_exec;
307
308 *vhca_id = MLX5_GET(query_hca_cap_out, out,
309 capability.cmd_hca_cap.vhca_id);
310
311 err_exec:
312 kfree(out);
313 return ret;
314 }
315
_create_mkey(struct mlx5_core_dev * mdev,u32 pdn,struct mlx5_vhca_data_buffer * buf,struct mlx5_vhca_recv_buf * recv_buf,u32 * mkey)316 static int _create_mkey(struct mlx5_core_dev *mdev, u32 pdn,
317 struct mlx5_vhca_data_buffer *buf,
318 struct mlx5_vhca_recv_buf *recv_buf,
319 u32 *mkey)
320 {
321 size_t npages = buf ? DIV_ROUND_UP(buf->allocated_length, PAGE_SIZE) :
322 recv_buf->npages;
323 int err = 0, inlen;
324 __be64 *mtt;
325 void *mkc;
326 u32 *in;
327
328 inlen = MLX5_ST_SZ_BYTES(create_mkey_in) +
329 sizeof(*mtt) * round_up(npages, 2);
330
331 in = kvzalloc(inlen, GFP_KERNEL);
332 if (!in)
333 return -ENOMEM;
334
335 MLX5_SET(create_mkey_in, in, translations_octword_actual_size,
336 DIV_ROUND_UP(npages, 2));
337 mtt = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt);
338
339 if (buf) {
340 struct sg_dma_page_iter dma_iter;
341
342 for_each_sgtable_dma_page(&buf->table.sgt, &dma_iter, 0)
343 *mtt++ = cpu_to_be64(sg_page_iter_dma_address(&dma_iter));
344 } else {
345 int i;
346
347 for (i = 0; i < npages; i++)
348 *mtt++ = cpu_to_be64(recv_buf->dma_addrs[i]);
349 }
350
351 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
352 MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_MTT);
353 MLX5_SET(mkc, mkc, lr, 1);
354 MLX5_SET(mkc, mkc, lw, 1);
355 MLX5_SET(mkc, mkc, rr, 1);
356 MLX5_SET(mkc, mkc, rw, 1);
357 MLX5_SET(mkc, mkc, pd, pdn);
358 MLX5_SET(mkc, mkc, bsf_octword_size, 0);
359 MLX5_SET(mkc, mkc, qpn, 0xffffff);
360 MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT);
361 MLX5_SET(mkc, mkc, translations_octword_size, DIV_ROUND_UP(npages, 2));
362 MLX5_SET64(mkc, mkc, len, npages * PAGE_SIZE);
363 err = mlx5_core_create_mkey(mdev, mkey, in, inlen);
364 kvfree(in);
365 return err;
366 }
367
mlx5vf_dma_data_buffer(struct mlx5_vhca_data_buffer * buf)368 static int mlx5vf_dma_data_buffer(struct mlx5_vhca_data_buffer *buf)
369 {
370 struct mlx5vf_pci_core_device *mvdev = buf->migf->mvdev;
371 struct mlx5_core_dev *mdev = mvdev->mdev;
372 int ret;
373
374 lockdep_assert_held(&mvdev->state_mutex);
375 if (mvdev->mdev_detach)
376 return -ENOTCONN;
377
378 if (buf->dmaed || !buf->allocated_length)
379 return -EINVAL;
380
381 ret = dma_map_sgtable(mdev->device, &buf->table.sgt, buf->dma_dir, 0);
382 if (ret)
383 return ret;
384
385 ret = _create_mkey(mdev, buf->migf->pdn, buf, NULL, &buf->mkey);
386 if (ret)
387 goto err;
388
389 buf->dmaed = true;
390
391 return 0;
392 err:
393 dma_unmap_sgtable(mdev->device, &buf->table.sgt, buf->dma_dir, 0);
394 return ret;
395 }
396
mlx5vf_free_data_buffer(struct mlx5_vhca_data_buffer * buf)397 void mlx5vf_free_data_buffer(struct mlx5_vhca_data_buffer *buf)
398 {
399 struct mlx5_vf_migration_file *migf = buf->migf;
400 struct sg_page_iter sg_iter;
401
402 lockdep_assert_held(&migf->mvdev->state_mutex);
403 WARN_ON(migf->mvdev->mdev_detach);
404
405 if (buf->dmaed) {
406 mlx5_core_destroy_mkey(migf->mvdev->mdev, buf->mkey);
407 dma_unmap_sgtable(migf->mvdev->mdev->device, &buf->table.sgt,
408 buf->dma_dir, 0);
409 }
410
411 /* Undo alloc_pages_bulk_array() */
412 for_each_sgtable_page(&buf->table.sgt, &sg_iter, 0)
413 __free_page(sg_page_iter_page(&sg_iter));
414 sg_free_append_table(&buf->table);
415 kfree(buf);
416 }
417
mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer * buf,unsigned int npages)418 static int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf,
419 unsigned int npages)
420 {
421 unsigned int to_alloc = npages;
422 struct page **page_list;
423 unsigned long filled;
424 unsigned int to_fill;
425 int ret;
426 int i;
427
428 to_fill = min_t(unsigned int, npages, PAGE_SIZE / sizeof(*page_list));
429 page_list = kvzalloc(to_fill * sizeof(*page_list), GFP_KERNEL_ACCOUNT);
430 if (!page_list)
431 return -ENOMEM;
432
433 do {
434 filled = alloc_pages_bulk_array(GFP_KERNEL_ACCOUNT, to_fill,
435 page_list);
436 if (!filled) {
437 ret = -ENOMEM;
438 goto err;
439 }
440 to_alloc -= filled;
441 ret = sg_alloc_append_table_from_pages(
442 &buf->table, page_list, filled, 0,
443 filled << PAGE_SHIFT, UINT_MAX, SG_MAX_SINGLE_ALLOC,
444 GFP_KERNEL_ACCOUNT);
445
446 if (ret)
447 goto err_append;
448 buf->allocated_length += filled * PAGE_SIZE;
449 /* clean input for another bulk allocation */
450 memset(page_list, 0, filled * sizeof(*page_list));
451 to_fill = min_t(unsigned int, to_alloc,
452 PAGE_SIZE / sizeof(*page_list));
453 } while (to_alloc > 0);
454
455 kvfree(page_list);
456 return 0;
457
458 err_append:
459 for (i = filled - 1; i >= 0; i--)
460 __free_page(page_list[i]);
461 err:
462 kvfree(page_list);
463 return ret;
464 }
465
466 struct mlx5_vhca_data_buffer *
mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file * migf,size_t length,enum dma_data_direction dma_dir)467 mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf,
468 size_t length,
469 enum dma_data_direction dma_dir)
470 {
471 struct mlx5_vhca_data_buffer *buf;
472 int ret;
473
474 buf = kzalloc(sizeof(*buf), GFP_KERNEL_ACCOUNT);
475 if (!buf)
476 return ERR_PTR(-ENOMEM);
477
478 buf->dma_dir = dma_dir;
479 buf->migf = migf;
480 if (length) {
481 ret = mlx5vf_add_migration_pages(buf,
482 DIV_ROUND_UP_ULL(length, PAGE_SIZE));
483 if (ret)
484 goto end;
485
486 if (dma_dir != DMA_NONE) {
487 ret = mlx5vf_dma_data_buffer(buf);
488 if (ret)
489 goto end;
490 }
491 }
492
493 return buf;
494 end:
495 mlx5vf_free_data_buffer(buf);
496 return ERR_PTR(ret);
497 }
498
mlx5vf_put_data_buffer(struct mlx5_vhca_data_buffer * buf)499 void mlx5vf_put_data_buffer(struct mlx5_vhca_data_buffer *buf)
500 {
501 spin_lock_irq(&buf->migf->list_lock);
502 buf->stop_copy_chunk_num = 0;
503 list_add_tail(&buf->buf_elm, &buf->migf->avail_list);
504 spin_unlock_irq(&buf->migf->list_lock);
505 }
506
507 struct mlx5_vhca_data_buffer *
mlx5vf_get_data_buffer(struct mlx5_vf_migration_file * migf,size_t length,enum dma_data_direction dma_dir)508 mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf,
509 size_t length, enum dma_data_direction dma_dir)
510 {
511 struct mlx5_vhca_data_buffer *buf, *temp_buf;
512 struct list_head free_list;
513
514 lockdep_assert_held(&migf->mvdev->state_mutex);
515 if (migf->mvdev->mdev_detach)
516 return ERR_PTR(-ENOTCONN);
517
518 INIT_LIST_HEAD(&free_list);
519
520 spin_lock_irq(&migf->list_lock);
521 list_for_each_entry_safe(buf, temp_buf, &migf->avail_list, buf_elm) {
522 if (buf->dma_dir == dma_dir) {
523 list_del_init(&buf->buf_elm);
524 if (buf->allocated_length >= length) {
525 spin_unlock_irq(&migf->list_lock);
526 goto found;
527 }
528 /*
529 * Prevent holding redundant buffers. Put in a free
530 * list and call at the end not under the spin lock
531 * (&migf->list_lock) to mlx5vf_free_data_buffer which
532 * might sleep.
533 */
534 list_add(&buf->buf_elm, &free_list);
535 }
536 }
537 spin_unlock_irq(&migf->list_lock);
538 buf = mlx5vf_alloc_data_buffer(migf, length, dma_dir);
539
540 found:
541 while ((temp_buf = list_first_entry_or_null(&free_list,
542 struct mlx5_vhca_data_buffer, buf_elm))) {
543 list_del(&temp_buf->buf_elm);
544 mlx5vf_free_data_buffer(temp_buf);
545 }
546
547 return buf;
548 }
549
550 static void
mlx5vf_save_callback_complete(struct mlx5_vf_migration_file * migf,struct mlx5vf_async_data * async_data)551 mlx5vf_save_callback_complete(struct mlx5_vf_migration_file *migf,
552 struct mlx5vf_async_data *async_data)
553 {
554 kvfree(async_data->out);
555 complete(&migf->save_comp);
556 fput(migf->filp);
557 }
558
mlx5vf_mig_file_cleanup_cb(struct work_struct * _work)559 void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work)
560 {
561 struct mlx5vf_async_data *async_data = container_of(_work,
562 struct mlx5vf_async_data, work);
563 struct mlx5_vf_migration_file *migf = container_of(async_data,
564 struct mlx5_vf_migration_file, async_data);
565
566 mutex_lock(&migf->lock);
567 if (async_data->status) {
568 mlx5vf_put_data_buffer(async_data->buf);
569 if (async_data->header_buf)
570 mlx5vf_put_data_buffer(async_data->header_buf);
571 if (!async_data->stop_copy_chunk &&
572 async_data->status == MLX5_CMD_STAT_BAD_RES_STATE_ERR)
573 migf->state = MLX5_MIGF_STATE_PRE_COPY_ERROR;
574 else
575 migf->state = MLX5_MIGF_STATE_ERROR;
576 wake_up_interruptible(&migf->poll_wait);
577 }
578 mutex_unlock(&migf->lock);
579 mlx5vf_save_callback_complete(migf, async_data);
580 }
581
add_buf_header(struct mlx5_vhca_data_buffer * header_buf,size_t image_size,bool initial_pre_copy)582 static int add_buf_header(struct mlx5_vhca_data_buffer *header_buf,
583 size_t image_size, bool initial_pre_copy)
584 {
585 struct mlx5_vf_migration_file *migf = header_buf->migf;
586 struct mlx5_vf_migration_header header = {};
587 unsigned long flags;
588 struct page *page;
589 u8 *to_buff;
590
591 header.record_size = cpu_to_le64(image_size);
592 header.flags = cpu_to_le32(MLX5_MIGF_HEADER_FLAGS_TAG_MANDATORY);
593 header.tag = cpu_to_le32(MLX5_MIGF_HEADER_TAG_FW_DATA);
594 page = mlx5vf_get_migration_page(header_buf, 0);
595 if (!page)
596 return -EINVAL;
597 to_buff = kmap_local_page(page);
598 memcpy(to_buff, &header, sizeof(header));
599 kunmap_local(to_buff);
600 header_buf->length = sizeof(header);
601 header_buf->start_pos = header_buf->migf->max_pos;
602 migf->max_pos += header_buf->length;
603 spin_lock_irqsave(&migf->list_lock, flags);
604 list_add_tail(&header_buf->buf_elm, &migf->buf_list);
605 spin_unlock_irqrestore(&migf->list_lock, flags);
606 if (initial_pre_copy)
607 migf->pre_copy_initial_bytes += sizeof(header);
608 return 0;
609 }
610
mlx5vf_save_callback(int status,struct mlx5_async_work * context)611 static void mlx5vf_save_callback(int status, struct mlx5_async_work *context)
612 {
613 struct mlx5vf_async_data *async_data = container_of(context,
614 struct mlx5vf_async_data, cb_work);
615 struct mlx5_vf_migration_file *migf = container_of(async_data,
616 struct mlx5_vf_migration_file, async_data);
617
618 if (!status) {
619 size_t next_required_umem_size = 0;
620 bool stop_copy_last_chunk;
621 size_t image_size;
622 unsigned long flags;
623 bool initial_pre_copy = migf->state != MLX5_MIGF_STATE_PRE_COPY &&
624 !async_data->stop_copy_chunk;
625
626 image_size = MLX5_GET(save_vhca_state_out, async_data->out,
627 actual_image_size);
628 if (async_data->buf->stop_copy_chunk_num)
629 next_required_umem_size = MLX5_GET(save_vhca_state_out,
630 async_data->out, next_required_umem_size);
631 stop_copy_last_chunk = async_data->stop_copy_chunk &&
632 !next_required_umem_size;
633 if (async_data->header_buf) {
634 status = add_buf_header(async_data->header_buf, image_size,
635 initial_pre_copy);
636 if (status)
637 goto err;
638 }
639 async_data->buf->length = image_size;
640 async_data->buf->start_pos = migf->max_pos;
641 migf->max_pos += async_data->buf->length;
642 spin_lock_irqsave(&migf->list_lock, flags);
643 list_add_tail(&async_data->buf->buf_elm, &migf->buf_list);
644 if (async_data->buf->stop_copy_chunk_num) {
645 migf->num_ready_chunks++;
646 if (next_required_umem_size &&
647 migf->num_ready_chunks >= MAX_NUM_CHUNKS) {
648 /* Delay the next SAVE till one chunk be consumed */
649 migf->next_required_umem_size = next_required_umem_size;
650 next_required_umem_size = 0;
651 }
652 }
653 spin_unlock_irqrestore(&migf->list_lock, flags);
654 if (initial_pre_copy) {
655 migf->pre_copy_initial_bytes += image_size;
656 migf->state = MLX5_MIGF_STATE_PRE_COPY;
657 }
658 if (stop_copy_last_chunk)
659 migf->state = MLX5_MIGF_STATE_COMPLETE;
660 wake_up_interruptible(&migf->poll_wait);
661 if (next_required_umem_size)
662 mlx5vf_mig_file_set_save_work(migf,
663 /* Picking up the next chunk num */
664 (async_data->buf->stop_copy_chunk_num % MAX_NUM_CHUNKS) + 1,
665 next_required_umem_size);
666 mlx5vf_save_callback_complete(migf, async_data);
667 return;
668 }
669
670 err:
671 /* The error flow can't run from an interrupt context */
672 if (status == -EREMOTEIO) {
673 status = MLX5_GET(save_vhca_state_out, async_data->out, status);
674 /* Failed in FW, print cmd out failure details */
675 mlx5_cmd_out_err(migf->mvdev->mdev, MLX5_CMD_OP_SAVE_VHCA_STATE, 0,
676 async_data->out);
677 }
678
679 async_data->status = status;
680 queue_work(migf->mvdev->cb_wq, &async_data->work);
681 }
682
mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device * mvdev,struct mlx5_vf_migration_file * migf,struct mlx5_vhca_data_buffer * buf,bool inc,bool track)683 int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev,
684 struct mlx5_vf_migration_file *migf,
685 struct mlx5_vhca_data_buffer *buf, bool inc,
686 bool track)
687 {
688 u32 out_size = MLX5_ST_SZ_BYTES(save_vhca_state_out);
689 u32 in[MLX5_ST_SZ_DW(save_vhca_state_in)] = {};
690 struct mlx5_vhca_data_buffer *header_buf = NULL;
691 struct mlx5vf_async_data *async_data;
692 bool pre_copy_cleanup = false;
693 int err;
694
695 lockdep_assert_held(&mvdev->state_mutex);
696 if (mvdev->mdev_detach)
697 return -ENOTCONN;
698
699 err = wait_for_completion_interruptible(&migf->save_comp);
700 if (err)
701 return err;
702
703 if ((migf->state == MLX5_MIGF_STATE_PRE_COPY ||
704 migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR) && !track && !inc)
705 pre_copy_cleanup = true;
706
707 if (migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR)
708 /*
709 * In case we had a PRE_COPY error, SAVE is triggered only for
710 * the final image, read device full image.
711 */
712 inc = false;
713
714 MLX5_SET(save_vhca_state_in, in, opcode,
715 MLX5_CMD_OP_SAVE_VHCA_STATE);
716 MLX5_SET(save_vhca_state_in, in, op_mod, 0);
717 MLX5_SET(save_vhca_state_in, in, vhca_id, mvdev->vhca_id);
718 MLX5_SET(save_vhca_state_in, in, mkey, buf->mkey);
719 MLX5_SET(save_vhca_state_in, in, size, buf->allocated_length);
720 MLX5_SET(save_vhca_state_in, in, incremental, inc);
721 MLX5_SET(save_vhca_state_in, in, set_track, track);
722
723 async_data = &migf->async_data;
724 async_data->buf = buf;
725 async_data->stop_copy_chunk = (!track && !pre_copy_cleanup);
726 async_data->out = kvzalloc(out_size, GFP_KERNEL);
727 if (!async_data->out) {
728 err = -ENOMEM;
729 goto err_out;
730 }
731
732 if (async_data->stop_copy_chunk) {
733 u8 header_idx = buf->stop_copy_chunk_num ?
734 buf->stop_copy_chunk_num - 1 : 0;
735
736 header_buf = migf->buf_header[header_idx];
737 migf->buf_header[header_idx] = NULL;
738 }
739
740 if (!header_buf) {
741 header_buf = mlx5vf_get_data_buffer(migf,
742 sizeof(struct mlx5_vf_migration_header), DMA_NONE);
743 if (IS_ERR(header_buf)) {
744 err = PTR_ERR(header_buf);
745 goto err_free;
746 }
747 }
748
749 if (async_data->stop_copy_chunk)
750 migf->state = MLX5_MIGF_STATE_SAVE_STOP_COPY_CHUNK;
751
752 async_data->header_buf = header_buf;
753 get_file(migf->filp);
754 err = mlx5_cmd_exec_cb(&migf->async_ctx, in, sizeof(in),
755 async_data->out,
756 out_size, mlx5vf_save_callback,
757 &async_data->cb_work);
758 if (err)
759 goto err_exec;
760
761 return 0;
762
763 err_exec:
764 if (header_buf)
765 mlx5vf_put_data_buffer(header_buf);
766 fput(migf->filp);
767 err_free:
768 kvfree(async_data->out);
769 err_out:
770 complete(&migf->save_comp);
771 return err;
772 }
773
mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device * mvdev,struct mlx5_vf_migration_file * migf,struct mlx5_vhca_data_buffer * buf)774 int mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device *mvdev,
775 struct mlx5_vf_migration_file *migf,
776 struct mlx5_vhca_data_buffer *buf)
777 {
778 u32 out[MLX5_ST_SZ_DW(load_vhca_state_out)] = {};
779 u32 in[MLX5_ST_SZ_DW(load_vhca_state_in)] = {};
780 int err;
781
782 lockdep_assert_held(&mvdev->state_mutex);
783 if (mvdev->mdev_detach)
784 return -ENOTCONN;
785
786 if (!buf->dmaed) {
787 err = mlx5vf_dma_data_buffer(buf);
788 if (err)
789 return err;
790 }
791
792 MLX5_SET(load_vhca_state_in, in, opcode,
793 MLX5_CMD_OP_LOAD_VHCA_STATE);
794 MLX5_SET(load_vhca_state_in, in, op_mod, 0);
795 MLX5_SET(load_vhca_state_in, in, vhca_id, mvdev->vhca_id);
796 MLX5_SET(load_vhca_state_in, in, mkey, buf->mkey);
797 MLX5_SET(load_vhca_state_in, in, size, buf->length);
798 return mlx5_cmd_exec_inout(mvdev->mdev, load_vhca_state, in, out);
799 }
800
mlx5vf_cmd_alloc_pd(struct mlx5_vf_migration_file * migf)801 int mlx5vf_cmd_alloc_pd(struct mlx5_vf_migration_file *migf)
802 {
803 int err;
804
805 lockdep_assert_held(&migf->mvdev->state_mutex);
806 if (migf->mvdev->mdev_detach)
807 return -ENOTCONN;
808
809 err = mlx5_core_alloc_pd(migf->mvdev->mdev, &migf->pdn);
810 return err;
811 }
812
mlx5vf_cmd_dealloc_pd(struct mlx5_vf_migration_file * migf)813 void mlx5vf_cmd_dealloc_pd(struct mlx5_vf_migration_file *migf)
814 {
815 lockdep_assert_held(&migf->mvdev->state_mutex);
816 if (migf->mvdev->mdev_detach)
817 return;
818
819 mlx5_core_dealloc_pd(migf->mvdev->mdev, migf->pdn);
820 }
821
mlx5fv_cmd_clean_migf_resources(struct mlx5_vf_migration_file * migf)822 void mlx5fv_cmd_clean_migf_resources(struct mlx5_vf_migration_file *migf)
823 {
824 struct mlx5_vhca_data_buffer *entry;
825 int i;
826
827 lockdep_assert_held(&migf->mvdev->state_mutex);
828 WARN_ON(migf->mvdev->mdev_detach);
829
830 for (i = 0; i < MAX_NUM_CHUNKS; i++) {
831 if (migf->buf[i]) {
832 mlx5vf_free_data_buffer(migf->buf[i]);
833 migf->buf[i] = NULL;
834 }
835
836 if (migf->buf_header[i]) {
837 mlx5vf_free_data_buffer(migf->buf_header[i]);
838 migf->buf_header[i] = NULL;
839 }
840 }
841
842 list_splice(&migf->avail_list, &migf->buf_list);
843
844 while ((entry = list_first_entry_or_null(&migf->buf_list,
845 struct mlx5_vhca_data_buffer, buf_elm))) {
846 list_del(&entry->buf_elm);
847 mlx5vf_free_data_buffer(entry);
848 }
849
850 mlx5vf_cmd_dealloc_pd(migf);
851 }
852
mlx5vf_create_tracker(struct mlx5_core_dev * mdev,struct mlx5vf_pci_core_device * mvdev,struct rb_root_cached * ranges,u32 nnodes)853 static int mlx5vf_create_tracker(struct mlx5_core_dev *mdev,
854 struct mlx5vf_pci_core_device *mvdev,
855 struct rb_root_cached *ranges, u32 nnodes)
856 {
857 int max_num_range =
858 MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_max_num_range);
859 struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker;
860 int record_size = MLX5_ST_SZ_BYTES(page_track_range);
861 u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {};
862 struct interval_tree_node *node = NULL;
863 u64 total_ranges_len = 0;
864 u32 num_ranges = nnodes;
865 u8 log_addr_space_size;
866 void *range_list_ptr;
867 void *obj_context;
868 void *cmd_hdr;
869 int inlen;
870 void *in;
871 int err;
872 int i;
873
874 if (num_ranges > max_num_range) {
875 vfio_combine_iova_ranges(ranges, nnodes, max_num_range);
876 num_ranges = max_num_range;
877 }
878
879 inlen = MLX5_ST_SZ_BYTES(create_page_track_obj_in) +
880 record_size * num_ranges;
881 in = kzalloc(inlen, GFP_KERNEL);
882 if (!in)
883 return -ENOMEM;
884
885 cmd_hdr = MLX5_ADDR_OF(create_page_track_obj_in, in,
886 general_obj_in_cmd_hdr);
887 MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode,
888 MLX5_CMD_OP_CREATE_GENERAL_OBJECT);
889 MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type,
890 MLX5_OBJ_TYPE_PAGE_TRACK);
891 obj_context = MLX5_ADDR_OF(create_page_track_obj_in, in, obj_context);
892 MLX5_SET(page_track, obj_context, vhca_id, mvdev->vhca_id);
893 MLX5_SET(page_track, obj_context, track_type, 1);
894 MLX5_SET(page_track, obj_context, log_page_size,
895 ilog2(tracker->host_qp->tracked_page_size));
896 MLX5_SET(page_track, obj_context, log_msg_size,
897 ilog2(tracker->host_qp->max_msg_size));
898 MLX5_SET(page_track, obj_context, reporting_qpn, tracker->fw_qp->qpn);
899 MLX5_SET(page_track, obj_context, num_ranges, num_ranges);
900
901 range_list_ptr = MLX5_ADDR_OF(page_track, obj_context, track_range);
902 node = interval_tree_iter_first(ranges, 0, ULONG_MAX);
903 for (i = 0; i < num_ranges; i++) {
904 void *addr_range_i_base = range_list_ptr + record_size * i;
905 unsigned long length = node->last - node->start + 1;
906
907 MLX5_SET64(page_track_range, addr_range_i_base, start_address,
908 node->start);
909 MLX5_SET64(page_track_range, addr_range_i_base, length, length);
910 total_ranges_len += length;
911 node = interval_tree_iter_next(node, 0, ULONG_MAX);
912 }
913
914 WARN_ON(node);
915 log_addr_space_size = ilog2(roundup_pow_of_two(total_ranges_len));
916 if (log_addr_space_size <
917 (MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_log_min_addr_space)) ||
918 log_addr_space_size >
919 (MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_log_max_addr_space))) {
920 err = -EOPNOTSUPP;
921 goto out;
922 }
923
924 MLX5_SET(page_track, obj_context, log_addr_space_size,
925 log_addr_space_size);
926 err = mlx5_cmd_exec(mdev, in, inlen, out, sizeof(out));
927 if (err)
928 goto out;
929
930 tracker->id = MLX5_GET(general_obj_out_cmd_hdr, out, obj_id);
931 out:
932 kfree(in);
933 return err;
934 }
935
mlx5vf_cmd_destroy_tracker(struct mlx5_core_dev * mdev,u32 tracker_id)936 static int mlx5vf_cmd_destroy_tracker(struct mlx5_core_dev *mdev,
937 u32 tracker_id)
938 {
939 u32 in[MLX5_ST_SZ_DW(general_obj_in_cmd_hdr)] = {};
940 u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {};
941
942 MLX5_SET(general_obj_in_cmd_hdr, in, opcode, MLX5_CMD_OP_DESTROY_GENERAL_OBJECT);
943 MLX5_SET(general_obj_in_cmd_hdr, in, obj_type, MLX5_OBJ_TYPE_PAGE_TRACK);
944 MLX5_SET(general_obj_in_cmd_hdr, in, obj_id, tracker_id);
945
946 return mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out));
947 }
948
mlx5vf_cmd_modify_tracker(struct mlx5_core_dev * mdev,u32 tracker_id,unsigned long iova,unsigned long length,u32 tracker_state)949 static int mlx5vf_cmd_modify_tracker(struct mlx5_core_dev *mdev,
950 u32 tracker_id, unsigned long iova,
951 unsigned long length, u32 tracker_state)
952 {
953 u32 in[MLX5_ST_SZ_DW(modify_page_track_obj_in)] = {};
954 u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {};
955 void *obj_context;
956 void *cmd_hdr;
957
958 cmd_hdr = MLX5_ADDR_OF(modify_page_track_obj_in, in, general_obj_in_cmd_hdr);
959 MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_MODIFY_GENERAL_OBJECT);
960 MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_PAGE_TRACK);
961 MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_id, tracker_id);
962
963 obj_context = MLX5_ADDR_OF(modify_page_track_obj_in, in, obj_context);
964 MLX5_SET64(page_track, obj_context, modify_field_select, 0x3);
965 MLX5_SET64(page_track, obj_context, range_start_address, iova);
966 MLX5_SET64(page_track, obj_context, length, length);
967 MLX5_SET(page_track, obj_context, state, tracker_state);
968
969 return mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out));
970 }
971
mlx5vf_cmd_query_tracker(struct mlx5_core_dev * mdev,struct mlx5_vhca_page_tracker * tracker)972 static int mlx5vf_cmd_query_tracker(struct mlx5_core_dev *mdev,
973 struct mlx5_vhca_page_tracker *tracker)
974 {
975 u32 out[MLX5_ST_SZ_DW(query_page_track_obj_out)] = {};
976 u32 in[MLX5_ST_SZ_DW(general_obj_in_cmd_hdr)] = {};
977 void *obj_context;
978 void *cmd_hdr;
979 int err;
980
981 cmd_hdr = MLX5_ADDR_OF(modify_page_track_obj_in, in, general_obj_in_cmd_hdr);
982 MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_QUERY_GENERAL_OBJECT);
983 MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_PAGE_TRACK);
984 MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_id, tracker->id);
985
986 err = mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out));
987 if (err)
988 return err;
989
990 obj_context = MLX5_ADDR_OF(query_page_track_obj_out, out, obj_context);
991 tracker->status = MLX5_GET(page_track, obj_context, state);
992 return 0;
993 }
994
alloc_cq_frag_buf(struct mlx5_core_dev * mdev,struct mlx5_vhca_cq_buf * buf,int nent,int cqe_size)995 static int alloc_cq_frag_buf(struct mlx5_core_dev *mdev,
996 struct mlx5_vhca_cq_buf *buf, int nent,
997 int cqe_size)
998 {
999 struct mlx5_frag_buf *frag_buf = &buf->frag_buf;
1000 u8 log_wq_stride = 6 + (cqe_size == 128 ? 1 : 0);
1001 u8 log_wq_sz = ilog2(cqe_size);
1002 int err;
1003
1004 err = mlx5_frag_buf_alloc_node(mdev, nent * cqe_size, frag_buf,
1005 mdev->priv.numa_node);
1006 if (err)
1007 return err;
1008
1009 mlx5_init_fbc(frag_buf->frags, log_wq_stride, log_wq_sz, &buf->fbc);
1010 buf->cqe_size = cqe_size;
1011 buf->nent = nent;
1012 return 0;
1013 }
1014
init_cq_frag_buf(struct mlx5_vhca_cq_buf * buf)1015 static void init_cq_frag_buf(struct mlx5_vhca_cq_buf *buf)
1016 {
1017 struct mlx5_cqe64 *cqe64;
1018 void *cqe;
1019 int i;
1020
1021 for (i = 0; i < buf->nent; i++) {
1022 cqe = mlx5_frag_buf_get_wqe(&buf->fbc, i);
1023 cqe64 = buf->cqe_size == 64 ? cqe : cqe + 64;
1024 cqe64->op_own = MLX5_CQE_INVALID << 4;
1025 }
1026 }
1027
mlx5vf_destroy_cq(struct mlx5_core_dev * mdev,struct mlx5_vhca_cq * cq)1028 static void mlx5vf_destroy_cq(struct mlx5_core_dev *mdev,
1029 struct mlx5_vhca_cq *cq)
1030 {
1031 mlx5_core_destroy_cq(mdev, &cq->mcq);
1032 mlx5_frag_buf_free(mdev, &cq->buf.frag_buf);
1033 mlx5_db_free(mdev, &cq->db);
1034 }
1035
mlx5vf_cq_event(struct mlx5_core_cq * mcq,enum mlx5_event type)1036 static void mlx5vf_cq_event(struct mlx5_core_cq *mcq, enum mlx5_event type)
1037 {
1038 if (type != MLX5_EVENT_TYPE_CQ_ERROR)
1039 return;
1040
1041 set_tracker_error(container_of(mcq, struct mlx5vf_pci_core_device,
1042 tracker.cq.mcq));
1043 }
1044
mlx5vf_event_notifier(struct notifier_block * nb,unsigned long type,void * data)1045 static int mlx5vf_event_notifier(struct notifier_block *nb, unsigned long type,
1046 void *data)
1047 {
1048 struct mlx5_vhca_page_tracker *tracker =
1049 mlx5_nb_cof(nb, struct mlx5_vhca_page_tracker, nb);
1050 struct mlx5vf_pci_core_device *mvdev = container_of(
1051 tracker, struct mlx5vf_pci_core_device, tracker);
1052 struct mlx5_eqe_obj_change *object;
1053 struct mlx5_eqe *eqe = data;
1054 u8 event_type = (u8)type;
1055 u8 queue_type;
1056 u32 obj_id;
1057 int qp_num;
1058
1059 switch (event_type) {
1060 case MLX5_EVENT_TYPE_WQ_CATAS_ERROR:
1061 case MLX5_EVENT_TYPE_WQ_ACCESS_ERROR:
1062 case MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR:
1063 queue_type = eqe->data.qp_srq.type;
1064 if (queue_type != MLX5_EVENT_QUEUE_TYPE_QP)
1065 break;
1066 qp_num = be32_to_cpu(eqe->data.qp_srq.qp_srq_n) & 0xffffff;
1067 if (qp_num != tracker->host_qp->qpn &&
1068 qp_num != tracker->fw_qp->qpn)
1069 break;
1070 set_tracker_error(mvdev);
1071 break;
1072 case MLX5_EVENT_TYPE_OBJECT_CHANGE:
1073 object = &eqe->data.obj_change;
1074 obj_id = be32_to_cpu(object->obj_id);
1075 if (obj_id == tracker->id)
1076 set_tracker_change_event(mvdev);
1077 break;
1078 default:
1079 break;
1080 }
1081
1082 return NOTIFY_OK;
1083 }
1084
mlx5vf_cq_complete(struct mlx5_core_cq * mcq,struct mlx5_eqe * eqe)1085 static void mlx5vf_cq_complete(struct mlx5_core_cq *mcq,
1086 struct mlx5_eqe *eqe)
1087 {
1088 struct mlx5vf_pci_core_device *mvdev =
1089 container_of(mcq, struct mlx5vf_pci_core_device,
1090 tracker.cq.mcq);
1091
1092 complete(&mvdev->tracker_comp);
1093 }
1094
mlx5vf_create_cq(struct mlx5_core_dev * mdev,struct mlx5_vhca_page_tracker * tracker,size_t ncqe)1095 static int mlx5vf_create_cq(struct mlx5_core_dev *mdev,
1096 struct mlx5_vhca_page_tracker *tracker,
1097 size_t ncqe)
1098 {
1099 int cqe_size = cache_line_size() == 128 ? 128 : 64;
1100 u32 out[MLX5_ST_SZ_DW(create_cq_out)];
1101 struct mlx5_vhca_cq *cq;
1102 int inlen, err, eqn;
1103 void *cqc, *in;
1104 __be64 *pas;
1105 int vector;
1106
1107 cq = &tracker->cq;
1108 ncqe = roundup_pow_of_two(ncqe);
1109 err = mlx5_db_alloc_node(mdev, &cq->db, mdev->priv.numa_node);
1110 if (err)
1111 return err;
1112
1113 cq->ncqe = ncqe;
1114 cq->mcq.set_ci_db = cq->db.db;
1115 cq->mcq.arm_db = cq->db.db + 1;
1116 cq->mcq.cqe_sz = cqe_size;
1117 err = alloc_cq_frag_buf(mdev, &cq->buf, ncqe, cqe_size);
1118 if (err)
1119 goto err_db_free;
1120
1121 init_cq_frag_buf(&cq->buf);
1122 inlen = MLX5_ST_SZ_BYTES(create_cq_in) +
1123 MLX5_FLD_SZ_BYTES(create_cq_in, pas[0]) *
1124 cq->buf.frag_buf.npages;
1125 in = kvzalloc(inlen, GFP_KERNEL);
1126 if (!in) {
1127 err = -ENOMEM;
1128 goto err_buff;
1129 }
1130
1131 vector = raw_smp_processor_id() % mlx5_comp_vectors_max(mdev);
1132 err = mlx5_comp_eqn_get(mdev, vector, &eqn);
1133 if (err)
1134 goto err_vec;
1135
1136 cqc = MLX5_ADDR_OF(create_cq_in, in, cq_context);
1137 MLX5_SET(cqc, cqc, log_cq_size, ilog2(ncqe));
1138 MLX5_SET(cqc, cqc, c_eqn_or_apu_element, eqn);
1139 MLX5_SET(cqc, cqc, uar_page, tracker->uar->index);
1140 MLX5_SET(cqc, cqc, log_page_size, cq->buf.frag_buf.page_shift -
1141 MLX5_ADAPTER_PAGE_SHIFT);
1142 MLX5_SET64(cqc, cqc, dbr_addr, cq->db.dma);
1143 pas = (__be64 *)MLX5_ADDR_OF(create_cq_in, in, pas);
1144 mlx5_fill_page_frag_array(&cq->buf.frag_buf, pas);
1145 cq->mcq.comp = mlx5vf_cq_complete;
1146 cq->mcq.event = mlx5vf_cq_event;
1147 err = mlx5_core_create_cq(mdev, &cq->mcq, in, inlen, out, sizeof(out));
1148 if (err)
1149 goto err_vec;
1150
1151 mlx5_cq_arm(&cq->mcq, MLX5_CQ_DB_REQ_NOT, tracker->uar->map,
1152 cq->mcq.cons_index);
1153 kvfree(in);
1154 return 0;
1155
1156 err_vec:
1157 kvfree(in);
1158 err_buff:
1159 mlx5_frag_buf_free(mdev, &cq->buf.frag_buf);
1160 err_db_free:
1161 mlx5_db_free(mdev, &cq->db);
1162 return err;
1163 }
1164
1165 static struct mlx5_vhca_qp *
mlx5vf_create_rc_qp(struct mlx5_core_dev * mdev,struct mlx5_vhca_page_tracker * tracker,u32 max_recv_wr)1166 mlx5vf_create_rc_qp(struct mlx5_core_dev *mdev,
1167 struct mlx5_vhca_page_tracker *tracker, u32 max_recv_wr)
1168 {
1169 u32 out[MLX5_ST_SZ_DW(create_qp_out)] = {};
1170 struct mlx5_vhca_qp *qp;
1171 u8 log_rq_stride;
1172 u8 log_rq_sz;
1173 void *qpc;
1174 int inlen;
1175 void *in;
1176 int err;
1177
1178 qp = kzalloc(sizeof(*qp), GFP_KERNEL_ACCOUNT);
1179 if (!qp)
1180 return ERR_PTR(-ENOMEM);
1181
1182 err = mlx5_db_alloc_node(mdev, &qp->db, mdev->priv.numa_node);
1183 if (err)
1184 goto err_free;
1185
1186 if (max_recv_wr) {
1187 qp->rq.wqe_cnt = roundup_pow_of_two(max_recv_wr);
1188 log_rq_stride = ilog2(MLX5_SEND_WQE_DS);
1189 log_rq_sz = ilog2(qp->rq.wqe_cnt);
1190 err = mlx5_frag_buf_alloc_node(mdev,
1191 wq_get_byte_sz(log_rq_sz, log_rq_stride),
1192 &qp->buf, mdev->priv.numa_node);
1193 if (err)
1194 goto err_db_free;
1195 mlx5_init_fbc(qp->buf.frags, log_rq_stride, log_rq_sz, &qp->rq.fbc);
1196 }
1197
1198 qp->rq.db = &qp->db.db[MLX5_RCV_DBR];
1199 inlen = MLX5_ST_SZ_BYTES(create_qp_in) +
1200 MLX5_FLD_SZ_BYTES(create_qp_in, pas[0]) *
1201 qp->buf.npages;
1202 in = kvzalloc(inlen, GFP_KERNEL);
1203 if (!in) {
1204 err = -ENOMEM;
1205 goto err_in;
1206 }
1207
1208 qpc = MLX5_ADDR_OF(create_qp_in, in, qpc);
1209 MLX5_SET(qpc, qpc, st, MLX5_QP_ST_RC);
1210 MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED);
1211 MLX5_SET(qpc, qpc, pd, tracker->pdn);
1212 MLX5_SET(qpc, qpc, uar_page, tracker->uar->index);
1213 MLX5_SET(qpc, qpc, log_page_size,
1214 qp->buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT);
1215 MLX5_SET(qpc, qpc, ts_format, mlx5_get_qp_default_ts(mdev));
1216 if (MLX5_CAP_GEN(mdev, cqe_version) == 1)
1217 MLX5_SET(qpc, qpc, user_index, 0xFFFFFF);
1218 MLX5_SET(qpc, qpc, no_sq, 1);
1219 if (max_recv_wr) {
1220 MLX5_SET(qpc, qpc, cqn_rcv, tracker->cq.mcq.cqn);
1221 MLX5_SET(qpc, qpc, log_rq_stride, log_rq_stride - 4);
1222 MLX5_SET(qpc, qpc, log_rq_size, log_rq_sz);
1223 MLX5_SET(qpc, qpc, rq_type, MLX5_NON_ZERO_RQ);
1224 MLX5_SET64(qpc, qpc, dbr_addr, qp->db.dma);
1225 mlx5_fill_page_frag_array(&qp->buf,
1226 (__be64 *)MLX5_ADDR_OF(create_qp_in,
1227 in, pas));
1228 } else {
1229 MLX5_SET(qpc, qpc, rq_type, MLX5_ZERO_LEN_RQ);
1230 }
1231
1232 MLX5_SET(create_qp_in, in, opcode, MLX5_CMD_OP_CREATE_QP);
1233 err = mlx5_cmd_exec(mdev, in, inlen, out, sizeof(out));
1234 kvfree(in);
1235 if (err)
1236 goto err_in;
1237
1238 qp->qpn = MLX5_GET(create_qp_out, out, qpn);
1239 return qp;
1240
1241 err_in:
1242 if (max_recv_wr)
1243 mlx5_frag_buf_free(mdev, &qp->buf);
1244 err_db_free:
1245 mlx5_db_free(mdev, &qp->db);
1246 err_free:
1247 kfree(qp);
1248 return ERR_PTR(err);
1249 }
1250
mlx5vf_post_recv(struct mlx5_vhca_qp * qp)1251 static void mlx5vf_post_recv(struct mlx5_vhca_qp *qp)
1252 {
1253 struct mlx5_wqe_data_seg *data;
1254 unsigned int ix;
1255
1256 WARN_ON(qp->rq.pc - qp->rq.cc >= qp->rq.wqe_cnt);
1257 ix = qp->rq.pc & (qp->rq.wqe_cnt - 1);
1258 data = mlx5_frag_buf_get_wqe(&qp->rq.fbc, ix);
1259 data->byte_count = cpu_to_be32(qp->max_msg_size);
1260 data->lkey = cpu_to_be32(qp->recv_buf.mkey);
1261 data->addr = cpu_to_be64(qp->recv_buf.next_rq_offset);
1262 qp->rq.pc++;
1263 /* Make sure that descriptors are written before doorbell record. */
1264 dma_wmb();
1265 *qp->rq.db = cpu_to_be32(qp->rq.pc & 0xffff);
1266 }
1267
mlx5vf_activate_qp(struct mlx5_core_dev * mdev,struct mlx5_vhca_qp * qp,u32 remote_qpn,bool host_qp)1268 static int mlx5vf_activate_qp(struct mlx5_core_dev *mdev,
1269 struct mlx5_vhca_qp *qp, u32 remote_qpn,
1270 bool host_qp)
1271 {
1272 u32 init_in[MLX5_ST_SZ_DW(rst2init_qp_in)] = {};
1273 u32 rtr_in[MLX5_ST_SZ_DW(init2rtr_qp_in)] = {};
1274 u32 rts_in[MLX5_ST_SZ_DW(rtr2rts_qp_in)] = {};
1275 void *qpc;
1276 int ret;
1277
1278 /* Init */
1279 qpc = MLX5_ADDR_OF(rst2init_qp_in, init_in, qpc);
1280 MLX5_SET(qpc, qpc, primary_address_path.vhca_port_num, 1);
1281 MLX5_SET(qpc, qpc, pm_state, MLX5_QPC_PM_STATE_MIGRATED);
1282 MLX5_SET(qpc, qpc, rre, 1);
1283 MLX5_SET(qpc, qpc, rwe, 1);
1284 MLX5_SET(rst2init_qp_in, init_in, opcode, MLX5_CMD_OP_RST2INIT_QP);
1285 MLX5_SET(rst2init_qp_in, init_in, qpn, qp->qpn);
1286 ret = mlx5_cmd_exec_in(mdev, rst2init_qp, init_in);
1287 if (ret)
1288 return ret;
1289
1290 if (host_qp) {
1291 struct mlx5_vhca_recv_buf *recv_buf = &qp->recv_buf;
1292 int i;
1293
1294 for (i = 0; i < qp->rq.wqe_cnt; i++) {
1295 mlx5vf_post_recv(qp);
1296 recv_buf->next_rq_offset += qp->max_msg_size;
1297 }
1298 }
1299
1300 /* RTR */
1301 qpc = MLX5_ADDR_OF(init2rtr_qp_in, rtr_in, qpc);
1302 MLX5_SET(init2rtr_qp_in, rtr_in, qpn, qp->qpn);
1303 MLX5_SET(qpc, qpc, mtu, IB_MTU_4096);
1304 MLX5_SET(qpc, qpc, log_msg_max, MLX5_CAP_GEN(mdev, log_max_msg));
1305 MLX5_SET(qpc, qpc, remote_qpn, remote_qpn);
1306 MLX5_SET(qpc, qpc, primary_address_path.vhca_port_num, 1);
1307 MLX5_SET(qpc, qpc, primary_address_path.fl, 1);
1308 MLX5_SET(qpc, qpc, min_rnr_nak, 1);
1309 MLX5_SET(init2rtr_qp_in, rtr_in, opcode, MLX5_CMD_OP_INIT2RTR_QP);
1310 MLX5_SET(init2rtr_qp_in, rtr_in, qpn, qp->qpn);
1311 ret = mlx5_cmd_exec_in(mdev, init2rtr_qp, rtr_in);
1312 if (ret || host_qp)
1313 return ret;
1314
1315 /* RTS */
1316 qpc = MLX5_ADDR_OF(rtr2rts_qp_in, rts_in, qpc);
1317 MLX5_SET(rtr2rts_qp_in, rts_in, qpn, qp->qpn);
1318 MLX5_SET(qpc, qpc, retry_count, 7);
1319 MLX5_SET(qpc, qpc, rnr_retry, 7); /* Infinite retry if RNR NACK */
1320 MLX5_SET(qpc, qpc, primary_address_path.ack_timeout, 0x8); /* ~1ms */
1321 MLX5_SET(rtr2rts_qp_in, rts_in, opcode, MLX5_CMD_OP_RTR2RTS_QP);
1322 MLX5_SET(rtr2rts_qp_in, rts_in, qpn, qp->qpn);
1323
1324 return mlx5_cmd_exec_in(mdev, rtr2rts_qp, rts_in);
1325 }
1326
mlx5vf_destroy_qp(struct mlx5_core_dev * mdev,struct mlx5_vhca_qp * qp)1327 static void mlx5vf_destroy_qp(struct mlx5_core_dev *mdev,
1328 struct mlx5_vhca_qp *qp)
1329 {
1330 u32 in[MLX5_ST_SZ_DW(destroy_qp_in)] = {};
1331
1332 MLX5_SET(destroy_qp_in, in, opcode, MLX5_CMD_OP_DESTROY_QP);
1333 MLX5_SET(destroy_qp_in, in, qpn, qp->qpn);
1334 mlx5_cmd_exec_in(mdev, destroy_qp, in);
1335
1336 mlx5_frag_buf_free(mdev, &qp->buf);
1337 mlx5_db_free(mdev, &qp->db);
1338 kfree(qp);
1339 }
1340
free_recv_pages(struct mlx5_vhca_recv_buf * recv_buf)1341 static void free_recv_pages(struct mlx5_vhca_recv_buf *recv_buf)
1342 {
1343 int i;
1344
1345 /* Undo alloc_pages_bulk_array() */
1346 for (i = 0; i < recv_buf->npages; i++)
1347 __free_page(recv_buf->page_list[i]);
1348
1349 kvfree(recv_buf->page_list);
1350 }
1351
alloc_recv_pages(struct mlx5_vhca_recv_buf * recv_buf,unsigned int npages)1352 static int alloc_recv_pages(struct mlx5_vhca_recv_buf *recv_buf,
1353 unsigned int npages)
1354 {
1355 unsigned int filled = 0, done = 0;
1356 int i;
1357
1358 recv_buf->page_list = kvcalloc(npages, sizeof(*recv_buf->page_list),
1359 GFP_KERNEL_ACCOUNT);
1360 if (!recv_buf->page_list)
1361 return -ENOMEM;
1362
1363 for (;;) {
1364 filled = alloc_pages_bulk_array(GFP_KERNEL_ACCOUNT,
1365 npages - done,
1366 recv_buf->page_list + done);
1367 if (!filled)
1368 goto err;
1369
1370 done += filled;
1371 if (done == npages)
1372 break;
1373 }
1374
1375 recv_buf->npages = npages;
1376 return 0;
1377
1378 err:
1379 for (i = 0; i < npages; i++) {
1380 if (recv_buf->page_list[i])
1381 __free_page(recv_buf->page_list[i]);
1382 }
1383
1384 kvfree(recv_buf->page_list);
1385 return -ENOMEM;
1386 }
1387
register_dma_recv_pages(struct mlx5_core_dev * mdev,struct mlx5_vhca_recv_buf * recv_buf)1388 static int register_dma_recv_pages(struct mlx5_core_dev *mdev,
1389 struct mlx5_vhca_recv_buf *recv_buf)
1390 {
1391 int i, j;
1392
1393 recv_buf->dma_addrs = kvcalloc(recv_buf->npages,
1394 sizeof(*recv_buf->dma_addrs),
1395 GFP_KERNEL_ACCOUNT);
1396 if (!recv_buf->dma_addrs)
1397 return -ENOMEM;
1398
1399 for (i = 0; i < recv_buf->npages; i++) {
1400 recv_buf->dma_addrs[i] = dma_map_page(mdev->device,
1401 recv_buf->page_list[i],
1402 0, PAGE_SIZE,
1403 DMA_FROM_DEVICE);
1404 if (dma_mapping_error(mdev->device, recv_buf->dma_addrs[i]))
1405 goto error;
1406 }
1407 return 0;
1408
1409 error:
1410 for (j = 0; j < i; j++)
1411 dma_unmap_single(mdev->device, recv_buf->dma_addrs[j],
1412 PAGE_SIZE, DMA_FROM_DEVICE);
1413
1414 kvfree(recv_buf->dma_addrs);
1415 return -ENOMEM;
1416 }
1417
unregister_dma_recv_pages(struct mlx5_core_dev * mdev,struct mlx5_vhca_recv_buf * recv_buf)1418 static void unregister_dma_recv_pages(struct mlx5_core_dev *mdev,
1419 struct mlx5_vhca_recv_buf *recv_buf)
1420 {
1421 int i;
1422
1423 for (i = 0; i < recv_buf->npages; i++)
1424 dma_unmap_single(mdev->device, recv_buf->dma_addrs[i],
1425 PAGE_SIZE, DMA_FROM_DEVICE);
1426
1427 kvfree(recv_buf->dma_addrs);
1428 }
1429
mlx5vf_free_qp_recv_resources(struct mlx5_core_dev * mdev,struct mlx5_vhca_qp * qp)1430 static void mlx5vf_free_qp_recv_resources(struct mlx5_core_dev *mdev,
1431 struct mlx5_vhca_qp *qp)
1432 {
1433 struct mlx5_vhca_recv_buf *recv_buf = &qp->recv_buf;
1434
1435 mlx5_core_destroy_mkey(mdev, recv_buf->mkey);
1436 unregister_dma_recv_pages(mdev, recv_buf);
1437 free_recv_pages(&qp->recv_buf);
1438 }
1439
mlx5vf_alloc_qp_recv_resources(struct mlx5_core_dev * mdev,struct mlx5_vhca_qp * qp,u32 pdn,u64 rq_size)1440 static int mlx5vf_alloc_qp_recv_resources(struct mlx5_core_dev *mdev,
1441 struct mlx5_vhca_qp *qp, u32 pdn,
1442 u64 rq_size)
1443 {
1444 unsigned int npages = DIV_ROUND_UP_ULL(rq_size, PAGE_SIZE);
1445 struct mlx5_vhca_recv_buf *recv_buf = &qp->recv_buf;
1446 int err;
1447
1448 err = alloc_recv_pages(recv_buf, npages);
1449 if (err < 0)
1450 return err;
1451
1452 err = register_dma_recv_pages(mdev, recv_buf);
1453 if (err)
1454 goto end;
1455
1456 err = _create_mkey(mdev, pdn, NULL, recv_buf, &recv_buf->mkey);
1457 if (err)
1458 goto err_create_mkey;
1459
1460 return 0;
1461
1462 err_create_mkey:
1463 unregister_dma_recv_pages(mdev, recv_buf);
1464 end:
1465 free_recv_pages(recv_buf);
1466 return err;
1467 }
1468
1469 static void
_mlx5vf_free_page_tracker_resources(struct mlx5vf_pci_core_device * mvdev)1470 _mlx5vf_free_page_tracker_resources(struct mlx5vf_pci_core_device *mvdev)
1471 {
1472 struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker;
1473 struct mlx5_core_dev *mdev = mvdev->mdev;
1474
1475 lockdep_assert_held(&mvdev->state_mutex);
1476
1477 if (!mvdev->log_active)
1478 return;
1479
1480 WARN_ON(mvdev->mdev_detach);
1481
1482 mlx5_eq_notifier_unregister(mdev, &tracker->nb);
1483 mlx5vf_cmd_destroy_tracker(mdev, tracker->id);
1484 mlx5vf_destroy_qp(mdev, tracker->fw_qp);
1485 mlx5vf_free_qp_recv_resources(mdev, tracker->host_qp);
1486 mlx5vf_destroy_qp(mdev, tracker->host_qp);
1487 mlx5vf_destroy_cq(mdev, &tracker->cq);
1488 mlx5_core_dealloc_pd(mdev, tracker->pdn);
1489 mlx5_put_uars_page(mdev, tracker->uar);
1490 mvdev->log_active = false;
1491 }
1492
mlx5vf_stop_page_tracker(struct vfio_device * vdev)1493 int mlx5vf_stop_page_tracker(struct vfio_device *vdev)
1494 {
1495 struct mlx5vf_pci_core_device *mvdev = container_of(
1496 vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1497
1498 mutex_lock(&mvdev->state_mutex);
1499 if (!mvdev->log_active)
1500 goto end;
1501
1502 _mlx5vf_free_page_tracker_resources(mvdev);
1503 mvdev->log_active = false;
1504 end:
1505 mlx5vf_state_mutex_unlock(mvdev);
1506 return 0;
1507 }
1508
mlx5vf_start_page_tracker(struct vfio_device * vdev,struct rb_root_cached * ranges,u32 nnodes,u64 * page_size)1509 int mlx5vf_start_page_tracker(struct vfio_device *vdev,
1510 struct rb_root_cached *ranges, u32 nnodes,
1511 u64 *page_size)
1512 {
1513 struct mlx5vf_pci_core_device *mvdev = container_of(
1514 vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1515 struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker;
1516 u8 log_tracked_page = ilog2(*page_size);
1517 struct mlx5_vhca_qp *host_qp;
1518 struct mlx5_vhca_qp *fw_qp;
1519 struct mlx5_core_dev *mdev;
1520 u32 log_max_msg_size;
1521 u32 max_msg_size;
1522 u64 rq_size = SZ_2M;
1523 u32 max_recv_wr;
1524 int err;
1525
1526 mutex_lock(&mvdev->state_mutex);
1527 if (mvdev->mdev_detach) {
1528 err = -ENOTCONN;
1529 goto end;
1530 }
1531
1532 if (mvdev->log_active) {
1533 err = -EINVAL;
1534 goto end;
1535 }
1536
1537 mdev = mvdev->mdev;
1538 log_max_msg_size = MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_log_max_msg_size);
1539 max_msg_size = (1ULL << log_max_msg_size);
1540 /* The RQ must hold at least 4 WQEs/messages for successful QP creation */
1541 if (rq_size < 4 * max_msg_size)
1542 rq_size = 4 * max_msg_size;
1543
1544 memset(tracker, 0, sizeof(*tracker));
1545 tracker->uar = mlx5_get_uars_page(mdev);
1546 if (IS_ERR(tracker->uar)) {
1547 err = PTR_ERR(tracker->uar);
1548 goto end;
1549 }
1550
1551 err = mlx5_core_alloc_pd(mdev, &tracker->pdn);
1552 if (err)
1553 goto err_uar;
1554
1555 max_recv_wr = DIV_ROUND_UP_ULL(rq_size, max_msg_size);
1556 err = mlx5vf_create_cq(mdev, tracker, max_recv_wr);
1557 if (err)
1558 goto err_dealloc_pd;
1559
1560 host_qp = mlx5vf_create_rc_qp(mdev, tracker, max_recv_wr);
1561 if (IS_ERR(host_qp)) {
1562 err = PTR_ERR(host_qp);
1563 goto err_cq;
1564 }
1565
1566 host_qp->max_msg_size = max_msg_size;
1567 if (log_tracked_page < MLX5_CAP_ADV_VIRTUALIZATION(mdev,
1568 pg_track_log_min_page_size)) {
1569 log_tracked_page = MLX5_CAP_ADV_VIRTUALIZATION(mdev,
1570 pg_track_log_min_page_size);
1571 } else if (log_tracked_page > MLX5_CAP_ADV_VIRTUALIZATION(mdev,
1572 pg_track_log_max_page_size)) {
1573 log_tracked_page = MLX5_CAP_ADV_VIRTUALIZATION(mdev,
1574 pg_track_log_max_page_size);
1575 }
1576
1577 host_qp->tracked_page_size = (1ULL << log_tracked_page);
1578 err = mlx5vf_alloc_qp_recv_resources(mdev, host_qp, tracker->pdn,
1579 rq_size);
1580 if (err)
1581 goto err_host_qp;
1582
1583 fw_qp = mlx5vf_create_rc_qp(mdev, tracker, 0);
1584 if (IS_ERR(fw_qp)) {
1585 err = PTR_ERR(fw_qp);
1586 goto err_recv_resources;
1587 }
1588
1589 err = mlx5vf_activate_qp(mdev, host_qp, fw_qp->qpn, true);
1590 if (err)
1591 goto err_activate;
1592
1593 err = mlx5vf_activate_qp(mdev, fw_qp, host_qp->qpn, false);
1594 if (err)
1595 goto err_activate;
1596
1597 tracker->host_qp = host_qp;
1598 tracker->fw_qp = fw_qp;
1599 err = mlx5vf_create_tracker(mdev, mvdev, ranges, nnodes);
1600 if (err)
1601 goto err_activate;
1602
1603 MLX5_NB_INIT(&tracker->nb, mlx5vf_event_notifier, NOTIFY_ANY);
1604 mlx5_eq_notifier_register(mdev, &tracker->nb);
1605 *page_size = host_qp->tracked_page_size;
1606 mvdev->log_active = true;
1607 mlx5vf_state_mutex_unlock(mvdev);
1608 return 0;
1609
1610 err_activate:
1611 mlx5vf_destroy_qp(mdev, fw_qp);
1612 err_recv_resources:
1613 mlx5vf_free_qp_recv_resources(mdev, host_qp);
1614 err_host_qp:
1615 mlx5vf_destroy_qp(mdev, host_qp);
1616 err_cq:
1617 mlx5vf_destroy_cq(mdev, &tracker->cq);
1618 err_dealloc_pd:
1619 mlx5_core_dealloc_pd(mdev, tracker->pdn);
1620 err_uar:
1621 mlx5_put_uars_page(mdev, tracker->uar);
1622 end:
1623 mlx5vf_state_mutex_unlock(mvdev);
1624 return err;
1625 }
1626
1627 static void
set_report_output(u32 size,int index,struct mlx5_vhca_qp * qp,struct iova_bitmap * dirty)1628 set_report_output(u32 size, int index, struct mlx5_vhca_qp *qp,
1629 struct iova_bitmap *dirty)
1630 {
1631 u32 entry_size = MLX5_ST_SZ_BYTES(page_track_report_entry);
1632 u32 nent = size / entry_size;
1633 u32 nent_in_page;
1634 u32 nent_to_set;
1635 struct page *page;
1636 u32 page_offset;
1637 u32 page_index;
1638 u32 buf_offset;
1639 void *kaddr;
1640 u64 addr;
1641 u64 *buf;
1642 int i;
1643
1644 buf_offset = index * qp->max_msg_size;
1645 if (WARN_ON(buf_offset + size >= qp->recv_buf.npages * PAGE_SIZE ||
1646 (nent > qp->max_msg_size / entry_size)))
1647 return;
1648
1649 do {
1650 page_index = buf_offset / PAGE_SIZE;
1651 page_offset = buf_offset % PAGE_SIZE;
1652 nent_in_page = (PAGE_SIZE - page_offset) / entry_size;
1653 page = qp->recv_buf.page_list[page_index];
1654 kaddr = kmap_local_page(page);
1655 buf = kaddr + page_offset;
1656 nent_to_set = min(nent, nent_in_page);
1657 for (i = 0; i < nent_to_set; i++) {
1658 addr = MLX5_GET(page_track_report_entry, buf + i,
1659 dirty_address_low);
1660 addr |= (u64)MLX5_GET(page_track_report_entry, buf + i,
1661 dirty_address_high) << 32;
1662 iova_bitmap_set(dirty, addr, qp->tracked_page_size);
1663 }
1664 kunmap_local(kaddr);
1665 buf_offset += (nent_to_set * entry_size);
1666 nent -= nent_to_set;
1667 } while (nent);
1668 }
1669
1670 static void
mlx5vf_rq_cqe(struct mlx5_vhca_qp * qp,struct mlx5_cqe64 * cqe,struct iova_bitmap * dirty,int * tracker_status)1671 mlx5vf_rq_cqe(struct mlx5_vhca_qp *qp, struct mlx5_cqe64 *cqe,
1672 struct iova_bitmap *dirty, int *tracker_status)
1673 {
1674 u32 size;
1675 int ix;
1676
1677 qp->rq.cc++;
1678 *tracker_status = be32_to_cpu(cqe->immediate) >> 28;
1679 size = be32_to_cpu(cqe->byte_cnt);
1680 ix = be16_to_cpu(cqe->wqe_counter) & (qp->rq.wqe_cnt - 1);
1681
1682 /* zero length CQE, no data */
1683 WARN_ON(!size && *tracker_status == MLX5_PAGE_TRACK_STATE_REPORTING);
1684 if (size)
1685 set_report_output(size, ix, qp, dirty);
1686
1687 qp->recv_buf.next_rq_offset = ix * qp->max_msg_size;
1688 mlx5vf_post_recv(qp);
1689 }
1690
get_cqe(struct mlx5_vhca_cq * cq,int n)1691 static void *get_cqe(struct mlx5_vhca_cq *cq, int n)
1692 {
1693 return mlx5_frag_buf_get_wqe(&cq->buf.fbc, n);
1694 }
1695
get_sw_cqe(struct mlx5_vhca_cq * cq,int n)1696 static struct mlx5_cqe64 *get_sw_cqe(struct mlx5_vhca_cq *cq, int n)
1697 {
1698 void *cqe = get_cqe(cq, n & (cq->ncqe - 1));
1699 struct mlx5_cqe64 *cqe64;
1700
1701 cqe64 = (cq->mcq.cqe_sz == 64) ? cqe : cqe + 64;
1702
1703 if (likely(get_cqe_opcode(cqe64) != MLX5_CQE_INVALID) &&
1704 !((cqe64->op_own & MLX5_CQE_OWNER_MASK) ^ !!(n & (cq->ncqe)))) {
1705 return cqe64;
1706 } else {
1707 return NULL;
1708 }
1709 }
1710
1711 static int
mlx5vf_cq_poll_one(struct mlx5_vhca_cq * cq,struct mlx5_vhca_qp * qp,struct iova_bitmap * dirty,int * tracker_status)1712 mlx5vf_cq_poll_one(struct mlx5_vhca_cq *cq, struct mlx5_vhca_qp *qp,
1713 struct iova_bitmap *dirty, int *tracker_status)
1714 {
1715 struct mlx5_cqe64 *cqe;
1716 u8 opcode;
1717
1718 cqe = get_sw_cqe(cq, cq->mcq.cons_index);
1719 if (!cqe)
1720 return CQ_EMPTY;
1721
1722 ++cq->mcq.cons_index;
1723 /*
1724 * Make sure we read CQ entry contents after we've checked the
1725 * ownership bit.
1726 */
1727 rmb();
1728 opcode = get_cqe_opcode(cqe);
1729 switch (opcode) {
1730 case MLX5_CQE_RESP_SEND_IMM:
1731 mlx5vf_rq_cqe(qp, cqe, dirty, tracker_status);
1732 return CQ_OK;
1733 default:
1734 return CQ_POLL_ERR;
1735 }
1736 }
1737
mlx5vf_tracker_read_and_clear(struct vfio_device * vdev,unsigned long iova,unsigned long length,struct iova_bitmap * dirty)1738 int mlx5vf_tracker_read_and_clear(struct vfio_device *vdev, unsigned long iova,
1739 unsigned long length,
1740 struct iova_bitmap *dirty)
1741 {
1742 struct mlx5vf_pci_core_device *mvdev = container_of(
1743 vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1744 struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker;
1745 struct mlx5_vhca_cq *cq = &tracker->cq;
1746 struct mlx5_core_dev *mdev;
1747 int poll_err, err;
1748
1749 mutex_lock(&mvdev->state_mutex);
1750 if (!mvdev->log_active) {
1751 err = -EINVAL;
1752 goto end;
1753 }
1754
1755 if (mvdev->mdev_detach) {
1756 err = -ENOTCONN;
1757 goto end;
1758 }
1759
1760 if (tracker->is_err) {
1761 err = -EIO;
1762 goto end;
1763 }
1764
1765 mdev = mvdev->mdev;
1766 err = mlx5vf_cmd_modify_tracker(mdev, tracker->id, iova, length,
1767 MLX5_PAGE_TRACK_STATE_REPORTING);
1768 if (err)
1769 goto end;
1770
1771 tracker->status = MLX5_PAGE_TRACK_STATE_REPORTING;
1772 while (tracker->status == MLX5_PAGE_TRACK_STATE_REPORTING &&
1773 !tracker->is_err) {
1774 poll_err = mlx5vf_cq_poll_one(cq, tracker->host_qp, dirty,
1775 &tracker->status);
1776 if (poll_err == CQ_EMPTY) {
1777 mlx5_cq_arm(&cq->mcq, MLX5_CQ_DB_REQ_NOT, tracker->uar->map,
1778 cq->mcq.cons_index);
1779 poll_err = mlx5vf_cq_poll_one(cq, tracker->host_qp,
1780 dirty, &tracker->status);
1781 if (poll_err == CQ_EMPTY) {
1782 wait_for_completion(&mvdev->tracker_comp);
1783 if (tracker->object_changed) {
1784 tracker->object_changed = false;
1785 err = mlx5vf_cmd_query_tracker(mdev, tracker);
1786 if (err)
1787 goto end;
1788 }
1789 continue;
1790 }
1791 }
1792 if (poll_err == CQ_POLL_ERR) {
1793 err = -EIO;
1794 goto end;
1795 }
1796 mlx5_cq_set_ci(&cq->mcq);
1797 }
1798
1799 if (tracker->status == MLX5_PAGE_TRACK_STATE_ERROR)
1800 tracker->is_err = true;
1801
1802 if (tracker->is_err)
1803 err = -EIO;
1804 end:
1805 mlx5vf_state_mutex_unlock(mvdev);
1806 return err;
1807 }
1808