xref: /linux/drivers/vfio/pci/mlx5/cmd.c (revision c5dbf04160005e07e8ca7232a7faa77ab1547ae0)
1 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
2 /*
3  * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved
4  */
5 
6 #include "cmd.h"
7 
8 enum { CQ_OK = 0, CQ_EMPTY = -1, CQ_POLL_ERR = -2 };
9 
10 static int mlx5vf_is_migratable(struct mlx5_core_dev *mdev, u16 func_id)
11 {
12 	int query_sz = MLX5_ST_SZ_BYTES(query_hca_cap_out);
13 	void *query_cap = NULL, *cap;
14 	int ret;
15 
16 	query_cap = kzalloc(query_sz, GFP_KERNEL);
17 	if (!query_cap)
18 		return -ENOMEM;
19 
20 	ret = mlx5_vport_get_other_func_cap(mdev, func_id, query_cap,
21 					    MLX5_CAP_GENERAL_2);
22 	if (ret)
23 		goto out;
24 
25 	cap = MLX5_ADDR_OF(query_hca_cap_out, query_cap, capability);
26 	if (!MLX5_GET(cmd_hca_cap_2, cap, migratable))
27 		ret = -EOPNOTSUPP;
28 out:
29 	kfree(query_cap);
30 	return ret;
31 }
32 
33 static int mlx5vf_cmd_get_vhca_id(struct mlx5_core_dev *mdev, u16 function_id,
34 				  u16 *vhca_id);
35 static void
36 _mlx5vf_free_page_tracker_resources(struct mlx5vf_pci_core_device *mvdev);
37 
38 int mlx5vf_cmd_suspend_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod)
39 {
40 	struct mlx5_vf_migration_file *migf = mvdev->saving_migf;
41 	u32 out[MLX5_ST_SZ_DW(suspend_vhca_out)] = {};
42 	u32 in[MLX5_ST_SZ_DW(suspend_vhca_in)] = {};
43 	int err;
44 
45 	lockdep_assert_held(&mvdev->state_mutex);
46 	if (mvdev->mdev_detach)
47 		return -ENOTCONN;
48 
49 	/*
50 	 * In case PRE_COPY is used, saving_migf is exposed while the device is
51 	 * running. Make sure to run only once there is no active save command.
52 	 * Running both in parallel, might end-up with a failure in the save
53 	 * command once it will try to turn on 'tracking' on a suspended device.
54 	 */
55 	if (migf) {
56 		err = wait_for_completion_interruptible(&migf->save_comp);
57 		if (err)
58 			return err;
59 	}
60 
61 	MLX5_SET(suspend_vhca_in, in, opcode, MLX5_CMD_OP_SUSPEND_VHCA);
62 	MLX5_SET(suspend_vhca_in, in, vhca_id, mvdev->vhca_id);
63 	MLX5_SET(suspend_vhca_in, in, op_mod, op_mod);
64 
65 	err = mlx5_cmd_exec_inout(mvdev->mdev, suspend_vhca, in, out);
66 	if (migf)
67 		complete(&migf->save_comp);
68 
69 	return err;
70 }
71 
72 int mlx5vf_cmd_resume_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod)
73 {
74 	u32 out[MLX5_ST_SZ_DW(resume_vhca_out)] = {};
75 	u32 in[MLX5_ST_SZ_DW(resume_vhca_in)] = {};
76 
77 	lockdep_assert_held(&mvdev->state_mutex);
78 	if (mvdev->mdev_detach)
79 		return -ENOTCONN;
80 
81 	MLX5_SET(resume_vhca_in, in, opcode, MLX5_CMD_OP_RESUME_VHCA);
82 	MLX5_SET(resume_vhca_in, in, vhca_id, mvdev->vhca_id);
83 	MLX5_SET(resume_vhca_in, in, op_mod, op_mod);
84 
85 	return mlx5_cmd_exec_inout(mvdev->mdev, resume_vhca, in, out);
86 }
87 
88 int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev,
89 					  size_t *state_size, u64 *total_size,
90 					  u8 query_flags)
91 {
92 	u32 out[MLX5_ST_SZ_DW(query_vhca_migration_state_out)] = {};
93 	u32 in[MLX5_ST_SZ_DW(query_vhca_migration_state_in)] = {};
94 	bool inc = query_flags & MLX5VF_QUERY_INC;
95 	int ret;
96 
97 	lockdep_assert_held(&mvdev->state_mutex);
98 	if (mvdev->mdev_detach)
99 		return -ENOTCONN;
100 
101 	/*
102 	 * In case PRE_COPY is used, saving_migf is exposed while device is
103 	 * running. Make sure to run only once there is no active save command.
104 	 * Running both in parallel, might end-up with a failure in the
105 	 * incremental query command on un-tracked vhca.
106 	 */
107 	if (inc) {
108 		ret = wait_for_completion_interruptible(&mvdev->saving_migf->save_comp);
109 		if (ret)
110 			return ret;
111 		if (mvdev->saving_migf->state ==
112 		    MLX5_MIGF_STATE_PRE_COPY_ERROR) {
113 			/*
114 			 * In case we had a PRE_COPY error, only query full
115 			 * image for final image
116 			 */
117 			if (!(query_flags & MLX5VF_QUERY_FINAL)) {
118 				*state_size = 0;
119 				complete(&mvdev->saving_migf->save_comp);
120 				return 0;
121 			}
122 			query_flags &= ~MLX5VF_QUERY_INC;
123 		}
124 	}
125 
126 	MLX5_SET(query_vhca_migration_state_in, in, opcode,
127 		 MLX5_CMD_OP_QUERY_VHCA_MIGRATION_STATE);
128 	MLX5_SET(query_vhca_migration_state_in, in, vhca_id, mvdev->vhca_id);
129 	MLX5_SET(query_vhca_migration_state_in, in, op_mod, 0);
130 	MLX5_SET(query_vhca_migration_state_in, in, incremental,
131 		 query_flags & MLX5VF_QUERY_INC);
132 	MLX5_SET(query_vhca_migration_state_in, in, chunk, mvdev->chunk_mode);
133 
134 	ret = mlx5_cmd_exec_inout(mvdev->mdev, query_vhca_migration_state, in,
135 				  out);
136 	if (inc)
137 		complete(&mvdev->saving_migf->save_comp);
138 
139 	if (ret)
140 		return ret;
141 
142 	*state_size = MLX5_GET(query_vhca_migration_state_out, out,
143 			       required_umem_size);
144 	if (total_size)
145 		*total_size = mvdev->chunk_mode ?
146 			MLX5_GET64(query_vhca_migration_state_out, out,
147 				   remaining_total_size) : *state_size;
148 
149 	return 0;
150 }
151 
152 static void set_tracker_error(struct mlx5vf_pci_core_device *mvdev)
153 {
154 	/* Mark the tracker under an error and wake it up if it's running */
155 	mvdev->tracker.is_err = true;
156 	complete(&mvdev->tracker_comp);
157 }
158 
159 static int mlx5fv_vf_event(struct notifier_block *nb,
160 			   unsigned long event, void *data)
161 {
162 	struct mlx5vf_pci_core_device *mvdev =
163 		container_of(nb, struct mlx5vf_pci_core_device, nb);
164 
165 	switch (event) {
166 	case MLX5_PF_NOTIFY_ENABLE_VF:
167 		mutex_lock(&mvdev->state_mutex);
168 		mvdev->mdev_detach = false;
169 		mlx5vf_state_mutex_unlock(mvdev);
170 		break;
171 	case MLX5_PF_NOTIFY_DISABLE_VF:
172 		mlx5vf_cmd_close_migratable(mvdev);
173 		mutex_lock(&mvdev->state_mutex);
174 		mvdev->mdev_detach = true;
175 		mlx5vf_state_mutex_unlock(mvdev);
176 		break;
177 	default:
178 		break;
179 	}
180 
181 	return 0;
182 }
183 
184 void mlx5vf_cmd_close_migratable(struct mlx5vf_pci_core_device *mvdev)
185 {
186 	if (!mvdev->migrate_cap)
187 		return;
188 
189 	/* Must be done outside the lock to let it progress */
190 	set_tracker_error(mvdev);
191 	mutex_lock(&mvdev->state_mutex);
192 	mlx5vf_disable_fds(mvdev);
193 	_mlx5vf_free_page_tracker_resources(mvdev);
194 	mlx5vf_state_mutex_unlock(mvdev);
195 }
196 
197 void mlx5vf_cmd_remove_migratable(struct mlx5vf_pci_core_device *mvdev)
198 {
199 	if (!mvdev->migrate_cap)
200 		return;
201 
202 	mlx5_sriov_blocking_notifier_unregister(mvdev->mdev, mvdev->vf_id,
203 						&mvdev->nb);
204 	destroy_workqueue(mvdev->cb_wq);
205 }
206 
207 void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev,
208 			       const struct vfio_migration_ops *mig_ops,
209 			       const struct vfio_log_ops *log_ops)
210 {
211 	struct pci_dev *pdev = mvdev->core_device.pdev;
212 	int ret;
213 
214 	if (!pdev->is_virtfn)
215 		return;
216 
217 	mvdev->mdev = mlx5_vf_get_core_dev(pdev);
218 	if (!mvdev->mdev)
219 		return;
220 
221 	if (!MLX5_CAP_GEN(mvdev->mdev, migration))
222 		goto end;
223 
224 	mvdev->vf_id = pci_iov_vf_id(pdev);
225 	if (mvdev->vf_id < 0)
226 		goto end;
227 
228 	ret = mlx5vf_is_migratable(mvdev->mdev, mvdev->vf_id + 1);
229 	if (ret)
230 		goto end;
231 
232 	if (mlx5vf_cmd_get_vhca_id(mvdev->mdev, mvdev->vf_id + 1,
233 				   &mvdev->vhca_id))
234 		goto end;
235 
236 	mvdev->cb_wq = alloc_ordered_workqueue("mlx5vf_wq", 0);
237 	if (!mvdev->cb_wq)
238 		goto end;
239 
240 	mutex_init(&mvdev->state_mutex);
241 	spin_lock_init(&mvdev->reset_lock);
242 	mvdev->nb.notifier_call = mlx5fv_vf_event;
243 	ret = mlx5_sriov_blocking_notifier_register(mvdev->mdev, mvdev->vf_id,
244 						    &mvdev->nb);
245 	if (ret) {
246 		destroy_workqueue(mvdev->cb_wq);
247 		goto end;
248 	}
249 
250 	mvdev->migrate_cap = 1;
251 	mvdev->core_device.vdev.migration_flags =
252 		VFIO_MIGRATION_STOP_COPY |
253 		VFIO_MIGRATION_P2P;
254 	mvdev->core_device.vdev.mig_ops = mig_ops;
255 	init_completion(&mvdev->tracker_comp);
256 	if (MLX5_CAP_GEN(mvdev->mdev, adv_virtualization))
257 		mvdev->core_device.vdev.log_ops = log_ops;
258 
259 	if (MLX5_CAP_GEN_2(mvdev->mdev, migration_multi_load) &&
260 	    MLX5_CAP_GEN_2(mvdev->mdev, migration_tracking_state))
261 		mvdev->core_device.vdev.migration_flags |=
262 			VFIO_MIGRATION_PRE_COPY;
263 
264 	if (MLX5_CAP_GEN_2(mvdev->mdev, migration_in_chunks))
265 		mvdev->chunk_mode = 1;
266 
267 end:
268 	mlx5_vf_put_core_dev(mvdev->mdev);
269 }
270 
271 static int mlx5vf_cmd_get_vhca_id(struct mlx5_core_dev *mdev, u16 function_id,
272 				  u16 *vhca_id)
273 {
274 	u32 in[MLX5_ST_SZ_DW(query_hca_cap_in)] = {};
275 	int out_size;
276 	void *out;
277 	int ret;
278 
279 	out_size = MLX5_ST_SZ_BYTES(query_hca_cap_out);
280 	out = kzalloc(out_size, GFP_KERNEL);
281 	if (!out)
282 		return -ENOMEM;
283 
284 	MLX5_SET(query_hca_cap_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_CAP);
285 	MLX5_SET(query_hca_cap_in, in, other_function, 1);
286 	MLX5_SET(query_hca_cap_in, in, function_id, function_id);
287 	MLX5_SET(query_hca_cap_in, in, op_mod,
288 		 MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE << 1 |
289 		 HCA_CAP_OPMOD_GET_CUR);
290 
291 	ret = mlx5_cmd_exec_inout(mdev, query_hca_cap, in, out);
292 	if (ret)
293 		goto err_exec;
294 
295 	*vhca_id = MLX5_GET(query_hca_cap_out, out,
296 			    capability.cmd_hca_cap.vhca_id);
297 
298 err_exec:
299 	kfree(out);
300 	return ret;
301 }
302 
303 static int _create_mkey(struct mlx5_core_dev *mdev, u32 pdn,
304 			struct mlx5_vhca_data_buffer *buf,
305 			struct mlx5_vhca_recv_buf *recv_buf,
306 			u32 *mkey)
307 {
308 	size_t npages = buf ? DIV_ROUND_UP(buf->allocated_length, PAGE_SIZE) :
309 				recv_buf->npages;
310 	int err = 0, inlen;
311 	__be64 *mtt;
312 	void *mkc;
313 	u32 *in;
314 
315 	inlen = MLX5_ST_SZ_BYTES(create_mkey_in) +
316 		sizeof(*mtt) * round_up(npages, 2);
317 
318 	in = kvzalloc(inlen, GFP_KERNEL);
319 	if (!in)
320 		return -ENOMEM;
321 
322 	MLX5_SET(create_mkey_in, in, translations_octword_actual_size,
323 		 DIV_ROUND_UP(npages, 2));
324 	mtt = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt);
325 
326 	if (buf) {
327 		struct sg_dma_page_iter dma_iter;
328 
329 		for_each_sgtable_dma_page(&buf->table.sgt, &dma_iter, 0)
330 			*mtt++ = cpu_to_be64(sg_page_iter_dma_address(&dma_iter));
331 	} else {
332 		int i;
333 
334 		for (i = 0; i < npages; i++)
335 			*mtt++ = cpu_to_be64(recv_buf->dma_addrs[i]);
336 	}
337 
338 	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
339 	MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_MTT);
340 	MLX5_SET(mkc, mkc, lr, 1);
341 	MLX5_SET(mkc, mkc, lw, 1);
342 	MLX5_SET(mkc, mkc, rr, 1);
343 	MLX5_SET(mkc, mkc, rw, 1);
344 	MLX5_SET(mkc, mkc, pd, pdn);
345 	MLX5_SET(mkc, mkc, bsf_octword_size, 0);
346 	MLX5_SET(mkc, mkc, qpn, 0xffffff);
347 	MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT);
348 	MLX5_SET(mkc, mkc, translations_octword_size, DIV_ROUND_UP(npages, 2));
349 	MLX5_SET64(mkc, mkc, len, npages * PAGE_SIZE);
350 	err = mlx5_core_create_mkey(mdev, mkey, in, inlen);
351 	kvfree(in);
352 	return err;
353 }
354 
355 static int mlx5vf_dma_data_buffer(struct mlx5_vhca_data_buffer *buf)
356 {
357 	struct mlx5vf_pci_core_device *mvdev = buf->migf->mvdev;
358 	struct mlx5_core_dev *mdev = mvdev->mdev;
359 	int ret;
360 
361 	lockdep_assert_held(&mvdev->state_mutex);
362 	if (mvdev->mdev_detach)
363 		return -ENOTCONN;
364 
365 	if (buf->dmaed || !buf->allocated_length)
366 		return -EINVAL;
367 
368 	ret = dma_map_sgtable(mdev->device, &buf->table.sgt, buf->dma_dir, 0);
369 	if (ret)
370 		return ret;
371 
372 	ret = _create_mkey(mdev, buf->migf->pdn, buf, NULL, &buf->mkey);
373 	if (ret)
374 		goto err;
375 
376 	buf->dmaed = true;
377 
378 	return 0;
379 err:
380 	dma_unmap_sgtable(mdev->device, &buf->table.sgt, buf->dma_dir, 0);
381 	return ret;
382 }
383 
384 void mlx5vf_free_data_buffer(struct mlx5_vhca_data_buffer *buf)
385 {
386 	struct mlx5_vf_migration_file *migf = buf->migf;
387 	struct sg_page_iter sg_iter;
388 
389 	lockdep_assert_held(&migf->mvdev->state_mutex);
390 	WARN_ON(migf->mvdev->mdev_detach);
391 
392 	if (buf->dmaed) {
393 		mlx5_core_destroy_mkey(migf->mvdev->mdev, buf->mkey);
394 		dma_unmap_sgtable(migf->mvdev->mdev->device, &buf->table.sgt,
395 				  buf->dma_dir, 0);
396 	}
397 
398 	/* Undo alloc_pages_bulk_array() */
399 	for_each_sgtable_page(&buf->table.sgt, &sg_iter, 0)
400 		__free_page(sg_page_iter_page(&sg_iter));
401 	sg_free_append_table(&buf->table);
402 	kfree(buf);
403 }
404 
405 struct mlx5_vhca_data_buffer *
406 mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf,
407 			 size_t length,
408 			 enum dma_data_direction dma_dir)
409 {
410 	struct mlx5_vhca_data_buffer *buf;
411 	int ret;
412 
413 	buf = kzalloc(sizeof(*buf), GFP_KERNEL_ACCOUNT);
414 	if (!buf)
415 		return ERR_PTR(-ENOMEM);
416 
417 	buf->dma_dir = dma_dir;
418 	buf->migf = migf;
419 	if (length) {
420 		ret = mlx5vf_add_migration_pages(buf,
421 				DIV_ROUND_UP_ULL(length, PAGE_SIZE));
422 		if (ret)
423 			goto end;
424 
425 		if (dma_dir != DMA_NONE) {
426 			ret = mlx5vf_dma_data_buffer(buf);
427 			if (ret)
428 				goto end;
429 		}
430 	}
431 
432 	return buf;
433 end:
434 	mlx5vf_free_data_buffer(buf);
435 	return ERR_PTR(ret);
436 }
437 
438 void mlx5vf_put_data_buffer(struct mlx5_vhca_data_buffer *buf)
439 {
440 	spin_lock_irq(&buf->migf->list_lock);
441 	buf->stop_copy_chunk_num = 0;
442 	list_add_tail(&buf->buf_elm, &buf->migf->avail_list);
443 	spin_unlock_irq(&buf->migf->list_lock);
444 }
445 
446 struct mlx5_vhca_data_buffer *
447 mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf,
448 		       size_t length, enum dma_data_direction dma_dir)
449 {
450 	struct mlx5_vhca_data_buffer *buf, *temp_buf;
451 	struct list_head free_list;
452 
453 	lockdep_assert_held(&migf->mvdev->state_mutex);
454 	if (migf->mvdev->mdev_detach)
455 		return ERR_PTR(-ENOTCONN);
456 
457 	INIT_LIST_HEAD(&free_list);
458 
459 	spin_lock_irq(&migf->list_lock);
460 	list_for_each_entry_safe(buf, temp_buf, &migf->avail_list, buf_elm) {
461 		if (buf->dma_dir == dma_dir) {
462 			list_del_init(&buf->buf_elm);
463 			if (buf->allocated_length >= length) {
464 				spin_unlock_irq(&migf->list_lock);
465 				goto found;
466 			}
467 			/*
468 			 * Prevent holding redundant buffers. Put in a free
469 			 * list and call at the end not under the spin lock
470 			 * (&migf->list_lock) to mlx5vf_free_data_buffer which
471 			 * might sleep.
472 			 */
473 			list_add(&buf->buf_elm, &free_list);
474 		}
475 	}
476 	spin_unlock_irq(&migf->list_lock);
477 	buf = mlx5vf_alloc_data_buffer(migf, length, dma_dir);
478 
479 found:
480 	while ((temp_buf = list_first_entry_or_null(&free_list,
481 				struct mlx5_vhca_data_buffer, buf_elm))) {
482 		list_del(&temp_buf->buf_elm);
483 		mlx5vf_free_data_buffer(temp_buf);
484 	}
485 
486 	return buf;
487 }
488 
489 static void
490 mlx5vf_save_callback_complete(struct mlx5_vf_migration_file *migf,
491 			      struct mlx5vf_async_data *async_data)
492 {
493 	kvfree(async_data->out);
494 	complete(&migf->save_comp);
495 	fput(migf->filp);
496 }
497 
498 void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work)
499 {
500 	struct mlx5vf_async_data *async_data = container_of(_work,
501 		struct mlx5vf_async_data, work);
502 	struct mlx5_vf_migration_file *migf = container_of(async_data,
503 		struct mlx5_vf_migration_file, async_data);
504 
505 	mutex_lock(&migf->lock);
506 	if (async_data->status) {
507 		mlx5vf_put_data_buffer(async_data->buf);
508 		if (async_data->header_buf)
509 			mlx5vf_put_data_buffer(async_data->header_buf);
510 		if (!async_data->stop_copy_chunk &&
511 		    async_data->status == MLX5_CMD_STAT_BAD_RES_STATE_ERR)
512 			migf->state = MLX5_MIGF_STATE_PRE_COPY_ERROR;
513 		else
514 			migf->state = MLX5_MIGF_STATE_ERROR;
515 		wake_up_interruptible(&migf->poll_wait);
516 	}
517 	mutex_unlock(&migf->lock);
518 	mlx5vf_save_callback_complete(migf, async_data);
519 }
520 
521 static int add_buf_header(struct mlx5_vhca_data_buffer *header_buf,
522 			  size_t image_size, bool initial_pre_copy)
523 {
524 	struct mlx5_vf_migration_file *migf = header_buf->migf;
525 	struct mlx5_vf_migration_header header = {};
526 	unsigned long flags;
527 	struct page *page;
528 	u8 *to_buff;
529 
530 	header.record_size = cpu_to_le64(image_size);
531 	header.flags = cpu_to_le32(MLX5_MIGF_HEADER_FLAGS_TAG_MANDATORY);
532 	header.tag = cpu_to_le32(MLX5_MIGF_HEADER_TAG_FW_DATA);
533 	page = mlx5vf_get_migration_page(header_buf, 0);
534 	if (!page)
535 		return -EINVAL;
536 	to_buff = kmap_local_page(page);
537 	memcpy(to_buff, &header, sizeof(header));
538 	kunmap_local(to_buff);
539 	header_buf->length = sizeof(header);
540 	header_buf->start_pos = header_buf->migf->max_pos;
541 	migf->max_pos += header_buf->length;
542 	spin_lock_irqsave(&migf->list_lock, flags);
543 	list_add_tail(&header_buf->buf_elm, &migf->buf_list);
544 	spin_unlock_irqrestore(&migf->list_lock, flags);
545 	if (initial_pre_copy)
546 		migf->pre_copy_initial_bytes += sizeof(header);
547 	return 0;
548 }
549 
550 static void mlx5vf_save_callback(int status, struct mlx5_async_work *context)
551 {
552 	struct mlx5vf_async_data *async_data = container_of(context,
553 			struct mlx5vf_async_data, cb_work);
554 	struct mlx5_vf_migration_file *migf = container_of(async_data,
555 			struct mlx5_vf_migration_file, async_data);
556 
557 	if (!status) {
558 		size_t next_required_umem_size = 0;
559 		bool stop_copy_last_chunk;
560 		size_t image_size;
561 		unsigned long flags;
562 		bool initial_pre_copy = migf->state != MLX5_MIGF_STATE_PRE_COPY &&
563 				!async_data->stop_copy_chunk;
564 
565 		image_size = MLX5_GET(save_vhca_state_out, async_data->out,
566 				      actual_image_size);
567 		if (async_data->buf->stop_copy_chunk_num)
568 			next_required_umem_size = MLX5_GET(save_vhca_state_out,
569 					async_data->out, next_required_umem_size);
570 		stop_copy_last_chunk = async_data->stop_copy_chunk &&
571 				!next_required_umem_size;
572 		if (async_data->header_buf) {
573 			status = add_buf_header(async_data->header_buf, image_size,
574 						initial_pre_copy);
575 			if (status)
576 				goto err;
577 		}
578 		async_data->buf->length = image_size;
579 		async_data->buf->start_pos = migf->max_pos;
580 		migf->max_pos += async_data->buf->length;
581 		spin_lock_irqsave(&migf->list_lock, flags);
582 		list_add_tail(&async_data->buf->buf_elm, &migf->buf_list);
583 		if (async_data->buf->stop_copy_chunk_num) {
584 			migf->num_ready_chunks++;
585 			if (next_required_umem_size &&
586 			    migf->num_ready_chunks >= MAX_NUM_CHUNKS) {
587 				/* Delay the next SAVE till one chunk be consumed */
588 				migf->next_required_umem_size = next_required_umem_size;
589 				next_required_umem_size = 0;
590 			}
591 		}
592 		spin_unlock_irqrestore(&migf->list_lock, flags);
593 		if (initial_pre_copy) {
594 			migf->pre_copy_initial_bytes += image_size;
595 			migf->state = MLX5_MIGF_STATE_PRE_COPY;
596 		}
597 		if (stop_copy_last_chunk)
598 			migf->state = MLX5_MIGF_STATE_COMPLETE;
599 		wake_up_interruptible(&migf->poll_wait);
600 		if (next_required_umem_size)
601 			mlx5vf_mig_file_set_save_work(migf,
602 				/* Picking up the next chunk num */
603 				(async_data->buf->stop_copy_chunk_num % MAX_NUM_CHUNKS) + 1,
604 				next_required_umem_size);
605 		mlx5vf_save_callback_complete(migf, async_data);
606 		return;
607 	}
608 
609 err:
610 	/* The error flow can't run from an interrupt context */
611 	if (status == -EREMOTEIO)
612 		status = MLX5_GET(save_vhca_state_out, async_data->out, status);
613 	async_data->status = status;
614 	queue_work(migf->mvdev->cb_wq, &async_data->work);
615 }
616 
617 int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev,
618 			       struct mlx5_vf_migration_file *migf,
619 			       struct mlx5_vhca_data_buffer *buf, bool inc,
620 			       bool track)
621 {
622 	u32 out_size = MLX5_ST_SZ_BYTES(save_vhca_state_out);
623 	u32 in[MLX5_ST_SZ_DW(save_vhca_state_in)] = {};
624 	struct mlx5_vhca_data_buffer *header_buf = NULL;
625 	struct mlx5vf_async_data *async_data;
626 	int err;
627 
628 	lockdep_assert_held(&mvdev->state_mutex);
629 	if (mvdev->mdev_detach)
630 		return -ENOTCONN;
631 
632 	err = wait_for_completion_interruptible(&migf->save_comp);
633 	if (err)
634 		return err;
635 
636 	if (migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR)
637 		/*
638 		 * In case we had a PRE_COPY error, SAVE is triggered only for
639 		 * the final image, read device full image.
640 		 */
641 		inc = false;
642 
643 	MLX5_SET(save_vhca_state_in, in, opcode,
644 		 MLX5_CMD_OP_SAVE_VHCA_STATE);
645 	MLX5_SET(save_vhca_state_in, in, op_mod, 0);
646 	MLX5_SET(save_vhca_state_in, in, vhca_id, mvdev->vhca_id);
647 	MLX5_SET(save_vhca_state_in, in, mkey, buf->mkey);
648 	MLX5_SET(save_vhca_state_in, in, size, buf->allocated_length);
649 	MLX5_SET(save_vhca_state_in, in, incremental, inc);
650 	MLX5_SET(save_vhca_state_in, in, set_track, track);
651 
652 	async_data = &migf->async_data;
653 	async_data->buf = buf;
654 	async_data->stop_copy_chunk = !track;
655 	async_data->out = kvzalloc(out_size, GFP_KERNEL);
656 	if (!async_data->out) {
657 		err = -ENOMEM;
658 		goto err_out;
659 	}
660 
661 	if (MLX5VF_PRE_COPY_SUPP(mvdev)) {
662 		if (async_data->stop_copy_chunk) {
663 			u8 header_idx = buf->stop_copy_chunk_num ?
664 				buf->stop_copy_chunk_num - 1 : 0;
665 
666 			header_buf = migf->buf_header[header_idx];
667 			migf->buf_header[header_idx] = NULL;
668 		}
669 
670 		if (!header_buf) {
671 			header_buf = mlx5vf_get_data_buffer(migf,
672 				sizeof(struct mlx5_vf_migration_header), DMA_NONE);
673 			if (IS_ERR(header_buf)) {
674 				err = PTR_ERR(header_buf);
675 				goto err_free;
676 			}
677 		}
678 	}
679 
680 	if (async_data->stop_copy_chunk)
681 		migf->state = MLX5_MIGF_STATE_SAVE_STOP_COPY_CHUNK;
682 
683 	async_data->header_buf = header_buf;
684 	get_file(migf->filp);
685 	err = mlx5_cmd_exec_cb(&migf->async_ctx, in, sizeof(in),
686 			       async_data->out,
687 			       out_size, mlx5vf_save_callback,
688 			       &async_data->cb_work);
689 	if (err)
690 		goto err_exec;
691 
692 	return 0;
693 
694 err_exec:
695 	if (header_buf)
696 		mlx5vf_put_data_buffer(header_buf);
697 	fput(migf->filp);
698 err_free:
699 	kvfree(async_data->out);
700 err_out:
701 	complete(&migf->save_comp);
702 	return err;
703 }
704 
705 int mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device *mvdev,
706 			       struct mlx5_vf_migration_file *migf,
707 			       struct mlx5_vhca_data_buffer *buf)
708 {
709 	u32 out[MLX5_ST_SZ_DW(load_vhca_state_out)] = {};
710 	u32 in[MLX5_ST_SZ_DW(load_vhca_state_in)] = {};
711 	int err;
712 
713 	lockdep_assert_held(&mvdev->state_mutex);
714 	if (mvdev->mdev_detach)
715 		return -ENOTCONN;
716 
717 	if (!buf->dmaed) {
718 		err = mlx5vf_dma_data_buffer(buf);
719 		if (err)
720 			return err;
721 	}
722 
723 	MLX5_SET(load_vhca_state_in, in, opcode,
724 		 MLX5_CMD_OP_LOAD_VHCA_STATE);
725 	MLX5_SET(load_vhca_state_in, in, op_mod, 0);
726 	MLX5_SET(load_vhca_state_in, in, vhca_id, mvdev->vhca_id);
727 	MLX5_SET(load_vhca_state_in, in, mkey, buf->mkey);
728 	MLX5_SET(load_vhca_state_in, in, size, buf->length);
729 	return mlx5_cmd_exec_inout(mvdev->mdev, load_vhca_state, in, out);
730 }
731 
732 int mlx5vf_cmd_alloc_pd(struct mlx5_vf_migration_file *migf)
733 {
734 	int err;
735 
736 	lockdep_assert_held(&migf->mvdev->state_mutex);
737 	if (migf->mvdev->mdev_detach)
738 		return -ENOTCONN;
739 
740 	err = mlx5_core_alloc_pd(migf->mvdev->mdev, &migf->pdn);
741 	return err;
742 }
743 
744 void mlx5vf_cmd_dealloc_pd(struct mlx5_vf_migration_file *migf)
745 {
746 	lockdep_assert_held(&migf->mvdev->state_mutex);
747 	if (migf->mvdev->mdev_detach)
748 		return;
749 
750 	mlx5_core_dealloc_pd(migf->mvdev->mdev, migf->pdn);
751 }
752 
753 void mlx5fv_cmd_clean_migf_resources(struct mlx5_vf_migration_file *migf)
754 {
755 	struct mlx5_vhca_data_buffer *entry;
756 	int i;
757 
758 	lockdep_assert_held(&migf->mvdev->state_mutex);
759 	WARN_ON(migf->mvdev->mdev_detach);
760 
761 	for (i = 0; i < MAX_NUM_CHUNKS; i++) {
762 		if (migf->buf[i]) {
763 			mlx5vf_free_data_buffer(migf->buf[i]);
764 			migf->buf[i] = NULL;
765 		}
766 
767 		if (migf->buf_header[i]) {
768 			mlx5vf_free_data_buffer(migf->buf_header[i]);
769 			migf->buf_header[i] = NULL;
770 		}
771 	}
772 
773 	list_splice(&migf->avail_list, &migf->buf_list);
774 
775 	while ((entry = list_first_entry_or_null(&migf->buf_list,
776 				struct mlx5_vhca_data_buffer, buf_elm))) {
777 		list_del(&entry->buf_elm);
778 		mlx5vf_free_data_buffer(entry);
779 	}
780 
781 	mlx5vf_cmd_dealloc_pd(migf);
782 }
783 
784 static int mlx5vf_create_tracker(struct mlx5_core_dev *mdev,
785 				 struct mlx5vf_pci_core_device *mvdev,
786 				 struct rb_root_cached *ranges, u32 nnodes)
787 {
788 	int max_num_range =
789 		MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_max_num_range);
790 	struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker;
791 	int record_size = MLX5_ST_SZ_BYTES(page_track_range);
792 	u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {};
793 	struct interval_tree_node *node = NULL;
794 	u64 total_ranges_len = 0;
795 	u32 num_ranges = nnodes;
796 	u8 log_addr_space_size;
797 	void *range_list_ptr;
798 	void *obj_context;
799 	void *cmd_hdr;
800 	int inlen;
801 	void *in;
802 	int err;
803 	int i;
804 
805 	if (num_ranges > max_num_range) {
806 		vfio_combine_iova_ranges(ranges, nnodes, max_num_range);
807 		num_ranges = max_num_range;
808 	}
809 
810 	inlen = MLX5_ST_SZ_BYTES(create_page_track_obj_in) +
811 				 record_size * num_ranges;
812 	in = kzalloc(inlen, GFP_KERNEL);
813 	if (!in)
814 		return -ENOMEM;
815 
816 	cmd_hdr = MLX5_ADDR_OF(create_page_track_obj_in, in,
817 			       general_obj_in_cmd_hdr);
818 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode,
819 		 MLX5_CMD_OP_CREATE_GENERAL_OBJECT);
820 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type,
821 		 MLX5_OBJ_TYPE_PAGE_TRACK);
822 	obj_context = MLX5_ADDR_OF(create_page_track_obj_in, in, obj_context);
823 	MLX5_SET(page_track, obj_context, vhca_id, mvdev->vhca_id);
824 	MLX5_SET(page_track, obj_context, track_type, 1);
825 	MLX5_SET(page_track, obj_context, log_page_size,
826 		 ilog2(tracker->host_qp->tracked_page_size));
827 	MLX5_SET(page_track, obj_context, log_msg_size,
828 		 ilog2(tracker->host_qp->max_msg_size));
829 	MLX5_SET(page_track, obj_context, reporting_qpn, tracker->fw_qp->qpn);
830 	MLX5_SET(page_track, obj_context, num_ranges, num_ranges);
831 
832 	range_list_ptr = MLX5_ADDR_OF(page_track, obj_context, track_range);
833 	node = interval_tree_iter_first(ranges, 0, ULONG_MAX);
834 	for (i = 0; i < num_ranges; i++) {
835 		void *addr_range_i_base = range_list_ptr + record_size * i;
836 		unsigned long length = node->last - node->start + 1;
837 
838 		MLX5_SET64(page_track_range, addr_range_i_base, start_address,
839 			   node->start);
840 		MLX5_SET64(page_track_range, addr_range_i_base, length, length);
841 		total_ranges_len += length;
842 		node = interval_tree_iter_next(node, 0, ULONG_MAX);
843 	}
844 
845 	WARN_ON(node);
846 	log_addr_space_size = ilog2(roundup_pow_of_two(total_ranges_len));
847 	if (log_addr_space_size <
848 	    (MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_log_min_addr_space)) ||
849 	    log_addr_space_size >
850 	    (MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_log_max_addr_space))) {
851 		err = -EOPNOTSUPP;
852 		goto out;
853 	}
854 
855 	MLX5_SET(page_track, obj_context, log_addr_space_size,
856 		 log_addr_space_size);
857 	err = mlx5_cmd_exec(mdev, in, inlen, out, sizeof(out));
858 	if (err)
859 		goto out;
860 
861 	tracker->id = MLX5_GET(general_obj_out_cmd_hdr, out, obj_id);
862 out:
863 	kfree(in);
864 	return err;
865 }
866 
867 static int mlx5vf_cmd_destroy_tracker(struct mlx5_core_dev *mdev,
868 				      u32 tracker_id)
869 {
870 	u32 in[MLX5_ST_SZ_DW(general_obj_in_cmd_hdr)] = {};
871 	u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {};
872 
873 	MLX5_SET(general_obj_in_cmd_hdr, in, opcode, MLX5_CMD_OP_DESTROY_GENERAL_OBJECT);
874 	MLX5_SET(general_obj_in_cmd_hdr, in, obj_type, MLX5_OBJ_TYPE_PAGE_TRACK);
875 	MLX5_SET(general_obj_in_cmd_hdr, in, obj_id, tracker_id);
876 
877 	return mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out));
878 }
879 
880 static int mlx5vf_cmd_modify_tracker(struct mlx5_core_dev *mdev,
881 				     u32 tracker_id, unsigned long iova,
882 				     unsigned long length, u32 tracker_state)
883 {
884 	u32 in[MLX5_ST_SZ_DW(modify_page_track_obj_in)] = {};
885 	u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {};
886 	void *obj_context;
887 	void *cmd_hdr;
888 
889 	cmd_hdr = MLX5_ADDR_OF(modify_page_track_obj_in, in, general_obj_in_cmd_hdr);
890 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_MODIFY_GENERAL_OBJECT);
891 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_PAGE_TRACK);
892 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_id, tracker_id);
893 
894 	obj_context = MLX5_ADDR_OF(modify_page_track_obj_in, in, obj_context);
895 	MLX5_SET64(page_track, obj_context, modify_field_select, 0x3);
896 	MLX5_SET64(page_track, obj_context, range_start_address, iova);
897 	MLX5_SET64(page_track, obj_context, length, length);
898 	MLX5_SET(page_track, obj_context, state, tracker_state);
899 
900 	return mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out));
901 }
902 
903 static int alloc_cq_frag_buf(struct mlx5_core_dev *mdev,
904 			     struct mlx5_vhca_cq_buf *buf, int nent,
905 			     int cqe_size)
906 {
907 	struct mlx5_frag_buf *frag_buf = &buf->frag_buf;
908 	u8 log_wq_stride = 6 + (cqe_size == 128 ? 1 : 0);
909 	u8 log_wq_sz = ilog2(cqe_size);
910 	int err;
911 
912 	err = mlx5_frag_buf_alloc_node(mdev, nent * cqe_size, frag_buf,
913 				       mdev->priv.numa_node);
914 	if (err)
915 		return err;
916 
917 	mlx5_init_fbc(frag_buf->frags, log_wq_stride, log_wq_sz, &buf->fbc);
918 	buf->cqe_size = cqe_size;
919 	buf->nent = nent;
920 	return 0;
921 }
922 
923 static void init_cq_frag_buf(struct mlx5_vhca_cq_buf *buf)
924 {
925 	struct mlx5_cqe64 *cqe64;
926 	void *cqe;
927 	int i;
928 
929 	for (i = 0; i < buf->nent; i++) {
930 		cqe = mlx5_frag_buf_get_wqe(&buf->fbc, i);
931 		cqe64 = buf->cqe_size == 64 ? cqe : cqe + 64;
932 		cqe64->op_own = MLX5_CQE_INVALID << 4;
933 	}
934 }
935 
936 static void mlx5vf_destroy_cq(struct mlx5_core_dev *mdev,
937 			      struct mlx5_vhca_cq *cq)
938 {
939 	mlx5_core_destroy_cq(mdev, &cq->mcq);
940 	mlx5_frag_buf_free(mdev, &cq->buf.frag_buf);
941 	mlx5_db_free(mdev, &cq->db);
942 }
943 
944 static void mlx5vf_cq_event(struct mlx5_core_cq *mcq, enum mlx5_event type)
945 {
946 	if (type != MLX5_EVENT_TYPE_CQ_ERROR)
947 		return;
948 
949 	set_tracker_error(container_of(mcq, struct mlx5vf_pci_core_device,
950 				       tracker.cq.mcq));
951 }
952 
953 static int mlx5vf_event_notifier(struct notifier_block *nb, unsigned long type,
954 				 void *data)
955 {
956 	struct mlx5_vhca_page_tracker *tracker =
957 		mlx5_nb_cof(nb, struct mlx5_vhca_page_tracker, nb);
958 	struct mlx5vf_pci_core_device *mvdev = container_of(
959 		tracker, struct mlx5vf_pci_core_device, tracker);
960 	struct mlx5_eqe *eqe = data;
961 	u8 event_type = (u8)type;
962 	u8 queue_type;
963 	int qp_num;
964 
965 	switch (event_type) {
966 	case MLX5_EVENT_TYPE_WQ_CATAS_ERROR:
967 	case MLX5_EVENT_TYPE_WQ_ACCESS_ERROR:
968 	case MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR:
969 		queue_type = eqe->data.qp_srq.type;
970 		if (queue_type != MLX5_EVENT_QUEUE_TYPE_QP)
971 			break;
972 		qp_num = be32_to_cpu(eqe->data.qp_srq.qp_srq_n) & 0xffffff;
973 		if (qp_num != tracker->host_qp->qpn &&
974 		    qp_num != tracker->fw_qp->qpn)
975 			break;
976 		set_tracker_error(mvdev);
977 		break;
978 	default:
979 		break;
980 	}
981 
982 	return NOTIFY_OK;
983 }
984 
985 static void mlx5vf_cq_complete(struct mlx5_core_cq *mcq,
986 			       struct mlx5_eqe *eqe)
987 {
988 	struct mlx5vf_pci_core_device *mvdev =
989 		container_of(mcq, struct mlx5vf_pci_core_device,
990 			     tracker.cq.mcq);
991 
992 	complete(&mvdev->tracker_comp);
993 }
994 
995 static int mlx5vf_create_cq(struct mlx5_core_dev *mdev,
996 			    struct mlx5_vhca_page_tracker *tracker,
997 			    size_t ncqe)
998 {
999 	int cqe_size = cache_line_size() == 128 ? 128 : 64;
1000 	u32 out[MLX5_ST_SZ_DW(create_cq_out)];
1001 	struct mlx5_vhca_cq *cq;
1002 	int inlen, err, eqn;
1003 	void *cqc, *in;
1004 	__be64 *pas;
1005 	int vector;
1006 
1007 	cq = &tracker->cq;
1008 	ncqe = roundup_pow_of_two(ncqe);
1009 	err = mlx5_db_alloc_node(mdev, &cq->db, mdev->priv.numa_node);
1010 	if (err)
1011 		return err;
1012 
1013 	cq->ncqe = ncqe;
1014 	cq->mcq.set_ci_db = cq->db.db;
1015 	cq->mcq.arm_db = cq->db.db + 1;
1016 	cq->mcq.cqe_sz = cqe_size;
1017 	err = alloc_cq_frag_buf(mdev, &cq->buf, ncqe, cqe_size);
1018 	if (err)
1019 		goto err_db_free;
1020 
1021 	init_cq_frag_buf(&cq->buf);
1022 	inlen = MLX5_ST_SZ_BYTES(create_cq_in) +
1023 		MLX5_FLD_SZ_BYTES(create_cq_in, pas[0]) *
1024 		cq->buf.frag_buf.npages;
1025 	in = kvzalloc(inlen, GFP_KERNEL);
1026 	if (!in) {
1027 		err = -ENOMEM;
1028 		goto err_buff;
1029 	}
1030 
1031 	vector = raw_smp_processor_id() % mlx5_comp_vectors_max(mdev);
1032 	err = mlx5_comp_eqn_get(mdev, vector, &eqn);
1033 	if (err)
1034 		goto err_vec;
1035 
1036 	cqc = MLX5_ADDR_OF(create_cq_in, in, cq_context);
1037 	MLX5_SET(cqc, cqc, log_cq_size, ilog2(ncqe));
1038 	MLX5_SET(cqc, cqc, c_eqn_or_apu_element, eqn);
1039 	MLX5_SET(cqc, cqc, uar_page, tracker->uar->index);
1040 	MLX5_SET(cqc, cqc, log_page_size, cq->buf.frag_buf.page_shift -
1041 		 MLX5_ADAPTER_PAGE_SHIFT);
1042 	MLX5_SET64(cqc, cqc, dbr_addr, cq->db.dma);
1043 	pas = (__be64 *)MLX5_ADDR_OF(create_cq_in, in, pas);
1044 	mlx5_fill_page_frag_array(&cq->buf.frag_buf, pas);
1045 	cq->mcq.comp = mlx5vf_cq_complete;
1046 	cq->mcq.event = mlx5vf_cq_event;
1047 	err = mlx5_core_create_cq(mdev, &cq->mcq, in, inlen, out, sizeof(out));
1048 	if (err)
1049 		goto err_vec;
1050 
1051 	mlx5_cq_arm(&cq->mcq, MLX5_CQ_DB_REQ_NOT, tracker->uar->map,
1052 		    cq->mcq.cons_index);
1053 	kvfree(in);
1054 	return 0;
1055 
1056 err_vec:
1057 	kvfree(in);
1058 err_buff:
1059 	mlx5_frag_buf_free(mdev, &cq->buf.frag_buf);
1060 err_db_free:
1061 	mlx5_db_free(mdev, &cq->db);
1062 	return err;
1063 }
1064 
1065 static struct mlx5_vhca_qp *
1066 mlx5vf_create_rc_qp(struct mlx5_core_dev *mdev,
1067 		    struct mlx5_vhca_page_tracker *tracker, u32 max_recv_wr)
1068 {
1069 	u32 out[MLX5_ST_SZ_DW(create_qp_out)] = {};
1070 	struct mlx5_vhca_qp *qp;
1071 	u8 log_rq_stride;
1072 	u8 log_rq_sz;
1073 	void *qpc;
1074 	int inlen;
1075 	void *in;
1076 	int err;
1077 
1078 	qp = kzalloc(sizeof(*qp), GFP_KERNEL_ACCOUNT);
1079 	if (!qp)
1080 		return ERR_PTR(-ENOMEM);
1081 
1082 	err = mlx5_db_alloc_node(mdev, &qp->db, mdev->priv.numa_node);
1083 	if (err)
1084 		goto err_free;
1085 
1086 	if (max_recv_wr) {
1087 		qp->rq.wqe_cnt = roundup_pow_of_two(max_recv_wr);
1088 		log_rq_stride = ilog2(MLX5_SEND_WQE_DS);
1089 		log_rq_sz = ilog2(qp->rq.wqe_cnt);
1090 		err = mlx5_frag_buf_alloc_node(mdev,
1091 			wq_get_byte_sz(log_rq_sz, log_rq_stride),
1092 			&qp->buf, mdev->priv.numa_node);
1093 		if (err)
1094 			goto err_db_free;
1095 		mlx5_init_fbc(qp->buf.frags, log_rq_stride, log_rq_sz, &qp->rq.fbc);
1096 	}
1097 
1098 	qp->rq.db = &qp->db.db[MLX5_RCV_DBR];
1099 	inlen = MLX5_ST_SZ_BYTES(create_qp_in) +
1100 		MLX5_FLD_SZ_BYTES(create_qp_in, pas[0]) *
1101 		qp->buf.npages;
1102 	in = kvzalloc(inlen, GFP_KERNEL);
1103 	if (!in) {
1104 		err = -ENOMEM;
1105 		goto err_in;
1106 	}
1107 
1108 	qpc = MLX5_ADDR_OF(create_qp_in, in, qpc);
1109 	MLX5_SET(qpc, qpc, st, MLX5_QP_ST_RC);
1110 	MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED);
1111 	MLX5_SET(qpc, qpc, pd, tracker->pdn);
1112 	MLX5_SET(qpc, qpc, uar_page, tracker->uar->index);
1113 	MLX5_SET(qpc, qpc, log_page_size,
1114 		 qp->buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT);
1115 	MLX5_SET(qpc, qpc, ts_format, mlx5_get_qp_default_ts(mdev));
1116 	if (MLX5_CAP_GEN(mdev, cqe_version) == 1)
1117 		MLX5_SET(qpc, qpc, user_index, 0xFFFFFF);
1118 	MLX5_SET(qpc, qpc, no_sq, 1);
1119 	if (max_recv_wr) {
1120 		MLX5_SET(qpc, qpc, cqn_rcv, tracker->cq.mcq.cqn);
1121 		MLX5_SET(qpc, qpc, log_rq_stride, log_rq_stride - 4);
1122 		MLX5_SET(qpc, qpc, log_rq_size, log_rq_sz);
1123 		MLX5_SET(qpc, qpc, rq_type, MLX5_NON_ZERO_RQ);
1124 		MLX5_SET64(qpc, qpc, dbr_addr, qp->db.dma);
1125 		mlx5_fill_page_frag_array(&qp->buf,
1126 					  (__be64 *)MLX5_ADDR_OF(create_qp_in,
1127 								 in, pas));
1128 	} else {
1129 		MLX5_SET(qpc, qpc, rq_type, MLX5_ZERO_LEN_RQ);
1130 	}
1131 
1132 	MLX5_SET(create_qp_in, in, opcode, MLX5_CMD_OP_CREATE_QP);
1133 	err = mlx5_cmd_exec(mdev, in, inlen, out, sizeof(out));
1134 	kvfree(in);
1135 	if (err)
1136 		goto err_in;
1137 
1138 	qp->qpn = MLX5_GET(create_qp_out, out, qpn);
1139 	return qp;
1140 
1141 err_in:
1142 	if (max_recv_wr)
1143 		mlx5_frag_buf_free(mdev, &qp->buf);
1144 err_db_free:
1145 	mlx5_db_free(mdev, &qp->db);
1146 err_free:
1147 	kfree(qp);
1148 	return ERR_PTR(err);
1149 }
1150 
1151 static void mlx5vf_post_recv(struct mlx5_vhca_qp *qp)
1152 {
1153 	struct mlx5_wqe_data_seg *data;
1154 	unsigned int ix;
1155 
1156 	WARN_ON(qp->rq.pc - qp->rq.cc >= qp->rq.wqe_cnt);
1157 	ix = qp->rq.pc & (qp->rq.wqe_cnt - 1);
1158 	data = mlx5_frag_buf_get_wqe(&qp->rq.fbc, ix);
1159 	data->byte_count = cpu_to_be32(qp->max_msg_size);
1160 	data->lkey = cpu_to_be32(qp->recv_buf.mkey);
1161 	data->addr = cpu_to_be64(qp->recv_buf.next_rq_offset);
1162 	qp->rq.pc++;
1163 	/* Make sure that descriptors are written before doorbell record. */
1164 	dma_wmb();
1165 	*qp->rq.db = cpu_to_be32(qp->rq.pc & 0xffff);
1166 }
1167 
1168 static int mlx5vf_activate_qp(struct mlx5_core_dev *mdev,
1169 			      struct mlx5_vhca_qp *qp, u32 remote_qpn,
1170 			      bool host_qp)
1171 {
1172 	u32 init_in[MLX5_ST_SZ_DW(rst2init_qp_in)] = {};
1173 	u32 rtr_in[MLX5_ST_SZ_DW(init2rtr_qp_in)] = {};
1174 	u32 rts_in[MLX5_ST_SZ_DW(rtr2rts_qp_in)] = {};
1175 	void *qpc;
1176 	int ret;
1177 
1178 	/* Init */
1179 	qpc = MLX5_ADDR_OF(rst2init_qp_in, init_in, qpc);
1180 	MLX5_SET(qpc, qpc, primary_address_path.vhca_port_num, 1);
1181 	MLX5_SET(qpc, qpc, pm_state, MLX5_QPC_PM_STATE_MIGRATED);
1182 	MLX5_SET(qpc, qpc, rre, 1);
1183 	MLX5_SET(qpc, qpc, rwe, 1);
1184 	MLX5_SET(rst2init_qp_in, init_in, opcode, MLX5_CMD_OP_RST2INIT_QP);
1185 	MLX5_SET(rst2init_qp_in, init_in, qpn, qp->qpn);
1186 	ret = mlx5_cmd_exec_in(mdev, rst2init_qp, init_in);
1187 	if (ret)
1188 		return ret;
1189 
1190 	if (host_qp) {
1191 		struct mlx5_vhca_recv_buf *recv_buf = &qp->recv_buf;
1192 		int i;
1193 
1194 		for (i = 0; i < qp->rq.wqe_cnt; i++) {
1195 			mlx5vf_post_recv(qp);
1196 			recv_buf->next_rq_offset += qp->max_msg_size;
1197 		}
1198 	}
1199 
1200 	/* RTR */
1201 	qpc = MLX5_ADDR_OF(init2rtr_qp_in, rtr_in, qpc);
1202 	MLX5_SET(init2rtr_qp_in, rtr_in, qpn, qp->qpn);
1203 	MLX5_SET(qpc, qpc, mtu, IB_MTU_4096);
1204 	MLX5_SET(qpc, qpc, log_msg_max, MLX5_CAP_GEN(mdev, log_max_msg));
1205 	MLX5_SET(qpc, qpc, remote_qpn, remote_qpn);
1206 	MLX5_SET(qpc, qpc, primary_address_path.vhca_port_num, 1);
1207 	MLX5_SET(qpc, qpc, primary_address_path.fl, 1);
1208 	MLX5_SET(qpc, qpc, min_rnr_nak, 1);
1209 	MLX5_SET(init2rtr_qp_in, rtr_in, opcode, MLX5_CMD_OP_INIT2RTR_QP);
1210 	MLX5_SET(init2rtr_qp_in, rtr_in, qpn, qp->qpn);
1211 	ret = mlx5_cmd_exec_in(mdev, init2rtr_qp, rtr_in);
1212 	if (ret || host_qp)
1213 		return ret;
1214 
1215 	/* RTS */
1216 	qpc = MLX5_ADDR_OF(rtr2rts_qp_in, rts_in, qpc);
1217 	MLX5_SET(rtr2rts_qp_in, rts_in, qpn, qp->qpn);
1218 	MLX5_SET(qpc, qpc, retry_count, 7);
1219 	MLX5_SET(qpc, qpc, rnr_retry, 7); /* Infinite retry if RNR NACK */
1220 	MLX5_SET(qpc, qpc, primary_address_path.ack_timeout, 0x8); /* ~1ms */
1221 	MLX5_SET(rtr2rts_qp_in, rts_in, opcode, MLX5_CMD_OP_RTR2RTS_QP);
1222 	MLX5_SET(rtr2rts_qp_in, rts_in, qpn, qp->qpn);
1223 
1224 	return mlx5_cmd_exec_in(mdev, rtr2rts_qp, rts_in);
1225 }
1226 
1227 static void mlx5vf_destroy_qp(struct mlx5_core_dev *mdev,
1228 			      struct mlx5_vhca_qp *qp)
1229 {
1230 	u32 in[MLX5_ST_SZ_DW(destroy_qp_in)] = {};
1231 
1232 	MLX5_SET(destroy_qp_in, in, opcode, MLX5_CMD_OP_DESTROY_QP);
1233 	MLX5_SET(destroy_qp_in, in, qpn, qp->qpn);
1234 	mlx5_cmd_exec_in(mdev, destroy_qp, in);
1235 
1236 	mlx5_frag_buf_free(mdev, &qp->buf);
1237 	mlx5_db_free(mdev, &qp->db);
1238 	kfree(qp);
1239 }
1240 
1241 static void free_recv_pages(struct mlx5_vhca_recv_buf *recv_buf)
1242 {
1243 	int i;
1244 
1245 	/* Undo alloc_pages_bulk_array() */
1246 	for (i = 0; i < recv_buf->npages; i++)
1247 		__free_page(recv_buf->page_list[i]);
1248 
1249 	kvfree(recv_buf->page_list);
1250 }
1251 
1252 static int alloc_recv_pages(struct mlx5_vhca_recv_buf *recv_buf,
1253 			    unsigned int npages)
1254 {
1255 	unsigned int filled = 0, done = 0;
1256 	int i;
1257 
1258 	recv_buf->page_list = kvcalloc(npages, sizeof(*recv_buf->page_list),
1259 				       GFP_KERNEL_ACCOUNT);
1260 	if (!recv_buf->page_list)
1261 		return -ENOMEM;
1262 
1263 	for (;;) {
1264 		filled = alloc_pages_bulk_array(GFP_KERNEL_ACCOUNT,
1265 						npages - done,
1266 						recv_buf->page_list + done);
1267 		if (!filled)
1268 			goto err;
1269 
1270 		done += filled;
1271 		if (done == npages)
1272 			break;
1273 	}
1274 
1275 	recv_buf->npages = npages;
1276 	return 0;
1277 
1278 err:
1279 	for (i = 0; i < npages; i++) {
1280 		if (recv_buf->page_list[i])
1281 			__free_page(recv_buf->page_list[i]);
1282 	}
1283 
1284 	kvfree(recv_buf->page_list);
1285 	return -ENOMEM;
1286 }
1287 
1288 static int register_dma_recv_pages(struct mlx5_core_dev *mdev,
1289 				   struct mlx5_vhca_recv_buf *recv_buf)
1290 {
1291 	int i, j;
1292 
1293 	recv_buf->dma_addrs = kvcalloc(recv_buf->npages,
1294 				       sizeof(*recv_buf->dma_addrs),
1295 				       GFP_KERNEL_ACCOUNT);
1296 	if (!recv_buf->dma_addrs)
1297 		return -ENOMEM;
1298 
1299 	for (i = 0; i < recv_buf->npages; i++) {
1300 		recv_buf->dma_addrs[i] = dma_map_page(mdev->device,
1301 						      recv_buf->page_list[i],
1302 						      0, PAGE_SIZE,
1303 						      DMA_FROM_DEVICE);
1304 		if (dma_mapping_error(mdev->device, recv_buf->dma_addrs[i]))
1305 			goto error;
1306 	}
1307 	return 0;
1308 
1309 error:
1310 	for (j = 0; j < i; j++)
1311 		dma_unmap_single(mdev->device, recv_buf->dma_addrs[j],
1312 				 PAGE_SIZE, DMA_FROM_DEVICE);
1313 
1314 	kvfree(recv_buf->dma_addrs);
1315 	return -ENOMEM;
1316 }
1317 
1318 static void unregister_dma_recv_pages(struct mlx5_core_dev *mdev,
1319 				      struct mlx5_vhca_recv_buf *recv_buf)
1320 {
1321 	int i;
1322 
1323 	for (i = 0; i < recv_buf->npages; i++)
1324 		dma_unmap_single(mdev->device, recv_buf->dma_addrs[i],
1325 				 PAGE_SIZE, DMA_FROM_DEVICE);
1326 
1327 	kvfree(recv_buf->dma_addrs);
1328 }
1329 
1330 static void mlx5vf_free_qp_recv_resources(struct mlx5_core_dev *mdev,
1331 					  struct mlx5_vhca_qp *qp)
1332 {
1333 	struct mlx5_vhca_recv_buf *recv_buf = &qp->recv_buf;
1334 
1335 	mlx5_core_destroy_mkey(mdev, recv_buf->mkey);
1336 	unregister_dma_recv_pages(mdev, recv_buf);
1337 	free_recv_pages(&qp->recv_buf);
1338 }
1339 
1340 static int mlx5vf_alloc_qp_recv_resources(struct mlx5_core_dev *mdev,
1341 					  struct mlx5_vhca_qp *qp, u32 pdn,
1342 					  u64 rq_size)
1343 {
1344 	unsigned int npages = DIV_ROUND_UP_ULL(rq_size, PAGE_SIZE);
1345 	struct mlx5_vhca_recv_buf *recv_buf = &qp->recv_buf;
1346 	int err;
1347 
1348 	err = alloc_recv_pages(recv_buf, npages);
1349 	if (err < 0)
1350 		return err;
1351 
1352 	err = register_dma_recv_pages(mdev, recv_buf);
1353 	if (err)
1354 		goto end;
1355 
1356 	err = _create_mkey(mdev, pdn, NULL, recv_buf, &recv_buf->mkey);
1357 	if (err)
1358 		goto err_create_mkey;
1359 
1360 	return 0;
1361 
1362 err_create_mkey:
1363 	unregister_dma_recv_pages(mdev, recv_buf);
1364 end:
1365 	free_recv_pages(recv_buf);
1366 	return err;
1367 }
1368 
1369 static void
1370 _mlx5vf_free_page_tracker_resources(struct mlx5vf_pci_core_device *mvdev)
1371 {
1372 	struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker;
1373 	struct mlx5_core_dev *mdev = mvdev->mdev;
1374 
1375 	lockdep_assert_held(&mvdev->state_mutex);
1376 
1377 	if (!mvdev->log_active)
1378 		return;
1379 
1380 	WARN_ON(mvdev->mdev_detach);
1381 
1382 	mlx5_eq_notifier_unregister(mdev, &tracker->nb);
1383 	mlx5vf_cmd_destroy_tracker(mdev, tracker->id);
1384 	mlx5vf_destroy_qp(mdev, tracker->fw_qp);
1385 	mlx5vf_free_qp_recv_resources(mdev, tracker->host_qp);
1386 	mlx5vf_destroy_qp(mdev, tracker->host_qp);
1387 	mlx5vf_destroy_cq(mdev, &tracker->cq);
1388 	mlx5_core_dealloc_pd(mdev, tracker->pdn);
1389 	mlx5_put_uars_page(mdev, tracker->uar);
1390 	mvdev->log_active = false;
1391 }
1392 
1393 int mlx5vf_stop_page_tracker(struct vfio_device *vdev)
1394 {
1395 	struct mlx5vf_pci_core_device *mvdev = container_of(
1396 		vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1397 
1398 	mutex_lock(&mvdev->state_mutex);
1399 	if (!mvdev->log_active)
1400 		goto end;
1401 
1402 	_mlx5vf_free_page_tracker_resources(mvdev);
1403 	mvdev->log_active = false;
1404 end:
1405 	mlx5vf_state_mutex_unlock(mvdev);
1406 	return 0;
1407 }
1408 
1409 int mlx5vf_start_page_tracker(struct vfio_device *vdev,
1410 			      struct rb_root_cached *ranges, u32 nnodes,
1411 			      u64 *page_size)
1412 {
1413 	struct mlx5vf_pci_core_device *mvdev = container_of(
1414 		vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1415 	struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker;
1416 	u8 log_tracked_page = ilog2(*page_size);
1417 	struct mlx5_vhca_qp *host_qp;
1418 	struct mlx5_vhca_qp *fw_qp;
1419 	struct mlx5_core_dev *mdev;
1420 	u32 max_msg_size = PAGE_SIZE;
1421 	u64 rq_size = SZ_2M;
1422 	u32 max_recv_wr;
1423 	int err;
1424 
1425 	mutex_lock(&mvdev->state_mutex);
1426 	if (mvdev->mdev_detach) {
1427 		err = -ENOTCONN;
1428 		goto end;
1429 	}
1430 
1431 	if (mvdev->log_active) {
1432 		err = -EINVAL;
1433 		goto end;
1434 	}
1435 
1436 	mdev = mvdev->mdev;
1437 	memset(tracker, 0, sizeof(*tracker));
1438 	tracker->uar = mlx5_get_uars_page(mdev);
1439 	if (IS_ERR(tracker->uar)) {
1440 		err = PTR_ERR(tracker->uar);
1441 		goto end;
1442 	}
1443 
1444 	err = mlx5_core_alloc_pd(mdev, &tracker->pdn);
1445 	if (err)
1446 		goto err_uar;
1447 
1448 	max_recv_wr = DIV_ROUND_UP_ULL(rq_size, max_msg_size);
1449 	err = mlx5vf_create_cq(mdev, tracker, max_recv_wr);
1450 	if (err)
1451 		goto err_dealloc_pd;
1452 
1453 	host_qp = mlx5vf_create_rc_qp(mdev, tracker, max_recv_wr);
1454 	if (IS_ERR(host_qp)) {
1455 		err = PTR_ERR(host_qp);
1456 		goto err_cq;
1457 	}
1458 
1459 	host_qp->max_msg_size = max_msg_size;
1460 	if (log_tracked_page < MLX5_CAP_ADV_VIRTUALIZATION(mdev,
1461 				pg_track_log_min_page_size)) {
1462 		log_tracked_page = MLX5_CAP_ADV_VIRTUALIZATION(mdev,
1463 				pg_track_log_min_page_size);
1464 	} else if (log_tracked_page > MLX5_CAP_ADV_VIRTUALIZATION(mdev,
1465 				pg_track_log_max_page_size)) {
1466 		log_tracked_page = MLX5_CAP_ADV_VIRTUALIZATION(mdev,
1467 				pg_track_log_max_page_size);
1468 	}
1469 
1470 	host_qp->tracked_page_size = (1ULL << log_tracked_page);
1471 	err = mlx5vf_alloc_qp_recv_resources(mdev, host_qp, tracker->pdn,
1472 					     rq_size);
1473 	if (err)
1474 		goto err_host_qp;
1475 
1476 	fw_qp = mlx5vf_create_rc_qp(mdev, tracker, 0);
1477 	if (IS_ERR(fw_qp)) {
1478 		err = PTR_ERR(fw_qp);
1479 		goto err_recv_resources;
1480 	}
1481 
1482 	err = mlx5vf_activate_qp(mdev, host_qp, fw_qp->qpn, true);
1483 	if (err)
1484 		goto err_activate;
1485 
1486 	err = mlx5vf_activate_qp(mdev, fw_qp, host_qp->qpn, false);
1487 	if (err)
1488 		goto err_activate;
1489 
1490 	tracker->host_qp = host_qp;
1491 	tracker->fw_qp = fw_qp;
1492 	err = mlx5vf_create_tracker(mdev, mvdev, ranges, nnodes);
1493 	if (err)
1494 		goto err_activate;
1495 
1496 	MLX5_NB_INIT(&tracker->nb, mlx5vf_event_notifier, NOTIFY_ANY);
1497 	mlx5_eq_notifier_register(mdev, &tracker->nb);
1498 	*page_size = host_qp->tracked_page_size;
1499 	mvdev->log_active = true;
1500 	mlx5vf_state_mutex_unlock(mvdev);
1501 	return 0;
1502 
1503 err_activate:
1504 	mlx5vf_destroy_qp(mdev, fw_qp);
1505 err_recv_resources:
1506 	mlx5vf_free_qp_recv_resources(mdev, host_qp);
1507 err_host_qp:
1508 	mlx5vf_destroy_qp(mdev, host_qp);
1509 err_cq:
1510 	mlx5vf_destroy_cq(mdev, &tracker->cq);
1511 err_dealloc_pd:
1512 	mlx5_core_dealloc_pd(mdev, tracker->pdn);
1513 err_uar:
1514 	mlx5_put_uars_page(mdev, tracker->uar);
1515 end:
1516 	mlx5vf_state_mutex_unlock(mvdev);
1517 	return err;
1518 }
1519 
1520 static void
1521 set_report_output(u32 size, int index, struct mlx5_vhca_qp *qp,
1522 		  struct iova_bitmap *dirty)
1523 {
1524 	u32 entry_size = MLX5_ST_SZ_BYTES(page_track_report_entry);
1525 	u32 nent = size / entry_size;
1526 	struct page *page;
1527 	u64 addr;
1528 	u64 *buf;
1529 	int i;
1530 
1531 	if (WARN_ON(index >= qp->recv_buf.npages ||
1532 		    (nent > qp->max_msg_size / entry_size)))
1533 		return;
1534 
1535 	page = qp->recv_buf.page_list[index];
1536 	buf = kmap_local_page(page);
1537 	for (i = 0; i < nent; i++) {
1538 		addr = MLX5_GET(page_track_report_entry, buf + i,
1539 				dirty_address_low);
1540 		addr |= (u64)MLX5_GET(page_track_report_entry, buf + i,
1541 				      dirty_address_high) << 32;
1542 		iova_bitmap_set(dirty, addr, qp->tracked_page_size);
1543 	}
1544 	kunmap_local(buf);
1545 }
1546 
1547 static void
1548 mlx5vf_rq_cqe(struct mlx5_vhca_qp *qp, struct mlx5_cqe64 *cqe,
1549 	      struct iova_bitmap *dirty, int *tracker_status)
1550 {
1551 	u32 size;
1552 	int ix;
1553 
1554 	qp->rq.cc++;
1555 	*tracker_status = be32_to_cpu(cqe->immediate) >> 28;
1556 	size = be32_to_cpu(cqe->byte_cnt);
1557 	ix = be16_to_cpu(cqe->wqe_counter) & (qp->rq.wqe_cnt - 1);
1558 
1559 	/* zero length CQE, no data */
1560 	WARN_ON(!size && *tracker_status == MLX5_PAGE_TRACK_STATE_REPORTING);
1561 	if (size)
1562 		set_report_output(size, ix, qp, dirty);
1563 
1564 	qp->recv_buf.next_rq_offset = ix * qp->max_msg_size;
1565 	mlx5vf_post_recv(qp);
1566 }
1567 
1568 static void *get_cqe(struct mlx5_vhca_cq *cq, int n)
1569 {
1570 	return mlx5_frag_buf_get_wqe(&cq->buf.fbc, n);
1571 }
1572 
1573 static struct mlx5_cqe64 *get_sw_cqe(struct mlx5_vhca_cq *cq, int n)
1574 {
1575 	void *cqe = get_cqe(cq, n & (cq->ncqe - 1));
1576 	struct mlx5_cqe64 *cqe64;
1577 
1578 	cqe64 = (cq->mcq.cqe_sz == 64) ? cqe : cqe + 64;
1579 
1580 	if (likely(get_cqe_opcode(cqe64) != MLX5_CQE_INVALID) &&
1581 	    !((cqe64->op_own & MLX5_CQE_OWNER_MASK) ^ !!(n & (cq->ncqe)))) {
1582 		return cqe64;
1583 	} else {
1584 		return NULL;
1585 	}
1586 }
1587 
1588 static int
1589 mlx5vf_cq_poll_one(struct mlx5_vhca_cq *cq, struct mlx5_vhca_qp *qp,
1590 		   struct iova_bitmap *dirty, int *tracker_status)
1591 {
1592 	struct mlx5_cqe64 *cqe;
1593 	u8 opcode;
1594 
1595 	cqe = get_sw_cqe(cq, cq->mcq.cons_index);
1596 	if (!cqe)
1597 		return CQ_EMPTY;
1598 
1599 	++cq->mcq.cons_index;
1600 	/*
1601 	 * Make sure we read CQ entry contents after we've checked the
1602 	 * ownership bit.
1603 	 */
1604 	rmb();
1605 	opcode = get_cqe_opcode(cqe);
1606 	switch (opcode) {
1607 	case MLX5_CQE_RESP_SEND_IMM:
1608 		mlx5vf_rq_cqe(qp, cqe, dirty, tracker_status);
1609 		return CQ_OK;
1610 	default:
1611 		return CQ_POLL_ERR;
1612 	}
1613 }
1614 
1615 int mlx5vf_tracker_read_and_clear(struct vfio_device *vdev, unsigned long iova,
1616 				  unsigned long length,
1617 				  struct iova_bitmap *dirty)
1618 {
1619 	struct mlx5vf_pci_core_device *mvdev = container_of(
1620 		vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1621 	struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker;
1622 	struct mlx5_vhca_cq *cq = &tracker->cq;
1623 	struct mlx5_core_dev *mdev;
1624 	int poll_err, err;
1625 
1626 	mutex_lock(&mvdev->state_mutex);
1627 	if (!mvdev->log_active) {
1628 		err = -EINVAL;
1629 		goto end;
1630 	}
1631 
1632 	if (mvdev->mdev_detach) {
1633 		err = -ENOTCONN;
1634 		goto end;
1635 	}
1636 
1637 	mdev = mvdev->mdev;
1638 	err = mlx5vf_cmd_modify_tracker(mdev, tracker->id, iova, length,
1639 					MLX5_PAGE_TRACK_STATE_REPORTING);
1640 	if (err)
1641 		goto end;
1642 
1643 	tracker->status = MLX5_PAGE_TRACK_STATE_REPORTING;
1644 	while (tracker->status == MLX5_PAGE_TRACK_STATE_REPORTING &&
1645 	       !tracker->is_err) {
1646 		poll_err = mlx5vf_cq_poll_one(cq, tracker->host_qp, dirty,
1647 					      &tracker->status);
1648 		if (poll_err == CQ_EMPTY) {
1649 			mlx5_cq_arm(&cq->mcq, MLX5_CQ_DB_REQ_NOT, tracker->uar->map,
1650 				    cq->mcq.cons_index);
1651 			poll_err = mlx5vf_cq_poll_one(cq, tracker->host_qp,
1652 						      dirty, &tracker->status);
1653 			if (poll_err == CQ_EMPTY) {
1654 				wait_for_completion(&mvdev->tracker_comp);
1655 				continue;
1656 			}
1657 		}
1658 		if (poll_err == CQ_POLL_ERR) {
1659 			err = -EIO;
1660 			goto end;
1661 		}
1662 		mlx5_cq_set_ci(&cq->mcq);
1663 	}
1664 
1665 	if (tracker->status == MLX5_PAGE_TRACK_STATE_ERROR)
1666 		tracker->is_err = true;
1667 
1668 	if (tracker->is_err)
1669 		err = -EIO;
1670 end:
1671 	mlx5vf_state_mutex_unlock(mvdev);
1672 	return err;
1673 }
1674