xref: /linux/drivers/vfio/pci/mlx5/cmd.c (revision 56f90177573e334532c0f039684e9c759830027a)
1 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
2 /*
3  * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved
4  */
5 
6 #include "cmd.h"
7 
8 enum { CQ_OK = 0, CQ_EMPTY = -1, CQ_POLL_ERR = -2 };
9 
10 static int mlx5vf_is_migratable(struct mlx5_core_dev *mdev, u16 func_id)
11 {
12 	int query_sz = MLX5_ST_SZ_BYTES(query_hca_cap_out);
13 	void *query_cap = NULL, *cap;
14 	int ret;
15 
16 	query_cap = kzalloc(query_sz, GFP_KERNEL);
17 	if (!query_cap)
18 		return -ENOMEM;
19 
20 	ret = mlx5_vport_get_other_func_cap(mdev, func_id, query_cap,
21 					    MLX5_CAP_GENERAL_2);
22 	if (ret)
23 		goto out;
24 
25 	cap = MLX5_ADDR_OF(query_hca_cap_out, query_cap, capability);
26 	if (!MLX5_GET(cmd_hca_cap_2, cap, migratable))
27 		ret = -EOPNOTSUPP;
28 out:
29 	kfree(query_cap);
30 	return ret;
31 }
32 
33 static int mlx5vf_cmd_get_vhca_id(struct mlx5_core_dev *mdev, u16 function_id,
34 				  u16 *vhca_id);
35 static void
36 _mlx5vf_free_page_tracker_resources(struct mlx5vf_pci_core_device *mvdev);
37 
38 int mlx5vf_cmd_suspend_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod)
39 {
40 	struct mlx5_vf_migration_file *migf = mvdev->saving_migf;
41 	u32 out[MLX5_ST_SZ_DW(suspend_vhca_out)] = {};
42 	u32 in[MLX5_ST_SZ_DW(suspend_vhca_in)] = {};
43 	int err;
44 
45 	lockdep_assert_held(&mvdev->state_mutex);
46 	if (mvdev->mdev_detach)
47 		return -ENOTCONN;
48 
49 	/*
50 	 * In case PRE_COPY is used, saving_migf is exposed while the device is
51 	 * running. Make sure to run only once there is no active save command.
52 	 * Running both in parallel, might end-up with a failure in the save
53 	 * command once it will try to turn on 'tracking' on a suspended device.
54 	 */
55 	if (migf) {
56 		err = wait_for_completion_interruptible(&migf->save_comp);
57 		if (err)
58 			return err;
59 	}
60 
61 	MLX5_SET(suspend_vhca_in, in, opcode, MLX5_CMD_OP_SUSPEND_VHCA);
62 	MLX5_SET(suspend_vhca_in, in, vhca_id, mvdev->vhca_id);
63 	MLX5_SET(suspend_vhca_in, in, op_mod, op_mod);
64 
65 	err = mlx5_cmd_exec_inout(mvdev->mdev, suspend_vhca, in, out);
66 	if (migf)
67 		complete(&migf->save_comp);
68 
69 	return err;
70 }
71 
72 int mlx5vf_cmd_resume_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod)
73 {
74 	u32 out[MLX5_ST_SZ_DW(resume_vhca_out)] = {};
75 	u32 in[MLX5_ST_SZ_DW(resume_vhca_in)] = {};
76 
77 	lockdep_assert_held(&mvdev->state_mutex);
78 	if (mvdev->mdev_detach)
79 		return -ENOTCONN;
80 
81 	MLX5_SET(resume_vhca_in, in, opcode, MLX5_CMD_OP_RESUME_VHCA);
82 	MLX5_SET(resume_vhca_in, in, vhca_id, mvdev->vhca_id);
83 	MLX5_SET(resume_vhca_in, in, op_mod, op_mod);
84 
85 	return mlx5_cmd_exec_inout(mvdev->mdev, resume_vhca, in, out);
86 }
87 
88 int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev,
89 					  size_t *state_size, u64 *total_size,
90 					  u8 *mig_state, u8 query_flags)
91 {
92 	u32 out[MLX5_ST_SZ_DW(query_vhca_migration_state_out)] = {};
93 	u32 in[MLX5_ST_SZ_DW(query_vhca_migration_state_in)] = {};
94 	bool inc = query_flags & MLX5VF_QUERY_INC;
95 	int ret;
96 
97 	lockdep_assert_held(&mvdev->state_mutex);
98 	if (mvdev->mdev_detach)
99 		return -ENOTCONN;
100 
101 	/*
102 	 * In case PRE_COPY is used, saving_migf is exposed while device is
103 	 * running. Make sure to run only once there is no active save command.
104 	 * Running both in parallel, might end-up with a failure in the
105 	 * incremental query command on un-tracked vhca.
106 	 */
107 	if (inc) {
108 		ret = wait_for_completion_interruptible(&mvdev->saving_migf->save_comp);
109 		if (ret)
110 			return ret;
111 		/* Upon cleanup, ignore previous pre_copy error state */
112 		if (mvdev->saving_migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR &&
113 		    !(query_flags & MLX5VF_QUERY_CLEANUP)) {
114 			/*
115 			 * In case we had a PRE_COPY error, only query full
116 			 * image for final image
117 			 */
118 			if (!(query_flags & MLX5VF_QUERY_FINAL)) {
119 				*state_size = 0;
120 				complete(&mvdev->saving_migf->save_comp);
121 				return 0;
122 			}
123 			query_flags &= ~MLX5VF_QUERY_INC;
124 		}
125 		/* Block incremental query which is state-dependent */
126 		if (mvdev->saving_migf->state == MLX5_MIGF_STATE_ERROR) {
127 			complete(&mvdev->saving_migf->save_comp);
128 			return -ENODEV;
129 		}
130 	}
131 
132 	MLX5_SET(query_vhca_migration_state_in, in, opcode,
133 		 MLX5_CMD_OP_QUERY_VHCA_MIGRATION_STATE);
134 	MLX5_SET(query_vhca_migration_state_in, in, vhca_id, mvdev->vhca_id);
135 	MLX5_SET(query_vhca_migration_state_in, in, op_mod, 0);
136 	MLX5_SET(query_vhca_migration_state_in, in, incremental,
137 		 query_flags & MLX5VF_QUERY_INC);
138 	MLX5_SET(query_vhca_migration_state_in, in, chunk, mvdev->chunk_mode);
139 
140 	ret = mlx5_cmd_exec_inout(mvdev->mdev, query_vhca_migration_state, in,
141 				  out);
142 	if (inc)
143 		complete(&mvdev->saving_migf->save_comp);
144 
145 	if (ret)
146 		return ret;
147 
148 	*state_size = MLX5_GET(query_vhca_migration_state_out, out,
149 			       required_umem_size);
150 	if (total_size)
151 		*total_size = mvdev->chunk_mode ?
152 			MLX5_GET64(query_vhca_migration_state_out, out,
153 				   remaining_total_size) : *state_size;
154 
155 	if (mig_state && mvdev->mig_state_cap)
156 		*mig_state = MLX5_GET(query_vhca_migration_state_out, out,
157 				      migration_state);
158 
159 	return 0;
160 }
161 
162 static void set_tracker_change_event(struct mlx5vf_pci_core_device *mvdev)
163 {
164 	mvdev->tracker.object_changed = true;
165 	complete(&mvdev->tracker_comp);
166 }
167 
168 static void set_tracker_error(struct mlx5vf_pci_core_device *mvdev)
169 {
170 	/* Mark the tracker under an error and wake it up if it's running */
171 	mvdev->tracker.is_err = true;
172 	complete(&mvdev->tracker_comp);
173 }
174 
175 static int mlx5fv_vf_event(struct notifier_block *nb,
176 			   unsigned long event, void *data)
177 {
178 	struct mlx5vf_pci_core_device *mvdev =
179 		container_of(nb, struct mlx5vf_pci_core_device, nb);
180 
181 	switch (event) {
182 	case MLX5_PF_NOTIFY_ENABLE_VF:
183 		mutex_lock(&mvdev->state_mutex);
184 		mvdev->mdev_detach = false;
185 		mlx5vf_state_mutex_unlock(mvdev);
186 		break;
187 	case MLX5_PF_NOTIFY_DISABLE_VF:
188 		mlx5vf_cmd_close_migratable(mvdev);
189 		mutex_lock(&mvdev->state_mutex);
190 		mvdev->mdev_detach = true;
191 		mlx5vf_state_mutex_unlock(mvdev);
192 		break;
193 	default:
194 		break;
195 	}
196 
197 	return 0;
198 }
199 
200 void mlx5vf_cmd_close_migratable(struct mlx5vf_pci_core_device *mvdev)
201 {
202 	if (!mvdev->migrate_cap)
203 		return;
204 
205 	/* Must be done outside the lock to let it progress */
206 	set_tracker_error(mvdev);
207 	mutex_lock(&mvdev->state_mutex);
208 	mlx5vf_disable_fds(mvdev, NULL);
209 	_mlx5vf_free_page_tracker_resources(mvdev);
210 	mlx5vf_state_mutex_unlock(mvdev);
211 }
212 
213 void mlx5vf_cmd_remove_migratable(struct mlx5vf_pci_core_device *mvdev)
214 {
215 	if (!mvdev->migrate_cap)
216 		return;
217 
218 	mlx5_sriov_blocking_notifier_unregister(mvdev->mdev, mvdev->vf_id,
219 						&mvdev->nb);
220 	destroy_workqueue(mvdev->cb_wq);
221 }
222 
223 void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev,
224 			       const struct vfio_migration_ops *mig_ops,
225 			       const struct vfio_log_ops *log_ops)
226 {
227 	struct pci_dev *pdev = mvdev->core_device.pdev;
228 	int ret;
229 
230 	if (!pdev->is_virtfn)
231 		return;
232 
233 	mvdev->mdev = mlx5_vf_get_core_dev(pdev);
234 	if (!mvdev->mdev)
235 		return;
236 
237 	if (!MLX5_CAP_GEN(mvdev->mdev, migration))
238 		goto end;
239 
240 	if (!(MLX5_CAP_GEN_2(mvdev->mdev, migration_multi_load) &&
241 	      MLX5_CAP_GEN_2(mvdev->mdev, migration_tracking_state)))
242 		goto end;
243 
244 	mvdev->vf_id = pci_iov_vf_id(pdev);
245 	if (mvdev->vf_id < 0)
246 		goto end;
247 
248 	ret = mlx5vf_is_migratable(mvdev->mdev, mvdev->vf_id + 1);
249 	if (ret)
250 		goto end;
251 
252 	if (mlx5vf_cmd_get_vhca_id(mvdev->mdev, mvdev->vf_id + 1,
253 				   &mvdev->vhca_id))
254 		goto end;
255 
256 	mvdev->cb_wq = alloc_ordered_workqueue("mlx5vf_wq", 0);
257 	if (!mvdev->cb_wq)
258 		goto end;
259 
260 	mutex_init(&mvdev->state_mutex);
261 	spin_lock_init(&mvdev->reset_lock);
262 	mvdev->nb.notifier_call = mlx5fv_vf_event;
263 	ret = mlx5_sriov_blocking_notifier_register(mvdev->mdev, mvdev->vf_id,
264 						    &mvdev->nb);
265 	if (ret) {
266 		destroy_workqueue(mvdev->cb_wq);
267 		goto end;
268 	}
269 
270 	mvdev->migrate_cap = 1;
271 	mvdev->core_device.vdev.migration_flags =
272 		VFIO_MIGRATION_STOP_COPY |
273 		VFIO_MIGRATION_P2P |
274 		VFIO_MIGRATION_PRE_COPY;
275 
276 	mvdev->core_device.vdev.mig_ops = mig_ops;
277 	init_completion(&mvdev->tracker_comp);
278 	if (MLX5_CAP_GEN(mvdev->mdev, adv_virtualization))
279 		mvdev->core_device.vdev.log_ops = log_ops;
280 
281 	if (MLX5_CAP_GEN_2(mvdev->mdev, migration_in_chunks))
282 		mvdev->chunk_mode = 1;
283 
284 	if (MLX5_CAP_GEN_2(mvdev->mdev, migration_state))
285 		mvdev->mig_state_cap = 1;
286 
287 end:
288 	mlx5_vf_put_core_dev(mvdev->mdev);
289 }
290 
291 static int mlx5vf_cmd_get_vhca_id(struct mlx5_core_dev *mdev, u16 function_id,
292 				  u16 *vhca_id)
293 {
294 	u32 in[MLX5_ST_SZ_DW(query_hca_cap_in)] = {};
295 	int out_size;
296 	void *out;
297 	int ret;
298 
299 	out_size = MLX5_ST_SZ_BYTES(query_hca_cap_out);
300 	out = kzalloc(out_size, GFP_KERNEL);
301 	if (!out)
302 		return -ENOMEM;
303 
304 	MLX5_SET(query_hca_cap_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_CAP);
305 	MLX5_SET(query_hca_cap_in, in, other_function, 1);
306 	MLX5_SET(query_hca_cap_in, in, function_id, function_id);
307 	MLX5_SET(query_hca_cap_in, in, op_mod,
308 		 MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE << 1 |
309 		 HCA_CAP_OPMOD_GET_CUR);
310 
311 	ret = mlx5_cmd_exec_inout(mdev, query_hca_cap, in, out);
312 	if (ret)
313 		goto err_exec;
314 
315 	*vhca_id = MLX5_GET(query_hca_cap_out, out,
316 			    capability.cmd_hca_cap.vhca_id);
317 
318 err_exec:
319 	kfree(out);
320 	return ret;
321 }
322 
323 static u32 *alloc_mkey_in(u32 npages, u32 pdn)
324 {
325 	int inlen;
326 	void *mkc;
327 	u32 *in;
328 
329 	inlen = MLX5_ST_SZ_BYTES(create_mkey_in) +
330 		sizeof(__be64) * round_up(npages, 2);
331 
332 	in = kvzalloc(inlen, GFP_KERNEL_ACCOUNT);
333 	if (!in)
334 		return NULL;
335 
336 	MLX5_SET(create_mkey_in, in, translations_octword_actual_size,
337 		 DIV_ROUND_UP(npages, 2));
338 
339 	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
340 	MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_MTT);
341 	MLX5_SET(mkc, mkc, lr, 1);
342 	MLX5_SET(mkc, mkc, lw, 1);
343 	MLX5_SET(mkc, mkc, rr, 1);
344 	MLX5_SET(mkc, mkc, rw, 1);
345 	MLX5_SET(mkc, mkc, pd, pdn);
346 	MLX5_SET(mkc, mkc, bsf_octword_size, 0);
347 	MLX5_SET(mkc, mkc, qpn, 0xffffff);
348 	MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT);
349 	MLX5_SET(mkc, mkc, translations_octword_size, DIV_ROUND_UP(npages, 2));
350 	MLX5_SET64(mkc, mkc, len, npages * PAGE_SIZE);
351 
352 	return in;
353 }
354 
355 static int create_mkey(struct mlx5_core_dev *mdev, u32 npages, u32 *mkey_in,
356 		       u32 *mkey)
357 {
358 	int inlen = MLX5_ST_SZ_BYTES(create_mkey_in) +
359 		sizeof(__be64) * round_up(npages, 2);
360 
361 	return mlx5_core_create_mkey(mdev, mkey, mkey_in, inlen);
362 }
363 
364 static void unregister_dma_pages(struct mlx5_core_dev *mdev, u32 npages,
365 				 u32 *mkey_in, struct dma_iova_state *state,
366 				 enum dma_data_direction dir)
367 {
368 	dma_addr_t addr;
369 	__be64 *mtt;
370 	int i;
371 
372 	if (dma_use_iova(state)) {
373 		dma_iova_destroy(mdev->device, state, npages * PAGE_SIZE, dir,
374 				 0);
375 	} else {
376 		mtt = (__be64 *)MLX5_ADDR_OF(create_mkey_in, mkey_in,
377 					     klm_pas_mtt);
378 		for (i = npages - 1; i >= 0; i--) {
379 			addr = be64_to_cpu(mtt[i]);
380 			dma_unmap_page(mdev->device, addr, PAGE_SIZE, dir);
381 		}
382 	}
383 }
384 
385 static int register_dma_pages(struct mlx5_core_dev *mdev, u32 npages,
386 			      struct page **page_list, u32 *mkey_in,
387 			      struct dma_iova_state *state,
388 			      enum dma_data_direction dir)
389 {
390 	dma_addr_t addr;
391 	size_t mapped = 0;
392 	__be64 *mtt;
393 	int i, err;
394 
395 	mtt = (__be64 *)MLX5_ADDR_OF(create_mkey_in, mkey_in, klm_pas_mtt);
396 
397 	if (dma_iova_try_alloc(mdev->device, state, 0, npages * PAGE_SIZE)) {
398 		addr = state->addr;
399 		for (i = 0; i < npages; i++) {
400 			err = dma_iova_link(mdev->device, state,
401 					    page_to_phys(page_list[i]), mapped,
402 					    PAGE_SIZE, dir, 0);
403 			if (err)
404 				goto error;
405 			*mtt++ = cpu_to_be64(addr);
406 			addr += PAGE_SIZE;
407 			mapped += PAGE_SIZE;
408 		}
409 		err = dma_iova_sync(mdev->device, state, 0, mapped);
410 		if (err)
411 			goto error;
412 	} else {
413 		for (i = 0; i < npages; i++) {
414 			addr = dma_map_page(mdev->device, page_list[i], 0,
415 					    PAGE_SIZE, dir);
416 			err = dma_mapping_error(mdev->device, addr);
417 			if (err)
418 				goto error;
419 			*mtt++ = cpu_to_be64(addr);
420 		}
421 	}
422 	return 0;
423 
424 error:
425 	unregister_dma_pages(mdev, i, mkey_in, state, dir);
426 	return err;
427 }
428 
429 static int mlx5vf_dma_data_buffer(struct mlx5_vhca_data_buffer *buf)
430 {
431 	struct mlx5vf_pci_core_device *mvdev = buf->migf->mvdev;
432 	struct mlx5_core_dev *mdev = mvdev->mdev;
433 	int ret;
434 
435 	lockdep_assert_held(&mvdev->state_mutex);
436 	if (mvdev->mdev_detach)
437 		return -ENOTCONN;
438 
439 	if (buf->mkey_in || !buf->npages)
440 		return -EINVAL;
441 
442 	buf->mkey_in = alloc_mkey_in(buf->npages, buf->migf->pdn);
443 	if (!buf->mkey_in)
444 		return -ENOMEM;
445 
446 	ret = register_dma_pages(mdev, buf->npages, buf->page_list,
447 				 buf->mkey_in, &buf->state, buf->dma_dir);
448 	if (ret)
449 		goto err_register_dma;
450 
451 	ret = create_mkey(mdev, buf->npages, buf->mkey_in, &buf->mkey);
452 	if (ret)
453 		goto err_create_mkey;
454 
455 	return 0;
456 
457 err_create_mkey:
458 	unregister_dma_pages(mdev, buf->npages, buf->mkey_in, &buf->state,
459 			     buf->dma_dir);
460 err_register_dma:
461 	kvfree(buf->mkey_in);
462 	buf->mkey_in = NULL;
463 	return ret;
464 }
465 
466 static void free_page_list(u32 npages, struct page **page_list)
467 {
468 	int i;
469 
470 	/* Undo alloc_pages_bulk() */
471 	for (i = npages - 1; i >= 0; i--)
472 		__free_page(page_list[i]);
473 
474 	kvfree(page_list);
475 }
476 
477 void mlx5vf_free_data_buffer(struct mlx5_vhca_data_buffer *buf)
478 {
479 	struct mlx5vf_pci_core_device *mvdev = buf->migf->mvdev;
480 	struct mlx5_core_dev *mdev = mvdev->mdev;
481 
482 	lockdep_assert_held(&mvdev->state_mutex);
483 	WARN_ON(mvdev->mdev_detach);
484 
485 	if (buf->mkey_in) {
486 		mlx5_core_destroy_mkey(mdev, buf->mkey);
487 		unregister_dma_pages(mdev, buf->npages, buf->mkey_in,
488 				     &buf->state, buf->dma_dir);
489 		kvfree(buf->mkey_in);
490 	}
491 
492 	free_page_list(buf->npages, buf->page_list);
493 	kfree(buf);
494 }
495 
496 static int mlx5vf_add_pages(struct page ***page_list, unsigned int npages)
497 {
498 	unsigned int filled, done = 0;
499 	int i;
500 
501 	*page_list =
502 		kvzalloc_objs(struct page *, npages, GFP_KERNEL_ACCOUNT);
503 	if (!*page_list)
504 		return -ENOMEM;
505 
506 	for (;;) {
507 		filled = alloc_pages_bulk(GFP_KERNEL_ACCOUNT, npages - done,
508 					  *page_list + done);
509 		if (!filled)
510 			goto err;
511 
512 		done += filled;
513 		if (done == npages)
514 			break;
515 	}
516 
517 	return 0;
518 
519 err:
520 	for (i = 0; i < done; i++)
521 		__free_page(*page_list[i]);
522 
523 	kvfree(*page_list);
524 	*page_list = NULL;
525 	return -ENOMEM;
526 }
527 
528 struct mlx5_vhca_data_buffer *
529 mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf, u32 npages,
530 			 enum dma_data_direction dma_dir)
531 {
532 	struct mlx5_vhca_data_buffer *buf;
533 	int ret;
534 
535 	buf = kzalloc_obj(*buf, GFP_KERNEL_ACCOUNT);
536 	if (!buf)
537 		return ERR_PTR(-ENOMEM);
538 
539 	buf->dma_dir = dma_dir;
540 	buf->migf = migf;
541 	if (npages) {
542 		ret = mlx5vf_add_pages(&buf->page_list, npages);
543 		if (ret)
544 			goto end;
545 
546 		buf->npages = npages;
547 
548 		if (dma_dir != DMA_NONE) {
549 			ret = mlx5vf_dma_data_buffer(buf);
550 			if (ret)
551 				goto end;
552 		}
553 	}
554 
555 	return buf;
556 end:
557 	mlx5vf_free_data_buffer(buf);
558 	return ERR_PTR(ret);
559 }
560 
561 void mlx5vf_put_data_buffer(struct mlx5_vhca_data_buffer *buf)
562 {
563 	spin_lock_irq(&buf->migf->list_lock);
564 	buf->stop_copy_chunk_num = 0;
565 	buf->pre_copy_init_bytes_chunk = false;
566 	list_add_tail(&buf->buf_elm, &buf->migf->avail_list);
567 	spin_unlock_irq(&buf->migf->list_lock);
568 }
569 
570 struct mlx5_vhca_data_buffer *
571 mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf, u32 npages,
572 		       enum dma_data_direction dma_dir)
573 {
574 	struct mlx5_vhca_data_buffer *buf, *temp_buf;
575 	struct list_head free_list;
576 
577 	lockdep_assert_held(&migf->mvdev->state_mutex);
578 	if (migf->mvdev->mdev_detach)
579 		return ERR_PTR(-ENOTCONN);
580 
581 	INIT_LIST_HEAD(&free_list);
582 
583 	spin_lock_irq(&migf->list_lock);
584 	list_for_each_entry_safe(buf, temp_buf, &migf->avail_list, buf_elm) {
585 		if (buf->dma_dir == dma_dir) {
586 			list_del_init(&buf->buf_elm);
587 			if (buf->npages >= npages) {
588 				spin_unlock_irq(&migf->list_lock);
589 				goto found;
590 			}
591 			/*
592 			 * Prevent holding redundant buffers. Put in a free
593 			 * list and call at the end not under the spin lock
594 			 * (&migf->list_lock) to mlx5vf_free_data_buffer which
595 			 * might sleep.
596 			 */
597 			list_add(&buf->buf_elm, &free_list);
598 		}
599 	}
600 	spin_unlock_irq(&migf->list_lock);
601 	buf = mlx5vf_alloc_data_buffer(migf, npages, dma_dir);
602 
603 found:
604 	while ((temp_buf = list_first_entry_or_null(&free_list,
605 				struct mlx5_vhca_data_buffer, buf_elm))) {
606 		list_del(&temp_buf->buf_elm);
607 		mlx5vf_free_data_buffer(temp_buf);
608 	}
609 
610 	return buf;
611 }
612 
613 static void
614 mlx5vf_save_callback_complete(struct mlx5_vf_migration_file *migf,
615 			      struct mlx5vf_async_data *async_data)
616 {
617 	migf->inflight_save = 0;
618 	wake_up_interruptible(&migf->poll_wait);
619 	kvfree(async_data->out);
620 	complete(&migf->save_comp);
621 	fput(migf->filp);
622 }
623 
624 void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work)
625 {
626 	struct mlx5vf_async_data *async_data = container_of(_work,
627 		struct mlx5vf_async_data, work);
628 	struct mlx5_vf_migration_file *migf = container_of(async_data,
629 		struct mlx5_vf_migration_file, async_data);
630 
631 	mutex_lock(&migf->lock);
632 	if (async_data->status) {
633 		mlx5vf_put_data_buffer(async_data->buf);
634 		if (async_data->header_buf)
635 			mlx5vf_put_data_buffer(async_data->header_buf);
636 		if (!async_data->stop_copy_chunk &&
637 		    async_data->status == MLX5_CMD_STAT_BAD_RES_STATE_ERR)
638 			migf->state = MLX5_MIGF_STATE_PRE_COPY_ERROR;
639 		else
640 			migf->state = MLX5_MIGF_STATE_ERROR;
641 		wake_up_interruptible(&migf->poll_wait);
642 	}
643 	mutex_unlock(&migf->lock);
644 	mlx5vf_save_callback_complete(migf, async_data);
645 }
646 
647 static int add_buf_header(struct mlx5_vhca_data_buffer *header_buf,
648 			  size_t image_size, bool initial_pre_copy)
649 {
650 	struct mlx5_vf_migration_file *migf = header_buf->migf;
651 	struct mlx5_vf_migration_header header = {};
652 	unsigned long flags;
653 	struct page *page;
654 	u8 *to_buff;
655 
656 	header.record_size = cpu_to_le64(image_size);
657 	header.flags = cpu_to_le32(MLX5_MIGF_HEADER_FLAGS_TAG_MANDATORY);
658 	header.tag = cpu_to_le32(MLX5_MIGF_HEADER_TAG_FW_DATA);
659 	page = mlx5vf_get_migration_page(header_buf, 0);
660 	if (!page)
661 		return -EINVAL;
662 	to_buff = kmap_local_page(page);
663 	memcpy(to_buff, &header, sizeof(header));
664 	kunmap_local(to_buff);
665 	header_buf->length = sizeof(header);
666 	header_buf->start_pos = header_buf->migf->max_pos;
667 	migf->max_pos += header_buf->length;
668 	spin_lock_irqsave(&migf->list_lock, flags);
669 	list_add_tail(&header_buf->buf_elm, &migf->buf_list);
670 	spin_unlock_irqrestore(&migf->list_lock, flags);
671 	if (initial_pre_copy)
672 		migf->pre_copy_initial_bytes += sizeof(header);
673 	return 0;
674 }
675 
676 static void mlx5vf_save_callback(int status, struct mlx5_async_work *context)
677 {
678 	struct mlx5vf_async_data *async_data = container_of(context,
679 			struct mlx5vf_async_data, cb_work);
680 	struct mlx5_vf_migration_file *migf = container_of(async_data,
681 			struct mlx5_vf_migration_file, async_data);
682 
683 	if (!status) {
684 		size_t next_required_umem_size = 0;
685 		bool stop_copy_last_chunk;
686 		size_t image_size;
687 		unsigned long flags;
688 		bool initial_pre_copy = migf->state != MLX5_MIGF_STATE_PRE_COPY &&
689 				!async_data->stop_copy_chunk;
690 
691 		image_size = MLX5_GET(save_vhca_state_out, async_data->out,
692 				      actual_image_size);
693 		if (async_data->buf->stop_copy_chunk_num)
694 			next_required_umem_size = MLX5_GET(save_vhca_state_out,
695 					async_data->out, next_required_umem_size);
696 		stop_copy_last_chunk = async_data->stop_copy_chunk &&
697 				!next_required_umem_size;
698 		if (async_data->header_buf) {
699 			status = add_buf_header(async_data->header_buf, image_size,
700 						initial_pre_copy ||
701 						async_data->buf->pre_copy_init_bytes_chunk);
702 			if (status)
703 				goto err;
704 		}
705 		async_data->buf->length = image_size;
706 		async_data->buf->start_pos = migf->max_pos;
707 		migf->max_pos += async_data->buf->length;
708 		spin_lock_irqsave(&migf->list_lock, flags);
709 		list_add_tail(&async_data->buf->buf_elm, &migf->buf_list);
710 		if (async_data->buf->stop_copy_chunk_num) {
711 			migf->num_ready_chunks++;
712 			if (next_required_umem_size &&
713 			    migf->num_ready_chunks >= MAX_NUM_CHUNKS) {
714 				/* Delay the next SAVE till one chunk be consumed */
715 				migf->next_required_umem_size = next_required_umem_size;
716 				next_required_umem_size = 0;
717 			}
718 		}
719 		spin_unlock_irqrestore(&migf->list_lock, flags);
720 		if (initial_pre_copy || async_data->buf->pre_copy_init_bytes_chunk) {
721 			migf->pre_copy_initial_bytes += image_size;
722 			if (initial_pre_copy)
723 				migf->state = MLX5_MIGF_STATE_PRE_COPY;
724 			if (async_data->buf->pre_copy_init_bytes_chunk)
725 				async_data->buf->pre_copy_init_bytes_chunk = false;
726 		}
727 		if (stop_copy_last_chunk)
728 			migf->state = MLX5_MIGF_STATE_COMPLETE;
729 		wake_up_interruptible(&migf->poll_wait);
730 		if (next_required_umem_size)
731 			mlx5vf_mig_file_set_save_work(migf,
732 				/* Picking up the next chunk num */
733 				(async_data->buf->stop_copy_chunk_num % MAX_NUM_CHUNKS) + 1,
734 				next_required_umem_size);
735 		mlx5vf_save_callback_complete(migf, async_data);
736 		return;
737 	}
738 
739 err:
740 	/* The error flow can't run from an interrupt context */
741 	if (status == -EREMOTEIO) {
742 		status = MLX5_GET(save_vhca_state_out, async_data->out, status);
743 		/* Failed in FW, print cmd out failure details */
744 		mlx5_cmd_out_err(migf->mvdev->mdev, MLX5_CMD_OP_SAVE_VHCA_STATE, 0,
745 				 async_data->out);
746 	}
747 
748 	async_data->status = status;
749 	queue_work(migf->mvdev->cb_wq, &async_data->work);
750 }
751 
752 int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev,
753 			       struct mlx5_vf_migration_file *migf,
754 			       struct mlx5_vhca_data_buffer *buf, bool inc,
755 			       bool track)
756 {
757 	u32 out_size = MLX5_ST_SZ_BYTES(save_vhca_state_out);
758 	u32 in[MLX5_ST_SZ_DW(save_vhca_state_in)] = {};
759 	struct mlx5_vhca_data_buffer *header_buf = NULL;
760 	struct mlx5vf_async_data *async_data;
761 	bool pre_copy_cleanup = false;
762 	int err;
763 
764 	lockdep_assert_held(&mvdev->state_mutex);
765 	if (mvdev->mdev_detach)
766 		return -ENOTCONN;
767 
768 	err = wait_for_completion_interruptible(&migf->save_comp);
769 	if (err)
770 		return err;
771 
772 	if ((migf->state == MLX5_MIGF_STATE_PRE_COPY ||
773 	     migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR) && !track && !inc)
774 		pre_copy_cleanup = true;
775 
776 	if (migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR)
777 		/*
778 		 * In case we had a PRE_COPY error, SAVE is triggered only for
779 		 * the final image, read device full image.
780 		 */
781 		inc = false;
782 
783 	MLX5_SET(save_vhca_state_in, in, opcode,
784 		 MLX5_CMD_OP_SAVE_VHCA_STATE);
785 	MLX5_SET(save_vhca_state_in, in, op_mod, 0);
786 	MLX5_SET(save_vhca_state_in, in, vhca_id, mvdev->vhca_id);
787 	MLX5_SET(save_vhca_state_in, in, mkey, buf->mkey);
788 	MLX5_SET(save_vhca_state_in, in, size, buf->npages * PAGE_SIZE);
789 	MLX5_SET(save_vhca_state_in, in, incremental, inc);
790 	MLX5_SET(save_vhca_state_in, in, set_track, track);
791 
792 	async_data = &migf->async_data;
793 	async_data->buf = buf;
794 	async_data->stop_copy_chunk = (!track && !pre_copy_cleanup);
795 	async_data->out = kvzalloc(out_size, GFP_KERNEL);
796 	if (!async_data->out) {
797 		err = -ENOMEM;
798 		goto err_out;
799 	}
800 
801 	if (async_data->stop_copy_chunk) {
802 		u8 header_idx = buf->stop_copy_chunk_num ?
803 			buf->stop_copy_chunk_num - 1 : 0;
804 
805 		header_buf = migf->buf_header[header_idx];
806 		migf->buf_header[header_idx] = NULL;
807 	}
808 
809 	if (!header_buf) {
810 		header_buf = mlx5vf_get_data_buffer(
811 			migf,
812 			DIV_ROUND_UP(sizeof(struct mlx5_vf_migration_header),
813 				     PAGE_SIZE),
814 			DMA_NONE);
815 		if (IS_ERR(header_buf)) {
816 			err = PTR_ERR(header_buf);
817 			goto err_free;
818 		}
819 	}
820 
821 	if (async_data->stop_copy_chunk)
822 		migf->state = MLX5_MIGF_STATE_SAVE_STOP_COPY_CHUNK;
823 
824 	async_data->header_buf = header_buf;
825 	get_file(migf->filp);
826 	migf->inflight_save = 1;
827 	err = mlx5_cmd_exec_cb(&migf->async_ctx, in, sizeof(in),
828 			       async_data->out,
829 			       out_size, mlx5vf_save_callback,
830 			       &async_data->cb_work);
831 	if (err)
832 		goto err_exec;
833 
834 	return 0;
835 
836 err_exec:
837 	migf->inflight_save = 0;
838 	wake_up_interruptible(&migf->poll_wait);
839 	if (header_buf)
840 		mlx5vf_put_data_buffer(header_buf);
841 	fput(migf->filp);
842 err_free:
843 	kvfree(async_data->out);
844 err_out:
845 	complete(&migf->save_comp);
846 	return err;
847 }
848 
849 int mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device *mvdev,
850 			       struct mlx5_vf_migration_file *migf,
851 			       struct mlx5_vhca_data_buffer *buf)
852 {
853 	u32 out[MLX5_ST_SZ_DW(load_vhca_state_out)] = {};
854 	u32 in[MLX5_ST_SZ_DW(load_vhca_state_in)] = {};
855 	int err;
856 
857 	lockdep_assert_held(&mvdev->state_mutex);
858 	if (mvdev->mdev_detach)
859 		return -ENOTCONN;
860 
861 	if (!buf->mkey_in) {
862 		err = mlx5vf_dma_data_buffer(buf);
863 		if (err)
864 			return err;
865 	}
866 
867 	MLX5_SET(load_vhca_state_in, in, opcode,
868 		 MLX5_CMD_OP_LOAD_VHCA_STATE);
869 	MLX5_SET(load_vhca_state_in, in, op_mod, 0);
870 	MLX5_SET(load_vhca_state_in, in, vhca_id, mvdev->vhca_id);
871 	MLX5_SET(load_vhca_state_in, in, mkey, buf->mkey);
872 	MLX5_SET(load_vhca_state_in, in, size, buf->length);
873 	return mlx5_cmd_exec_inout(mvdev->mdev, load_vhca_state, in, out);
874 }
875 
876 int mlx5vf_cmd_alloc_pd(struct mlx5_vf_migration_file *migf)
877 {
878 	int err;
879 
880 	lockdep_assert_held(&migf->mvdev->state_mutex);
881 	if (migf->mvdev->mdev_detach)
882 		return -ENOTCONN;
883 
884 	err = mlx5_core_alloc_pd(migf->mvdev->mdev, &migf->pdn);
885 	return err;
886 }
887 
888 void mlx5vf_cmd_dealloc_pd(struct mlx5_vf_migration_file *migf)
889 {
890 	lockdep_assert_held(&migf->mvdev->state_mutex);
891 	if (migf->mvdev->mdev_detach)
892 		return;
893 
894 	mlx5_core_dealloc_pd(migf->mvdev->mdev, migf->pdn);
895 }
896 
897 void mlx5fv_cmd_clean_migf_resources(struct mlx5_vf_migration_file *migf)
898 {
899 	struct mlx5_vhca_data_buffer *entry;
900 	int i;
901 
902 	lockdep_assert_held(&migf->mvdev->state_mutex);
903 	WARN_ON(migf->mvdev->mdev_detach);
904 
905 	for (i = 0; i < MAX_NUM_CHUNKS; i++) {
906 		if (migf->buf[i]) {
907 			mlx5vf_free_data_buffer(migf->buf[i]);
908 			migf->buf[i] = NULL;
909 		}
910 
911 		if (migf->buf_header[i]) {
912 			mlx5vf_free_data_buffer(migf->buf_header[i]);
913 			migf->buf_header[i] = NULL;
914 		}
915 	}
916 
917 	list_splice(&migf->avail_list, &migf->buf_list);
918 
919 	while ((entry = list_first_entry_or_null(&migf->buf_list,
920 				struct mlx5_vhca_data_buffer, buf_elm))) {
921 		list_del(&entry->buf_elm);
922 		mlx5vf_free_data_buffer(entry);
923 	}
924 
925 	mlx5vf_cmd_dealloc_pd(migf);
926 }
927 
928 static int mlx5vf_create_tracker(struct mlx5_core_dev *mdev,
929 				 struct mlx5vf_pci_core_device *mvdev,
930 				 struct rb_root_cached *ranges, u32 nnodes)
931 {
932 	int max_num_range =
933 		MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_max_num_range);
934 	struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker;
935 	int record_size = MLX5_ST_SZ_BYTES(page_track_range);
936 	u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {};
937 	struct interval_tree_node *node = NULL;
938 	u64 total_ranges_len = 0;
939 	u32 num_ranges = nnodes;
940 	u8 log_addr_space_size;
941 	void *range_list_ptr;
942 	void *obj_context;
943 	void *cmd_hdr;
944 	int inlen;
945 	void *in;
946 	int err;
947 	int i;
948 
949 	if (num_ranges > max_num_range) {
950 		vfio_combine_iova_ranges(ranges, nnodes, max_num_range);
951 		num_ranges = max_num_range;
952 	}
953 
954 	inlen = MLX5_ST_SZ_BYTES(create_page_track_obj_in) +
955 				 record_size * num_ranges;
956 	in = kzalloc(inlen, GFP_KERNEL);
957 	if (!in)
958 		return -ENOMEM;
959 
960 	cmd_hdr = MLX5_ADDR_OF(create_page_track_obj_in, in,
961 			       general_obj_in_cmd_hdr);
962 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode,
963 		 MLX5_CMD_OP_CREATE_GENERAL_OBJECT);
964 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type,
965 		 MLX5_OBJ_TYPE_PAGE_TRACK);
966 	obj_context = MLX5_ADDR_OF(create_page_track_obj_in, in, obj_context);
967 	MLX5_SET(page_track, obj_context, vhca_id, mvdev->vhca_id);
968 	MLX5_SET(page_track, obj_context, track_type, 1);
969 	MLX5_SET(page_track, obj_context, log_page_size,
970 		 ilog2(tracker->host_qp->tracked_page_size));
971 	MLX5_SET(page_track, obj_context, log_msg_size,
972 		 ilog2(tracker->host_qp->max_msg_size));
973 	MLX5_SET(page_track, obj_context, reporting_qpn, tracker->fw_qp->qpn);
974 	MLX5_SET(page_track, obj_context, num_ranges, num_ranges);
975 
976 	range_list_ptr = MLX5_ADDR_OF(page_track, obj_context, track_range);
977 	node = interval_tree_iter_first(ranges, 0, ULONG_MAX);
978 	for (i = 0; i < num_ranges; i++) {
979 		void *addr_range_i_base = range_list_ptr + record_size * i;
980 		unsigned long length = node->last - node->start + 1;
981 
982 		MLX5_SET64(page_track_range, addr_range_i_base, start_address,
983 			   node->start);
984 		MLX5_SET64(page_track_range, addr_range_i_base, length, length);
985 		total_ranges_len += length;
986 		node = interval_tree_iter_next(node, 0, ULONG_MAX);
987 	}
988 
989 	WARN_ON(node);
990 	log_addr_space_size = ilog2(roundup_pow_of_two(total_ranges_len));
991 	if (log_addr_space_size <
992 	    (MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_log_min_addr_space)) ||
993 	    log_addr_space_size >
994 	    (MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_log_max_addr_space))) {
995 		err = -EOPNOTSUPP;
996 		goto out;
997 	}
998 
999 	MLX5_SET(page_track, obj_context, log_addr_space_size,
1000 		 log_addr_space_size);
1001 	err = mlx5_cmd_exec(mdev, in, inlen, out, sizeof(out));
1002 	if (err)
1003 		goto out;
1004 
1005 	tracker->id = MLX5_GET(general_obj_out_cmd_hdr, out, obj_id);
1006 out:
1007 	kfree(in);
1008 	return err;
1009 }
1010 
1011 static int mlx5vf_cmd_destroy_tracker(struct mlx5_core_dev *mdev,
1012 				      u32 tracker_id)
1013 {
1014 	u32 in[MLX5_ST_SZ_DW(general_obj_in_cmd_hdr)] = {};
1015 	u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {};
1016 
1017 	MLX5_SET(general_obj_in_cmd_hdr, in, opcode, MLX5_CMD_OP_DESTROY_GENERAL_OBJECT);
1018 	MLX5_SET(general_obj_in_cmd_hdr, in, obj_type, MLX5_OBJ_TYPE_PAGE_TRACK);
1019 	MLX5_SET(general_obj_in_cmd_hdr, in, obj_id, tracker_id);
1020 
1021 	return mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out));
1022 }
1023 
1024 static int mlx5vf_cmd_modify_tracker(struct mlx5_core_dev *mdev,
1025 				     u32 tracker_id, unsigned long iova,
1026 				     unsigned long length, u32 tracker_state)
1027 {
1028 	u32 in[MLX5_ST_SZ_DW(modify_page_track_obj_in)] = {};
1029 	u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {};
1030 	void *obj_context;
1031 	void *cmd_hdr;
1032 
1033 	cmd_hdr = MLX5_ADDR_OF(modify_page_track_obj_in, in, general_obj_in_cmd_hdr);
1034 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_MODIFY_GENERAL_OBJECT);
1035 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_PAGE_TRACK);
1036 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_id, tracker_id);
1037 
1038 	obj_context = MLX5_ADDR_OF(modify_page_track_obj_in, in, obj_context);
1039 	MLX5_SET64(page_track, obj_context, modify_field_select, 0x3);
1040 	MLX5_SET64(page_track, obj_context, range_start_address, iova);
1041 	MLX5_SET64(page_track, obj_context, length, length);
1042 	MLX5_SET(page_track, obj_context, state, tracker_state);
1043 
1044 	return mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out));
1045 }
1046 
1047 static int mlx5vf_cmd_query_tracker(struct mlx5_core_dev *mdev,
1048 				    struct mlx5_vhca_page_tracker *tracker)
1049 {
1050 	u32 out[MLX5_ST_SZ_DW(query_page_track_obj_out)] = {};
1051 	u32 in[MLX5_ST_SZ_DW(general_obj_in_cmd_hdr)] = {};
1052 	void *obj_context;
1053 	void *cmd_hdr;
1054 	int err;
1055 
1056 	cmd_hdr = MLX5_ADDR_OF(modify_page_track_obj_in, in, general_obj_in_cmd_hdr);
1057 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_QUERY_GENERAL_OBJECT);
1058 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_PAGE_TRACK);
1059 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_id, tracker->id);
1060 
1061 	err = mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out));
1062 	if (err)
1063 		return err;
1064 
1065 	obj_context = MLX5_ADDR_OF(query_page_track_obj_out, out, obj_context);
1066 	tracker->status = MLX5_GET(page_track, obj_context, state);
1067 	return 0;
1068 }
1069 
1070 static int alloc_cq_frag_buf(struct mlx5_core_dev *mdev,
1071 			     struct mlx5_vhca_cq_buf *buf, int nent,
1072 			     int cqe_size)
1073 {
1074 	struct mlx5_frag_buf *frag_buf = &buf->frag_buf;
1075 	u8 log_wq_stride = 6 + (cqe_size == 128 ? 1 : 0);
1076 	u8 log_wq_sz = ilog2(cqe_size);
1077 	int err;
1078 
1079 	err = mlx5_frag_buf_alloc_node(mdev, nent * cqe_size, frag_buf,
1080 				       mdev->priv.numa_node);
1081 	if (err)
1082 		return err;
1083 
1084 	mlx5_init_fbc(frag_buf->frags, log_wq_stride, log_wq_sz, &buf->fbc);
1085 	buf->cqe_size = cqe_size;
1086 	buf->nent = nent;
1087 	return 0;
1088 }
1089 
1090 static void init_cq_frag_buf(struct mlx5_vhca_cq_buf *buf)
1091 {
1092 	struct mlx5_cqe64 *cqe64;
1093 	void *cqe;
1094 	int i;
1095 
1096 	for (i = 0; i < buf->nent; i++) {
1097 		cqe = mlx5_frag_buf_get_wqe(&buf->fbc, i);
1098 		cqe64 = buf->cqe_size == 64 ? cqe : cqe + 64;
1099 		cqe64->op_own = MLX5_CQE_INVALID << 4;
1100 	}
1101 }
1102 
1103 static void mlx5vf_destroy_cq(struct mlx5_core_dev *mdev,
1104 			      struct mlx5_vhca_cq *cq)
1105 {
1106 	mlx5_core_destroy_cq(mdev, &cq->mcq);
1107 	mlx5_frag_buf_free(mdev, &cq->buf.frag_buf);
1108 	mlx5_db_free(mdev, &cq->db);
1109 }
1110 
1111 static void mlx5vf_cq_event(struct mlx5_core_cq *mcq, enum mlx5_event type)
1112 {
1113 	if (type != MLX5_EVENT_TYPE_CQ_ERROR)
1114 		return;
1115 
1116 	set_tracker_error(container_of(mcq, struct mlx5vf_pci_core_device,
1117 				       tracker.cq.mcq));
1118 }
1119 
1120 static int mlx5vf_event_notifier(struct notifier_block *nb, unsigned long type,
1121 				 void *data)
1122 {
1123 	struct mlx5_vhca_page_tracker *tracker =
1124 		mlx5_nb_cof(nb, struct mlx5_vhca_page_tracker, nb);
1125 	struct mlx5vf_pci_core_device *mvdev = container_of(
1126 		tracker, struct mlx5vf_pci_core_device, tracker);
1127 	struct mlx5_eqe_obj_change *object;
1128 	struct mlx5_eqe *eqe = data;
1129 	u8 event_type = (u8)type;
1130 	u8 queue_type;
1131 	u32 obj_id;
1132 	int qp_num;
1133 
1134 	switch (event_type) {
1135 	case MLX5_EVENT_TYPE_WQ_CATAS_ERROR:
1136 	case MLX5_EVENT_TYPE_WQ_ACCESS_ERROR:
1137 	case MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR:
1138 		queue_type = eqe->data.qp_srq.type;
1139 		if (queue_type != MLX5_EVENT_QUEUE_TYPE_QP)
1140 			break;
1141 		qp_num = be32_to_cpu(eqe->data.qp_srq.qp_srq_n) & 0xffffff;
1142 		if (qp_num != tracker->host_qp->qpn &&
1143 		    qp_num != tracker->fw_qp->qpn)
1144 			break;
1145 		set_tracker_error(mvdev);
1146 		break;
1147 	case MLX5_EVENT_TYPE_OBJECT_CHANGE:
1148 		object = &eqe->data.obj_change;
1149 		obj_id = be32_to_cpu(object->obj_id);
1150 		if (obj_id == tracker->id)
1151 			set_tracker_change_event(mvdev);
1152 		break;
1153 	default:
1154 		break;
1155 	}
1156 
1157 	return NOTIFY_OK;
1158 }
1159 
1160 static void mlx5vf_cq_complete(struct mlx5_core_cq *mcq,
1161 			       struct mlx5_eqe *eqe)
1162 {
1163 	struct mlx5vf_pci_core_device *mvdev =
1164 		container_of(mcq, struct mlx5vf_pci_core_device,
1165 			     tracker.cq.mcq);
1166 
1167 	complete(&mvdev->tracker_comp);
1168 }
1169 
1170 static int mlx5vf_create_cq(struct mlx5_core_dev *mdev,
1171 			    struct mlx5_vhca_page_tracker *tracker,
1172 			    size_t ncqe)
1173 {
1174 	int cqe_size = cache_line_size() == 128 ? 128 : 64;
1175 	u32 out[MLX5_ST_SZ_DW(create_cq_out)];
1176 	struct mlx5_vhca_cq *cq;
1177 	int inlen, err, eqn;
1178 	void *cqc, *in;
1179 	__be64 *pas;
1180 	int vector;
1181 
1182 	cq = &tracker->cq;
1183 	ncqe = roundup_pow_of_two(ncqe);
1184 	err = mlx5_db_alloc_node(mdev, &cq->db, mdev->priv.numa_node);
1185 	if (err)
1186 		return err;
1187 
1188 	cq->ncqe = ncqe;
1189 	cq->mcq.set_ci_db = cq->db.db;
1190 	cq->mcq.arm_db = cq->db.db + 1;
1191 	cq->mcq.cqe_sz = cqe_size;
1192 	err = alloc_cq_frag_buf(mdev, &cq->buf, ncqe, cqe_size);
1193 	if (err)
1194 		goto err_db_free;
1195 
1196 	init_cq_frag_buf(&cq->buf);
1197 	inlen = MLX5_ST_SZ_BYTES(create_cq_in) +
1198 		MLX5_FLD_SZ_BYTES(create_cq_in, pas[0]) *
1199 		cq->buf.frag_buf.npages;
1200 	in = kvzalloc(inlen, GFP_KERNEL);
1201 	if (!in) {
1202 		err = -ENOMEM;
1203 		goto err_buff;
1204 	}
1205 
1206 	vector = raw_smp_processor_id() % mlx5_comp_vectors_max(mdev);
1207 	err = mlx5_comp_eqn_get(mdev, vector, &eqn);
1208 	if (err)
1209 		goto err_vec;
1210 
1211 	cqc = MLX5_ADDR_OF(create_cq_in, in, cq_context);
1212 	MLX5_SET(cqc, cqc, log_cq_size, ilog2(ncqe));
1213 	MLX5_SET(cqc, cqc, c_eqn_or_apu_element, eqn);
1214 	MLX5_SET(cqc, cqc, uar_page, tracker->uar->index);
1215 	MLX5_SET(cqc, cqc, log_page_size, cq->buf.frag_buf.page_shift -
1216 		 MLX5_ADAPTER_PAGE_SHIFT);
1217 	MLX5_SET64(cqc, cqc, dbr_addr, cq->db.dma);
1218 	pas = (__be64 *)MLX5_ADDR_OF(create_cq_in, in, pas);
1219 	mlx5_fill_page_frag_array(&cq->buf.frag_buf, pas);
1220 	cq->mcq.comp = mlx5vf_cq_complete;
1221 	cq->mcq.event = mlx5vf_cq_event;
1222 	err = mlx5_core_create_cq(mdev, &cq->mcq, in, inlen, out, sizeof(out));
1223 	if (err)
1224 		goto err_vec;
1225 
1226 	mlx5_cq_arm(&cq->mcq, MLX5_CQ_DB_REQ_NOT, tracker->uar->map,
1227 		    cq->mcq.cons_index);
1228 	kvfree(in);
1229 	return 0;
1230 
1231 err_vec:
1232 	kvfree(in);
1233 err_buff:
1234 	mlx5_frag_buf_free(mdev, &cq->buf.frag_buf);
1235 err_db_free:
1236 	mlx5_db_free(mdev, &cq->db);
1237 	return err;
1238 }
1239 
1240 static struct mlx5_vhca_qp *
1241 mlx5vf_create_rc_qp(struct mlx5_core_dev *mdev,
1242 		    struct mlx5_vhca_page_tracker *tracker, u32 max_recv_wr)
1243 {
1244 	u32 out[MLX5_ST_SZ_DW(create_qp_out)] = {};
1245 	struct mlx5_vhca_qp *qp;
1246 	u8 log_rq_stride;
1247 	u8 log_rq_sz;
1248 	void *qpc;
1249 	int inlen;
1250 	void *in;
1251 	int err;
1252 
1253 	qp = kzalloc_obj(*qp, GFP_KERNEL_ACCOUNT);
1254 	if (!qp)
1255 		return ERR_PTR(-ENOMEM);
1256 
1257 	err = mlx5_db_alloc_node(mdev, &qp->db, mdev->priv.numa_node);
1258 	if (err)
1259 		goto err_free;
1260 
1261 	if (max_recv_wr) {
1262 		qp->rq.wqe_cnt = roundup_pow_of_two(max_recv_wr);
1263 		log_rq_stride = ilog2(MLX5_SEND_WQE_DS);
1264 		log_rq_sz = ilog2(qp->rq.wqe_cnt);
1265 		err = mlx5_frag_buf_alloc_node(mdev,
1266 			wq_get_byte_sz(log_rq_sz, log_rq_stride),
1267 			&qp->buf, mdev->priv.numa_node);
1268 		if (err)
1269 			goto err_db_free;
1270 		mlx5_init_fbc(qp->buf.frags, log_rq_stride, log_rq_sz, &qp->rq.fbc);
1271 	}
1272 
1273 	qp->rq.db = &qp->db.db[MLX5_RCV_DBR];
1274 	inlen = MLX5_ST_SZ_BYTES(create_qp_in) +
1275 		MLX5_FLD_SZ_BYTES(create_qp_in, pas[0]) *
1276 		qp->buf.npages;
1277 	in = kvzalloc(inlen, GFP_KERNEL);
1278 	if (!in) {
1279 		err = -ENOMEM;
1280 		goto err_in;
1281 	}
1282 
1283 	qpc = MLX5_ADDR_OF(create_qp_in, in, qpc);
1284 	MLX5_SET(qpc, qpc, st, MLX5_QP_ST_RC);
1285 	MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED);
1286 	MLX5_SET(qpc, qpc, pd, tracker->pdn);
1287 	MLX5_SET(qpc, qpc, uar_page, tracker->uar->index);
1288 	MLX5_SET(qpc, qpc, log_page_size,
1289 		 qp->buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT);
1290 	MLX5_SET(qpc, qpc, ts_format, mlx5_get_qp_default_ts(mdev));
1291 	if (MLX5_CAP_GEN(mdev, cqe_version) == 1)
1292 		MLX5_SET(qpc, qpc, user_index, 0xFFFFFF);
1293 	MLX5_SET(qpc, qpc, no_sq, 1);
1294 	if (max_recv_wr) {
1295 		MLX5_SET(qpc, qpc, cqn_rcv, tracker->cq.mcq.cqn);
1296 		MLX5_SET(qpc, qpc, log_rq_stride, log_rq_stride - 4);
1297 		MLX5_SET(qpc, qpc, log_rq_size, log_rq_sz);
1298 		MLX5_SET(qpc, qpc, rq_type, MLX5_NON_ZERO_RQ);
1299 		MLX5_SET64(qpc, qpc, dbr_addr, qp->db.dma);
1300 		mlx5_fill_page_frag_array(&qp->buf,
1301 					  (__be64 *)MLX5_ADDR_OF(create_qp_in,
1302 								 in, pas));
1303 	} else {
1304 		MLX5_SET(qpc, qpc, rq_type, MLX5_ZERO_LEN_RQ);
1305 	}
1306 
1307 	MLX5_SET(create_qp_in, in, opcode, MLX5_CMD_OP_CREATE_QP);
1308 	err = mlx5_cmd_exec(mdev, in, inlen, out, sizeof(out));
1309 	kvfree(in);
1310 	if (err)
1311 		goto err_in;
1312 
1313 	qp->qpn = MLX5_GET(create_qp_out, out, qpn);
1314 	return qp;
1315 
1316 err_in:
1317 	if (max_recv_wr)
1318 		mlx5_frag_buf_free(mdev, &qp->buf);
1319 err_db_free:
1320 	mlx5_db_free(mdev, &qp->db);
1321 err_free:
1322 	kfree(qp);
1323 	return ERR_PTR(err);
1324 }
1325 
1326 static void mlx5vf_post_recv(struct mlx5_vhca_qp *qp)
1327 {
1328 	struct mlx5_wqe_data_seg *data;
1329 	unsigned int ix;
1330 
1331 	WARN_ON(qp->rq.pc - qp->rq.cc >= qp->rq.wqe_cnt);
1332 	ix = qp->rq.pc & (qp->rq.wqe_cnt - 1);
1333 	data = mlx5_frag_buf_get_wqe(&qp->rq.fbc, ix);
1334 	data->byte_count = cpu_to_be32(qp->max_msg_size);
1335 	data->lkey = cpu_to_be32(qp->recv_buf.mkey);
1336 	data->addr = cpu_to_be64(qp->recv_buf.next_rq_offset);
1337 	qp->rq.pc++;
1338 	/* Make sure that descriptors are written before doorbell record. */
1339 	dma_wmb();
1340 	*qp->rq.db = cpu_to_be32(qp->rq.pc & 0xffff);
1341 }
1342 
1343 static int mlx5vf_activate_qp(struct mlx5_core_dev *mdev,
1344 			      struct mlx5_vhca_qp *qp, u32 remote_qpn,
1345 			      bool host_qp)
1346 {
1347 	u32 init_in[MLX5_ST_SZ_DW(rst2init_qp_in)] = {};
1348 	u32 rtr_in[MLX5_ST_SZ_DW(init2rtr_qp_in)] = {};
1349 	u32 rts_in[MLX5_ST_SZ_DW(rtr2rts_qp_in)] = {};
1350 	void *qpc;
1351 	int ret;
1352 
1353 	/* Init */
1354 	qpc = MLX5_ADDR_OF(rst2init_qp_in, init_in, qpc);
1355 	MLX5_SET(qpc, qpc, primary_address_path.vhca_port_num, 1);
1356 	MLX5_SET(qpc, qpc, pm_state, MLX5_QPC_PM_STATE_MIGRATED);
1357 	MLX5_SET(qpc, qpc, rre, 1);
1358 	MLX5_SET(qpc, qpc, rwe, 1);
1359 	MLX5_SET(rst2init_qp_in, init_in, opcode, MLX5_CMD_OP_RST2INIT_QP);
1360 	MLX5_SET(rst2init_qp_in, init_in, qpn, qp->qpn);
1361 	ret = mlx5_cmd_exec_in(mdev, rst2init_qp, init_in);
1362 	if (ret)
1363 		return ret;
1364 
1365 	if (host_qp) {
1366 		struct mlx5_vhca_recv_buf *recv_buf = &qp->recv_buf;
1367 		int i;
1368 
1369 		for (i = 0; i < qp->rq.wqe_cnt; i++) {
1370 			mlx5vf_post_recv(qp);
1371 			recv_buf->next_rq_offset += qp->max_msg_size;
1372 		}
1373 	}
1374 
1375 	/* RTR */
1376 	qpc = MLX5_ADDR_OF(init2rtr_qp_in, rtr_in, qpc);
1377 	MLX5_SET(init2rtr_qp_in, rtr_in, qpn, qp->qpn);
1378 	MLX5_SET(qpc, qpc, mtu, IB_MTU_4096);
1379 	MLX5_SET(qpc, qpc, log_msg_max, MLX5_CAP_GEN(mdev, log_max_msg));
1380 	MLX5_SET(qpc, qpc, remote_qpn, remote_qpn);
1381 	MLX5_SET(qpc, qpc, primary_address_path.vhca_port_num, 1);
1382 	MLX5_SET(qpc, qpc, primary_address_path.fl, 1);
1383 	MLX5_SET(qpc, qpc, min_rnr_nak, 1);
1384 	MLX5_SET(init2rtr_qp_in, rtr_in, opcode, MLX5_CMD_OP_INIT2RTR_QP);
1385 	MLX5_SET(init2rtr_qp_in, rtr_in, qpn, qp->qpn);
1386 	ret = mlx5_cmd_exec_in(mdev, init2rtr_qp, rtr_in);
1387 	if (ret || host_qp)
1388 		return ret;
1389 
1390 	/* RTS */
1391 	qpc = MLX5_ADDR_OF(rtr2rts_qp_in, rts_in, qpc);
1392 	MLX5_SET(rtr2rts_qp_in, rts_in, qpn, qp->qpn);
1393 	MLX5_SET(qpc, qpc, retry_count, 7);
1394 	MLX5_SET(qpc, qpc, rnr_retry, 7); /* Infinite retry if RNR NACK */
1395 	MLX5_SET(qpc, qpc, primary_address_path.ack_timeout, 0x8); /* ~1ms */
1396 	MLX5_SET(rtr2rts_qp_in, rts_in, opcode, MLX5_CMD_OP_RTR2RTS_QP);
1397 	MLX5_SET(rtr2rts_qp_in, rts_in, qpn, qp->qpn);
1398 
1399 	return mlx5_cmd_exec_in(mdev, rtr2rts_qp, rts_in);
1400 }
1401 
1402 static void mlx5vf_destroy_qp(struct mlx5_core_dev *mdev,
1403 			      struct mlx5_vhca_qp *qp)
1404 {
1405 	u32 in[MLX5_ST_SZ_DW(destroy_qp_in)] = {};
1406 
1407 	MLX5_SET(destroy_qp_in, in, opcode, MLX5_CMD_OP_DESTROY_QP);
1408 	MLX5_SET(destroy_qp_in, in, qpn, qp->qpn);
1409 	mlx5_cmd_exec_in(mdev, destroy_qp, in);
1410 
1411 	mlx5_frag_buf_free(mdev, &qp->buf);
1412 	mlx5_db_free(mdev, &qp->db);
1413 	kfree(qp);
1414 }
1415 
1416 static void mlx5vf_free_qp_recv_resources(struct mlx5_core_dev *mdev,
1417 					  struct mlx5_vhca_qp *qp)
1418 {
1419 	struct mlx5_vhca_recv_buf *recv_buf = &qp->recv_buf;
1420 
1421 	mlx5_core_destroy_mkey(mdev, recv_buf->mkey);
1422 	unregister_dma_pages(mdev, recv_buf->npages, recv_buf->mkey_in,
1423 			     &recv_buf->state, DMA_FROM_DEVICE);
1424 	kvfree(recv_buf->mkey_in);
1425 	free_page_list(recv_buf->npages, recv_buf->page_list);
1426 }
1427 
1428 static int mlx5vf_alloc_qp_recv_resources(struct mlx5_core_dev *mdev,
1429 					  struct mlx5_vhca_qp *qp, u32 pdn,
1430 					  u64 rq_size)
1431 {
1432 	unsigned int npages = DIV_ROUND_UP_ULL(rq_size, PAGE_SIZE);
1433 	struct mlx5_vhca_recv_buf *recv_buf = &qp->recv_buf;
1434 	int err;
1435 
1436 	err = mlx5vf_add_pages(&recv_buf->page_list, npages);
1437 	if (err)
1438 		return err;
1439 
1440 	recv_buf->npages = npages;
1441 
1442 	recv_buf->mkey_in = alloc_mkey_in(npages, pdn);
1443 	if (!recv_buf->mkey_in) {
1444 		err = -ENOMEM;
1445 		goto end;
1446 	}
1447 
1448 	err = register_dma_pages(mdev, npages, recv_buf->page_list,
1449 				 recv_buf->mkey_in, &recv_buf->state,
1450 				 DMA_FROM_DEVICE);
1451 	if (err)
1452 		goto err_register_dma;
1453 
1454 	err = create_mkey(mdev, npages, recv_buf->mkey_in, &recv_buf->mkey);
1455 	if (err)
1456 		goto err_create_mkey;
1457 
1458 	return 0;
1459 
1460 err_create_mkey:
1461 	unregister_dma_pages(mdev, npages, recv_buf->mkey_in, &recv_buf->state,
1462 			     DMA_FROM_DEVICE);
1463 err_register_dma:
1464 	kvfree(recv_buf->mkey_in);
1465 	recv_buf->mkey_in = NULL;
1466 end:
1467 	free_page_list(npages, recv_buf->page_list);
1468 	return err;
1469 }
1470 
1471 static void
1472 _mlx5vf_free_page_tracker_resources(struct mlx5vf_pci_core_device *mvdev)
1473 {
1474 	struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker;
1475 	struct mlx5_core_dev *mdev = mvdev->mdev;
1476 
1477 	lockdep_assert_held(&mvdev->state_mutex);
1478 
1479 	if (!mvdev->log_active)
1480 		return;
1481 
1482 	WARN_ON(mvdev->mdev_detach);
1483 
1484 	mlx5_eq_notifier_unregister(mdev, &tracker->nb);
1485 	mlx5vf_cmd_destroy_tracker(mdev, tracker->id);
1486 	mlx5vf_destroy_qp(mdev, tracker->fw_qp);
1487 	mlx5vf_free_qp_recv_resources(mdev, tracker->host_qp);
1488 	mlx5vf_destroy_qp(mdev, tracker->host_qp);
1489 	mlx5vf_destroy_cq(mdev, &tracker->cq);
1490 	mlx5_core_dealloc_pd(mdev, tracker->pdn);
1491 	mlx5_put_uars_page(mdev, tracker->uar);
1492 	mvdev->log_active = false;
1493 }
1494 
1495 int mlx5vf_stop_page_tracker(struct vfio_device *vdev)
1496 {
1497 	struct mlx5vf_pci_core_device *mvdev = container_of(
1498 		vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1499 
1500 	mutex_lock(&mvdev->state_mutex);
1501 	if (!mvdev->log_active)
1502 		goto end;
1503 
1504 	_mlx5vf_free_page_tracker_resources(mvdev);
1505 	mvdev->log_active = false;
1506 end:
1507 	mlx5vf_state_mutex_unlock(mvdev);
1508 	return 0;
1509 }
1510 
1511 int mlx5vf_start_page_tracker(struct vfio_device *vdev,
1512 			      struct rb_root_cached *ranges, u32 nnodes,
1513 			      u64 *page_size)
1514 {
1515 	struct mlx5vf_pci_core_device *mvdev = container_of(
1516 		vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1517 	struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker;
1518 	u8 log_tracked_page = ilog2(*page_size);
1519 	struct mlx5_vhca_qp *host_qp;
1520 	struct mlx5_vhca_qp *fw_qp;
1521 	struct mlx5_core_dev *mdev;
1522 	u32 log_max_msg_size;
1523 	u32 max_msg_size;
1524 	u64 rq_size = SZ_2M;
1525 	u32 max_recv_wr;
1526 	int err;
1527 
1528 	mutex_lock(&mvdev->state_mutex);
1529 	if (mvdev->mdev_detach) {
1530 		err = -ENOTCONN;
1531 		goto end;
1532 	}
1533 
1534 	if (mvdev->log_active) {
1535 		err = -EINVAL;
1536 		goto end;
1537 	}
1538 
1539 	mdev = mvdev->mdev;
1540 	log_max_msg_size = MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_log_max_msg_size);
1541 	max_msg_size = (1ULL << log_max_msg_size);
1542 	/* The RQ must hold at least 4 WQEs/messages for successful QP creation */
1543 	if (rq_size < 4ULL * max_msg_size)
1544 		rq_size = 4ULL * max_msg_size;
1545 
1546 	memset(tracker, 0, sizeof(*tracker));
1547 	tracker->uar = mlx5_get_uars_page(mdev);
1548 	if (IS_ERR(tracker->uar)) {
1549 		err = PTR_ERR(tracker->uar);
1550 		goto end;
1551 	}
1552 
1553 	err = mlx5_core_alloc_pd(mdev, &tracker->pdn);
1554 	if (err)
1555 		goto err_uar;
1556 
1557 	max_recv_wr = DIV_ROUND_UP_ULL(rq_size, max_msg_size);
1558 	err = mlx5vf_create_cq(mdev, tracker, max_recv_wr);
1559 	if (err)
1560 		goto err_dealloc_pd;
1561 
1562 	host_qp = mlx5vf_create_rc_qp(mdev, tracker, max_recv_wr);
1563 	if (IS_ERR(host_qp)) {
1564 		err = PTR_ERR(host_qp);
1565 		goto err_cq;
1566 	}
1567 
1568 	host_qp->max_msg_size = max_msg_size;
1569 	if (log_tracked_page < MLX5_CAP_ADV_VIRTUALIZATION(mdev,
1570 				pg_track_log_min_page_size)) {
1571 		log_tracked_page = MLX5_CAP_ADV_VIRTUALIZATION(mdev,
1572 				pg_track_log_min_page_size);
1573 	} else if (log_tracked_page > MLX5_CAP_ADV_VIRTUALIZATION(mdev,
1574 				pg_track_log_max_page_size)) {
1575 		log_tracked_page = MLX5_CAP_ADV_VIRTUALIZATION(mdev,
1576 				pg_track_log_max_page_size);
1577 	}
1578 
1579 	host_qp->tracked_page_size = (1ULL << log_tracked_page);
1580 	err = mlx5vf_alloc_qp_recv_resources(mdev, host_qp, tracker->pdn,
1581 					     rq_size);
1582 	if (err)
1583 		goto err_host_qp;
1584 
1585 	fw_qp = mlx5vf_create_rc_qp(mdev, tracker, 0);
1586 	if (IS_ERR(fw_qp)) {
1587 		err = PTR_ERR(fw_qp);
1588 		goto err_recv_resources;
1589 	}
1590 
1591 	err = mlx5vf_activate_qp(mdev, host_qp, fw_qp->qpn, true);
1592 	if (err)
1593 		goto err_activate;
1594 
1595 	err = mlx5vf_activate_qp(mdev, fw_qp, host_qp->qpn, false);
1596 	if (err)
1597 		goto err_activate;
1598 
1599 	tracker->host_qp = host_qp;
1600 	tracker->fw_qp = fw_qp;
1601 	err = mlx5vf_create_tracker(mdev, mvdev, ranges, nnodes);
1602 	if (err)
1603 		goto err_activate;
1604 
1605 	MLX5_NB_INIT(&tracker->nb, mlx5vf_event_notifier, NOTIFY_ANY);
1606 	mlx5_eq_notifier_register(mdev, &tracker->nb);
1607 	*page_size = host_qp->tracked_page_size;
1608 	mvdev->log_active = true;
1609 	mlx5vf_state_mutex_unlock(mvdev);
1610 	return 0;
1611 
1612 err_activate:
1613 	mlx5vf_destroy_qp(mdev, fw_qp);
1614 err_recv_resources:
1615 	mlx5vf_free_qp_recv_resources(mdev, host_qp);
1616 err_host_qp:
1617 	mlx5vf_destroy_qp(mdev, host_qp);
1618 err_cq:
1619 	mlx5vf_destroy_cq(mdev, &tracker->cq);
1620 err_dealloc_pd:
1621 	mlx5_core_dealloc_pd(mdev, tracker->pdn);
1622 err_uar:
1623 	mlx5_put_uars_page(mdev, tracker->uar);
1624 end:
1625 	mlx5vf_state_mutex_unlock(mvdev);
1626 	return err;
1627 }
1628 
1629 static void
1630 set_report_output(u32 size, int index, struct mlx5_vhca_qp *qp,
1631 		  struct iova_bitmap *dirty)
1632 {
1633 	u32 entry_size = MLX5_ST_SZ_BYTES(page_track_report_entry);
1634 	u32 nent = size / entry_size;
1635 	u32 nent_in_page;
1636 	u32 nent_to_set;
1637 	struct page *page;
1638 	u32 page_offset;
1639 	u32 page_index;
1640 	u32 buf_offset;
1641 	void *kaddr;
1642 	u64 addr;
1643 	u64 *buf;
1644 	int i;
1645 
1646 	buf_offset = index * qp->max_msg_size;
1647 	if (WARN_ON(buf_offset + size >= qp->recv_buf.npages * PAGE_SIZE ||
1648 		    (nent > qp->max_msg_size / entry_size)))
1649 		return;
1650 
1651 	do {
1652 		page_index = buf_offset / PAGE_SIZE;
1653 		page_offset = buf_offset % PAGE_SIZE;
1654 		nent_in_page = (PAGE_SIZE - page_offset) / entry_size;
1655 		page = qp->recv_buf.page_list[page_index];
1656 		kaddr = kmap_local_page(page);
1657 		buf = kaddr + page_offset;
1658 		nent_to_set = min(nent, nent_in_page);
1659 		for (i = 0; i < nent_to_set; i++) {
1660 			addr = MLX5_GET(page_track_report_entry, buf + i,
1661 					dirty_address_low);
1662 			addr |= (u64)MLX5_GET(page_track_report_entry, buf + i,
1663 					      dirty_address_high) << 32;
1664 			iova_bitmap_set(dirty, addr, qp->tracked_page_size);
1665 		}
1666 		kunmap_local(kaddr);
1667 		buf_offset += (nent_to_set * entry_size);
1668 		nent -= nent_to_set;
1669 	} while (nent);
1670 }
1671 
1672 static void
1673 mlx5vf_rq_cqe(struct mlx5_vhca_qp *qp, struct mlx5_cqe64 *cqe,
1674 	      struct iova_bitmap *dirty, int *tracker_status)
1675 {
1676 	u32 size;
1677 	int ix;
1678 
1679 	qp->rq.cc++;
1680 	*tracker_status = be32_to_cpu(cqe->immediate) >> 28;
1681 	size = be32_to_cpu(cqe->byte_cnt);
1682 	ix = be16_to_cpu(cqe->wqe_counter) & (qp->rq.wqe_cnt - 1);
1683 
1684 	/* zero length CQE, no data */
1685 	WARN_ON(!size && *tracker_status == MLX5_PAGE_TRACK_STATE_REPORTING);
1686 	if (size)
1687 		set_report_output(size, ix, qp, dirty);
1688 
1689 	qp->recv_buf.next_rq_offset = ix * qp->max_msg_size;
1690 	mlx5vf_post_recv(qp);
1691 }
1692 
1693 static void *get_cqe(struct mlx5_vhca_cq *cq, int n)
1694 {
1695 	return mlx5_frag_buf_get_wqe(&cq->buf.fbc, n);
1696 }
1697 
1698 static struct mlx5_cqe64 *get_sw_cqe(struct mlx5_vhca_cq *cq, int n)
1699 {
1700 	void *cqe = get_cqe(cq, n & (cq->ncqe - 1));
1701 	struct mlx5_cqe64 *cqe64;
1702 
1703 	cqe64 = (cq->mcq.cqe_sz == 64) ? cqe : cqe + 64;
1704 
1705 	if (likely(get_cqe_opcode(cqe64) != MLX5_CQE_INVALID) &&
1706 	    !((cqe64->op_own & MLX5_CQE_OWNER_MASK) ^ !!(n & (cq->ncqe)))) {
1707 		return cqe64;
1708 	} else {
1709 		return NULL;
1710 	}
1711 }
1712 
1713 static int
1714 mlx5vf_cq_poll_one(struct mlx5_vhca_cq *cq, struct mlx5_vhca_qp *qp,
1715 		   struct iova_bitmap *dirty, int *tracker_status)
1716 {
1717 	struct mlx5_cqe64 *cqe;
1718 	u8 opcode;
1719 
1720 	cqe = get_sw_cqe(cq, cq->mcq.cons_index);
1721 	if (!cqe)
1722 		return CQ_EMPTY;
1723 
1724 	++cq->mcq.cons_index;
1725 	/*
1726 	 * Make sure we read CQ entry contents after we've checked the
1727 	 * ownership bit.
1728 	 */
1729 	rmb();
1730 	opcode = get_cqe_opcode(cqe);
1731 	switch (opcode) {
1732 	case MLX5_CQE_RESP_SEND_IMM:
1733 		mlx5vf_rq_cqe(qp, cqe, dirty, tracker_status);
1734 		return CQ_OK;
1735 	default:
1736 		return CQ_POLL_ERR;
1737 	}
1738 }
1739 
1740 int mlx5vf_tracker_read_and_clear(struct vfio_device *vdev, unsigned long iova,
1741 				  unsigned long length,
1742 				  struct iova_bitmap *dirty)
1743 {
1744 	struct mlx5vf_pci_core_device *mvdev = container_of(
1745 		vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1746 	struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker;
1747 	struct mlx5_vhca_cq *cq = &tracker->cq;
1748 	struct mlx5_core_dev *mdev;
1749 	int poll_err, err;
1750 
1751 	mutex_lock(&mvdev->state_mutex);
1752 	if (!mvdev->log_active) {
1753 		err = -EINVAL;
1754 		goto end;
1755 	}
1756 
1757 	if (mvdev->mdev_detach) {
1758 		err = -ENOTCONN;
1759 		goto end;
1760 	}
1761 
1762 	if (tracker->is_err) {
1763 		err = -EIO;
1764 		goto end;
1765 	}
1766 
1767 	mdev = mvdev->mdev;
1768 	err = mlx5vf_cmd_modify_tracker(mdev, tracker->id, iova, length,
1769 					MLX5_PAGE_TRACK_STATE_REPORTING);
1770 	if (err)
1771 		goto end;
1772 
1773 	tracker->status = MLX5_PAGE_TRACK_STATE_REPORTING;
1774 	while (tracker->status == MLX5_PAGE_TRACK_STATE_REPORTING &&
1775 	       !tracker->is_err) {
1776 		poll_err = mlx5vf_cq_poll_one(cq, tracker->host_qp, dirty,
1777 					      &tracker->status);
1778 		if (poll_err == CQ_EMPTY) {
1779 			mlx5_cq_arm(&cq->mcq, MLX5_CQ_DB_REQ_NOT, tracker->uar->map,
1780 				    cq->mcq.cons_index);
1781 			poll_err = mlx5vf_cq_poll_one(cq, tracker->host_qp,
1782 						      dirty, &tracker->status);
1783 			if (poll_err == CQ_EMPTY) {
1784 				wait_for_completion(&mvdev->tracker_comp);
1785 				if (tracker->object_changed) {
1786 					tracker->object_changed = false;
1787 					err = mlx5vf_cmd_query_tracker(mdev, tracker);
1788 					if (err)
1789 						goto end;
1790 				}
1791 				continue;
1792 			}
1793 		}
1794 		if (poll_err == CQ_POLL_ERR) {
1795 			err = -EIO;
1796 			goto end;
1797 		}
1798 		mlx5_cq_set_ci(&cq->mcq);
1799 	}
1800 
1801 	if (tracker->status == MLX5_PAGE_TRACK_STATE_ERROR)
1802 		tracker->is_err = true;
1803 
1804 	if (tracker->is_err)
1805 		err = -EIO;
1806 end:
1807 	mlx5vf_state_mutex_unlock(mvdev);
1808 	return err;
1809 }
1810