xref: /linux/drivers/vfio/pci/mlx5/cmd.c (revision 5cd2340cb6a383d04fd88e48fabc2a21a909d6a1)
1 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
2 /*
3  * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved
4  */
5 
6 #include "cmd.h"
7 
8 enum { CQ_OK = 0, CQ_EMPTY = -1, CQ_POLL_ERR = -2 };
9 
10 static int mlx5vf_is_migratable(struct mlx5_core_dev *mdev, u16 func_id)
11 {
12 	int query_sz = MLX5_ST_SZ_BYTES(query_hca_cap_out);
13 	void *query_cap = NULL, *cap;
14 	int ret;
15 
16 	query_cap = kzalloc(query_sz, GFP_KERNEL);
17 	if (!query_cap)
18 		return -ENOMEM;
19 
20 	ret = mlx5_vport_get_other_func_cap(mdev, func_id, query_cap,
21 					    MLX5_CAP_GENERAL_2);
22 	if (ret)
23 		goto out;
24 
25 	cap = MLX5_ADDR_OF(query_hca_cap_out, query_cap, capability);
26 	if (!MLX5_GET(cmd_hca_cap_2, cap, migratable))
27 		ret = -EOPNOTSUPP;
28 out:
29 	kfree(query_cap);
30 	return ret;
31 }
32 
33 static int mlx5vf_cmd_get_vhca_id(struct mlx5_core_dev *mdev, u16 function_id,
34 				  u16 *vhca_id);
35 static void
36 _mlx5vf_free_page_tracker_resources(struct mlx5vf_pci_core_device *mvdev);
37 
38 int mlx5vf_cmd_suspend_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod)
39 {
40 	struct mlx5_vf_migration_file *migf = mvdev->saving_migf;
41 	u32 out[MLX5_ST_SZ_DW(suspend_vhca_out)] = {};
42 	u32 in[MLX5_ST_SZ_DW(suspend_vhca_in)] = {};
43 	int err;
44 
45 	lockdep_assert_held(&mvdev->state_mutex);
46 	if (mvdev->mdev_detach)
47 		return -ENOTCONN;
48 
49 	/*
50 	 * In case PRE_COPY is used, saving_migf is exposed while the device is
51 	 * running. Make sure to run only once there is no active save command.
52 	 * Running both in parallel, might end-up with a failure in the save
53 	 * command once it will try to turn on 'tracking' on a suspended device.
54 	 */
55 	if (migf) {
56 		err = wait_for_completion_interruptible(&migf->save_comp);
57 		if (err)
58 			return err;
59 	}
60 
61 	MLX5_SET(suspend_vhca_in, in, opcode, MLX5_CMD_OP_SUSPEND_VHCA);
62 	MLX5_SET(suspend_vhca_in, in, vhca_id, mvdev->vhca_id);
63 	MLX5_SET(suspend_vhca_in, in, op_mod, op_mod);
64 
65 	err = mlx5_cmd_exec_inout(mvdev->mdev, suspend_vhca, in, out);
66 	if (migf)
67 		complete(&migf->save_comp);
68 
69 	return err;
70 }
71 
72 int mlx5vf_cmd_resume_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod)
73 {
74 	u32 out[MLX5_ST_SZ_DW(resume_vhca_out)] = {};
75 	u32 in[MLX5_ST_SZ_DW(resume_vhca_in)] = {};
76 
77 	lockdep_assert_held(&mvdev->state_mutex);
78 	if (mvdev->mdev_detach)
79 		return -ENOTCONN;
80 
81 	MLX5_SET(resume_vhca_in, in, opcode, MLX5_CMD_OP_RESUME_VHCA);
82 	MLX5_SET(resume_vhca_in, in, vhca_id, mvdev->vhca_id);
83 	MLX5_SET(resume_vhca_in, in, op_mod, op_mod);
84 
85 	return mlx5_cmd_exec_inout(mvdev->mdev, resume_vhca, in, out);
86 }
87 
88 int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev,
89 					  size_t *state_size, u64 *total_size,
90 					  u8 query_flags)
91 {
92 	u32 out[MLX5_ST_SZ_DW(query_vhca_migration_state_out)] = {};
93 	u32 in[MLX5_ST_SZ_DW(query_vhca_migration_state_in)] = {};
94 	bool inc = query_flags & MLX5VF_QUERY_INC;
95 	int ret;
96 
97 	lockdep_assert_held(&mvdev->state_mutex);
98 	if (mvdev->mdev_detach)
99 		return -ENOTCONN;
100 
101 	/*
102 	 * In case PRE_COPY is used, saving_migf is exposed while device is
103 	 * running. Make sure to run only once there is no active save command.
104 	 * Running both in parallel, might end-up with a failure in the
105 	 * incremental query command on un-tracked vhca.
106 	 */
107 	if (inc) {
108 		ret = wait_for_completion_interruptible(&mvdev->saving_migf->save_comp);
109 		if (ret)
110 			return ret;
111 		/* Upon cleanup, ignore previous pre_copy error state */
112 		if (mvdev->saving_migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR &&
113 		    !(query_flags & MLX5VF_QUERY_CLEANUP)) {
114 			/*
115 			 * In case we had a PRE_COPY error, only query full
116 			 * image for final image
117 			 */
118 			if (!(query_flags & MLX5VF_QUERY_FINAL)) {
119 				*state_size = 0;
120 				complete(&mvdev->saving_migf->save_comp);
121 				return 0;
122 			}
123 			query_flags &= ~MLX5VF_QUERY_INC;
124 		}
125 		/* Block incremental query which is state-dependent */
126 		if (mvdev->saving_migf->state == MLX5_MIGF_STATE_ERROR) {
127 			complete(&mvdev->saving_migf->save_comp);
128 			return -ENODEV;
129 		}
130 	}
131 
132 	MLX5_SET(query_vhca_migration_state_in, in, opcode,
133 		 MLX5_CMD_OP_QUERY_VHCA_MIGRATION_STATE);
134 	MLX5_SET(query_vhca_migration_state_in, in, vhca_id, mvdev->vhca_id);
135 	MLX5_SET(query_vhca_migration_state_in, in, op_mod, 0);
136 	MLX5_SET(query_vhca_migration_state_in, in, incremental,
137 		 query_flags & MLX5VF_QUERY_INC);
138 	MLX5_SET(query_vhca_migration_state_in, in, chunk, mvdev->chunk_mode);
139 
140 	ret = mlx5_cmd_exec_inout(mvdev->mdev, query_vhca_migration_state, in,
141 				  out);
142 	if (inc)
143 		complete(&mvdev->saving_migf->save_comp);
144 
145 	if (ret)
146 		return ret;
147 
148 	*state_size = MLX5_GET(query_vhca_migration_state_out, out,
149 			       required_umem_size);
150 	if (total_size)
151 		*total_size = mvdev->chunk_mode ?
152 			MLX5_GET64(query_vhca_migration_state_out, out,
153 				   remaining_total_size) : *state_size;
154 
155 	return 0;
156 }
157 
158 static void set_tracker_change_event(struct mlx5vf_pci_core_device *mvdev)
159 {
160 	mvdev->tracker.object_changed = true;
161 	complete(&mvdev->tracker_comp);
162 }
163 
164 static void set_tracker_error(struct mlx5vf_pci_core_device *mvdev)
165 {
166 	/* Mark the tracker under an error and wake it up if it's running */
167 	mvdev->tracker.is_err = true;
168 	complete(&mvdev->tracker_comp);
169 }
170 
171 static int mlx5fv_vf_event(struct notifier_block *nb,
172 			   unsigned long event, void *data)
173 {
174 	struct mlx5vf_pci_core_device *mvdev =
175 		container_of(nb, struct mlx5vf_pci_core_device, nb);
176 
177 	switch (event) {
178 	case MLX5_PF_NOTIFY_ENABLE_VF:
179 		mutex_lock(&mvdev->state_mutex);
180 		mvdev->mdev_detach = false;
181 		mlx5vf_state_mutex_unlock(mvdev);
182 		break;
183 	case MLX5_PF_NOTIFY_DISABLE_VF:
184 		mlx5vf_cmd_close_migratable(mvdev);
185 		mutex_lock(&mvdev->state_mutex);
186 		mvdev->mdev_detach = true;
187 		mlx5vf_state_mutex_unlock(mvdev);
188 		break;
189 	default:
190 		break;
191 	}
192 
193 	return 0;
194 }
195 
196 void mlx5vf_cmd_close_migratable(struct mlx5vf_pci_core_device *mvdev)
197 {
198 	if (!mvdev->migrate_cap)
199 		return;
200 
201 	/* Must be done outside the lock to let it progress */
202 	set_tracker_error(mvdev);
203 	mutex_lock(&mvdev->state_mutex);
204 	mlx5vf_disable_fds(mvdev, NULL);
205 	_mlx5vf_free_page_tracker_resources(mvdev);
206 	mlx5vf_state_mutex_unlock(mvdev);
207 }
208 
209 void mlx5vf_cmd_remove_migratable(struct mlx5vf_pci_core_device *mvdev)
210 {
211 	if (!mvdev->migrate_cap)
212 		return;
213 
214 	mlx5_sriov_blocking_notifier_unregister(mvdev->mdev, mvdev->vf_id,
215 						&mvdev->nb);
216 	destroy_workqueue(mvdev->cb_wq);
217 }
218 
219 void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev,
220 			       const struct vfio_migration_ops *mig_ops,
221 			       const struct vfio_log_ops *log_ops)
222 {
223 	struct pci_dev *pdev = mvdev->core_device.pdev;
224 	int ret;
225 
226 	if (!pdev->is_virtfn)
227 		return;
228 
229 	mvdev->mdev = mlx5_vf_get_core_dev(pdev);
230 	if (!mvdev->mdev)
231 		return;
232 
233 	if (!MLX5_CAP_GEN(mvdev->mdev, migration))
234 		goto end;
235 
236 	if (!(MLX5_CAP_GEN_2(mvdev->mdev, migration_multi_load) &&
237 	      MLX5_CAP_GEN_2(mvdev->mdev, migration_tracking_state)))
238 		goto end;
239 
240 	mvdev->vf_id = pci_iov_vf_id(pdev);
241 	if (mvdev->vf_id < 0)
242 		goto end;
243 
244 	ret = mlx5vf_is_migratable(mvdev->mdev, mvdev->vf_id + 1);
245 	if (ret)
246 		goto end;
247 
248 	if (mlx5vf_cmd_get_vhca_id(mvdev->mdev, mvdev->vf_id + 1,
249 				   &mvdev->vhca_id))
250 		goto end;
251 
252 	mvdev->cb_wq = alloc_ordered_workqueue("mlx5vf_wq", 0);
253 	if (!mvdev->cb_wq)
254 		goto end;
255 
256 	mutex_init(&mvdev->state_mutex);
257 	spin_lock_init(&mvdev->reset_lock);
258 	mvdev->nb.notifier_call = mlx5fv_vf_event;
259 	ret = mlx5_sriov_blocking_notifier_register(mvdev->mdev, mvdev->vf_id,
260 						    &mvdev->nb);
261 	if (ret) {
262 		destroy_workqueue(mvdev->cb_wq);
263 		goto end;
264 	}
265 
266 	mvdev->migrate_cap = 1;
267 	mvdev->core_device.vdev.migration_flags =
268 		VFIO_MIGRATION_STOP_COPY |
269 		VFIO_MIGRATION_P2P |
270 		VFIO_MIGRATION_PRE_COPY;
271 
272 	mvdev->core_device.vdev.mig_ops = mig_ops;
273 	init_completion(&mvdev->tracker_comp);
274 	if (MLX5_CAP_GEN(mvdev->mdev, adv_virtualization))
275 		mvdev->core_device.vdev.log_ops = log_ops;
276 
277 	if (MLX5_CAP_GEN_2(mvdev->mdev, migration_in_chunks))
278 		mvdev->chunk_mode = 1;
279 
280 end:
281 	mlx5_vf_put_core_dev(mvdev->mdev);
282 }
283 
284 static int mlx5vf_cmd_get_vhca_id(struct mlx5_core_dev *mdev, u16 function_id,
285 				  u16 *vhca_id)
286 {
287 	u32 in[MLX5_ST_SZ_DW(query_hca_cap_in)] = {};
288 	int out_size;
289 	void *out;
290 	int ret;
291 
292 	out_size = MLX5_ST_SZ_BYTES(query_hca_cap_out);
293 	out = kzalloc(out_size, GFP_KERNEL);
294 	if (!out)
295 		return -ENOMEM;
296 
297 	MLX5_SET(query_hca_cap_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_CAP);
298 	MLX5_SET(query_hca_cap_in, in, other_function, 1);
299 	MLX5_SET(query_hca_cap_in, in, function_id, function_id);
300 	MLX5_SET(query_hca_cap_in, in, op_mod,
301 		 MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE << 1 |
302 		 HCA_CAP_OPMOD_GET_CUR);
303 
304 	ret = mlx5_cmd_exec_inout(mdev, query_hca_cap, in, out);
305 	if (ret)
306 		goto err_exec;
307 
308 	*vhca_id = MLX5_GET(query_hca_cap_out, out,
309 			    capability.cmd_hca_cap.vhca_id);
310 
311 err_exec:
312 	kfree(out);
313 	return ret;
314 }
315 
316 static int _create_mkey(struct mlx5_core_dev *mdev, u32 pdn,
317 			struct mlx5_vhca_data_buffer *buf,
318 			struct mlx5_vhca_recv_buf *recv_buf,
319 			u32 *mkey)
320 {
321 	size_t npages = buf ? DIV_ROUND_UP(buf->allocated_length, PAGE_SIZE) :
322 				recv_buf->npages;
323 	int err = 0, inlen;
324 	__be64 *mtt;
325 	void *mkc;
326 	u32 *in;
327 
328 	inlen = MLX5_ST_SZ_BYTES(create_mkey_in) +
329 		sizeof(*mtt) * round_up(npages, 2);
330 
331 	in = kvzalloc(inlen, GFP_KERNEL);
332 	if (!in)
333 		return -ENOMEM;
334 
335 	MLX5_SET(create_mkey_in, in, translations_octword_actual_size,
336 		 DIV_ROUND_UP(npages, 2));
337 	mtt = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt);
338 
339 	if (buf) {
340 		struct sg_dma_page_iter dma_iter;
341 
342 		for_each_sgtable_dma_page(&buf->table.sgt, &dma_iter, 0)
343 			*mtt++ = cpu_to_be64(sg_page_iter_dma_address(&dma_iter));
344 	} else {
345 		int i;
346 
347 		for (i = 0; i < npages; i++)
348 			*mtt++ = cpu_to_be64(recv_buf->dma_addrs[i]);
349 	}
350 
351 	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
352 	MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_MTT);
353 	MLX5_SET(mkc, mkc, lr, 1);
354 	MLX5_SET(mkc, mkc, lw, 1);
355 	MLX5_SET(mkc, mkc, rr, 1);
356 	MLX5_SET(mkc, mkc, rw, 1);
357 	MLX5_SET(mkc, mkc, pd, pdn);
358 	MLX5_SET(mkc, mkc, bsf_octword_size, 0);
359 	MLX5_SET(mkc, mkc, qpn, 0xffffff);
360 	MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT);
361 	MLX5_SET(mkc, mkc, translations_octword_size, DIV_ROUND_UP(npages, 2));
362 	MLX5_SET64(mkc, mkc, len, npages * PAGE_SIZE);
363 	err = mlx5_core_create_mkey(mdev, mkey, in, inlen);
364 	kvfree(in);
365 	return err;
366 }
367 
368 static int mlx5vf_dma_data_buffer(struct mlx5_vhca_data_buffer *buf)
369 {
370 	struct mlx5vf_pci_core_device *mvdev = buf->migf->mvdev;
371 	struct mlx5_core_dev *mdev = mvdev->mdev;
372 	int ret;
373 
374 	lockdep_assert_held(&mvdev->state_mutex);
375 	if (mvdev->mdev_detach)
376 		return -ENOTCONN;
377 
378 	if (buf->dmaed || !buf->allocated_length)
379 		return -EINVAL;
380 
381 	ret = dma_map_sgtable(mdev->device, &buf->table.sgt, buf->dma_dir, 0);
382 	if (ret)
383 		return ret;
384 
385 	ret = _create_mkey(mdev, buf->migf->pdn, buf, NULL, &buf->mkey);
386 	if (ret)
387 		goto err;
388 
389 	buf->dmaed = true;
390 
391 	return 0;
392 err:
393 	dma_unmap_sgtable(mdev->device, &buf->table.sgt, buf->dma_dir, 0);
394 	return ret;
395 }
396 
397 void mlx5vf_free_data_buffer(struct mlx5_vhca_data_buffer *buf)
398 {
399 	struct mlx5_vf_migration_file *migf = buf->migf;
400 	struct sg_page_iter sg_iter;
401 
402 	lockdep_assert_held(&migf->mvdev->state_mutex);
403 	WARN_ON(migf->mvdev->mdev_detach);
404 
405 	if (buf->dmaed) {
406 		mlx5_core_destroy_mkey(migf->mvdev->mdev, buf->mkey);
407 		dma_unmap_sgtable(migf->mvdev->mdev->device, &buf->table.sgt,
408 				  buf->dma_dir, 0);
409 	}
410 
411 	/* Undo alloc_pages_bulk_array() */
412 	for_each_sgtable_page(&buf->table.sgt, &sg_iter, 0)
413 		__free_page(sg_page_iter_page(&sg_iter));
414 	sg_free_append_table(&buf->table);
415 	kfree(buf);
416 }
417 
418 static int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf,
419 				      unsigned int npages)
420 {
421 	unsigned int to_alloc = npages;
422 	struct page **page_list;
423 	unsigned long filled;
424 	unsigned int to_fill;
425 	int ret;
426 
427 	to_fill = min_t(unsigned int, npages, PAGE_SIZE / sizeof(*page_list));
428 	page_list = kvzalloc(to_fill * sizeof(*page_list), GFP_KERNEL_ACCOUNT);
429 	if (!page_list)
430 		return -ENOMEM;
431 
432 	do {
433 		filled = alloc_pages_bulk_array(GFP_KERNEL_ACCOUNT, to_fill,
434 						page_list);
435 		if (!filled) {
436 			ret = -ENOMEM;
437 			goto err;
438 		}
439 		to_alloc -= filled;
440 		ret = sg_alloc_append_table_from_pages(
441 			&buf->table, page_list, filled, 0,
442 			filled << PAGE_SHIFT, UINT_MAX, SG_MAX_SINGLE_ALLOC,
443 			GFP_KERNEL_ACCOUNT);
444 
445 		if (ret)
446 			goto err;
447 		buf->allocated_length += filled * PAGE_SIZE;
448 		/* clean input for another bulk allocation */
449 		memset(page_list, 0, filled * sizeof(*page_list));
450 		to_fill = min_t(unsigned int, to_alloc,
451 				PAGE_SIZE / sizeof(*page_list));
452 	} while (to_alloc > 0);
453 
454 	kvfree(page_list);
455 	return 0;
456 
457 err:
458 	kvfree(page_list);
459 	return ret;
460 }
461 
462 struct mlx5_vhca_data_buffer *
463 mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf,
464 			 size_t length,
465 			 enum dma_data_direction dma_dir)
466 {
467 	struct mlx5_vhca_data_buffer *buf;
468 	int ret;
469 
470 	buf = kzalloc(sizeof(*buf), GFP_KERNEL_ACCOUNT);
471 	if (!buf)
472 		return ERR_PTR(-ENOMEM);
473 
474 	buf->dma_dir = dma_dir;
475 	buf->migf = migf;
476 	if (length) {
477 		ret = mlx5vf_add_migration_pages(buf,
478 				DIV_ROUND_UP_ULL(length, PAGE_SIZE));
479 		if (ret)
480 			goto end;
481 
482 		if (dma_dir != DMA_NONE) {
483 			ret = mlx5vf_dma_data_buffer(buf);
484 			if (ret)
485 				goto end;
486 		}
487 	}
488 
489 	return buf;
490 end:
491 	mlx5vf_free_data_buffer(buf);
492 	return ERR_PTR(ret);
493 }
494 
495 void mlx5vf_put_data_buffer(struct mlx5_vhca_data_buffer *buf)
496 {
497 	spin_lock_irq(&buf->migf->list_lock);
498 	buf->stop_copy_chunk_num = 0;
499 	list_add_tail(&buf->buf_elm, &buf->migf->avail_list);
500 	spin_unlock_irq(&buf->migf->list_lock);
501 }
502 
503 struct mlx5_vhca_data_buffer *
504 mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf,
505 		       size_t length, enum dma_data_direction dma_dir)
506 {
507 	struct mlx5_vhca_data_buffer *buf, *temp_buf;
508 	struct list_head free_list;
509 
510 	lockdep_assert_held(&migf->mvdev->state_mutex);
511 	if (migf->mvdev->mdev_detach)
512 		return ERR_PTR(-ENOTCONN);
513 
514 	INIT_LIST_HEAD(&free_list);
515 
516 	spin_lock_irq(&migf->list_lock);
517 	list_for_each_entry_safe(buf, temp_buf, &migf->avail_list, buf_elm) {
518 		if (buf->dma_dir == dma_dir) {
519 			list_del_init(&buf->buf_elm);
520 			if (buf->allocated_length >= length) {
521 				spin_unlock_irq(&migf->list_lock);
522 				goto found;
523 			}
524 			/*
525 			 * Prevent holding redundant buffers. Put in a free
526 			 * list and call at the end not under the spin lock
527 			 * (&migf->list_lock) to mlx5vf_free_data_buffer which
528 			 * might sleep.
529 			 */
530 			list_add(&buf->buf_elm, &free_list);
531 		}
532 	}
533 	spin_unlock_irq(&migf->list_lock);
534 	buf = mlx5vf_alloc_data_buffer(migf, length, dma_dir);
535 
536 found:
537 	while ((temp_buf = list_first_entry_or_null(&free_list,
538 				struct mlx5_vhca_data_buffer, buf_elm))) {
539 		list_del(&temp_buf->buf_elm);
540 		mlx5vf_free_data_buffer(temp_buf);
541 	}
542 
543 	return buf;
544 }
545 
546 static void
547 mlx5vf_save_callback_complete(struct mlx5_vf_migration_file *migf,
548 			      struct mlx5vf_async_data *async_data)
549 {
550 	kvfree(async_data->out);
551 	complete(&migf->save_comp);
552 	fput(migf->filp);
553 }
554 
555 void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work)
556 {
557 	struct mlx5vf_async_data *async_data = container_of(_work,
558 		struct mlx5vf_async_data, work);
559 	struct mlx5_vf_migration_file *migf = container_of(async_data,
560 		struct mlx5_vf_migration_file, async_data);
561 
562 	mutex_lock(&migf->lock);
563 	if (async_data->status) {
564 		mlx5vf_put_data_buffer(async_data->buf);
565 		if (async_data->header_buf)
566 			mlx5vf_put_data_buffer(async_data->header_buf);
567 		if (!async_data->stop_copy_chunk &&
568 		    async_data->status == MLX5_CMD_STAT_BAD_RES_STATE_ERR)
569 			migf->state = MLX5_MIGF_STATE_PRE_COPY_ERROR;
570 		else
571 			migf->state = MLX5_MIGF_STATE_ERROR;
572 		wake_up_interruptible(&migf->poll_wait);
573 	}
574 	mutex_unlock(&migf->lock);
575 	mlx5vf_save_callback_complete(migf, async_data);
576 }
577 
578 static int add_buf_header(struct mlx5_vhca_data_buffer *header_buf,
579 			  size_t image_size, bool initial_pre_copy)
580 {
581 	struct mlx5_vf_migration_file *migf = header_buf->migf;
582 	struct mlx5_vf_migration_header header = {};
583 	unsigned long flags;
584 	struct page *page;
585 	u8 *to_buff;
586 
587 	header.record_size = cpu_to_le64(image_size);
588 	header.flags = cpu_to_le32(MLX5_MIGF_HEADER_FLAGS_TAG_MANDATORY);
589 	header.tag = cpu_to_le32(MLX5_MIGF_HEADER_TAG_FW_DATA);
590 	page = mlx5vf_get_migration_page(header_buf, 0);
591 	if (!page)
592 		return -EINVAL;
593 	to_buff = kmap_local_page(page);
594 	memcpy(to_buff, &header, sizeof(header));
595 	kunmap_local(to_buff);
596 	header_buf->length = sizeof(header);
597 	header_buf->start_pos = header_buf->migf->max_pos;
598 	migf->max_pos += header_buf->length;
599 	spin_lock_irqsave(&migf->list_lock, flags);
600 	list_add_tail(&header_buf->buf_elm, &migf->buf_list);
601 	spin_unlock_irqrestore(&migf->list_lock, flags);
602 	if (initial_pre_copy)
603 		migf->pre_copy_initial_bytes += sizeof(header);
604 	return 0;
605 }
606 
607 static void mlx5vf_save_callback(int status, struct mlx5_async_work *context)
608 {
609 	struct mlx5vf_async_data *async_data = container_of(context,
610 			struct mlx5vf_async_data, cb_work);
611 	struct mlx5_vf_migration_file *migf = container_of(async_data,
612 			struct mlx5_vf_migration_file, async_data);
613 
614 	if (!status) {
615 		size_t next_required_umem_size = 0;
616 		bool stop_copy_last_chunk;
617 		size_t image_size;
618 		unsigned long flags;
619 		bool initial_pre_copy = migf->state != MLX5_MIGF_STATE_PRE_COPY &&
620 				!async_data->stop_copy_chunk;
621 
622 		image_size = MLX5_GET(save_vhca_state_out, async_data->out,
623 				      actual_image_size);
624 		if (async_data->buf->stop_copy_chunk_num)
625 			next_required_umem_size = MLX5_GET(save_vhca_state_out,
626 					async_data->out, next_required_umem_size);
627 		stop_copy_last_chunk = async_data->stop_copy_chunk &&
628 				!next_required_umem_size;
629 		if (async_data->header_buf) {
630 			status = add_buf_header(async_data->header_buf, image_size,
631 						initial_pre_copy);
632 			if (status)
633 				goto err;
634 		}
635 		async_data->buf->length = image_size;
636 		async_data->buf->start_pos = migf->max_pos;
637 		migf->max_pos += async_data->buf->length;
638 		spin_lock_irqsave(&migf->list_lock, flags);
639 		list_add_tail(&async_data->buf->buf_elm, &migf->buf_list);
640 		if (async_data->buf->stop_copy_chunk_num) {
641 			migf->num_ready_chunks++;
642 			if (next_required_umem_size &&
643 			    migf->num_ready_chunks >= MAX_NUM_CHUNKS) {
644 				/* Delay the next SAVE till one chunk be consumed */
645 				migf->next_required_umem_size = next_required_umem_size;
646 				next_required_umem_size = 0;
647 			}
648 		}
649 		spin_unlock_irqrestore(&migf->list_lock, flags);
650 		if (initial_pre_copy) {
651 			migf->pre_copy_initial_bytes += image_size;
652 			migf->state = MLX5_MIGF_STATE_PRE_COPY;
653 		}
654 		if (stop_copy_last_chunk)
655 			migf->state = MLX5_MIGF_STATE_COMPLETE;
656 		wake_up_interruptible(&migf->poll_wait);
657 		if (next_required_umem_size)
658 			mlx5vf_mig_file_set_save_work(migf,
659 				/* Picking up the next chunk num */
660 				(async_data->buf->stop_copy_chunk_num % MAX_NUM_CHUNKS) + 1,
661 				next_required_umem_size);
662 		mlx5vf_save_callback_complete(migf, async_data);
663 		return;
664 	}
665 
666 err:
667 	/* The error flow can't run from an interrupt context */
668 	if (status == -EREMOTEIO) {
669 		status = MLX5_GET(save_vhca_state_out, async_data->out, status);
670 		/* Failed in FW, print cmd out failure details */
671 		mlx5_cmd_out_err(migf->mvdev->mdev, MLX5_CMD_OP_SAVE_VHCA_STATE, 0,
672 				 async_data->out);
673 	}
674 
675 	async_data->status = status;
676 	queue_work(migf->mvdev->cb_wq, &async_data->work);
677 }
678 
679 int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev,
680 			       struct mlx5_vf_migration_file *migf,
681 			       struct mlx5_vhca_data_buffer *buf, bool inc,
682 			       bool track)
683 {
684 	u32 out_size = MLX5_ST_SZ_BYTES(save_vhca_state_out);
685 	u32 in[MLX5_ST_SZ_DW(save_vhca_state_in)] = {};
686 	struct mlx5_vhca_data_buffer *header_buf = NULL;
687 	struct mlx5vf_async_data *async_data;
688 	bool pre_copy_cleanup = false;
689 	int err;
690 
691 	lockdep_assert_held(&mvdev->state_mutex);
692 	if (mvdev->mdev_detach)
693 		return -ENOTCONN;
694 
695 	err = wait_for_completion_interruptible(&migf->save_comp);
696 	if (err)
697 		return err;
698 
699 	if ((migf->state == MLX5_MIGF_STATE_PRE_COPY ||
700 	     migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR) && !track && !inc)
701 		pre_copy_cleanup = true;
702 
703 	if (migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR)
704 		/*
705 		 * In case we had a PRE_COPY error, SAVE is triggered only for
706 		 * the final image, read device full image.
707 		 */
708 		inc = false;
709 
710 	MLX5_SET(save_vhca_state_in, in, opcode,
711 		 MLX5_CMD_OP_SAVE_VHCA_STATE);
712 	MLX5_SET(save_vhca_state_in, in, op_mod, 0);
713 	MLX5_SET(save_vhca_state_in, in, vhca_id, mvdev->vhca_id);
714 	MLX5_SET(save_vhca_state_in, in, mkey, buf->mkey);
715 	MLX5_SET(save_vhca_state_in, in, size, buf->allocated_length);
716 	MLX5_SET(save_vhca_state_in, in, incremental, inc);
717 	MLX5_SET(save_vhca_state_in, in, set_track, track);
718 
719 	async_data = &migf->async_data;
720 	async_data->buf = buf;
721 	async_data->stop_copy_chunk = (!track && !pre_copy_cleanup);
722 	async_data->out = kvzalloc(out_size, GFP_KERNEL);
723 	if (!async_data->out) {
724 		err = -ENOMEM;
725 		goto err_out;
726 	}
727 
728 	if (async_data->stop_copy_chunk) {
729 		u8 header_idx = buf->stop_copy_chunk_num ?
730 			buf->stop_copy_chunk_num - 1 : 0;
731 
732 		header_buf = migf->buf_header[header_idx];
733 		migf->buf_header[header_idx] = NULL;
734 	}
735 
736 	if (!header_buf) {
737 		header_buf = mlx5vf_get_data_buffer(migf,
738 			sizeof(struct mlx5_vf_migration_header), DMA_NONE);
739 		if (IS_ERR(header_buf)) {
740 			err = PTR_ERR(header_buf);
741 			goto err_free;
742 		}
743 	}
744 
745 	if (async_data->stop_copy_chunk)
746 		migf->state = MLX5_MIGF_STATE_SAVE_STOP_COPY_CHUNK;
747 
748 	async_data->header_buf = header_buf;
749 	get_file(migf->filp);
750 	err = mlx5_cmd_exec_cb(&migf->async_ctx, in, sizeof(in),
751 			       async_data->out,
752 			       out_size, mlx5vf_save_callback,
753 			       &async_data->cb_work);
754 	if (err)
755 		goto err_exec;
756 
757 	return 0;
758 
759 err_exec:
760 	if (header_buf)
761 		mlx5vf_put_data_buffer(header_buf);
762 	fput(migf->filp);
763 err_free:
764 	kvfree(async_data->out);
765 err_out:
766 	complete(&migf->save_comp);
767 	return err;
768 }
769 
770 int mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device *mvdev,
771 			       struct mlx5_vf_migration_file *migf,
772 			       struct mlx5_vhca_data_buffer *buf)
773 {
774 	u32 out[MLX5_ST_SZ_DW(load_vhca_state_out)] = {};
775 	u32 in[MLX5_ST_SZ_DW(load_vhca_state_in)] = {};
776 	int err;
777 
778 	lockdep_assert_held(&mvdev->state_mutex);
779 	if (mvdev->mdev_detach)
780 		return -ENOTCONN;
781 
782 	if (!buf->dmaed) {
783 		err = mlx5vf_dma_data_buffer(buf);
784 		if (err)
785 			return err;
786 	}
787 
788 	MLX5_SET(load_vhca_state_in, in, opcode,
789 		 MLX5_CMD_OP_LOAD_VHCA_STATE);
790 	MLX5_SET(load_vhca_state_in, in, op_mod, 0);
791 	MLX5_SET(load_vhca_state_in, in, vhca_id, mvdev->vhca_id);
792 	MLX5_SET(load_vhca_state_in, in, mkey, buf->mkey);
793 	MLX5_SET(load_vhca_state_in, in, size, buf->length);
794 	return mlx5_cmd_exec_inout(mvdev->mdev, load_vhca_state, in, out);
795 }
796 
797 int mlx5vf_cmd_alloc_pd(struct mlx5_vf_migration_file *migf)
798 {
799 	int err;
800 
801 	lockdep_assert_held(&migf->mvdev->state_mutex);
802 	if (migf->mvdev->mdev_detach)
803 		return -ENOTCONN;
804 
805 	err = mlx5_core_alloc_pd(migf->mvdev->mdev, &migf->pdn);
806 	return err;
807 }
808 
809 void mlx5vf_cmd_dealloc_pd(struct mlx5_vf_migration_file *migf)
810 {
811 	lockdep_assert_held(&migf->mvdev->state_mutex);
812 	if (migf->mvdev->mdev_detach)
813 		return;
814 
815 	mlx5_core_dealloc_pd(migf->mvdev->mdev, migf->pdn);
816 }
817 
818 void mlx5fv_cmd_clean_migf_resources(struct mlx5_vf_migration_file *migf)
819 {
820 	struct mlx5_vhca_data_buffer *entry;
821 	int i;
822 
823 	lockdep_assert_held(&migf->mvdev->state_mutex);
824 	WARN_ON(migf->mvdev->mdev_detach);
825 
826 	for (i = 0; i < MAX_NUM_CHUNKS; i++) {
827 		if (migf->buf[i]) {
828 			mlx5vf_free_data_buffer(migf->buf[i]);
829 			migf->buf[i] = NULL;
830 		}
831 
832 		if (migf->buf_header[i]) {
833 			mlx5vf_free_data_buffer(migf->buf_header[i]);
834 			migf->buf_header[i] = NULL;
835 		}
836 	}
837 
838 	list_splice(&migf->avail_list, &migf->buf_list);
839 
840 	while ((entry = list_first_entry_or_null(&migf->buf_list,
841 				struct mlx5_vhca_data_buffer, buf_elm))) {
842 		list_del(&entry->buf_elm);
843 		mlx5vf_free_data_buffer(entry);
844 	}
845 
846 	mlx5vf_cmd_dealloc_pd(migf);
847 }
848 
849 static int mlx5vf_create_tracker(struct mlx5_core_dev *mdev,
850 				 struct mlx5vf_pci_core_device *mvdev,
851 				 struct rb_root_cached *ranges, u32 nnodes)
852 {
853 	int max_num_range =
854 		MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_max_num_range);
855 	struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker;
856 	int record_size = MLX5_ST_SZ_BYTES(page_track_range);
857 	u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {};
858 	struct interval_tree_node *node = NULL;
859 	u64 total_ranges_len = 0;
860 	u32 num_ranges = nnodes;
861 	u8 log_addr_space_size;
862 	void *range_list_ptr;
863 	void *obj_context;
864 	void *cmd_hdr;
865 	int inlen;
866 	void *in;
867 	int err;
868 	int i;
869 
870 	if (num_ranges > max_num_range) {
871 		vfio_combine_iova_ranges(ranges, nnodes, max_num_range);
872 		num_ranges = max_num_range;
873 	}
874 
875 	inlen = MLX5_ST_SZ_BYTES(create_page_track_obj_in) +
876 				 record_size * num_ranges;
877 	in = kzalloc(inlen, GFP_KERNEL);
878 	if (!in)
879 		return -ENOMEM;
880 
881 	cmd_hdr = MLX5_ADDR_OF(create_page_track_obj_in, in,
882 			       general_obj_in_cmd_hdr);
883 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode,
884 		 MLX5_CMD_OP_CREATE_GENERAL_OBJECT);
885 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type,
886 		 MLX5_OBJ_TYPE_PAGE_TRACK);
887 	obj_context = MLX5_ADDR_OF(create_page_track_obj_in, in, obj_context);
888 	MLX5_SET(page_track, obj_context, vhca_id, mvdev->vhca_id);
889 	MLX5_SET(page_track, obj_context, track_type, 1);
890 	MLX5_SET(page_track, obj_context, log_page_size,
891 		 ilog2(tracker->host_qp->tracked_page_size));
892 	MLX5_SET(page_track, obj_context, log_msg_size,
893 		 ilog2(tracker->host_qp->max_msg_size));
894 	MLX5_SET(page_track, obj_context, reporting_qpn, tracker->fw_qp->qpn);
895 	MLX5_SET(page_track, obj_context, num_ranges, num_ranges);
896 
897 	range_list_ptr = MLX5_ADDR_OF(page_track, obj_context, track_range);
898 	node = interval_tree_iter_first(ranges, 0, ULONG_MAX);
899 	for (i = 0; i < num_ranges; i++) {
900 		void *addr_range_i_base = range_list_ptr + record_size * i;
901 		unsigned long length = node->last - node->start + 1;
902 
903 		MLX5_SET64(page_track_range, addr_range_i_base, start_address,
904 			   node->start);
905 		MLX5_SET64(page_track_range, addr_range_i_base, length, length);
906 		total_ranges_len += length;
907 		node = interval_tree_iter_next(node, 0, ULONG_MAX);
908 	}
909 
910 	WARN_ON(node);
911 	log_addr_space_size = ilog2(roundup_pow_of_two(total_ranges_len));
912 	if (log_addr_space_size <
913 	    (MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_log_min_addr_space)) ||
914 	    log_addr_space_size >
915 	    (MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_log_max_addr_space))) {
916 		err = -EOPNOTSUPP;
917 		goto out;
918 	}
919 
920 	MLX5_SET(page_track, obj_context, log_addr_space_size,
921 		 log_addr_space_size);
922 	err = mlx5_cmd_exec(mdev, in, inlen, out, sizeof(out));
923 	if (err)
924 		goto out;
925 
926 	tracker->id = MLX5_GET(general_obj_out_cmd_hdr, out, obj_id);
927 out:
928 	kfree(in);
929 	return err;
930 }
931 
932 static int mlx5vf_cmd_destroy_tracker(struct mlx5_core_dev *mdev,
933 				      u32 tracker_id)
934 {
935 	u32 in[MLX5_ST_SZ_DW(general_obj_in_cmd_hdr)] = {};
936 	u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {};
937 
938 	MLX5_SET(general_obj_in_cmd_hdr, in, opcode, MLX5_CMD_OP_DESTROY_GENERAL_OBJECT);
939 	MLX5_SET(general_obj_in_cmd_hdr, in, obj_type, MLX5_OBJ_TYPE_PAGE_TRACK);
940 	MLX5_SET(general_obj_in_cmd_hdr, in, obj_id, tracker_id);
941 
942 	return mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out));
943 }
944 
945 static int mlx5vf_cmd_modify_tracker(struct mlx5_core_dev *mdev,
946 				     u32 tracker_id, unsigned long iova,
947 				     unsigned long length, u32 tracker_state)
948 {
949 	u32 in[MLX5_ST_SZ_DW(modify_page_track_obj_in)] = {};
950 	u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {};
951 	void *obj_context;
952 	void *cmd_hdr;
953 
954 	cmd_hdr = MLX5_ADDR_OF(modify_page_track_obj_in, in, general_obj_in_cmd_hdr);
955 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_MODIFY_GENERAL_OBJECT);
956 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_PAGE_TRACK);
957 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_id, tracker_id);
958 
959 	obj_context = MLX5_ADDR_OF(modify_page_track_obj_in, in, obj_context);
960 	MLX5_SET64(page_track, obj_context, modify_field_select, 0x3);
961 	MLX5_SET64(page_track, obj_context, range_start_address, iova);
962 	MLX5_SET64(page_track, obj_context, length, length);
963 	MLX5_SET(page_track, obj_context, state, tracker_state);
964 
965 	return mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out));
966 }
967 
968 static int mlx5vf_cmd_query_tracker(struct mlx5_core_dev *mdev,
969 				    struct mlx5_vhca_page_tracker *tracker)
970 {
971 	u32 out[MLX5_ST_SZ_DW(query_page_track_obj_out)] = {};
972 	u32 in[MLX5_ST_SZ_DW(general_obj_in_cmd_hdr)] = {};
973 	void *obj_context;
974 	void *cmd_hdr;
975 	int err;
976 
977 	cmd_hdr = MLX5_ADDR_OF(modify_page_track_obj_in, in, general_obj_in_cmd_hdr);
978 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_QUERY_GENERAL_OBJECT);
979 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_PAGE_TRACK);
980 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_id, tracker->id);
981 
982 	err = mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out));
983 	if (err)
984 		return err;
985 
986 	obj_context = MLX5_ADDR_OF(query_page_track_obj_out, out, obj_context);
987 	tracker->status = MLX5_GET(page_track, obj_context, state);
988 	return 0;
989 }
990 
991 static int alloc_cq_frag_buf(struct mlx5_core_dev *mdev,
992 			     struct mlx5_vhca_cq_buf *buf, int nent,
993 			     int cqe_size)
994 {
995 	struct mlx5_frag_buf *frag_buf = &buf->frag_buf;
996 	u8 log_wq_stride = 6 + (cqe_size == 128 ? 1 : 0);
997 	u8 log_wq_sz = ilog2(cqe_size);
998 	int err;
999 
1000 	err = mlx5_frag_buf_alloc_node(mdev, nent * cqe_size, frag_buf,
1001 				       mdev->priv.numa_node);
1002 	if (err)
1003 		return err;
1004 
1005 	mlx5_init_fbc(frag_buf->frags, log_wq_stride, log_wq_sz, &buf->fbc);
1006 	buf->cqe_size = cqe_size;
1007 	buf->nent = nent;
1008 	return 0;
1009 }
1010 
1011 static void init_cq_frag_buf(struct mlx5_vhca_cq_buf *buf)
1012 {
1013 	struct mlx5_cqe64 *cqe64;
1014 	void *cqe;
1015 	int i;
1016 
1017 	for (i = 0; i < buf->nent; i++) {
1018 		cqe = mlx5_frag_buf_get_wqe(&buf->fbc, i);
1019 		cqe64 = buf->cqe_size == 64 ? cqe : cqe + 64;
1020 		cqe64->op_own = MLX5_CQE_INVALID << 4;
1021 	}
1022 }
1023 
1024 static void mlx5vf_destroy_cq(struct mlx5_core_dev *mdev,
1025 			      struct mlx5_vhca_cq *cq)
1026 {
1027 	mlx5_core_destroy_cq(mdev, &cq->mcq);
1028 	mlx5_frag_buf_free(mdev, &cq->buf.frag_buf);
1029 	mlx5_db_free(mdev, &cq->db);
1030 }
1031 
1032 static void mlx5vf_cq_event(struct mlx5_core_cq *mcq, enum mlx5_event type)
1033 {
1034 	if (type != MLX5_EVENT_TYPE_CQ_ERROR)
1035 		return;
1036 
1037 	set_tracker_error(container_of(mcq, struct mlx5vf_pci_core_device,
1038 				       tracker.cq.mcq));
1039 }
1040 
1041 static int mlx5vf_event_notifier(struct notifier_block *nb, unsigned long type,
1042 				 void *data)
1043 {
1044 	struct mlx5_vhca_page_tracker *tracker =
1045 		mlx5_nb_cof(nb, struct mlx5_vhca_page_tracker, nb);
1046 	struct mlx5vf_pci_core_device *mvdev = container_of(
1047 		tracker, struct mlx5vf_pci_core_device, tracker);
1048 	struct mlx5_eqe_obj_change *object;
1049 	struct mlx5_eqe *eqe = data;
1050 	u8 event_type = (u8)type;
1051 	u8 queue_type;
1052 	u32 obj_id;
1053 	int qp_num;
1054 
1055 	switch (event_type) {
1056 	case MLX5_EVENT_TYPE_WQ_CATAS_ERROR:
1057 	case MLX5_EVENT_TYPE_WQ_ACCESS_ERROR:
1058 	case MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR:
1059 		queue_type = eqe->data.qp_srq.type;
1060 		if (queue_type != MLX5_EVENT_QUEUE_TYPE_QP)
1061 			break;
1062 		qp_num = be32_to_cpu(eqe->data.qp_srq.qp_srq_n) & 0xffffff;
1063 		if (qp_num != tracker->host_qp->qpn &&
1064 		    qp_num != tracker->fw_qp->qpn)
1065 			break;
1066 		set_tracker_error(mvdev);
1067 		break;
1068 	case MLX5_EVENT_TYPE_OBJECT_CHANGE:
1069 		object = &eqe->data.obj_change;
1070 		obj_id = be32_to_cpu(object->obj_id);
1071 		if (obj_id == tracker->id)
1072 			set_tracker_change_event(mvdev);
1073 		break;
1074 	default:
1075 		break;
1076 	}
1077 
1078 	return NOTIFY_OK;
1079 }
1080 
1081 static void mlx5vf_cq_complete(struct mlx5_core_cq *mcq,
1082 			       struct mlx5_eqe *eqe)
1083 {
1084 	struct mlx5vf_pci_core_device *mvdev =
1085 		container_of(mcq, struct mlx5vf_pci_core_device,
1086 			     tracker.cq.mcq);
1087 
1088 	complete(&mvdev->tracker_comp);
1089 }
1090 
1091 static int mlx5vf_create_cq(struct mlx5_core_dev *mdev,
1092 			    struct mlx5_vhca_page_tracker *tracker,
1093 			    size_t ncqe)
1094 {
1095 	int cqe_size = cache_line_size() == 128 ? 128 : 64;
1096 	u32 out[MLX5_ST_SZ_DW(create_cq_out)];
1097 	struct mlx5_vhca_cq *cq;
1098 	int inlen, err, eqn;
1099 	void *cqc, *in;
1100 	__be64 *pas;
1101 	int vector;
1102 
1103 	cq = &tracker->cq;
1104 	ncqe = roundup_pow_of_two(ncqe);
1105 	err = mlx5_db_alloc_node(mdev, &cq->db, mdev->priv.numa_node);
1106 	if (err)
1107 		return err;
1108 
1109 	cq->ncqe = ncqe;
1110 	cq->mcq.set_ci_db = cq->db.db;
1111 	cq->mcq.arm_db = cq->db.db + 1;
1112 	cq->mcq.cqe_sz = cqe_size;
1113 	err = alloc_cq_frag_buf(mdev, &cq->buf, ncqe, cqe_size);
1114 	if (err)
1115 		goto err_db_free;
1116 
1117 	init_cq_frag_buf(&cq->buf);
1118 	inlen = MLX5_ST_SZ_BYTES(create_cq_in) +
1119 		MLX5_FLD_SZ_BYTES(create_cq_in, pas[0]) *
1120 		cq->buf.frag_buf.npages;
1121 	in = kvzalloc(inlen, GFP_KERNEL);
1122 	if (!in) {
1123 		err = -ENOMEM;
1124 		goto err_buff;
1125 	}
1126 
1127 	vector = raw_smp_processor_id() % mlx5_comp_vectors_max(mdev);
1128 	err = mlx5_comp_eqn_get(mdev, vector, &eqn);
1129 	if (err)
1130 		goto err_vec;
1131 
1132 	cqc = MLX5_ADDR_OF(create_cq_in, in, cq_context);
1133 	MLX5_SET(cqc, cqc, log_cq_size, ilog2(ncqe));
1134 	MLX5_SET(cqc, cqc, c_eqn_or_apu_element, eqn);
1135 	MLX5_SET(cqc, cqc, uar_page, tracker->uar->index);
1136 	MLX5_SET(cqc, cqc, log_page_size, cq->buf.frag_buf.page_shift -
1137 		 MLX5_ADAPTER_PAGE_SHIFT);
1138 	MLX5_SET64(cqc, cqc, dbr_addr, cq->db.dma);
1139 	pas = (__be64 *)MLX5_ADDR_OF(create_cq_in, in, pas);
1140 	mlx5_fill_page_frag_array(&cq->buf.frag_buf, pas);
1141 	cq->mcq.comp = mlx5vf_cq_complete;
1142 	cq->mcq.event = mlx5vf_cq_event;
1143 	err = mlx5_core_create_cq(mdev, &cq->mcq, in, inlen, out, sizeof(out));
1144 	if (err)
1145 		goto err_vec;
1146 
1147 	mlx5_cq_arm(&cq->mcq, MLX5_CQ_DB_REQ_NOT, tracker->uar->map,
1148 		    cq->mcq.cons_index);
1149 	kvfree(in);
1150 	return 0;
1151 
1152 err_vec:
1153 	kvfree(in);
1154 err_buff:
1155 	mlx5_frag_buf_free(mdev, &cq->buf.frag_buf);
1156 err_db_free:
1157 	mlx5_db_free(mdev, &cq->db);
1158 	return err;
1159 }
1160 
1161 static struct mlx5_vhca_qp *
1162 mlx5vf_create_rc_qp(struct mlx5_core_dev *mdev,
1163 		    struct mlx5_vhca_page_tracker *tracker, u32 max_recv_wr)
1164 {
1165 	u32 out[MLX5_ST_SZ_DW(create_qp_out)] = {};
1166 	struct mlx5_vhca_qp *qp;
1167 	u8 log_rq_stride;
1168 	u8 log_rq_sz;
1169 	void *qpc;
1170 	int inlen;
1171 	void *in;
1172 	int err;
1173 
1174 	qp = kzalloc(sizeof(*qp), GFP_KERNEL_ACCOUNT);
1175 	if (!qp)
1176 		return ERR_PTR(-ENOMEM);
1177 
1178 	err = mlx5_db_alloc_node(mdev, &qp->db, mdev->priv.numa_node);
1179 	if (err)
1180 		goto err_free;
1181 
1182 	if (max_recv_wr) {
1183 		qp->rq.wqe_cnt = roundup_pow_of_two(max_recv_wr);
1184 		log_rq_stride = ilog2(MLX5_SEND_WQE_DS);
1185 		log_rq_sz = ilog2(qp->rq.wqe_cnt);
1186 		err = mlx5_frag_buf_alloc_node(mdev,
1187 			wq_get_byte_sz(log_rq_sz, log_rq_stride),
1188 			&qp->buf, mdev->priv.numa_node);
1189 		if (err)
1190 			goto err_db_free;
1191 		mlx5_init_fbc(qp->buf.frags, log_rq_stride, log_rq_sz, &qp->rq.fbc);
1192 	}
1193 
1194 	qp->rq.db = &qp->db.db[MLX5_RCV_DBR];
1195 	inlen = MLX5_ST_SZ_BYTES(create_qp_in) +
1196 		MLX5_FLD_SZ_BYTES(create_qp_in, pas[0]) *
1197 		qp->buf.npages;
1198 	in = kvzalloc(inlen, GFP_KERNEL);
1199 	if (!in) {
1200 		err = -ENOMEM;
1201 		goto err_in;
1202 	}
1203 
1204 	qpc = MLX5_ADDR_OF(create_qp_in, in, qpc);
1205 	MLX5_SET(qpc, qpc, st, MLX5_QP_ST_RC);
1206 	MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED);
1207 	MLX5_SET(qpc, qpc, pd, tracker->pdn);
1208 	MLX5_SET(qpc, qpc, uar_page, tracker->uar->index);
1209 	MLX5_SET(qpc, qpc, log_page_size,
1210 		 qp->buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT);
1211 	MLX5_SET(qpc, qpc, ts_format, mlx5_get_qp_default_ts(mdev));
1212 	if (MLX5_CAP_GEN(mdev, cqe_version) == 1)
1213 		MLX5_SET(qpc, qpc, user_index, 0xFFFFFF);
1214 	MLX5_SET(qpc, qpc, no_sq, 1);
1215 	if (max_recv_wr) {
1216 		MLX5_SET(qpc, qpc, cqn_rcv, tracker->cq.mcq.cqn);
1217 		MLX5_SET(qpc, qpc, log_rq_stride, log_rq_stride - 4);
1218 		MLX5_SET(qpc, qpc, log_rq_size, log_rq_sz);
1219 		MLX5_SET(qpc, qpc, rq_type, MLX5_NON_ZERO_RQ);
1220 		MLX5_SET64(qpc, qpc, dbr_addr, qp->db.dma);
1221 		mlx5_fill_page_frag_array(&qp->buf,
1222 					  (__be64 *)MLX5_ADDR_OF(create_qp_in,
1223 								 in, pas));
1224 	} else {
1225 		MLX5_SET(qpc, qpc, rq_type, MLX5_ZERO_LEN_RQ);
1226 	}
1227 
1228 	MLX5_SET(create_qp_in, in, opcode, MLX5_CMD_OP_CREATE_QP);
1229 	err = mlx5_cmd_exec(mdev, in, inlen, out, sizeof(out));
1230 	kvfree(in);
1231 	if (err)
1232 		goto err_in;
1233 
1234 	qp->qpn = MLX5_GET(create_qp_out, out, qpn);
1235 	return qp;
1236 
1237 err_in:
1238 	if (max_recv_wr)
1239 		mlx5_frag_buf_free(mdev, &qp->buf);
1240 err_db_free:
1241 	mlx5_db_free(mdev, &qp->db);
1242 err_free:
1243 	kfree(qp);
1244 	return ERR_PTR(err);
1245 }
1246 
1247 static void mlx5vf_post_recv(struct mlx5_vhca_qp *qp)
1248 {
1249 	struct mlx5_wqe_data_seg *data;
1250 	unsigned int ix;
1251 
1252 	WARN_ON(qp->rq.pc - qp->rq.cc >= qp->rq.wqe_cnt);
1253 	ix = qp->rq.pc & (qp->rq.wqe_cnt - 1);
1254 	data = mlx5_frag_buf_get_wqe(&qp->rq.fbc, ix);
1255 	data->byte_count = cpu_to_be32(qp->max_msg_size);
1256 	data->lkey = cpu_to_be32(qp->recv_buf.mkey);
1257 	data->addr = cpu_to_be64(qp->recv_buf.next_rq_offset);
1258 	qp->rq.pc++;
1259 	/* Make sure that descriptors are written before doorbell record. */
1260 	dma_wmb();
1261 	*qp->rq.db = cpu_to_be32(qp->rq.pc & 0xffff);
1262 }
1263 
1264 static int mlx5vf_activate_qp(struct mlx5_core_dev *mdev,
1265 			      struct mlx5_vhca_qp *qp, u32 remote_qpn,
1266 			      bool host_qp)
1267 {
1268 	u32 init_in[MLX5_ST_SZ_DW(rst2init_qp_in)] = {};
1269 	u32 rtr_in[MLX5_ST_SZ_DW(init2rtr_qp_in)] = {};
1270 	u32 rts_in[MLX5_ST_SZ_DW(rtr2rts_qp_in)] = {};
1271 	void *qpc;
1272 	int ret;
1273 
1274 	/* Init */
1275 	qpc = MLX5_ADDR_OF(rst2init_qp_in, init_in, qpc);
1276 	MLX5_SET(qpc, qpc, primary_address_path.vhca_port_num, 1);
1277 	MLX5_SET(qpc, qpc, pm_state, MLX5_QPC_PM_STATE_MIGRATED);
1278 	MLX5_SET(qpc, qpc, rre, 1);
1279 	MLX5_SET(qpc, qpc, rwe, 1);
1280 	MLX5_SET(rst2init_qp_in, init_in, opcode, MLX5_CMD_OP_RST2INIT_QP);
1281 	MLX5_SET(rst2init_qp_in, init_in, qpn, qp->qpn);
1282 	ret = mlx5_cmd_exec_in(mdev, rst2init_qp, init_in);
1283 	if (ret)
1284 		return ret;
1285 
1286 	if (host_qp) {
1287 		struct mlx5_vhca_recv_buf *recv_buf = &qp->recv_buf;
1288 		int i;
1289 
1290 		for (i = 0; i < qp->rq.wqe_cnt; i++) {
1291 			mlx5vf_post_recv(qp);
1292 			recv_buf->next_rq_offset += qp->max_msg_size;
1293 		}
1294 	}
1295 
1296 	/* RTR */
1297 	qpc = MLX5_ADDR_OF(init2rtr_qp_in, rtr_in, qpc);
1298 	MLX5_SET(init2rtr_qp_in, rtr_in, qpn, qp->qpn);
1299 	MLX5_SET(qpc, qpc, mtu, IB_MTU_4096);
1300 	MLX5_SET(qpc, qpc, log_msg_max, MLX5_CAP_GEN(mdev, log_max_msg));
1301 	MLX5_SET(qpc, qpc, remote_qpn, remote_qpn);
1302 	MLX5_SET(qpc, qpc, primary_address_path.vhca_port_num, 1);
1303 	MLX5_SET(qpc, qpc, primary_address_path.fl, 1);
1304 	MLX5_SET(qpc, qpc, min_rnr_nak, 1);
1305 	MLX5_SET(init2rtr_qp_in, rtr_in, opcode, MLX5_CMD_OP_INIT2RTR_QP);
1306 	MLX5_SET(init2rtr_qp_in, rtr_in, qpn, qp->qpn);
1307 	ret = mlx5_cmd_exec_in(mdev, init2rtr_qp, rtr_in);
1308 	if (ret || host_qp)
1309 		return ret;
1310 
1311 	/* RTS */
1312 	qpc = MLX5_ADDR_OF(rtr2rts_qp_in, rts_in, qpc);
1313 	MLX5_SET(rtr2rts_qp_in, rts_in, qpn, qp->qpn);
1314 	MLX5_SET(qpc, qpc, retry_count, 7);
1315 	MLX5_SET(qpc, qpc, rnr_retry, 7); /* Infinite retry if RNR NACK */
1316 	MLX5_SET(qpc, qpc, primary_address_path.ack_timeout, 0x8); /* ~1ms */
1317 	MLX5_SET(rtr2rts_qp_in, rts_in, opcode, MLX5_CMD_OP_RTR2RTS_QP);
1318 	MLX5_SET(rtr2rts_qp_in, rts_in, qpn, qp->qpn);
1319 
1320 	return mlx5_cmd_exec_in(mdev, rtr2rts_qp, rts_in);
1321 }
1322 
1323 static void mlx5vf_destroy_qp(struct mlx5_core_dev *mdev,
1324 			      struct mlx5_vhca_qp *qp)
1325 {
1326 	u32 in[MLX5_ST_SZ_DW(destroy_qp_in)] = {};
1327 
1328 	MLX5_SET(destroy_qp_in, in, opcode, MLX5_CMD_OP_DESTROY_QP);
1329 	MLX5_SET(destroy_qp_in, in, qpn, qp->qpn);
1330 	mlx5_cmd_exec_in(mdev, destroy_qp, in);
1331 
1332 	mlx5_frag_buf_free(mdev, &qp->buf);
1333 	mlx5_db_free(mdev, &qp->db);
1334 	kfree(qp);
1335 }
1336 
1337 static void free_recv_pages(struct mlx5_vhca_recv_buf *recv_buf)
1338 {
1339 	int i;
1340 
1341 	/* Undo alloc_pages_bulk_array() */
1342 	for (i = 0; i < recv_buf->npages; i++)
1343 		__free_page(recv_buf->page_list[i]);
1344 
1345 	kvfree(recv_buf->page_list);
1346 }
1347 
1348 static int alloc_recv_pages(struct mlx5_vhca_recv_buf *recv_buf,
1349 			    unsigned int npages)
1350 {
1351 	unsigned int filled = 0, done = 0;
1352 	int i;
1353 
1354 	recv_buf->page_list = kvcalloc(npages, sizeof(*recv_buf->page_list),
1355 				       GFP_KERNEL_ACCOUNT);
1356 	if (!recv_buf->page_list)
1357 		return -ENOMEM;
1358 
1359 	for (;;) {
1360 		filled = alloc_pages_bulk_array(GFP_KERNEL_ACCOUNT,
1361 						npages - done,
1362 						recv_buf->page_list + done);
1363 		if (!filled)
1364 			goto err;
1365 
1366 		done += filled;
1367 		if (done == npages)
1368 			break;
1369 	}
1370 
1371 	recv_buf->npages = npages;
1372 	return 0;
1373 
1374 err:
1375 	for (i = 0; i < npages; i++) {
1376 		if (recv_buf->page_list[i])
1377 			__free_page(recv_buf->page_list[i]);
1378 	}
1379 
1380 	kvfree(recv_buf->page_list);
1381 	return -ENOMEM;
1382 }
1383 
1384 static int register_dma_recv_pages(struct mlx5_core_dev *mdev,
1385 				   struct mlx5_vhca_recv_buf *recv_buf)
1386 {
1387 	int i, j;
1388 
1389 	recv_buf->dma_addrs = kvcalloc(recv_buf->npages,
1390 				       sizeof(*recv_buf->dma_addrs),
1391 				       GFP_KERNEL_ACCOUNT);
1392 	if (!recv_buf->dma_addrs)
1393 		return -ENOMEM;
1394 
1395 	for (i = 0; i < recv_buf->npages; i++) {
1396 		recv_buf->dma_addrs[i] = dma_map_page(mdev->device,
1397 						      recv_buf->page_list[i],
1398 						      0, PAGE_SIZE,
1399 						      DMA_FROM_DEVICE);
1400 		if (dma_mapping_error(mdev->device, recv_buf->dma_addrs[i]))
1401 			goto error;
1402 	}
1403 	return 0;
1404 
1405 error:
1406 	for (j = 0; j < i; j++)
1407 		dma_unmap_single(mdev->device, recv_buf->dma_addrs[j],
1408 				 PAGE_SIZE, DMA_FROM_DEVICE);
1409 
1410 	kvfree(recv_buf->dma_addrs);
1411 	return -ENOMEM;
1412 }
1413 
1414 static void unregister_dma_recv_pages(struct mlx5_core_dev *mdev,
1415 				      struct mlx5_vhca_recv_buf *recv_buf)
1416 {
1417 	int i;
1418 
1419 	for (i = 0; i < recv_buf->npages; i++)
1420 		dma_unmap_single(mdev->device, recv_buf->dma_addrs[i],
1421 				 PAGE_SIZE, DMA_FROM_DEVICE);
1422 
1423 	kvfree(recv_buf->dma_addrs);
1424 }
1425 
1426 static void mlx5vf_free_qp_recv_resources(struct mlx5_core_dev *mdev,
1427 					  struct mlx5_vhca_qp *qp)
1428 {
1429 	struct mlx5_vhca_recv_buf *recv_buf = &qp->recv_buf;
1430 
1431 	mlx5_core_destroy_mkey(mdev, recv_buf->mkey);
1432 	unregister_dma_recv_pages(mdev, recv_buf);
1433 	free_recv_pages(&qp->recv_buf);
1434 }
1435 
1436 static int mlx5vf_alloc_qp_recv_resources(struct mlx5_core_dev *mdev,
1437 					  struct mlx5_vhca_qp *qp, u32 pdn,
1438 					  u64 rq_size)
1439 {
1440 	unsigned int npages = DIV_ROUND_UP_ULL(rq_size, PAGE_SIZE);
1441 	struct mlx5_vhca_recv_buf *recv_buf = &qp->recv_buf;
1442 	int err;
1443 
1444 	err = alloc_recv_pages(recv_buf, npages);
1445 	if (err < 0)
1446 		return err;
1447 
1448 	err = register_dma_recv_pages(mdev, recv_buf);
1449 	if (err)
1450 		goto end;
1451 
1452 	err = _create_mkey(mdev, pdn, NULL, recv_buf, &recv_buf->mkey);
1453 	if (err)
1454 		goto err_create_mkey;
1455 
1456 	return 0;
1457 
1458 err_create_mkey:
1459 	unregister_dma_recv_pages(mdev, recv_buf);
1460 end:
1461 	free_recv_pages(recv_buf);
1462 	return err;
1463 }
1464 
1465 static void
1466 _mlx5vf_free_page_tracker_resources(struct mlx5vf_pci_core_device *mvdev)
1467 {
1468 	struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker;
1469 	struct mlx5_core_dev *mdev = mvdev->mdev;
1470 
1471 	lockdep_assert_held(&mvdev->state_mutex);
1472 
1473 	if (!mvdev->log_active)
1474 		return;
1475 
1476 	WARN_ON(mvdev->mdev_detach);
1477 
1478 	mlx5_eq_notifier_unregister(mdev, &tracker->nb);
1479 	mlx5vf_cmd_destroy_tracker(mdev, tracker->id);
1480 	mlx5vf_destroy_qp(mdev, tracker->fw_qp);
1481 	mlx5vf_free_qp_recv_resources(mdev, tracker->host_qp);
1482 	mlx5vf_destroy_qp(mdev, tracker->host_qp);
1483 	mlx5vf_destroy_cq(mdev, &tracker->cq);
1484 	mlx5_core_dealloc_pd(mdev, tracker->pdn);
1485 	mlx5_put_uars_page(mdev, tracker->uar);
1486 	mvdev->log_active = false;
1487 }
1488 
1489 int mlx5vf_stop_page_tracker(struct vfio_device *vdev)
1490 {
1491 	struct mlx5vf_pci_core_device *mvdev = container_of(
1492 		vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1493 
1494 	mutex_lock(&mvdev->state_mutex);
1495 	if (!mvdev->log_active)
1496 		goto end;
1497 
1498 	_mlx5vf_free_page_tracker_resources(mvdev);
1499 	mvdev->log_active = false;
1500 end:
1501 	mlx5vf_state_mutex_unlock(mvdev);
1502 	return 0;
1503 }
1504 
1505 int mlx5vf_start_page_tracker(struct vfio_device *vdev,
1506 			      struct rb_root_cached *ranges, u32 nnodes,
1507 			      u64 *page_size)
1508 {
1509 	struct mlx5vf_pci_core_device *mvdev = container_of(
1510 		vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1511 	struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker;
1512 	u8 log_tracked_page = ilog2(*page_size);
1513 	struct mlx5_vhca_qp *host_qp;
1514 	struct mlx5_vhca_qp *fw_qp;
1515 	struct mlx5_core_dev *mdev;
1516 	u32 max_msg_size = PAGE_SIZE;
1517 	u64 rq_size = SZ_2M;
1518 	u32 max_recv_wr;
1519 	int err;
1520 
1521 	mutex_lock(&mvdev->state_mutex);
1522 	if (mvdev->mdev_detach) {
1523 		err = -ENOTCONN;
1524 		goto end;
1525 	}
1526 
1527 	if (mvdev->log_active) {
1528 		err = -EINVAL;
1529 		goto end;
1530 	}
1531 
1532 	mdev = mvdev->mdev;
1533 	memset(tracker, 0, sizeof(*tracker));
1534 	tracker->uar = mlx5_get_uars_page(mdev);
1535 	if (IS_ERR(tracker->uar)) {
1536 		err = PTR_ERR(tracker->uar);
1537 		goto end;
1538 	}
1539 
1540 	err = mlx5_core_alloc_pd(mdev, &tracker->pdn);
1541 	if (err)
1542 		goto err_uar;
1543 
1544 	max_recv_wr = DIV_ROUND_UP_ULL(rq_size, max_msg_size);
1545 	err = mlx5vf_create_cq(mdev, tracker, max_recv_wr);
1546 	if (err)
1547 		goto err_dealloc_pd;
1548 
1549 	host_qp = mlx5vf_create_rc_qp(mdev, tracker, max_recv_wr);
1550 	if (IS_ERR(host_qp)) {
1551 		err = PTR_ERR(host_qp);
1552 		goto err_cq;
1553 	}
1554 
1555 	host_qp->max_msg_size = max_msg_size;
1556 	if (log_tracked_page < MLX5_CAP_ADV_VIRTUALIZATION(mdev,
1557 				pg_track_log_min_page_size)) {
1558 		log_tracked_page = MLX5_CAP_ADV_VIRTUALIZATION(mdev,
1559 				pg_track_log_min_page_size);
1560 	} else if (log_tracked_page > MLX5_CAP_ADV_VIRTUALIZATION(mdev,
1561 				pg_track_log_max_page_size)) {
1562 		log_tracked_page = MLX5_CAP_ADV_VIRTUALIZATION(mdev,
1563 				pg_track_log_max_page_size);
1564 	}
1565 
1566 	host_qp->tracked_page_size = (1ULL << log_tracked_page);
1567 	err = mlx5vf_alloc_qp_recv_resources(mdev, host_qp, tracker->pdn,
1568 					     rq_size);
1569 	if (err)
1570 		goto err_host_qp;
1571 
1572 	fw_qp = mlx5vf_create_rc_qp(mdev, tracker, 0);
1573 	if (IS_ERR(fw_qp)) {
1574 		err = PTR_ERR(fw_qp);
1575 		goto err_recv_resources;
1576 	}
1577 
1578 	err = mlx5vf_activate_qp(mdev, host_qp, fw_qp->qpn, true);
1579 	if (err)
1580 		goto err_activate;
1581 
1582 	err = mlx5vf_activate_qp(mdev, fw_qp, host_qp->qpn, false);
1583 	if (err)
1584 		goto err_activate;
1585 
1586 	tracker->host_qp = host_qp;
1587 	tracker->fw_qp = fw_qp;
1588 	err = mlx5vf_create_tracker(mdev, mvdev, ranges, nnodes);
1589 	if (err)
1590 		goto err_activate;
1591 
1592 	MLX5_NB_INIT(&tracker->nb, mlx5vf_event_notifier, NOTIFY_ANY);
1593 	mlx5_eq_notifier_register(mdev, &tracker->nb);
1594 	*page_size = host_qp->tracked_page_size;
1595 	mvdev->log_active = true;
1596 	mlx5vf_state_mutex_unlock(mvdev);
1597 	return 0;
1598 
1599 err_activate:
1600 	mlx5vf_destroy_qp(mdev, fw_qp);
1601 err_recv_resources:
1602 	mlx5vf_free_qp_recv_resources(mdev, host_qp);
1603 err_host_qp:
1604 	mlx5vf_destroy_qp(mdev, host_qp);
1605 err_cq:
1606 	mlx5vf_destroy_cq(mdev, &tracker->cq);
1607 err_dealloc_pd:
1608 	mlx5_core_dealloc_pd(mdev, tracker->pdn);
1609 err_uar:
1610 	mlx5_put_uars_page(mdev, tracker->uar);
1611 end:
1612 	mlx5vf_state_mutex_unlock(mvdev);
1613 	return err;
1614 }
1615 
1616 static void
1617 set_report_output(u32 size, int index, struct mlx5_vhca_qp *qp,
1618 		  struct iova_bitmap *dirty)
1619 {
1620 	u32 entry_size = MLX5_ST_SZ_BYTES(page_track_report_entry);
1621 	u32 nent = size / entry_size;
1622 	struct page *page;
1623 	u64 addr;
1624 	u64 *buf;
1625 	int i;
1626 
1627 	if (WARN_ON(index >= qp->recv_buf.npages ||
1628 		    (nent > qp->max_msg_size / entry_size)))
1629 		return;
1630 
1631 	page = qp->recv_buf.page_list[index];
1632 	buf = kmap_local_page(page);
1633 	for (i = 0; i < nent; i++) {
1634 		addr = MLX5_GET(page_track_report_entry, buf + i,
1635 				dirty_address_low);
1636 		addr |= (u64)MLX5_GET(page_track_report_entry, buf + i,
1637 				      dirty_address_high) << 32;
1638 		iova_bitmap_set(dirty, addr, qp->tracked_page_size);
1639 	}
1640 	kunmap_local(buf);
1641 }
1642 
1643 static void
1644 mlx5vf_rq_cqe(struct mlx5_vhca_qp *qp, struct mlx5_cqe64 *cqe,
1645 	      struct iova_bitmap *dirty, int *tracker_status)
1646 {
1647 	u32 size;
1648 	int ix;
1649 
1650 	qp->rq.cc++;
1651 	*tracker_status = be32_to_cpu(cqe->immediate) >> 28;
1652 	size = be32_to_cpu(cqe->byte_cnt);
1653 	ix = be16_to_cpu(cqe->wqe_counter) & (qp->rq.wqe_cnt - 1);
1654 
1655 	/* zero length CQE, no data */
1656 	WARN_ON(!size && *tracker_status == MLX5_PAGE_TRACK_STATE_REPORTING);
1657 	if (size)
1658 		set_report_output(size, ix, qp, dirty);
1659 
1660 	qp->recv_buf.next_rq_offset = ix * qp->max_msg_size;
1661 	mlx5vf_post_recv(qp);
1662 }
1663 
1664 static void *get_cqe(struct mlx5_vhca_cq *cq, int n)
1665 {
1666 	return mlx5_frag_buf_get_wqe(&cq->buf.fbc, n);
1667 }
1668 
1669 static struct mlx5_cqe64 *get_sw_cqe(struct mlx5_vhca_cq *cq, int n)
1670 {
1671 	void *cqe = get_cqe(cq, n & (cq->ncqe - 1));
1672 	struct mlx5_cqe64 *cqe64;
1673 
1674 	cqe64 = (cq->mcq.cqe_sz == 64) ? cqe : cqe + 64;
1675 
1676 	if (likely(get_cqe_opcode(cqe64) != MLX5_CQE_INVALID) &&
1677 	    !((cqe64->op_own & MLX5_CQE_OWNER_MASK) ^ !!(n & (cq->ncqe)))) {
1678 		return cqe64;
1679 	} else {
1680 		return NULL;
1681 	}
1682 }
1683 
1684 static int
1685 mlx5vf_cq_poll_one(struct mlx5_vhca_cq *cq, struct mlx5_vhca_qp *qp,
1686 		   struct iova_bitmap *dirty, int *tracker_status)
1687 {
1688 	struct mlx5_cqe64 *cqe;
1689 	u8 opcode;
1690 
1691 	cqe = get_sw_cqe(cq, cq->mcq.cons_index);
1692 	if (!cqe)
1693 		return CQ_EMPTY;
1694 
1695 	++cq->mcq.cons_index;
1696 	/*
1697 	 * Make sure we read CQ entry contents after we've checked the
1698 	 * ownership bit.
1699 	 */
1700 	rmb();
1701 	opcode = get_cqe_opcode(cqe);
1702 	switch (opcode) {
1703 	case MLX5_CQE_RESP_SEND_IMM:
1704 		mlx5vf_rq_cqe(qp, cqe, dirty, tracker_status);
1705 		return CQ_OK;
1706 	default:
1707 		return CQ_POLL_ERR;
1708 	}
1709 }
1710 
1711 int mlx5vf_tracker_read_and_clear(struct vfio_device *vdev, unsigned long iova,
1712 				  unsigned long length,
1713 				  struct iova_bitmap *dirty)
1714 {
1715 	struct mlx5vf_pci_core_device *mvdev = container_of(
1716 		vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1717 	struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker;
1718 	struct mlx5_vhca_cq *cq = &tracker->cq;
1719 	struct mlx5_core_dev *mdev;
1720 	int poll_err, err;
1721 
1722 	mutex_lock(&mvdev->state_mutex);
1723 	if (!mvdev->log_active) {
1724 		err = -EINVAL;
1725 		goto end;
1726 	}
1727 
1728 	if (mvdev->mdev_detach) {
1729 		err = -ENOTCONN;
1730 		goto end;
1731 	}
1732 
1733 	if (tracker->is_err) {
1734 		err = -EIO;
1735 		goto end;
1736 	}
1737 
1738 	mdev = mvdev->mdev;
1739 	err = mlx5vf_cmd_modify_tracker(mdev, tracker->id, iova, length,
1740 					MLX5_PAGE_TRACK_STATE_REPORTING);
1741 	if (err)
1742 		goto end;
1743 
1744 	tracker->status = MLX5_PAGE_TRACK_STATE_REPORTING;
1745 	while (tracker->status == MLX5_PAGE_TRACK_STATE_REPORTING &&
1746 	       !tracker->is_err) {
1747 		poll_err = mlx5vf_cq_poll_one(cq, tracker->host_qp, dirty,
1748 					      &tracker->status);
1749 		if (poll_err == CQ_EMPTY) {
1750 			mlx5_cq_arm(&cq->mcq, MLX5_CQ_DB_REQ_NOT, tracker->uar->map,
1751 				    cq->mcq.cons_index);
1752 			poll_err = mlx5vf_cq_poll_one(cq, tracker->host_qp,
1753 						      dirty, &tracker->status);
1754 			if (poll_err == CQ_EMPTY) {
1755 				wait_for_completion(&mvdev->tracker_comp);
1756 				if (tracker->object_changed) {
1757 					tracker->object_changed = false;
1758 					err = mlx5vf_cmd_query_tracker(mdev, tracker);
1759 					if (err)
1760 						goto end;
1761 				}
1762 				continue;
1763 			}
1764 		}
1765 		if (poll_err == CQ_POLL_ERR) {
1766 			err = -EIO;
1767 			goto end;
1768 		}
1769 		mlx5_cq_set_ci(&cq->mcq);
1770 	}
1771 
1772 	if (tracker->status == MLX5_PAGE_TRACK_STATE_ERROR)
1773 		tracker->is_err = true;
1774 
1775 	if (tracker->is_err)
1776 		err = -EIO;
1777 end:
1778 	mlx5vf_state_mutex_unlock(mvdev);
1779 	return err;
1780 }
1781