xref: /linux/drivers/vfio/pci/mlx5/cmd.c (revision 3536049822060347c8cb5a923186a8d65a8f7a48)
1 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
2 /*
3  * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved
4  */
5 
6 #include "cmd.h"
7 
8 enum { CQ_OK = 0, CQ_EMPTY = -1, CQ_POLL_ERR = -2 };
9 
mlx5vf_is_migratable(struct mlx5_core_dev * mdev,u16 func_id)10 static int mlx5vf_is_migratable(struct mlx5_core_dev *mdev, u16 func_id)
11 {
12 	int query_sz = MLX5_ST_SZ_BYTES(query_hca_cap_out);
13 	void *query_cap = NULL, *cap;
14 	int ret;
15 
16 	query_cap = kzalloc(query_sz, GFP_KERNEL);
17 	if (!query_cap)
18 		return -ENOMEM;
19 
20 	ret = mlx5_vport_get_other_func_cap(mdev, func_id, query_cap,
21 					    MLX5_CAP_GENERAL_2);
22 	if (ret)
23 		goto out;
24 
25 	cap = MLX5_ADDR_OF(query_hca_cap_out, query_cap, capability);
26 	if (!MLX5_GET(cmd_hca_cap_2, cap, migratable))
27 		ret = -EOPNOTSUPP;
28 out:
29 	kfree(query_cap);
30 	return ret;
31 }
32 
33 static int mlx5vf_cmd_get_vhca_id(struct mlx5_core_dev *mdev, u16 function_id,
34 				  u16 *vhca_id);
35 static void
36 _mlx5vf_free_page_tracker_resources(struct mlx5vf_pci_core_device *mvdev);
37 
mlx5vf_cmd_suspend_vhca(struct mlx5vf_pci_core_device * mvdev,u16 op_mod)38 int mlx5vf_cmd_suspend_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod)
39 {
40 	struct mlx5_vf_migration_file *migf = mvdev->saving_migf;
41 	u32 out[MLX5_ST_SZ_DW(suspend_vhca_out)] = {};
42 	u32 in[MLX5_ST_SZ_DW(suspend_vhca_in)] = {};
43 	int err;
44 
45 	lockdep_assert_held(&mvdev->state_mutex);
46 	if (mvdev->mdev_detach)
47 		return -ENOTCONN;
48 
49 	/*
50 	 * In case PRE_COPY is used, saving_migf is exposed while the device is
51 	 * running. Make sure to run only once there is no active save command.
52 	 * Running both in parallel, might end-up with a failure in the save
53 	 * command once it will try to turn on 'tracking' on a suspended device.
54 	 */
55 	if (migf) {
56 		err = wait_for_completion_interruptible(&migf->save_comp);
57 		if (err)
58 			return err;
59 	}
60 
61 	MLX5_SET(suspend_vhca_in, in, opcode, MLX5_CMD_OP_SUSPEND_VHCA);
62 	MLX5_SET(suspend_vhca_in, in, vhca_id, mvdev->vhca_id);
63 	MLX5_SET(suspend_vhca_in, in, op_mod, op_mod);
64 
65 	err = mlx5_cmd_exec_inout(mvdev->mdev, suspend_vhca, in, out);
66 	if (migf)
67 		complete(&migf->save_comp);
68 
69 	return err;
70 }
71 
mlx5vf_cmd_resume_vhca(struct mlx5vf_pci_core_device * mvdev,u16 op_mod)72 int mlx5vf_cmd_resume_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod)
73 {
74 	u32 out[MLX5_ST_SZ_DW(resume_vhca_out)] = {};
75 	u32 in[MLX5_ST_SZ_DW(resume_vhca_in)] = {};
76 
77 	lockdep_assert_held(&mvdev->state_mutex);
78 	if (mvdev->mdev_detach)
79 		return -ENOTCONN;
80 
81 	MLX5_SET(resume_vhca_in, in, opcode, MLX5_CMD_OP_RESUME_VHCA);
82 	MLX5_SET(resume_vhca_in, in, vhca_id, mvdev->vhca_id);
83 	MLX5_SET(resume_vhca_in, in, op_mod, op_mod);
84 
85 	return mlx5_cmd_exec_inout(mvdev->mdev, resume_vhca, in, out);
86 }
87 
mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device * mvdev,size_t * state_size,u64 * total_size,u8 query_flags)88 int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev,
89 					  size_t *state_size, u64 *total_size,
90 					  u8 query_flags)
91 {
92 	u32 out[MLX5_ST_SZ_DW(query_vhca_migration_state_out)] = {};
93 	u32 in[MLX5_ST_SZ_DW(query_vhca_migration_state_in)] = {};
94 	bool inc = query_flags & MLX5VF_QUERY_INC;
95 	int ret;
96 
97 	lockdep_assert_held(&mvdev->state_mutex);
98 	if (mvdev->mdev_detach)
99 		return -ENOTCONN;
100 
101 	/*
102 	 * In case PRE_COPY is used, saving_migf is exposed while device is
103 	 * running. Make sure to run only once there is no active save command.
104 	 * Running both in parallel, might end-up with a failure in the
105 	 * incremental query command on un-tracked vhca.
106 	 */
107 	if (inc) {
108 		ret = wait_for_completion_interruptible(&mvdev->saving_migf->save_comp);
109 		if (ret)
110 			return ret;
111 		/* Upon cleanup, ignore previous pre_copy error state */
112 		if (mvdev->saving_migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR &&
113 		    !(query_flags & MLX5VF_QUERY_CLEANUP)) {
114 			/*
115 			 * In case we had a PRE_COPY error, only query full
116 			 * image for final image
117 			 */
118 			if (!(query_flags & MLX5VF_QUERY_FINAL)) {
119 				*state_size = 0;
120 				complete(&mvdev->saving_migf->save_comp);
121 				return 0;
122 			}
123 			query_flags &= ~MLX5VF_QUERY_INC;
124 		}
125 		/* Block incremental query which is state-dependent */
126 		if (mvdev->saving_migf->state == MLX5_MIGF_STATE_ERROR) {
127 			complete(&mvdev->saving_migf->save_comp);
128 			return -ENODEV;
129 		}
130 	}
131 
132 	MLX5_SET(query_vhca_migration_state_in, in, opcode,
133 		 MLX5_CMD_OP_QUERY_VHCA_MIGRATION_STATE);
134 	MLX5_SET(query_vhca_migration_state_in, in, vhca_id, mvdev->vhca_id);
135 	MLX5_SET(query_vhca_migration_state_in, in, op_mod, 0);
136 	MLX5_SET(query_vhca_migration_state_in, in, incremental,
137 		 query_flags & MLX5VF_QUERY_INC);
138 	MLX5_SET(query_vhca_migration_state_in, in, chunk, mvdev->chunk_mode);
139 
140 	ret = mlx5_cmd_exec_inout(mvdev->mdev, query_vhca_migration_state, in,
141 				  out);
142 	if (inc)
143 		complete(&mvdev->saving_migf->save_comp);
144 
145 	if (ret)
146 		return ret;
147 
148 	*state_size = MLX5_GET(query_vhca_migration_state_out, out,
149 			       required_umem_size);
150 	if (total_size)
151 		*total_size = mvdev->chunk_mode ?
152 			MLX5_GET64(query_vhca_migration_state_out, out,
153 				   remaining_total_size) : *state_size;
154 
155 	return 0;
156 }
157 
set_tracker_change_event(struct mlx5vf_pci_core_device * mvdev)158 static void set_tracker_change_event(struct mlx5vf_pci_core_device *mvdev)
159 {
160 	mvdev->tracker.object_changed = true;
161 	complete(&mvdev->tracker_comp);
162 }
163 
set_tracker_error(struct mlx5vf_pci_core_device * mvdev)164 static void set_tracker_error(struct mlx5vf_pci_core_device *mvdev)
165 {
166 	/* Mark the tracker under an error and wake it up if it's running */
167 	mvdev->tracker.is_err = true;
168 	complete(&mvdev->tracker_comp);
169 }
170 
mlx5fv_vf_event(struct notifier_block * nb,unsigned long event,void * data)171 static int mlx5fv_vf_event(struct notifier_block *nb,
172 			   unsigned long event, void *data)
173 {
174 	struct mlx5vf_pci_core_device *mvdev =
175 		container_of(nb, struct mlx5vf_pci_core_device, nb);
176 
177 	switch (event) {
178 	case MLX5_PF_NOTIFY_ENABLE_VF:
179 		mutex_lock(&mvdev->state_mutex);
180 		mvdev->mdev_detach = false;
181 		mlx5vf_state_mutex_unlock(mvdev);
182 		break;
183 	case MLX5_PF_NOTIFY_DISABLE_VF:
184 		mlx5vf_cmd_close_migratable(mvdev);
185 		mutex_lock(&mvdev->state_mutex);
186 		mvdev->mdev_detach = true;
187 		mlx5vf_state_mutex_unlock(mvdev);
188 		break;
189 	default:
190 		break;
191 	}
192 
193 	return 0;
194 }
195 
mlx5vf_cmd_close_migratable(struct mlx5vf_pci_core_device * mvdev)196 void mlx5vf_cmd_close_migratable(struct mlx5vf_pci_core_device *mvdev)
197 {
198 	if (!mvdev->migrate_cap)
199 		return;
200 
201 	/* Must be done outside the lock to let it progress */
202 	set_tracker_error(mvdev);
203 	mutex_lock(&mvdev->state_mutex);
204 	mlx5vf_disable_fds(mvdev, NULL);
205 	_mlx5vf_free_page_tracker_resources(mvdev);
206 	mlx5vf_state_mutex_unlock(mvdev);
207 }
208 
mlx5vf_cmd_remove_migratable(struct mlx5vf_pci_core_device * mvdev)209 void mlx5vf_cmd_remove_migratable(struct mlx5vf_pci_core_device *mvdev)
210 {
211 	if (!mvdev->migrate_cap)
212 		return;
213 
214 	mlx5_sriov_blocking_notifier_unregister(mvdev->mdev, mvdev->vf_id,
215 						&mvdev->nb);
216 	destroy_workqueue(mvdev->cb_wq);
217 }
218 
mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device * mvdev,const struct vfio_migration_ops * mig_ops,const struct vfio_log_ops * log_ops)219 void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev,
220 			       const struct vfio_migration_ops *mig_ops,
221 			       const struct vfio_log_ops *log_ops)
222 {
223 	struct pci_dev *pdev = mvdev->core_device.pdev;
224 	int ret;
225 
226 	if (!pdev->is_virtfn)
227 		return;
228 
229 	mvdev->mdev = mlx5_vf_get_core_dev(pdev);
230 	if (!mvdev->mdev)
231 		return;
232 
233 	if (!MLX5_CAP_GEN(mvdev->mdev, migration))
234 		goto end;
235 
236 	if (!(MLX5_CAP_GEN_2(mvdev->mdev, migration_multi_load) &&
237 	      MLX5_CAP_GEN_2(mvdev->mdev, migration_tracking_state)))
238 		goto end;
239 
240 	mvdev->vf_id = pci_iov_vf_id(pdev);
241 	if (mvdev->vf_id < 0)
242 		goto end;
243 
244 	ret = mlx5vf_is_migratable(mvdev->mdev, mvdev->vf_id + 1);
245 	if (ret)
246 		goto end;
247 
248 	if (mlx5vf_cmd_get_vhca_id(mvdev->mdev, mvdev->vf_id + 1,
249 				   &mvdev->vhca_id))
250 		goto end;
251 
252 	mvdev->cb_wq = alloc_ordered_workqueue("mlx5vf_wq", 0);
253 	if (!mvdev->cb_wq)
254 		goto end;
255 
256 	mutex_init(&mvdev->state_mutex);
257 	spin_lock_init(&mvdev->reset_lock);
258 	mvdev->nb.notifier_call = mlx5fv_vf_event;
259 	ret = mlx5_sriov_blocking_notifier_register(mvdev->mdev, mvdev->vf_id,
260 						    &mvdev->nb);
261 	if (ret) {
262 		destroy_workqueue(mvdev->cb_wq);
263 		goto end;
264 	}
265 
266 	mvdev->migrate_cap = 1;
267 	mvdev->core_device.vdev.migration_flags =
268 		VFIO_MIGRATION_STOP_COPY |
269 		VFIO_MIGRATION_P2P |
270 		VFIO_MIGRATION_PRE_COPY;
271 
272 	mvdev->core_device.vdev.mig_ops = mig_ops;
273 	init_completion(&mvdev->tracker_comp);
274 	if (MLX5_CAP_GEN(mvdev->mdev, adv_virtualization))
275 		mvdev->core_device.vdev.log_ops = log_ops;
276 
277 	if (MLX5_CAP_GEN_2(mvdev->mdev, migration_in_chunks))
278 		mvdev->chunk_mode = 1;
279 
280 end:
281 	mlx5_vf_put_core_dev(mvdev->mdev);
282 }
283 
mlx5vf_cmd_get_vhca_id(struct mlx5_core_dev * mdev,u16 function_id,u16 * vhca_id)284 static int mlx5vf_cmd_get_vhca_id(struct mlx5_core_dev *mdev, u16 function_id,
285 				  u16 *vhca_id)
286 {
287 	u32 in[MLX5_ST_SZ_DW(query_hca_cap_in)] = {};
288 	int out_size;
289 	void *out;
290 	int ret;
291 
292 	out_size = MLX5_ST_SZ_BYTES(query_hca_cap_out);
293 	out = kzalloc(out_size, GFP_KERNEL);
294 	if (!out)
295 		return -ENOMEM;
296 
297 	MLX5_SET(query_hca_cap_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_CAP);
298 	MLX5_SET(query_hca_cap_in, in, other_function, 1);
299 	MLX5_SET(query_hca_cap_in, in, function_id, function_id);
300 	MLX5_SET(query_hca_cap_in, in, op_mod,
301 		 MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE << 1 |
302 		 HCA_CAP_OPMOD_GET_CUR);
303 
304 	ret = mlx5_cmd_exec_inout(mdev, query_hca_cap, in, out);
305 	if (ret)
306 		goto err_exec;
307 
308 	*vhca_id = MLX5_GET(query_hca_cap_out, out,
309 			    capability.cmd_hca_cap.vhca_id);
310 
311 err_exec:
312 	kfree(out);
313 	return ret;
314 }
315 
alloc_mkey_in(u32 npages,u32 pdn)316 static u32 *alloc_mkey_in(u32 npages, u32 pdn)
317 {
318 	int inlen;
319 	void *mkc;
320 	u32 *in;
321 
322 	inlen = MLX5_ST_SZ_BYTES(create_mkey_in) +
323 		sizeof(__be64) * round_up(npages, 2);
324 
325 	in = kvzalloc(inlen, GFP_KERNEL_ACCOUNT);
326 	if (!in)
327 		return NULL;
328 
329 	MLX5_SET(create_mkey_in, in, translations_octword_actual_size,
330 		 DIV_ROUND_UP(npages, 2));
331 
332 	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
333 	MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_MTT);
334 	MLX5_SET(mkc, mkc, lr, 1);
335 	MLX5_SET(mkc, mkc, lw, 1);
336 	MLX5_SET(mkc, mkc, rr, 1);
337 	MLX5_SET(mkc, mkc, rw, 1);
338 	MLX5_SET(mkc, mkc, pd, pdn);
339 	MLX5_SET(mkc, mkc, bsf_octword_size, 0);
340 	MLX5_SET(mkc, mkc, qpn, 0xffffff);
341 	MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT);
342 	MLX5_SET(mkc, mkc, translations_octword_size, DIV_ROUND_UP(npages, 2));
343 	MLX5_SET64(mkc, mkc, len, npages * PAGE_SIZE);
344 
345 	return in;
346 }
347 
create_mkey(struct mlx5_core_dev * mdev,u32 npages,u32 * mkey_in,u32 * mkey)348 static int create_mkey(struct mlx5_core_dev *mdev, u32 npages, u32 *mkey_in,
349 		       u32 *mkey)
350 {
351 	int inlen = MLX5_ST_SZ_BYTES(create_mkey_in) +
352 		sizeof(__be64) * round_up(npages, 2);
353 
354 	return mlx5_core_create_mkey(mdev, mkey, mkey_in, inlen);
355 }
356 
unregister_dma_pages(struct mlx5_core_dev * mdev,u32 npages,u32 * mkey_in,struct dma_iova_state * state,enum dma_data_direction dir)357 static void unregister_dma_pages(struct mlx5_core_dev *mdev, u32 npages,
358 				 u32 *mkey_in, struct dma_iova_state *state,
359 				 enum dma_data_direction dir)
360 {
361 	dma_addr_t addr;
362 	__be64 *mtt;
363 	int i;
364 
365 	if (dma_use_iova(state)) {
366 		dma_iova_destroy(mdev->device, state, npages * PAGE_SIZE, dir,
367 				 0);
368 	} else {
369 		mtt = (__be64 *)MLX5_ADDR_OF(create_mkey_in, mkey_in,
370 					     klm_pas_mtt);
371 		for (i = npages - 1; i >= 0; i--) {
372 			addr = be64_to_cpu(mtt[i]);
373 			dma_unmap_page(mdev->device, addr, PAGE_SIZE, dir);
374 		}
375 	}
376 }
377 
register_dma_pages(struct mlx5_core_dev * mdev,u32 npages,struct page ** page_list,u32 * mkey_in,struct dma_iova_state * state,enum dma_data_direction dir)378 static int register_dma_pages(struct mlx5_core_dev *mdev, u32 npages,
379 			      struct page **page_list, u32 *mkey_in,
380 			      struct dma_iova_state *state,
381 			      enum dma_data_direction dir)
382 {
383 	dma_addr_t addr;
384 	size_t mapped = 0;
385 	__be64 *mtt;
386 	int i, err;
387 
388 	mtt = (__be64 *)MLX5_ADDR_OF(create_mkey_in, mkey_in, klm_pas_mtt);
389 
390 	if (dma_iova_try_alloc(mdev->device, state, 0, npages * PAGE_SIZE)) {
391 		addr = state->addr;
392 		for (i = 0; i < npages; i++) {
393 			err = dma_iova_link(mdev->device, state,
394 					    page_to_phys(page_list[i]), mapped,
395 					    PAGE_SIZE, dir, 0);
396 			if (err)
397 				goto error;
398 			*mtt++ = cpu_to_be64(addr);
399 			addr += PAGE_SIZE;
400 			mapped += PAGE_SIZE;
401 		}
402 		err = dma_iova_sync(mdev->device, state, 0, mapped);
403 		if (err)
404 			goto error;
405 	} else {
406 		for (i = 0; i < npages; i++) {
407 			addr = dma_map_page(mdev->device, page_list[i], 0,
408 					    PAGE_SIZE, dir);
409 			err = dma_mapping_error(mdev->device, addr);
410 			if (err)
411 				goto error;
412 			*mtt++ = cpu_to_be64(addr);
413 		}
414 	}
415 	return 0;
416 
417 error:
418 	unregister_dma_pages(mdev, i, mkey_in, state, dir);
419 	return err;
420 }
421 
mlx5vf_dma_data_buffer(struct mlx5_vhca_data_buffer * buf)422 static int mlx5vf_dma_data_buffer(struct mlx5_vhca_data_buffer *buf)
423 {
424 	struct mlx5vf_pci_core_device *mvdev = buf->migf->mvdev;
425 	struct mlx5_core_dev *mdev = mvdev->mdev;
426 	int ret;
427 
428 	lockdep_assert_held(&mvdev->state_mutex);
429 	if (mvdev->mdev_detach)
430 		return -ENOTCONN;
431 
432 	if (buf->mkey_in || !buf->npages)
433 		return -EINVAL;
434 
435 	buf->mkey_in = alloc_mkey_in(buf->npages, buf->migf->pdn);
436 	if (!buf->mkey_in)
437 		return -ENOMEM;
438 
439 	ret = register_dma_pages(mdev, buf->npages, buf->page_list,
440 				 buf->mkey_in, &buf->state, buf->dma_dir);
441 	if (ret)
442 		goto err_register_dma;
443 
444 	ret = create_mkey(mdev, buf->npages, buf->mkey_in, &buf->mkey);
445 	if (ret)
446 		goto err_create_mkey;
447 
448 	return 0;
449 
450 err_create_mkey:
451 	unregister_dma_pages(mdev, buf->npages, buf->mkey_in, &buf->state,
452 			     buf->dma_dir);
453 err_register_dma:
454 	kvfree(buf->mkey_in);
455 	buf->mkey_in = NULL;
456 	return ret;
457 }
458 
free_page_list(u32 npages,struct page ** page_list)459 static void free_page_list(u32 npages, struct page **page_list)
460 {
461 	int i;
462 
463 	/* Undo alloc_pages_bulk() */
464 	for (i = npages - 1; i >= 0; i--)
465 		__free_page(page_list[i]);
466 
467 	kvfree(page_list);
468 }
469 
mlx5vf_free_data_buffer(struct mlx5_vhca_data_buffer * buf)470 void mlx5vf_free_data_buffer(struct mlx5_vhca_data_buffer *buf)
471 {
472 	struct mlx5vf_pci_core_device *mvdev = buf->migf->mvdev;
473 	struct mlx5_core_dev *mdev = mvdev->mdev;
474 
475 	lockdep_assert_held(&mvdev->state_mutex);
476 	WARN_ON(mvdev->mdev_detach);
477 
478 	if (buf->mkey_in) {
479 		mlx5_core_destroy_mkey(mdev, buf->mkey);
480 		unregister_dma_pages(mdev, buf->npages, buf->mkey_in,
481 				     &buf->state, buf->dma_dir);
482 		kvfree(buf->mkey_in);
483 	}
484 
485 	free_page_list(buf->npages, buf->page_list);
486 	kfree(buf);
487 }
488 
mlx5vf_add_pages(struct page *** page_list,unsigned int npages)489 static int mlx5vf_add_pages(struct page ***page_list, unsigned int npages)
490 {
491 	unsigned int filled, done = 0;
492 	int i;
493 
494 	*page_list =
495 		kvcalloc(npages, sizeof(struct page *), GFP_KERNEL_ACCOUNT);
496 	if (!*page_list)
497 		return -ENOMEM;
498 
499 	for (;;) {
500 		filled = alloc_pages_bulk(GFP_KERNEL_ACCOUNT, npages - done,
501 					  *page_list + done);
502 		if (!filled)
503 			goto err;
504 
505 		done += filled;
506 		if (done == npages)
507 			break;
508 	}
509 
510 	return 0;
511 
512 err:
513 	for (i = 0; i < done; i++)
514 		__free_page(*page_list[i]);
515 
516 	kvfree(*page_list);
517 	*page_list = NULL;
518 	return -ENOMEM;
519 }
520 
521 struct mlx5_vhca_data_buffer *
mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file * migf,u32 npages,enum dma_data_direction dma_dir)522 mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf, u32 npages,
523 			 enum dma_data_direction dma_dir)
524 {
525 	struct mlx5_vhca_data_buffer *buf;
526 	int ret;
527 
528 	buf = kzalloc(sizeof(*buf), GFP_KERNEL_ACCOUNT);
529 	if (!buf)
530 		return ERR_PTR(-ENOMEM);
531 
532 	buf->dma_dir = dma_dir;
533 	buf->migf = migf;
534 	if (npages) {
535 		ret = mlx5vf_add_pages(&buf->page_list, npages);
536 		if (ret)
537 			goto end;
538 
539 		buf->npages = npages;
540 
541 		if (dma_dir != DMA_NONE) {
542 			ret = mlx5vf_dma_data_buffer(buf);
543 			if (ret)
544 				goto end;
545 		}
546 	}
547 
548 	return buf;
549 end:
550 	mlx5vf_free_data_buffer(buf);
551 	return ERR_PTR(ret);
552 }
553 
mlx5vf_put_data_buffer(struct mlx5_vhca_data_buffer * buf)554 void mlx5vf_put_data_buffer(struct mlx5_vhca_data_buffer *buf)
555 {
556 	spin_lock_irq(&buf->migf->list_lock);
557 	buf->stop_copy_chunk_num = 0;
558 	list_add_tail(&buf->buf_elm, &buf->migf->avail_list);
559 	spin_unlock_irq(&buf->migf->list_lock);
560 }
561 
562 struct mlx5_vhca_data_buffer *
mlx5vf_get_data_buffer(struct mlx5_vf_migration_file * migf,u32 npages,enum dma_data_direction dma_dir)563 mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf, u32 npages,
564 		       enum dma_data_direction dma_dir)
565 {
566 	struct mlx5_vhca_data_buffer *buf, *temp_buf;
567 	struct list_head free_list;
568 
569 	lockdep_assert_held(&migf->mvdev->state_mutex);
570 	if (migf->mvdev->mdev_detach)
571 		return ERR_PTR(-ENOTCONN);
572 
573 	INIT_LIST_HEAD(&free_list);
574 
575 	spin_lock_irq(&migf->list_lock);
576 	list_for_each_entry_safe(buf, temp_buf, &migf->avail_list, buf_elm) {
577 		if (buf->dma_dir == dma_dir) {
578 			list_del_init(&buf->buf_elm);
579 			if (buf->npages >= npages) {
580 				spin_unlock_irq(&migf->list_lock);
581 				goto found;
582 			}
583 			/*
584 			 * Prevent holding redundant buffers. Put in a free
585 			 * list and call at the end not under the spin lock
586 			 * (&migf->list_lock) to mlx5vf_free_data_buffer which
587 			 * might sleep.
588 			 */
589 			list_add(&buf->buf_elm, &free_list);
590 		}
591 	}
592 	spin_unlock_irq(&migf->list_lock);
593 	buf = mlx5vf_alloc_data_buffer(migf, npages, dma_dir);
594 
595 found:
596 	while ((temp_buf = list_first_entry_or_null(&free_list,
597 				struct mlx5_vhca_data_buffer, buf_elm))) {
598 		list_del(&temp_buf->buf_elm);
599 		mlx5vf_free_data_buffer(temp_buf);
600 	}
601 
602 	return buf;
603 }
604 
605 static void
mlx5vf_save_callback_complete(struct mlx5_vf_migration_file * migf,struct mlx5vf_async_data * async_data)606 mlx5vf_save_callback_complete(struct mlx5_vf_migration_file *migf,
607 			      struct mlx5vf_async_data *async_data)
608 {
609 	kvfree(async_data->out);
610 	complete(&migf->save_comp);
611 	fput(migf->filp);
612 }
613 
mlx5vf_mig_file_cleanup_cb(struct work_struct * _work)614 void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work)
615 {
616 	struct mlx5vf_async_data *async_data = container_of(_work,
617 		struct mlx5vf_async_data, work);
618 	struct mlx5_vf_migration_file *migf = container_of(async_data,
619 		struct mlx5_vf_migration_file, async_data);
620 
621 	mutex_lock(&migf->lock);
622 	if (async_data->status) {
623 		mlx5vf_put_data_buffer(async_data->buf);
624 		if (async_data->header_buf)
625 			mlx5vf_put_data_buffer(async_data->header_buf);
626 		if (!async_data->stop_copy_chunk &&
627 		    async_data->status == MLX5_CMD_STAT_BAD_RES_STATE_ERR)
628 			migf->state = MLX5_MIGF_STATE_PRE_COPY_ERROR;
629 		else
630 			migf->state = MLX5_MIGF_STATE_ERROR;
631 		wake_up_interruptible(&migf->poll_wait);
632 	}
633 	mutex_unlock(&migf->lock);
634 	mlx5vf_save_callback_complete(migf, async_data);
635 }
636 
add_buf_header(struct mlx5_vhca_data_buffer * header_buf,size_t image_size,bool initial_pre_copy)637 static int add_buf_header(struct mlx5_vhca_data_buffer *header_buf,
638 			  size_t image_size, bool initial_pre_copy)
639 {
640 	struct mlx5_vf_migration_file *migf = header_buf->migf;
641 	struct mlx5_vf_migration_header header = {};
642 	unsigned long flags;
643 	struct page *page;
644 	u8 *to_buff;
645 
646 	header.record_size = cpu_to_le64(image_size);
647 	header.flags = cpu_to_le32(MLX5_MIGF_HEADER_FLAGS_TAG_MANDATORY);
648 	header.tag = cpu_to_le32(MLX5_MIGF_HEADER_TAG_FW_DATA);
649 	page = mlx5vf_get_migration_page(header_buf, 0);
650 	if (!page)
651 		return -EINVAL;
652 	to_buff = kmap_local_page(page);
653 	memcpy(to_buff, &header, sizeof(header));
654 	kunmap_local(to_buff);
655 	header_buf->length = sizeof(header);
656 	header_buf->start_pos = header_buf->migf->max_pos;
657 	migf->max_pos += header_buf->length;
658 	spin_lock_irqsave(&migf->list_lock, flags);
659 	list_add_tail(&header_buf->buf_elm, &migf->buf_list);
660 	spin_unlock_irqrestore(&migf->list_lock, flags);
661 	if (initial_pre_copy)
662 		migf->pre_copy_initial_bytes += sizeof(header);
663 	return 0;
664 }
665 
mlx5vf_save_callback(int status,struct mlx5_async_work * context)666 static void mlx5vf_save_callback(int status, struct mlx5_async_work *context)
667 {
668 	struct mlx5vf_async_data *async_data = container_of(context,
669 			struct mlx5vf_async_data, cb_work);
670 	struct mlx5_vf_migration_file *migf = container_of(async_data,
671 			struct mlx5_vf_migration_file, async_data);
672 
673 	if (!status) {
674 		size_t next_required_umem_size = 0;
675 		bool stop_copy_last_chunk;
676 		size_t image_size;
677 		unsigned long flags;
678 		bool initial_pre_copy = migf->state != MLX5_MIGF_STATE_PRE_COPY &&
679 				!async_data->stop_copy_chunk;
680 
681 		image_size = MLX5_GET(save_vhca_state_out, async_data->out,
682 				      actual_image_size);
683 		if (async_data->buf->stop_copy_chunk_num)
684 			next_required_umem_size = MLX5_GET(save_vhca_state_out,
685 					async_data->out, next_required_umem_size);
686 		stop_copy_last_chunk = async_data->stop_copy_chunk &&
687 				!next_required_umem_size;
688 		if (async_data->header_buf) {
689 			status = add_buf_header(async_data->header_buf, image_size,
690 						initial_pre_copy);
691 			if (status)
692 				goto err;
693 		}
694 		async_data->buf->length = image_size;
695 		async_data->buf->start_pos = migf->max_pos;
696 		migf->max_pos += async_data->buf->length;
697 		spin_lock_irqsave(&migf->list_lock, flags);
698 		list_add_tail(&async_data->buf->buf_elm, &migf->buf_list);
699 		if (async_data->buf->stop_copy_chunk_num) {
700 			migf->num_ready_chunks++;
701 			if (next_required_umem_size &&
702 			    migf->num_ready_chunks >= MAX_NUM_CHUNKS) {
703 				/* Delay the next SAVE till one chunk be consumed */
704 				migf->next_required_umem_size = next_required_umem_size;
705 				next_required_umem_size = 0;
706 			}
707 		}
708 		spin_unlock_irqrestore(&migf->list_lock, flags);
709 		if (initial_pre_copy) {
710 			migf->pre_copy_initial_bytes += image_size;
711 			migf->state = MLX5_MIGF_STATE_PRE_COPY;
712 		}
713 		if (stop_copy_last_chunk)
714 			migf->state = MLX5_MIGF_STATE_COMPLETE;
715 		wake_up_interruptible(&migf->poll_wait);
716 		if (next_required_umem_size)
717 			mlx5vf_mig_file_set_save_work(migf,
718 				/* Picking up the next chunk num */
719 				(async_data->buf->stop_copy_chunk_num % MAX_NUM_CHUNKS) + 1,
720 				next_required_umem_size);
721 		mlx5vf_save_callback_complete(migf, async_data);
722 		return;
723 	}
724 
725 err:
726 	/* The error flow can't run from an interrupt context */
727 	if (status == -EREMOTEIO) {
728 		status = MLX5_GET(save_vhca_state_out, async_data->out, status);
729 		/* Failed in FW, print cmd out failure details */
730 		mlx5_cmd_out_err(migf->mvdev->mdev, MLX5_CMD_OP_SAVE_VHCA_STATE, 0,
731 				 async_data->out);
732 	}
733 
734 	async_data->status = status;
735 	queue_work(migf->mvdev->cb_wq, &async_data->work);
736 }
737 
mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device * mvdev,struct mlx5_vf_migration_file * migf,struct mlx5_vhca_data_buffer * buf,bool inc,bool track)738 int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev,
739 			       struct mlx5_vf_migration_file *migf,
740 			       struct mlx5_vhca_data_buffer *buf, bool inc,
741 			       bool track)
742 {
743 	u32 out_size = MLX5_ST_SZ_BYTES(save_vhca_state_out);
744 	u32 in[MLX5_ST_SZ_DW(save_vhca_state_in)] = {};
745 	struct mlx5_vhca_data_buffer *header_buf = NULL;
746 	struct mlx5vf_async_data *async_data;
747 	bool pre_copy_cleanup = false;
748 	int err;
749 
750 	lockdep_assert_held(&mvdev->state_mutex);
751 	if (mvdev->mdev_detach)
752 		return -ENOTCONN;
753 
754 	err = wait_for_completion_interruptible(&migf->save_comp);
755 	if (err)
756 		return err;
757 
758 	if ((migf->state == MLX5_MIGF_STATE_PRE_COPY ||
759 	     migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR) && !track && !inc)
760 		pre_copy_cleanup = true;
761 
762 	if (migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR)
763 		/*
764 		 * In case we had a PRE_COPY error, SAVE is triggered only for
765 		 * the final image, read device full image.
766 		 */
767 		inc = false;
768 
769 	MLX5_SET(save_vhca_state_in, in, opcode,
770 		 MLX5_CMD_OP_SAVE_VHCA_STATE);
771 	MLX5_SET(save_vhca_state_in, in, op_mod, 0);
772 	MLX5_SET(save_vhca_state_in, in, vhca_id, mvdev->vhca_id);
773 	MLX5_SET(save_vhca_state_in, in, mkey, buf->mkey);
774 	MLX5_SET(save_vhca_state_in, in, size, buf->npages * PAGE_SIZE);
775 	MLX5_SET(save_vhca_state_in, in, incremental, inc);
776 	MLX5_SET(save_vhca_state_in, in, set_track, track);
777 
778 	async_data = &migf->async_data;
779 	async_data->buf = buf;
780 	async_data->stop_copy_chunk = (!track && !pre_copy_cleanup);
781 	async_data->out = kvzalloc(out_size, GFP_KERNEL);
782 	if (!async_data->out) {
783 		err = -ENOMEM;
784 		goto err_out;
785 	}
786 
787 	if (async_data->stop_copy_chunk) {
788 		u8 header_idx = buf->stop_copy_chunk_num ?
789 			buf->stop_copy_chunk_num - 1 : 0;
790 
791 		header_buf = migf->buf_header[header_idx];
792 		migf->buf_header[header_idx] = NULL;
793 	}
794 
795 	if (!header_buf) {
796 		header_buf = mlx5vf_get_data_buffer(
797 			migf,
798 			DIV_ROUND_UP(sizeof(struct mlx5_vf_migration_header),
799 				     PAGE_SIZE),
800 			DMA_NONE);
801 		if (IS_ERR(header_buf)) {
802 			err = PTR_ERR(header_buf);
803 			goto err_free;
804 		}
805 	}
806 
807 	if (async_data->stop_copy_chunk)
808 		migf->state = MLX5_MIGF_STATE_SAVE_STOP_COPY_CHUNK;
809 
810 	async_data->header_buf = header_buf;
811 	get_file(migf->filp);
812 	err = mlx5_cmd_exec_cb(&migf->async_ctx, in, sizeof(in),
813 			       async_data->out,
814 			       out_size, mlx5vf_save_callback,
815 			       &async_data->cb_work);
816 	if (err)
817 		goto err_exec;
818 
819 	return 0;
820 
821 err_exec:
822 	if (header_buf)
823 		mlx5vf_put_data_buffer(header_buf);
824 	fput(migf->filp);
825 err_free:
826 	kvfree(async_data->out);
827 err_out:
828 	complete(&migf->save_comp);
829 	return err;
830 }
831 
mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device * mvdev,struct mlx5_vf_migration_file * migf,struct mlx5_vhca_data_buffer * buf)832 int mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device *mvdev,
833 			       struct mlx5_vf_migration_file *migf,
834 			       struct mlx5_vhca_data_buffer *buf)
835 {
836 	u32 out[MLX5_ST_SZ_DW(load_vhca_state_out)] = {};
837 	u32 in[MLX5_ST_SZ_DW(load_vhca_state_in)] = {};
838 	int err;
839 
840 	lockdep_assert_held(&mvdev->state_mutex);
841 	if (mvdev->mdev_detach)
842 		return -ENOTCONN;
843 
844 	if (!buf->mkey_in) {
845 		err = mlx5vf_dma_data_buffer(buf);
846 		if (err)
847 			return err;
848 	}
849 
850 	MLX5_SET(load_vhca_state_in, in, opcode,
851 		 MLX5_CMD_OP_LOAD_VHCA_STATE);
852 	MLX5_SET(load_vhca_state_in, in, op_mod, 0);
853 	MLX5_SET(load_vhca_state_in, in, vhca_id, mvdev->vhca_id);
854 	MLX5_SET(load_vhca_state_in, in, mkey, buf->mkey);
855 	MLX5_SET(load_vhca_state_in, in, size, buf->length);
856 	return mlx5_cmd_exec_inout(mvdev->mdev, load_vhca_state, in, out);
857 }
858 
mlx5vf_cmd_alloc_pd(struct mlx5_vf_migration_file * migf)859 int mlx5vf_cmd_alloc_pd(struct mlx5_vf_migration_file *migf)
860 {
861 	int err;
862 
863 	lockdep_assert_held(&migf->mvdev->state_mutex);
864 	if (migf->mvdev->mdev_detach)
865 		return -ENOTCONN;
866 
867 	err = mlx5_core_alloc_pd(migf->mvdev->mdev, &migf->pdn);
868 	return err;
869 }
870 
mlx5vf_cmd_dealloc_pd(struct mlx5_vf_migration_file * migf)871 void mlx5vf_cmd_dealloc_pd(struct mlx5_vf_migration_file *migf)
872 {
873 	lockdep_assert_held(&migf->mvdev->state_mutex);
874 	if (migf->mvdev->mdev_detach)
875 		return;
876 
877 	mlx5_core_dealloc_pd(migf->mvdev->mdev, migf->pdn);
878 }
879 
mlx5fv_cmd_clean_migf_resources(struct mlx5_vf_migration_file * migf)880 void mlx5fv_cmd_clean_migf_resources(struct mlx5_vf_migration_file *migf)
881 {
882 	struct mlx5_vhca_data_buffer *entry;
883 	int i;
884 
885 	lockdep_assert_held(&migf->mvdev->state_mutex);
886 	WARN_ON(migf->mvdev->mdev_detach);
887 
888 	for (i = 0; i < MAX_NUM_CHUNKS; i++) {
889 		if (migf->buf[i]) {
890 			mlx5vf_free_data_buffer(migf->buf[i]);
891 			migf->buf[i] = NULL;
892 		}
893 
894 		if (migf->buf_header[i]) {
895 			mlx5vf_free_data_buffer(migf->buf_header[i]);
896 			migf->buf_header[i] = NULL;
897 		}
898 	}
899 
900 	list_splice(&migf->avail_list, &migf->buf_list);
901 
902 	while ((entry = list_first_entry_or_null(&migf->buf_list,
903 				struct mlx5_vhca_data_buffer, buf_elm))) {
904 		list_del(&entry->buf_elm);
905 		mlx5vf_free_data_buffer(entry);
906 	}
907 
908 	mlx5vf_cmd_dealloc_pd(migf);
909 }
910 
mlx5vf_create_tracker(struct mlx5_core_dev * mdev,struct mlx5vf_pci_core_device * mvdev,struct rb_root_cached * ranges,u32 nnodes)911 static int mlx5vf_create_tracker(struct mlx5_core_dev *mdev,
912 				 struct mlx5vf_pci_core_device *mvdev,
913 				 struct rb_root_cached *ranges, u32 nnodes)
914 {
915 	int max_num_range =
916 		MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_max_num_range);
917 	struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker;
918 	int record_size = MLX5_ST_SZ_BYTES(page_track_range);
919 	u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {};
920 	struct interval_tree_node *node = NULL;
921 	u64 total_ranges_len = 0;
922 	u32 num_ranges = nnodes;
923 	u8 log_addr_space_size;
924 	void *range_list_ptr;
925 	void *obj_context;
926 	void *cmd_hdr;
927 	int inlen;
928 	void *in;
929 	int err;
930 	int i;
931 
932 	if (num_ranges > max_num_range) {
933 		vfio_combine_iova_ranges(ranges, nnodes, max_num_range);
934 		num_ranges = max_num_range;
935 	}
936 
937 	inlen = MLX5_ST_SZ_BYTES(create_page_track_obj_in) +
938 				 record_size * num_ranges;
939 	in = kzalloc(inlen, GFP_KERNEL);
940 	if (!in)
941 		return -ENOMEM;
942 
943 	cmd_hdr = MLX5_ADDR_OF(create_page_track_obj_in, in,
944 			       general_obj_in_cmd_hdr);
945 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode,
946 		 MLX5_CMD_OP_CREATE_GENERAL_OBJECT);
947 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type,
948 		 MLX5_OBJ_TYPE_PAGE_TRACK);
949 	obj_context = MLX5_ADDR_OF(create_page_track_obj_in, in, obj_context);
950 	MLX5_SET(page_track, obj_context, vhca_id, mvdev->vhca_id);
951 	MLX5_SET(page_track, obj_context, track_type, 1);
952 	MLX5_SET(page_track, obj_context, log_page_size,
953 		 ilog2(tracker->host_qp->tracked_page_size));
954 	MLX5_SET(page_track, obj_context, log_msg_size,
955 		 ilog2(tracker->host_qp->max_msg_size));
956 	MLX5_SET(page_track, obj_context, reporting_qpn, tracker->fw_qp->qpn);
957 	MLX5_SET(page_track, obj_context, num_ranges, num_ranges);
958 
959 	range_list_ptr = MLX5_ADDR_OF(page_track, obj_context, track_range);
960 	node = interval_tree_iter_first(ranges, 0, ULONG_MAX);
961 	for (i = 0; i < num_ranges; i++) {
962 		void *addr_range_i_base = range_list_ptr + record_size * i;
963 		unsigned long length = node->last - node->start + 1;
964 
965 		MLX5_SET64(page_track_range, addr_range_i_base, start_address,
966 			   node->start);
967 		MLX5_SET64(page_track_range, addr_range_i_base, length, length);
968 		total_ranges_len += length;
969 		node = interval_tree_iter_next(node, 0, ULONG_MAX);
970 	}
971 
972 	WARN_ON(node);
973 	log_addr_space_size = ilog2(roundup_pow_of_two(total_ranges_len));
974 	if (log_addr_space_size <
975 	    (MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_log_min_addr_space)) ||
976 	    log_addr_space_size >
977 	    (MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_log_max_addr_space))) {
978 		err = -EOPNOTSUPP;
979 		goto out;
980 	}
981 
982 	MLX5_SET(page_track, obj_context, log_addr_space_size,
983 		 log_addr_space_size);
984 	err = mlx5_cmd_exec(mdev, in, inlen, out, sizeof(out));
985 	if (err)
986 		goto out;
987 
988 	tracker->id = MLX5_GET(general_obj_out_cmd_hdr, out, obj_id);
989 out:
990 	kfree(in);
991 	return err;
992 }
993 
mlx5vf_cmd_destroy_tracker(struct mlx5_core_dev * mdev,u32 tracker_id)994 static int mlx5vf_cmd_destroy_tracker(struct mlx5_core_dev *mdev,
995 				      u32 tracker_id)
996 {
997 	u32 in[MLX5_ST_SZ_DW(general_obj_in_cmd_hdr)] = {};
998 	u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {};
999 
1000 	MLX5_SET(general_obj_in_cmd_hdr, in, opcode, MLX5_CMD_OP_DESTROY_GENERAL_OBJECT);
1001 	MLX5_SET(general_obj_in_cmd_hdr, in, obj_type, MLX5_OBJ_TYPE_PAGE_TRACK);
1002 	MLX5_SET(general_obj_in_cmd_hdr, in, obj_id, tracker_id);
1003 
1004 	return mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out));
1005 }
1006 
mlx5vf_cmd_modify_tracker(struct mlx5_core_dev * mdev,u32 tracker_id,unsigned long iova,unsigned long length,u32 tracker_state)1007 static int mlx5vf_cmd_modify_tracker(struct mlx5_core_dev *mdev,
1008 				     u32 tracker_id, unsigned long iova,
1009 				     unsigned long length, u32 tracker_state)
1010 {
1011 	u32 in[MLX5_ST_SZ_DW(modify_page_track_obj_in)] = {};
1012 	u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {};
1013 	void *obj_context;
1014 	void *cmd_hdr;
1015 
1016 	cmd_hdr = MLX5_ADDR_OF(modify_page_track_obj_in, in, general_obj_in_cmd_hdr);
1017 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_MODIFY_GENERAL_OBJECT);
1018 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_PAGE_TRACK);
1019 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_id, tracker_id);
1020 
1021 	obj_context = MLX5_ADDR_OF(modify_page_track_obj_in, in, obj_context);
1022 	MLX5_SET64(page_track, obj_context, modify_field_select, 0x3);
1023 	MLX5_SET64(page_track, obj_context, range_start_address, iova);
1024 	MLX5_SET64(page_track, obj_context, length, length);
1025 	MLX5_SET(page_track, obj_context, state, tracker_state);
1026 
1027 	return mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out));
1028 }
1029 
mlx5vf_cmd_query_tracker(struct mlx5_core_dev * mdev,struct mlx5_vhca_page_tracker * tracker)1030 static int mlx5vf_cmd_query_tracker(struct mlx5_core_dev *mdev,
1031 				    struct mlx5_vhca_page_tracker *tracker)
1032 {
1033 	u32 out[MLX5_ST_SZ_DW(query_page_track_obj_out)] = {};
1034 	u32 in[MLX5_ST_SZ_DW(general_obj_in_cmd_hdr)] = {};
1035 	void *obj_context;
1036 	void *cmd_hdr;
1037 	int err;
1038 
1039 	cmd_hdr = MLX5_ADDR_OF(modify_page_track_obj_in, in, general_obj_in_cmd_hdr);
1040 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_QUERY_GENERAL_OBJECT);
1041 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_PAGE_TRACK);
1042 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_id, tracker->id);
1043 
1044 	err = mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out));
1045 	if (err)
1046 		return err;
1047 
1048 	obj_context = MLX5_ADDR_OF(query_page_track_obj_out, out, obj_context);
1049 	tracker->status = MLX5_GET(page_track, obj_context, state);
1050 	return 0;
1051 }
1052 
alloc_cq_frag_buf(struct mlx5_core_dev * mdev,struct mlx5_vhca_cq_buf * buf,int nent,int cqe_size)1053 static int alloc_cq_frag_buf(struct mlx5_core_dev *mdev,
1054 			     struct mlx5_vhca_cq_buf *buf, int nent,
1055 			     int cqe_size)
1056 {
1057 	struct mlx5_frag_buf *frag_buf = &buf->frag_buf;
1058 	u8 log_wq_stride = 6 + (cqe_size == 128 ? 1 : 0);
1059 	u8 log_wq_sz = ilog2(cqe_size);
1060 	int err;
1061 
1062 	err = mlx5_frag_buf_alloc_node(mdev, nent * cqe_size, frag_buf,
1063 				       mdev->priv.numa_node);
1064 	if (err)
1065 		return err;
1066 
1067 	mlx5_init_fbc(frag_buf->frags, log_wq_stride, log_wq_sz, &buf->fbc);
1068 	buf->cqe_size = cqe_size;
1069 	buf->nent = nent;
1070 	return 0;
1071 }
1072 
init_cq_frag_buf(struct mlx5_vhca_cq_buf * buf)1073 static void init_cq_frag_buf(struct mlx5_vhca_cq_buf *buf)
1074 {
1075 	struct mlx5_cqe64 *cqe64;
1076 	void *cqe;
1077 	int i;
1078 
1079 	for (i = 0; i < buf->nent; i++) {
1080 		cqe = mlx5_frag_buf_get_wqe(&buf->fbc, i);
1081 		cqe64 = buf->cqe_size == 64 ? cqe : cqe + 64;
1082 		cqe64->op_own = MLX5_CQE_INVALID << 4;
1083 	}
1084 }
1085 
mlx5vf_destroy_cq(struct mlx5_core_dev * mdev,struct mlx5_vhca_cq * cq)1086 static void mlx5vf_destroy_cq(struct mlx5_core_dev *mdev,
1087 			      struct mlx5_vhca_cq *cq)
1088 {
1089 	mlx5_core_destroy_cq(mdev, &cq->mcq);
1090 	mlx5_frag_buf_free(mdev, &cq->buf.frag_buf);
1091 	mlx5_db_free(mdev, &cq->db);
1092 }
1093 
mlx5vf_cq_event(struct mlx5_core_cq * mcq,enum mlx5_event type)1094 static void mlx5vf_cq_event(struct mlx5_core_cq *mcq, enum mlx5_event type)
1095 {
1096 	if (type != MLX5_EVENT_TYPE_CQ_ERROR)
1097 		return;
1098 
1099 	set_tracker_error(container_of(mcq, struct mlx5vf_pci_core_device,
1100 				       tracker.cq.mcq));
1101 }
1102 
mlx5vf_event_notifier(struct notifier_block * nb,unsigned long type,void * data)1103 static int mlx5vf_event_notifier(struct notifier_block *nb, unsigned long type,
1104 				 void *data)
1105 {
1106 	struct mlx5_vhca_page_tracker *tracker =
1107 		mlx5_nb_cof(nb, struct mlx5_vhca_page_tracker, nb);
1108 	struct mlx5vf_pci_core_device *mvdev = container_of(
1109 		tracker, struct mlx5vf_pci_core_device, tracker);
1110 	struct mlx5_eqe_obj_change *object;
1111 	struct mlx5_eqe *eqe = data;
1112 	u8 event_type = (u8)type;
1113 	u8 queue_type;
1114 	u32 obj_id;
1115 	int qp_num;
1116 
1117 	switch (event_type) {
1118 	case MLX5_EVENT_TYPE_WQ_CATAS_ERROR:
1119 	case MLX5_EVENT_TYPE_WQ_ACCESS_ERROR:
1120 	case MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR:
1121 		queue_type = eqe->data.qp_srq.type;
1122 		if (queue_type != MLX5_EVENT_QUEUE_TYPE_QP)
1123 			break;
1124 		qp_num = be32_to_cpu(eqe->data.qp_srq.qp_srq_n) & 0xffffff;
1125 		if (qp_num != tracker->host_qp->qpn &&
1126 		    qp_num != tracker->fw_qp->qpn)
1127 			break;
1128 		set_tracker_error(mvdev);
1129 		break;
1130 	case MLX5_EVENT_TYPE_OBJECT_CHANGE:
1131 		object = &eqe->data.obj_change;
1132 		obj_id = be32_to_cpu(object->obj_id);
1133 		if (obj_id == tracker->id)
1134 			set_tracker_change_event(mvdev);
1135 		break;
1136 	default:
1137 		break;
1138 	}
1139 
1140 	return NOTIFY_OK;
1141 }
1142 
mlx5vf_cq_complete(struct mlx5_core_cq * mcq,struct mlx5_eqe * eqe)1143 static void mlx5vf_cq_complete(struct mlx5_core_cq *mcq,
1144 			       struct mlx5_eqe *eqe)
1145 {
1146 	struct mlx5vf_pci_core_device *mvdev =
1147 		container_of(mcq, struct mlx5vf_pci_core_device,
1148 			     tracker.cq.mcq);
1149 
1150 	complete(&mvdev->tracker_comp);
1151 }
1152 
mlx5vf_create_cq(struct mlx5_core_dev * mdev,struct mlx5_vhca_page_tracker * tracker,size_t ncqe)1153 static int mlx5vf_create_cq(struct mlx5_core_dev *mdev,
1154 			    struct mlx5_vhca_page_tracker *tracker,
1155 			    size_t ncqe)
1156 {
1157 	int cqe_size = cache_line_size() == 128 ? 128 : 64;
1158 	u32 out[MLX5_ST_SZ_DW(create_cq_out)];
1159 	struct mlx5_vhca_cq *cq;
1160 	int inlen, err, eqn;
1161 	void *cqc, *in;
1162 	__be64 *pas;
1163 	int vector;
1164 
1165 	cq = &tracker->cq;
1166 	ncqe = roundup_pow_of_two(ncqe);
1167 	err = mlx5_db_alloc_node(mdev, &cq->db, mdev->priv.numa_node);
1168 	if (err)
1169 		return err;
1170 
1171 	cq->ncqe = ncqe;
1172 	cq->mcq.set_ci_db = cq->db.db;
1173 	cq->mcq.arm_db = cq->db.db + 1;
1174 	cq->mcq.cqe_sz = cqe_size;
1175 	err = alloc_cq_frag_buf(mdev, &cq->buf, ncqe, cqe_size);
1176 	if (err)
1177 		goto err_db_free;
1178 
1179 	init_cq_frag_buf(&cq->buf);
1180 	inlen = MLX5_ST_SZ_BYTES(create_cq_in) +
1181 		MLX5_FLD_SZ_BYTES(create_cq_in, pas[0]) *
1182 		cq->buf.frag_buf.npages;
1183 	in = kvzalloc(inlen, GFP_KERNEL);
1184 	if (!in) {
1185 		err = -ENOMEM;
1186 		goto err_buff;
1187 	}
1188 
1189 	vector = raw_smp_processor_id() % mlx5_comp_vectors_max(mdev);
1190 	err = mlx5_comp_eqn_get(mdev, vector, &eqn);
1191 	if (err)
1192 		goto err_vec;
1193 
1194 	cqc = MLX5_ADDR_OF(create_cq_in, in, cq_context);
1195 	MLX5_SET(cqc, cqc, log_cq_size, ilog2(ncqe));
1196 	MLX5_SET(cqc, cqc, c_eqn_or_apu_element, eqn);
1197 	MLX5_SET(cqc, cqc, uar_page, tracker->uar->index);
1198 	MLX5_SET(cqc, cqc, log_page_size, cq->buf.frag_buf.page_shift -
1199 		 MLX5_ADAPTER_PAGE_SHIFT);
1200 	MLX5_SET64(cqc, cqc, dbr_addr, cq->db.dma);
1201 	pas = (__be64 *)MLX5_ADDR_OF(create_cq_in, in, pas);
1202 	mlx5_fill_page_frag_array(&cq->buf.frag_buf, pas);
1203 	cq->mcq.comp = mlx5vf_cq_complete;
1204 	cq->mcq.event = mlx5vf_cq_event;
1205 	err = mlx5_core_create_cq(mdev, &cq->mcq, in, inlen, out, sizeof(out));
1206 	if (err)
1207 		goto err_vec;
1208 
1209 	mlx5_cq_arm(&cq->mcq, MLX5_CQ_DB_REQ_NOT, tracker->uar->map,
1210 		    cq->mcq.cons_index);
1211 	kvfree(in);
1212 	return 0;
1213 
1214 err_vec:
1215 	kvfree(in);
1216 err_buff:
1217 	mlx5_frag_buf_free(mdev, &cq->buf.frag_buf);
1218 err_db_free:
1219 	mlx5_db_free(mdev, &cq->db);
1220 	return err;
1221 }
1222 
1223 static struct mlx5_vhca_qp *
mlx5vf_create_rc_qp(struct mlx5_core_dev * mdev,struct mlx5_vhca_page_tracker * tracker,u32 max_recv_wr)1224 mlx5vf_create_rc_qp(struct mlx5_core_dev *mdev,
1225 		    struct mlx5_vhca_page_tracker *tracker, u32 max_recv_wr)
1226 {
1227 	u32 out[MLX5_ST_SZ_DW(create_qp_out)] = {};
1228 	struct mlx5_vhca_qp *qp;
1229 	u8 log_rq_stride;
1230 	u8 log_rq_sz;
1231 	void *qpc;
1232 	int inlen;
1233 	void *in;
1234 	int err;
1235 
1236 	qp = kzalloc(sizeof(*qp), GFP_KERNEL_ACCOUNT);
1237 	if (!qp)
1238 		return ERR_PTR(-ENOMEM);
1239 
1240 	err = mlx5_db_alloc_node(mdev, &qp->db, mdev->priv.numa_node);
1241 	if (err)
1242 		goto err_free;
1243 
1244 	if (max_recv_wr) {
1245 		qp->rq.wqe_cnt = roundup_pow_of_two(max_recv_wr);
1246 		log_rq_stride = ilog2(MLX5_SEND_WQE_DS);
1247 		log_rq_sz = ilog2(qp->rq.wqe_cnt);
1248 		err = mlx5_frag_buf_alloc_node(mdev,
1249 			wq_get_byte_sz(log_rq_sz, log_rq_stride),
1250 			&qp->buf, mdev->priv.numa_node);
1251 		if (err)
1252 			goto err_db_free;
1253 		mlx5_init_fbc(qp->buf.frags, log_rq_stride, log_rq_sz, &qp->rq.fbc);
1254 	}
1255 
1256 	qp->rq.db = &qp->db.db[MLX5_RCV_DBR];
1257 	inlen = MLX5_ST_SZ_BYTES(create_qp_in) +
1258 		MLX5_FLD_SZ_BYTES(create_qp_in, pas[0]) *
1259 		qp->buf.npages;
1260 	in = kvzalloc(inlen, GFP_KERNEL);
1261 	if (!in) {
1262 		err = -ENOMEM;
1263 		goto err_in;
1264 	}
1265 
1266 	qpc = MLX5_ADDR_OF(create_qp_in, in, qpc);
1267 	MLX5_SET(qpc, qpc, st, MLX5_QP_ST_RC);
1268 	MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED);
1269 	MLX5_SET(qpc, qpc, pd, tracker->pdn);
1270 	MLX5_SET(qpc, qpc, uar_page, tracker->uar->index);
1271 	MLX5_SET(qpc, qpc, log_page_size,
1272 		 qp->buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT);
1273 	MLX5_SET(qpc, qpc, ts_format, mlx5_get_qp_default_ts(mdev));
1274 	if (MLX5_CAP_GEN(mdev, cqe_version) == 1)
1275 		MLX5_SET(qpc, qpc, user_index, 0xFFFFFF);
1276 	MLX5_SET(qpc, qpc, no_sq, 1);
1277 	if (max_recv_wr) {
1278 		MLX5_SET(qpc, qpc, cqn_rcv, tracker->cq.mcq.cqn);
1279 		MLX5_SET(qpc, qpc, log_rq_stride, log_rq_stride - 4);
1280 		MLX5_SET(qpc, qpc, log_rq_size, log_rq_sz);
1281 		MLX5_SET(qpc, qpc, rq_type, MLX5_NON_ZERO_RQ);
1282 		MLX5_SET64(qpc, qpc, dbr_addr, qp->db.dma);
1283 		mlx5_fill_page_frag_array(&qp->buf,
1284 					  (__be64 *)MLX5_ADDR_OF(create_qp_in,
1285 								 in, pas));
1286 	} else {
1287 		MLX5_SET(qpc, qpc, rq_type, MLX5_ZERO_LEN_RQ);
1288 	}
1289 
1290 	MLX5_SET(create_qp_in, in, opcode, MLX5_CMD_OP_CREATE_QP);
1291 	err = mlx5_cmd_exec(mdev, in, inlen, out, sizeof(out));
1292 	kvfree(in);
1293 	if (err)
1294 		goto err_in;
1295 
1296 	qp->qpn = MLX5_GET(create_qp_out, out, qpn);
1297 	return qp;
1298 
1299 err_in:
1300 	if (max_recv_wr)
1301 		mlx5_frag_buf_free(mdev, &qp->buf);
1302 err_db_free:
1303 	mlx5_db_free(mdev, &qp->db);
1304 err_free:
1305 	kfree(qp);
1306 	return ERR_PTR(err);
1307 }
1308 
mlx5vf_post_recv(struct mlx5_vhca_qp * qp)1309 static void mlx5vf_post_recv(struct mlx5_vhca_qp *qp)
1310 {
1311 	struct mlx5_wqe_data_seg *data;
1312 	unsigned int ix;
1313 
1314 	WARN_ON(qp->rq.pc - qp->rq.cc >= qp->rq.wqe_cnt);
1315 	ix = qp->rq.pc & (qp->rq.wqe_cnt - 1);
1316 	data = mlx5_frag_buf_get_wqe(&qp->rq.fbc, ix);
1317 	data->byte_count = cpu_to_be32(qp->max_msg_size);
1318 	data->lkey = cpu_to_be32(qp->recv_buf.mkey);
1319 	data->addr = cpu_to_be64(qp->recv_buf.next_rq_offset);
1320 	qp->rq.pc++;
1321 	/* Make sure that descriptors are written before doorbell record. */
1322 	dma_wmb();
1323 	*qp->rq.db = cpu_to_be32(qp->rq.pc & 0xffff);
1324 }
1325 
mlx5vf_activate_qp(struct mlx5_core_dev * mdev,struct mlx5_vhca_qp * qp,u32 remote_qpn,bool host_qp)1326 static int mlx5vf_activate_qp(struct mlx5_core_dev *mdev,
1327 			      struct mlx5_vhca_qp *qp, u32 remote_qpn,
1328 			      bool host_qp)
1329 {
1330 	u32 init_in[MLX5_ST_SZ_DW(rst2init_qp_in)] = {};
1331 	u32 rtr_in[MLX5_ST_SZ_DW(init2rtr_qp_in)] = {};
1332 	u32 rts_in[MLX5_ST_SZ_DW(rtr2rts_qp_in)] = {};
1333 	void *qpc;
1334 	int ret;
1335 
1336 	/* Init */
1337 	qpc = MLX5_ADDR_OF(rst2init_qp_in, init_in, qpc);
1338 	MLX5_SET(qpc, qpc, primary_address_path.vhca_port_num, 1);
1339 	MLX5_SET(qpc, qpc, pm_state, MLX5_QPC_PM_STATE_MIGRATED);
1340 	MLX5_SET(qpc, qpc, rre, 1);
1341 	MLX5_SET(qpc, qpc, rwe, 1);
1342 	MLX5_SET(rst2init_qp_in, init_in, opcode, MLX5_CMD_OP_RST2INIT_QP);
1343 	MLX5_SET(rst2init_qp_in, init_in, qpn, qp->qpn);
1344 	ret = mlx5_cmd_exec_in(mdev, rst2init_qp, init_in);
1345 	if (ret)
1346 		return ret;
1347 
1348 	if (host_qp) {
1349 		struct mlx5_vhca_recv_buf *recv_buf = &qp->recv_buf;
1350 		int i;
1351 
1352 		for (i = 0; i < qp->rq.wqe_cnt; i++) {
1353 			mlx5vf_post_recv(qp);
1354 			recv_buf->next_rq_offset += qp->max_msg_size;
1355 		}
1356 	}
1357 
1358 	/* RTR */
1359 	qpc = MLX5_ADDR_OF(init2rtr_qp_in, rtr_in, qpc);
1360 	MLX5_SET(init2rtr_qp_in, rtr_in, qpn, qp->qpn);
1361 	MLX5_SET(qpc, qpc, mtu, IB_MTU_4096);
1362 	MLX5_SET(qpc, qpc, log_msg_max, MLX5_CAP_GEN(mdev, log_max_msg));
1363 	MLX5_SET(qpc, qpc, remote_qpn, remote_qpn);
1364 	MLX5_SET(qpc, qpc, primary_address_path.vhca_port_num, 1);
1365 	MLX5_SET(qpc, qpc, primary_address_path.fl, 1);
1366 	MLX5_SET(qpc, qpc, min_rnr_nak, 1);
1367 	MLX5_SET(init2rtr_qp_in, rtr_in, opcode, MLX5_CMD_OP_INIT2RTR_QP);
1368 	MLX5_SET(init2rtr_qp_in, rtr_in, qpn, qp->qpn);
1369 	ret = mlx5_cmd_exec_in(mdev, init2rtr_qp, rtr_in);
1370 	if (ret || host_qp)
1371 		return ret;
1372 
1373 	/* RTS */
1374 	qpc = MLX5_ADDR_OF(rtr2rts_qp_in, rts_in, qpc);
1375 	MLX5_SET(rtr2rts_qp_in, rts_in, qpn, qp->qpn);
1376 	MLX5_SET(qpc, qpc, retry_count, 7);
1377 	MLX5_SET(qpc, qpc, rnr_retry, 7); /* Infinite retry if RNR NACK */
1378 	MLX5_SET(qpc, qpc, primary_address_path.ack_timeout, 0x8); /* ~1ms */
1379 	MLX5_SET(rtr2rts_qp_in, rts_in, opcode, MLX5_CMD_OP_RTR2RTS_QP);
1380 	MLX5_SET(rtr2rts_qp_in, rts_in, qpn, qp->qpn);
1381 
1382 	return mlx5_cmd_exec_in(mdev, rtr2rts_qp, rts_in);
1383 }
1384 
mlx5vf_destroy_qp(struct mlx5_core_dev * mdev,struct mlx5_vhca_qp * qp)1385 static void mlx5vf_destroy_qp(struct mlx5_core_dev *mdev,
1386 			      struct mlx5_vhca_qp *qp)
1387 {
1388 	u32 in[MLX5_ST_SZ_DW(destroy_qp_in)] = {};
1389 
1390 	MLX5_SET(destroy_qp_in, in, opcode, MLX5_CMD_OP_DESTROY_QP);
1391 	MLX5_SET(destroy_qp_in, in, qpn, qp->qpn);
1392 	mlx5_cmd_exec_in(mdev, destroy_qp, in);
1393 
1394 	mlx5_frag_buf_free(mdev, &qp->buf);
1395 	mlx5_db_free(mdev, &qp->db);
1396 	kfree(qp);
1397 }
1398 
mlx5vf_free_qp_recv_resources(struct mlx5_core_dev * mdev,struct mlx5_vhca_qp * qp)1399 static void mlx5vf_free_qp_recv_resources(struct mlx5_core_dev *mdev,
1400 					  struct mlx5_vhca_qp *qp)
1401 {
1402 	struct mlx5_vhca_recv_buf *recv_buf = &qp->recv_buf;
1403 
1404 	mlx5_core_destroy_mkey(mdev, recv_buf->mkey);
1405 	unregister_dma_pages(mdev, recv_buf->npages, recv_buf->mkey_in,
1406 			     &recv_buf->state, DMA_FROM_DEVICE);
1407 	kvfree(recv_buf->mkey_in);
1408 	free_page_list(recv_buf->npages, recv_buf->page_list);
1409 }
1410 
mlx5vf_alloc_qp_recv_resources(struct mlx5_core_dev * mdev,struct mlx5_vhca_qp * qp,u32 pdn,u64 rq_size)1411 static int mlx5vf_alloc_qp_recv_resources(struct mlx5_core_dev *mdev,
1412 					  struct mlx5_vhca_qp *qp, u32 pdn,
1413 					  u64 rq_size)
1414 {
1415 	unsigned int npages = DIV_ROUND_UP_ULL(rq_size, PAGE_SIZE);
1416 	struct mlx5_vhca_recv_buf *recv_buf = &qp->recv_buf;
1417 	int err;
1418 
1419 	err = mlx5vf_add_pages(&recv_buf->page_list, npages);
1420 	if (err)
1421 		return err;
1422 
1423 	recv_buf->npages = npages;
1424 
1425 	recv_buf->mkey_in = alloc_mkey_in(npages, pdn);
1426 	if (!recv_buf->mkey_in) {
1427 		err = -ENOMEM;
1428 		goto end;
1429 	}
1430 
1431 	err = register_dma_pages(mdev, npages, recv_buf->page_list,
1432 				 recv_buf->mkey_in, &recv_buf->state,
1433 				 DMA_FROM_DEVICE);
1434 	if (err)
1435 		goto err_register_dma;
1436 
1437 	err = create_mkey(mdev, npages, recv_buf->mkey_in, &recv_buf->mkey);
1438 	if (err)
1439 		goto err_create_mkey;
1440 
1441 	return 0;
1442 
1443 err_create_mkey:
1444 	unregister_dma_pages(mdev, npages, recv_buf->mkey_in, &recv_buf->state,
1445 			     DMA_FROM_DEVICE);
1446 err_register_dma:
1447 	kvfree(recv_buf->mkey_in);
1448 	recv_buf->mkey_in = NULL;
1449 end:
1450 	free_page_list(npages, recv_buf->page_list);
1451 	return err;
1452 }
1453 
1454 static void
_mlx5vf_free_page_tracker_resources(struct mlx5vf_pci_core_device * mvdev)1455 _mlx5vf_free_page_tracker_resources(struct mlx5vf_pci_core_device *mvdev)
1456 {
1457 	struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker;
1458 	struct mlx5_core_dev *mdev = mvdev->mdev;
1459 
1460 	lockdep_assert_held(&mvdev->state_mutex);
1461 
1462 	if (!mvdev->log_active)
1463 		return;
1464 
1465 	WARN_ON(mvdev->mdev_detach);
1466 
1467 	mlx5_eq_notifier_unregister(mdev, &tracker->nb);
1468 	mlx5vf_cmd_destroy_tracker(mdev, tracker->id);
1469 	mlx5vf_destroy_qp(mdev, tracker->fw_qp);
1470 	mlx5vf_free_qp_recv_resources(mdev, tracker->host_qp);
1471 	mlx5vf_destroy_qp(mdev, tracker->host_qp);
1472 	mlx5vf_destroy_cq(mdev, &tracker->cq);
1473 	mlx5_core_dealloc_pd(mdev, tracker->pdn);
1474 	mlx5_put_uars_page(mdev, tracker->uar);
1475 	mvdev->log_active = false;
1476 }
1477 
mlx5vf_stop_page_tracker(struct vfio_device * vdev)1478 int mlx5vf_stop_page_tracker(struct vfio_device *vdev)
1479 {
1480 	struct mlx5vf_pci_core_device *mvdev = container_of(
1481 		vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1482 
1483 	mutex_lock(&mvdev->state_mutex);
1484 	if (!mvdev->log_active)
1485 		goto end;
1486 
1487 	_mlx5vf_free_page_tracker_resources(mvdev);
1488 	mvdev->log_active = false;
1489 end:
1490 	mlx5vf_state_mutex_unlock(mvdev);
1491 	return 0;
1492 }
1493 
mlx5vf_start_page_tracker(struct vfio_device * vdev,struct rb_root_cached * ranges,u32 nnodes,u64 * page_size)1494 int mlx5vf_start_page_tracker(struct vfio_device *vdev,
1495 			      struct rb_root_cached *ranges, u32 nnodes,
1496 			      u64 *page_size)
1497 {
1498 	struct mlx5vf_pci_core_device *mvdev = container_of(
1499 		vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1500 	struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker;
1501 	u8 log_tracked_page = ilog2(*page_size);
1502 	struct mlx5_vhca_qp *host_qp;
1503 	struct mlx5_vhca_qp *fw_qp;
1504 	struct mlx5_core_dev *mdev;
1505 	u32 log_max_msg_size;
1506 	u32 max_msg_size;
1507 	u64 rq_size = SZ_2M;
1508 	u32 max_recv_wr;
1509 	int err;
1510 
1511 	mutex_lock(&mvdev->state_mutex);
1512 	if (mvdev->mdev_detach) {
1513 		err = -ENOTCONN;
1514 		goto end;
1515 	}
1516 
1517 	if (mvdev->log_active) {
1518 		err = -EINVAL;
1519 		goto end;
1520 	}
1521 
1522 	mdev = mvdev->mdev;
1523 	log_max_msg_size = MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_log_max_msg_size);
1524 	max_msg_size = (1ULL << log_max_msg_size);
1525 	/* The RQ must hold at least 4 WQEs/messages for successful QP creation */
1526 	if (rq_size < 4 * max_msg_size)
1527 		rq_size = 4 * max_msg_size;
1528 
1529 	memset(tracker, 0, sizeof(*tracker));
1530 	tracker->uar = mlx5_get_uars_page(mdev);
1531 	if (IS_ERR(tracker->uar)) {
1532 		err = PTR_ERR(tracker->uar);
1533 		goto end;
1534 	}
1535 
1536 	err = mlx5_core_alloc_pd(mdev, &tracker->pdn);
1537 	if (err)
1538 		goto err_uar;
1539 
1540 	max_recv_wr = DIV_ROUND_UP_ULL(rq_size, max_msg_size);
1541 	err = mlx5vf_create_cq(mdev, tracker, max_recv_wr);
1542 	if (err)
1543 		goto err_dealloc_pd;
1544 
1545 	host_qp = mlx5vf_create_rc_qp(mdev, tracker, max_recv_wr);
1546 	if (IS_ERR(host_qp)) {
1547 		err = PTR_ERR(host_qp);
1548 		goto err_cq;
1549 	}
1550 
1551 	host_qp->max_msg_size = max_msg_size;
1552 	if (log_tracked_page < MLX5_CAP_ADV_VIRTUALIZATION(mdev,
1553 				pg_track_log_min_page_size)) {
1554 		log_tracked_page = MLX5_CAP_ADV_VIRTUALIZATION(mdev,
1555 				pg_track_log_min_page_size);
1556 	} else if (log_tracked_page > MLX5_CAP_ADV_VIRTUALIZATION(mdev,
1557 				pg_track_log_max_page_size)) {
1558 		log_tracked_page = MLX5_CAP_ADV_VIRTUALIZATION(mdev,
1559 				pg_track_log_max_page_size);
1560 	}
1561 
1562 	host_qp->tracked_page_size = (1ULL << log_tracked_page);
1563 	err = mlx5vf_alloc_qp_recv_resources(mdev, host_qp, tracker->pdn,
1564 					     rq_size);
1565 	if (err)
1566 		goto err_host_qp;
1567 
1568 	fw_qp = mlx5vf_create_rc_qp(mdev, tracker, 0);
1569 	if (IS_ERR(fw_qp)) {
1570 		err = PTR_ERR(fw_qp);
1571 		goto err_recv_resources;
1572 	}
1573 
1574 	err = mlx5vf_activate_qp(mdev, host_qp, fw_qp->qpn, true);
1575 	if (err)
1576 		goto err_activate;
1577 
1578 	err = mlx5vf_activate_qp(mdev, fw_qp, host_qp->qpn, false);
1579 	if (err)
1580 		goto err_activate;
1581 
1582 	tracker->host_qp = host_qp;
1583 	tracker->fw_qp = fw_qp;
1584 	err = mlx5vf_create_tracker(mdev, mvdev, ranges, nnodes);
1585 	if (err)
1586 		goto err_activate;
1587 
1588 	MLX5_NB_INIT(&tracker->nb, mlx5vf_event_notifier, NOTIFY_ANY);
1589 	mlx5_eq_notifier_register(mdev, &tracker->nb);
1590 	*page_size = host_qp->tracked_page_size;
1591 	mvdev->log_active = true;
1592 	mlx5vf_state_mutex_unlock(mvdev);
1593 	return 0;
1594 
1595 err_activate:
1596 	mlx5vf_destroy_qp(mdev, fw_qp);
1597 err_recv_resources:
1598 	mlx5vf_free_qp_recv_resources(mdev, host_qp);
1599 err_host_qp:
1600 	mlx5vf_destroy_qp(mdev, host_qp);
1601 err_cq:
1602 	mlx5vf_destroy_cq(mdev, &tracker->cq);
1603 err_dealloc_pd:
1604 	mlx5_core_dealloc_pd(mdev, tracker->pdn);
1605 err_uar:
1606 	mlx5_put_uars_page(mdev, tracker->uar);
1607 end:
1608 	mlx5vf_state_mutex_unlock(mvdev);
1609 	return err;
1610 }
1611 
1612 static void
set_report_output(u32 size,int index,struct mlx5_vhca_qp * qp,struct iova_bitmap * dirty)1613 set_report_output(u32 size, int index, struct mlx5_vhca_qp *qp,
1614 		  struct iova_bitmap *dirty)
1615 {
1616 	u32 entry_size = MLX5_ST_SZ_BYTES(page_track_report_entry);
1617 	u32 nent = size / entry_size;
1618 	u32 nent_in_page;
1619 	u32 nent_to_set;
1620 	struct page *page;
1621 	u32 page_offset;
1622 	u32 page_index;
1623 	u32 buf_offset;
1624 	void *kaddr;
1625 	u64 addr;
1626 	u64 *buf;
1627 	int i;
1628 
1629 	buf_offset = index * qp->max_msg_size;
1630 	if (WARN_ON(buf_offset + size >= qp->recv_buf.npages * PAGE_SIZE ||
1631 		    (nent > qp->max_msg_size / entry_size)))
1632 		return;
1633 
1634 	do {
1635 		page_index = buf_offset / PAGE_SIZE;
1636 		page_offset = buf_offset % PAGE_SIZE;
1637 		nent_in_page = (PAGE_SIZE - page_offset) / entry_size;
1638 		page = qp->recv_buf.page_list[page_index];
1639 		kaddr = kmap_local_page(page);
1640 		buf = kaddr + page_offset;
1641 		nent_to_set = min(nent, nent_in_page);
1642 		for (i = 0; i < nent_to_set; i++) {
1643 			addr = MLX5_GET(page_track_report_entry, buf + i,
1644 					dirty_address_low);
1645 			addr |= (u64)MLX5_GET(page_track_report_entry, buf + i,
1646 					      dirty_address_high) << 32;
1647 			iova_bitmap_set(dirty, addr, qp->tracked_page_size);
1648 		}
1649 		kunmap_local(kaddr);
1650 		buf_offset += (nent_to_set * entry_size);
1651 		nent -= nent_to_set;
1652 	} while (nent);
1653 }
1654 
1655 static void
mlx5vf_rq_cqe(struct mlx5_vhca_qp * qp,struct mlx5_cqe64 * cqe,struct iova_bitmap * dirty,int * tracker_status)1656 mlx5vf_rq_cqe(struct mlx5_vhca_qp *qp, struct mlx5_cqe64 *cqe,
1657 	      struct iova_bitmap *dirty, int *tracker_status)
1658 {
1659 	u32 size;
1660 	int ix;
1661 
1662 	qp->rq.cc++;
1663 	*tracker_status = be32_to_cpu(cqe->immediate) >> 28;
1664 	size = be32_to_cpu(cqe->byte_cnt);
1665 	ix = be16_to_cpu(cqe->wqe_counter) & (qp->rq.wqe_cnt - 1);
1666 
1667 	/* zero length CQE, no data */
1668 	WARN_ON(!size && *tracker_status == MLX5_PAGE_TRACK_STATE_REPORTING);
1669 	if (size)
1670 		set_report_output(size, ix, qp, dirty);
1671 
1672 	qp->recv_buf.next_rq_offset = ix * qp->max_msg_size;
1673 	mlx5vf_post_recv(qp);
1674 }
1675 
get_cqe(struct mlx5_vhca_cq * cq,int n)1676 static void *get_cqe(struct mlx5_vhca_cq *cq, int n)
1677 {
1678 	return mlx5_frag_buf_get_wqe(&cq->buf.fbc, n);
1679 }
1680 
get_sw_cqe(struct mlx5_vhca_cq * cq,int n)1681 static struct mlx5_cqe64 *get_sw_cqe(struct mlx5_vhca_cq *cq, int n)
1682 {
1683 	void *cqe = get_cqe(cq, n & (cq->ncqe - 1));
1684 	struct mlx5_cqe64 *cqe64;
1685 
1686 	cqe64 = (cq->mcq.cqe_sz == 64) ? cqe : cqe + 64;
1687 
1688 	if (likely(get_cqe_opcode(cqe64) != MLX5_CQE_INVALID) &&
1689 	    !((cqe64->op_own & MLX5_CQE_OWNER_MASK) ^ !!(n & (cq->ncqe)))) {
1690 		return cqe64;
1691 	} else {
1692 		return NULL;
1693 	}
1694 }
1695 
1696 static int
mlx5vf_cq_poll_one(struct mlx5_vhca_cq * cq,struct mlx5_vhca_qp * qp,struct iova_bitmap * dirty,int * tracker_status)1697 mlx5vf_cq_poll_one(struct mlx5_vhca_cq *cq, struct mlx5_vhca_qp *qp,
1698 		   struct iova_bitmap *dirty, int *tracker_status)
1699 {
1700 	struct mlx5_cqe64 *cqe;
1701 	u8 opcode;
1702 
1703 	cqe = get_sw_cqe(cq, cq->mcq.cons_index);
1704 	if (!cqe)
1705 		return CQ_EMPTY;
1706 
1707 	++cq->mcq.cons_index;
1708 	/*
1709 	 * Make sure we read CQ entry contents after we've checked the
1710 	 * ownership bit.
1711 	 */
1712 	rmb();
1713 	opcode = get_cqe_opcode(cqe);
1714 	switch (opcode) {
1715 	case MLX5_CQE_RESP_SEND_IMM:
1716 		mlx5vf_rq_cqe(qp, cqe, dirty, tracker_status);
1717 		return CQ_OK;
1718 	default:
1719 		return CQ_POLL_ERR;
1720 	}
1721 }
1722 
mlx5vf_tracker_read_and_clear(struct vfio_device * vdev,unsigned long iova,unsigned long length,struct iova_bitmap * dirty)1723 int mlx5vf_tracker_read_and_clear(struct vfio_device *vdev, unsigned long iova,
1724 				  unsigned long length,
1725 				  struct iova_bitmap *dirty)
1726 {
1727 	struct mlx5vf_pci_core_device *mvdev = container_of(
1728 		vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1729 	struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker;
1730 	struct mlx5_vhca_cq *cq = &tracker->cq;
1731 	struct mlx5_core_dev *mdev;
1732 	int poll_err, err;
1733 
1734 	mutex_lock(&mvdev->state_mutex);
1735 	if (!mvdev->log_active) {
1736 		err = -EINVAL;
1737 		goto end;
1738 	}
1739 
1740 	if (mvdev->mdev_detach) {
1741 		err = -ENOTCONN;
1742 		goto end;
1743 	}
1744 
1745 	if (tracker->is_err) {
1746 		err = -EIO;
1747 		goto end;
1748 	}
1749 
1750 	mdev = mvdev->mdev;
1751 	err = mlx5vf_cmd_modify_tracker(mdev, tracker->id, iova, length,
1752 					MLX5_PAGE_TRACK_STATE_REPORTING);
1753 	if (err)
1754 		goto end;
1755 
1756 	tracker->status = MLX5_PAGE_TRACK_STATE_REPORTING;
1757 	while (tracker->status == MLX5_PAGE_TRACK_STATE_REPORTING &&
1758 	       !tracker->is_err) {
1759 		poll_err = mlx5vf_cq_poll_one(cq, tracker->host_qp, dirty,
1760 					      &tracker->status);
1761 		if (poll_err == CQ_EMPTY) {
1762 			mlx5_cq_arm(&cq->mcq, MLX5_CQ_DB_REQ_NOT, tracker->uar->map,
1763 				    cq->mcq.cons_index);
1764 			poll_err = mlx5vf_cq_poll_one(cq, tracker->host_qp,
1765 						      dirty, &tracker->status);
1766 			if (poll_err == CQ_EMPTY) {
1767 				wait_for_completion(&mvdev->tracker_comp);
1768 				if (tracker->object_changed) {
1769 					tracker->object_changed = false;
1770 					err = mlx5vf_cmd_query_tracker(mdev, tracker);
1771 					if (err)
1772 						goto end;
1773 				}
1774 				continue;
1775 			}
1776 		}
1777 		if (poll_err == CQ_POLL_ERR) {
1778 			err = -EIO;
1779 			goto end;
1780 		}
1781 		mlx5_cq_set_ci(&cq->mcq);
1782 	}
1783 
1784 	if (tracker->status == MLX5_PAGE_TRACK_STATE_ERROR)
1785 		tracker->is_err = true;
1786 
1787 	if (tracker->is_err)
1788 		err = -EIO;
1789 end:
1790 	mlx5vf_state_mutex_unlock(mvdev);
1791 	return err;
1792 }
1793