xref: /linux/drivers/vfio/pci/mlx5/main.c (revision eb01fe7abbe2d0b38824d2a93fdb4cc3eaf2ccc1)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved
4  */
5 
6 #include <linux/device.h>
7 #include <linux/eventfd.h>
8 #include <linux/file.h>
9 #include <linux/interrupt.h>
10 #include <linux/iommu.h>
11 #include <linux/module.h>
12 #include <linux/mutex.h>
13 #include <linux/notifier.h>
14 #include <linux/pci.h>
15 #include <linux/pm_runtime.h>
16 #include <linux/types.h>
17 #include <linux/uaccess.h>
18 #include <linux/vfio.h>
19 #include <linux/sched/mm.h>
20 #include <linux/anon_inodes.h>
21 
22 #include "cmd.h"
23 
24 /* Device specification max LOAD size */
25 #define MAX_LOAD_SIZE (BIT_ULL(__mlx5_bit_sz(load_vhca_state_in, size)) - 1)
26 
27 #define MAX_CHUNK_SIZE SZ_8M
28 
29 static struct mlx5vf_pci_core_device *mlx5vf_drvdata(struct pci_dev *pdev)
30 {
31 	struct vfio_pci_core_device *core_device = dev_get_drvdata(&pdev->dev);
32 
33 	return container_of(core_device, struct mlx5vf_pci_core_device,
34 			    core_device);
35 }
36 
37 struct page *
38 mlx5vf_get_migration_page(struct mlx5_vhca_data_buffer *buf,
39 			  unsigned long offset)
40 {
41 	unsigned long cur_offset = 0;
42 	struct scatterlist *sg;
43 	unsigned int i;
44 
45 	/* All accesses are sequential */
46 	if (offset < buf->last_offset || !buf->last_offset_sg) {
47 		buf->last_offset = 0;
48 		buf->last_offset_sg = buf->table.sgt.sgl;
49 		buf->sg_last_entry = 0;
50 	}
51 
52 	cur_offset = buf->last_offset;
53 
54 	for_each_sg(buf->last_offset_sg, sg,
55 			buf->table.sgt.orig_nents - buf->sg_last_entry, i) {
56 		if (offset < sg->length + cur_offset) {
57 			buf->last_offset_sg = sg;
58 			buf->sg_last_entry += i;
59 			buf->last_offset = cur_offset;
60 			return nth_page(sg_page(sg),
61 					(offset - cur_offset) / PAGE_SIZE);
62 		}
63 		cur_offset += sg->length;
64 	}
65 	return NULL;
66 }
67 
68 static void mlx5vf_disable_fd(struct mlx5_vf_migration_file *migf)
69 {
70 	mutex_lock(&migf->lock);
71 	migf->state = MLX5_MIGF_STATE_ERROR;
72 	migf->filp->f_pos = 0;
73 	mutex_unlock(&migf->lock);
74 }
75 
76 static int mlx5vf_release_file(struct inode *inode, struct file *filp)
77 {
78 	struct mlx5_vf_migration_file *migf = filp->private_data;
79 
80 	mlx5vf_disable_fd(migf);
81 	mutex_destroy(&migf->lock);
82 	kfree(migf);
83 	return 0;
84 }
85 
86 static struct mlx5_vhca_data_buffer *
87 mlx5vf_get_data_buff_from_pos(struct mlx5_vf_migration_file *migf, loff_t pos,
88 			      bool *end_of_data)
89 {
90 	struct mlx5_vhca_data_buffer *buf;
91 	bool found = false;
92 
93 	*end_of_data = false;
94 	spin_lock_irq(&migf->list_lock);
95 	if (list_empty(&migf->buf_list)) {
96 		*end_of_data = true;
97 		goto end;
98 	}
99 
100 	buf = list_first_entry(&migf->buf_list, struct mlx5_vhca_data_buffer,
101 			       buf_elm);
102 	if (pos >= buf->start_pos &&
103 	    pos < buf->start_pos + buf->length) {
104 		found = true;
105 		goto end;
106 	}
107 
108 	/*
109 	 * As we use a stream based FD we may expect having the data always
110 	 * on first chunk
111 	 */
112 	migf->state = MLX5_MIGF_STATE_ERROR;
113 
114 end:
115 	spin_unlock_irq(&migf->list_lock);
116 	return found ? buf : NULL;
117 }
118 
119 static void mlx5vf_buf_read_done(struct mlx5_vhca_data_buffer *vhca_buf)
120 {
121 	struct mlx5_vf_migration_file *migf = vhca_buf->migf;
122 
123 	if (vhca_buf->stop_copy_chunk_num) {
124 		bool is_header = vhca_buf->dma_dir == DMA_NONE;
125 		u8 chunk_num = vhca_buf->stop_copy_chunk_num;
126 		size_t next_required_umem_size = 0;
127 
128 		if (is_header)
129 			migf->buf_header[chunk_num - 1] = vhca_buf;
130 		else
131 			migf->buf[chunk_num - 1] = vhca_buf;
132 
133 		spin_lock_irq(&migf->list_lock);
134 		list_del_init(&vhca_buf->buf_elm);
135 		if (!is_header) {
136 			next_required_umem_size =
137 				migf->next_required_umem_size;
138 			migf->next_required_umem_size = 0;
139 			migf->num_ready_chunks--;
140 		}
141 		spin_unlock_irq(&migf->list_lock);
142 		if (next_required_umem_size)
143 			mlx5vf_mig_file_set_save_work(migf, chunk_num,
144 						      next_required_umem_size);
145 		return;
146 	}
147 
148 	spin_lock_irq(&migf->list_lock);
149 	list_del_init(&vhca_buf->buf_elm);
150 	list_add_tail(&vhca_buf->buf_elm, &vhca_buf->migf->avail_list);
151 	spin_unlock_irq(&migf->list_lock);
152 }
153 
154 static ssize_t mlx5vf_buf_read(struct mlx5_vhca_data_buffer *vhca_buf,
155 			       char __user **buf, size_t *len, loff_t *pos)
156 {
157 	unsigned long offset;
158 	ssize_t done = 0;
159 	size_t copy_len;
160 
161 	copy_len = min_t(size_t,
162 			 vhca_buf->start_pos + vhca_buf->length - *pos, *len);
163 	while (copy_len) {
164 		size_t page_offset;
165 		struct page *page;
166 		size_t page_len;
167 		u8 *from_buff;
168 		int ret;
169 
170 		offset = *pos - vhca_buf->start_pos;
171 		page_offset = offset % PAGE_SIZE;
172 		offset -= page_offset;
173 		page = mlx5vf_get_migration_page(vhca_buf, offset);
174 		if (!page)
175 			return -EINVAL;
176 		page_len = min_t(size_t, copy_len, PAGE_SIZE - page_offset);
177 		from_buff = kmap_local_page(page);
178 		ret = copy_to_user(*buf, from_buff + page_offset, page_len);
179 		kunmap_local(from_buff);
180 		if (ret)
181 			return -EFAULT;
182 		*pos += page_len;
183 		*len -= page_len;
184 		*buf += page_len;
185 		done += page_len;
186 		copy_len -= page_len;
187 	}
188 
189 	if (*pos >= vhca_buf->start_pos + vhca_buf->length)
190 		mlx5vf_buf_read_done(vhca_buf);
191 
192 	return done;
193 }
194 
195 static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len,
196 			       loff_t *pos)
197 {
198 	struct mlx5_vf_migration_file *migf = filp->private_data;
199 	struct mlx5_vhca_data_buffer *vhca_buf;
200 	bool first_loop_call = true;
201 	bool end_of_data;
202 	ssize_t done = 0;
203 
204 	if (pos)
205 		return -ESPIPE;
206 	pos = &filp->f_pos;
207 
208 	if (!(filp->f_flags & O_NONBLOCK)) {
209 		if (wait_event_interruptible(migf->poll_wait,
210 				!list_empty(&migf->buf_list) ||
211 				migf->state == MLX5_MIGF_STATE_ERROR ||
212 				migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR ||
213 				migf->state == MLX5_MIGF_STATE_PRE_COPY ||
214 				migf->state == MLX5_MIGF_STATE_COMPLETE))
215 			return -ERESTARTSYS;
216 	}
217 
218 	mutex_lock(&migf->lock);
219 	if (migf->state == MLX5_MIGF_STATE_ERROR) {
220 		done = -ENODEV;
221 		goto out_unlock;
222 	}
223 
224 	while (len) {
225 		ssize_t count;
226 
227 		vhca_buf = mlx5vf_get_data_buff_from_pos(migf, *pos,
228 							 &end_of_data);
229 		if (first_loop_call) {
230 			first_loop_call = false;
231 			/* Temporary end of file as part of PRE_COPY */
232 			if (end_of_data && (migf->state == MLX5_MIGF_STATE_PRE_COPY ||
233 				migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR)) {
234 				done = -ENOMSG;
235 				goto out_unlock;
236 			}
237 
238 			if (end_of_data && migf->state != MLX5_MIGF_STATE_COMPLETE) {
239 				if (filp->f_flags & O_NONBLOCK) {
240 					done = -EAGAIN;
241 					goto out_unlock;
242 				}
243 			}
244 		}
245 
246 		if (end_of_data)
247 			goto out_unlock;
248 
249 		if (!vhca_buf) {
250 			done = -EINVAL;
251 			goto out_unlock;
252 		}
253 
254 		count = mlx5vf_buf_read(vhca_buf, &buf, &len, pos);
255 		if (count < 0) {
256 			done = count;
257 			goto out_unlock;
258 		}
259 		done += count;
260 	}
261 
262 out_unlock:
263 	mutex_unlock(&migf->lock);
264 	return done;
265 }
266 
267 static __poll_t mlx5vf_save_poll(struct file *filp,
268 				 struct poll_table_struct *wait)
269 {
270 	struct mlx5_vf_migration_file *migf = filp->private_data;
271 	__poll_t pollflags = 0;
272 
273 	poll_wait(filp, &migf->poll_wait, wait);
274 
275 	mutex_lock(&migf->lock);
276 	if (migf->state == MLX5_MIGF_STATE_ERROR)
277 		pollflags = EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
278 	else if (!list_empty(&migf->buf_list) ||
279 		 migf->state == MLX5_MIGF_STATE_COMPLETE)
280 		pollflags = EPOLLIN | EPOLLRDNORM;
281 	mutex_unlock(&migf->lock);
282 
283 	return pollflags;
284 }
285 
286 /*
287  * FD is exposed and user can use it after receiving an error.
288  * Mark migf in error, and wake the user.
289  */
290 static void mlx5vf_mark_err(struct mlx5_vf_migration_file *migf)
291 {
292 	migf->state = MLX5_MIGF_STATE_ERROR;
293 	wake_up_interruptible(&migf->poll_wait);
294 }
295 
296 void mlx5vf_mig_file_set_save_work(struct mlx5_vf_migration_file *migf,
297 				   u8 chunk_num, size_t next_required_umem_size)
298 {
299 	migf->save_data[chunk_num - 1].next_required_umem_size =
300 			next_required_umem_size;
301 	migf->save_data[chunk_num - 1].migf = migf;
302 	get_file(migf->filp);
303 	queue_work(migf->mvdev->cb_wq,
304 		   &migf->save_data[chunk_num - 1].work);
305 }
306 
307 static struct mlx5_vhca_data_buffer *
308 mlx5vf_mig_file_get_stop_copy_buf(struct mlx5_vf_migration_file *migf,
309 				  u8 index, size_t required_length)
310 {
311 	struct mlx5_vhca_data_buffer *buf = migf->buf[index];
312 	u8 chunk_num;
313 
314 	WARN_ON(!buf);
315 	chunk_num = buf->stop_copy_chunk_num;
316 	buf->migf->buf[index] = NULL;
317 	/* Checking whether the pre-allocated buffer can fit */
318 	if (buf->allocated_length >= required_length)
319 		return buf;
320 
321 	mlx5vf_put_data_buffer(buf);
322 	buf = mlx5vf_get_data_buffer(buf->migf, required_length,
323 				     DMA_FROM_DEVICE);
324 	if (IS_ERR(buf))
325 		return buf;
326 
327 	buf->stop_copy_chunk_num = chunk_num;
328 	return buf;
329 }
330 
331 static void mlx5vf_mig_file_save_work(struct work_struct *_work)
332 {
333 	struct mlx5vf_save_work_data *save_data = container_of(_work,
334 		struct mlx5vf_save_work_data, work);
335 	struct mlx5_vf_migration_file *migf = save_data->migf;
336 	struct mlx5vf_pci_core_device *mvdev = migf->mvdev;
337 	struct mlx5_vhca_data_buffer *buf;
338 
339 	mutex_lock(&mvdev->state_mutex);
340 	if (migf->state == MLX5_MIGF_STATE_ERROR)
341 		goto end;
342 
343 	buf = mlx5vf_mig_file_get_stop_copy_buf(migf,
344 				save_data->chunk_num - 1,
345 				save_data->next_required_umem_size);
346 	if (IS_ERR(buf))
347 		goto err;
348 
349 	if (mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, false))
350 		goto err_save;
351 
352 	goto end;
353 
354 err_save:
355 	mlx5vf_put_data_buffer(buf);
356 err:
357 	mlx5vf_mark_err(migf);
358 end:
359 	mlx5vf_state_mutex_unlock(mvdev);
360 	fput(migf->filp);
361 }
362 
363 static int mlx5vf_add_stop_copy_header(struct mlx5_vf_migration_file *migf,
364 				       bool track)
365 {
366 	size_t size = sizeof(struct mlx5_vf_migration_header) +
367 		sizeof(struct mlx5_vf_migration_tag_stop_copy_data);
368 	struct mlx5_vf_migration_tag_stop_copy_data data = {};
369 	struct mlx5_vhca_data_buffer *header_buf = NULL;
370 	struct mlx5_vf_migration_header header = {};
371 	unsigned long flags;
372 	struct page *page;
373 	u8 *to_buff;
374 	int ret;
375 
376 	header_buf = mlx5vf_get_data_buffer(migf, size, DMA_NONE);
377 	if (IS_ERR(header_buf))
378 		return PTR_ERR(header_buf);
379 
380 	header.record_size = cpu_to_le64(sizeof(data));
381 	header.flags = cpu_to_le32(MLX5_MIGF_HEADER_FLAGS_TAG_OPTIONAL);
382 	header.tag = cpu_to_le32(MLX5_MIGF_HEADER_TAG_STOP_COPY_SIZE);
383 	page = mlx5vf_get_migration_page(header_buf, 0);
384 	if (!page) {
385 		ret = -EINVAL;
386 		goto err;
387 	}
388 	to_buff = kmap_local_page(page);
389 	memcpy(to_buff, &header, sizeof(header));
390 	header_buf->length = sizeof(header);
391 	data.stop_copy_size = cpu_to_le64(migf->buf[0]->allocated_length);
392 	memcpy(to_buff + sizeof(header), &data, sizeof(data));
393 	header_buf->length += sizeof(data);
394 	kunmap_local(to_buff);
395 	header_buf->start_pos = header_buf->migf->max_pos;
396 	migf->max_pos += header_buf->length;
397 	spin_lock_irqsave(&migf->list_lock, flags);
398 	list_add_tail(&header_buf->buf_elm, &migf->buf_list);
399 	spin_unlock_irqrestore(&migf->list_lock, flags);
400 	if (track)
401 		migf->pre_copy_initial_bytes = size;
402 	return 0;
403 err:
404 	mlx5vf_put_data_buffer(header_buf);
405 	return ret;
406 }
407 
408 static int mlx5vf_prep_stop_copy(struct mlx5vf_pci_core_device *mvdev,
409 				 struct mlx5_vf_migration_file *migf,
410 				 size_t state_size, u64 full_size,
411 				 bool track)
412 {
413 	struct mlx5_vhca_data_buffer *buf;
414 	size_t inc_state_size;
415 	int num_chunks;
416 	int ret;
417 	int i;
418 
419 	if (mvdev->chunk_mode) {
420 		size_t chunk_size = min_t(size_t, MAX_CHUNK_SIZE, full_size);
421 
422 		/* from firmware perspective at least 'state_size' buffer should be set */
423 		inc_state_size = max(state_size, chunk_size);
424 	} else {
425 		if (track) {
426 			/* let's be ready for stop_copy size that might grow by 10 percents */
427 			if (check_add_overflow(state_size, state_size / 10, &inc_state_size))
428 				inc_state_size = state_size;
429 		} else {
430 			inc_state_size = state_size;
431 		}
432 	}
433 
434 	/* let's not overflow the device specification max SAVE size */
435 	inc_state_size = min_t(size_t, inc_state_size,
436 		(BIT_ULL(__mlx5_bit_sz(save_vhca_state_in, size)) - PAGE_SIZE));
437 
438 	num_chunks = mvdev->chunk_mode ? MAX_NUM_CHUNKS : 1;
439 	for (i = 0; i < num_chunks; i++) {
440 		buf = mlx5vf_get_data_buffer(migf, inc_state_size, DMA_FROM_DEVICE);
441 		if (IS_ERR(buf)) {
442 			ret = PTR_ERR(buf);
443 			goto err;
444 		}
445 
446 		migf->buf[i] = buf;
447 		buf = mlx5vf_get_data_buffer(migf,
448 				sizeof(struct mlx5_vf_migration_header), DMA_NONE);
449 		if (IS_ERR(buf)) {
450 			ret = PTR_ERR(buf);
451 			goto err;
452 		}
453 		migf->buf_header[i] = buf;
454 		if (mvdev->chunk_mode) {
455 			migf->buf[i]->stop_copy_chunk_num = i + 1;
456 			migf->buf_header[i]->stop_copy_chunk_num = i + 1;
457 			INIT_WORK(&migf->save_data[i].work,
458 				  mlx5vf_mig_file_save_work);
459 			migf->save_data[i].chunk_num = i + 1;
460 		}
461 	}
462 
463 	ret = mlx5vf_add_stop_copy_header(migf, track);
464 	if (ret)
465 		goto err;
466 	return 0;
467 
468 err:
469 	for (i = 0; i < num_chunks; i++) {
470 		if (migf->buf[i]) {
471 			mlx5vf_put_data_buffer(migf->buf[i]);
472 			migf->buf[i] = NULL;
473 		}
474 		if (migf->buf_header[i]) {
475 			mlx5vf_put_data_buffer(migf->buf_header[i]);
476 			migf->buf_header[i] = NULL;
477 		}
478 	}
479 
480 	return ret;
481 }
482 
483 static long mlx5vf_precopy_ioctl(struct file *filp, unsigned int cmd,
484 				 unsigned long arg)
485 {
486 	struct mlx5_vf_migration_file *migf = filp->private_data;
487 	struct mlx5vf_pci_core_device *mvdev = migf->mvdev;
488 	struct mlx5_vhca_data_buffer *buf;
489 	struct vfio_precopy_info info = {};
490 	loff_t *pos = &filp->f_pos;
491 	unsigned long minsz;
492 	size_t inc_length = 0;
493 	bool end_of_data = false;
494 	int ret;
495 
496 	if (cmd != VFIO_MIG_GET_PRECOPY_INFO)
497 		return -ENOTTY;
498 
499 	minsz = offsetofend(struct vfio_precopy_info, dirty_bytes);
500 
501 	if (copy_from_user(&info, (void __user *)arg, minsz))
502 		return -EFAULT;
503 
504 	if (info.argsz < minsz)
505 		return -EINVAL;
506 
507 	mutex_lock(&mvdev->state_mutex);
508 	if (mvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY &&
509 	    mvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY_P2P) {
510 		ret = -EINVAL;
511 		goto err_state_unlock;
512 	}
513 
514 	/*
515 	 * We can't issue a SAVE command when the device is suspended, so as
516 	 * part of VFIO_DEVICE_STATE_PRE_COPY_P2P no reason to query for extra
517 	 * bytes that can't be read.
518 	 */
519 	if (mvdev->mig_state == VFIO_DEVICE_STATE_PRE_COPY) {
520 		/*
521 		 * Once the query returns it's guaranteed that there is no
522 		 * active SAVE command.
523 		 * As so, the other code below is safe with the proper locks.
524 		 */
525 		ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &inc_length,
526 							    NULL, MLX5VF_QUERY_INC);
527 		if (ret)
528 			goto err_state_unlock;
529 	}
530 
531 	mutex_lock(&migf->lock);
532 	if (migf->state == MLX5_MIGF_STATE_ERROR) {
533 		ret = -ENODEV;
534 		goto err_migf_unlock;
535 	}
536 
537 	if (migf->pre_copy_initial_bytes > *pos) {
538 		info.initial_bytes = migf->pre_copy_initial_bytes - *pos;
539 	} else {
540 		info.dirty_bytes = migf->max_pos - *pos;
541 		if (!info.dirty_bytes)
542 			end_of_data = true;
543 		info.dirty_bytes += inc_length;
544 	}
545 
546 	if (!end_of_data || !inc_length) {
547 		mutex_unlock(&migf->lock);
548 		goto done;
549 	}
550 
551 	mutex_unlock(&migf->lock);
552 	/*
553 	 * We finished transferring the current state and the device has a
554 	 * dirty state, save a new state to be ready for.
555 	 */
556 	buf = mlx5vf_get_data_buffer(migf, inc_length, DMA_FROM_DEVICE);
557 	if (IS_ERR(buf)) {
558 		ret = PTR_ERR(buf);
559 		mlx5vf_mark_err(migf);
560 		goto err_state_unlock;
561 	}
562 
563 	ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, true);
564 	if (ret) {
565 		mlx5vf_mark_err(migf);
566 		mlx5vf_put_data_buffer(buf);
567 		goto err_state_unlock;
568 	}
569 
570 done:
571 	mlx5vf_state_mutex_unlock(mvdev);
572 	if (copy_to_user((void __user *)arg, &info, minsz))
573 		return -EFAULT;
574 	return 0;
575 
576 err_migf_unlock:
577 	mutex_unlock(&migf->lock);
578 err_state_unlock:
579 	mlx5vf_state_mutex_unlock(mvdev);
580 	return ret;
581 }
582 
583 static const struct file_operations mlx5vf_save_fops = {
584 	.owner = THIS_MODULE,
585 	.read = mlx5vf_save_read,
586 	.poll = mlx5vf_save_poll,
587 	.unlocked_ioctl = mlx5vf_precopy_ioctl,
588 	.compat_ioctl = compat_ptr_ioctl,
589 	.release = mlx5vf_release_file,
590 	.llseek = no_llseek,
591 };
592 
593 static int mlx5vf_pci_save_device_inc_data(struct mlx5vf_pci_core_device *mvdev)
594 {
595 	struct mlx5_vf_migration_file *migf = mvdev->saving_migf;
596 	struct mlx5_vhca_data_buffer *buf;
597 	size_t length;
598 	int ret;
599 
600 	if (migf->state == MLX5_MIGF_STATE_ERROR)
601 		return -ENODEV;
602 
603 	ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, NULL,
604 				MLX5VF_QUERY_INC | MLX5VF_QUERY_FINAL);
605 	if (ret)
606 		goto err;
607 
608 	buf = mlx5vf_mig_file_get_stop_copy_buf(migf, 0, length);
609 	if (IS_ERR(buf)) {
610 		ret = PTR_ERR(buf);
611 		goto err;
612 	}
613 
614 	ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, false);
615 	if (ret)
616 		goto err_save;
617 
618 	return 0;
619 
620 err_save:
621 	mlx5vf_put_data_buffer(buf);
622 err:
623 	mlx5vf_mark_err(migf);
624 	return ret;
625 }
626 
627 static struct mlx5_vf_migration_file *
628 mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev, bool track)
629 {
630 	struct mlx5_vf_migration_file *migf;
631 	struct mlx5_vhca_data_buffer *buf;
632 	size_t length;
633 	u64 full_size;
634 	int ret;
635 
636 	migf = kzalloc(sizeof(*migf), GFP_KERNEL_ACCOUNT);
637 	if (!migf)
638 		return ERR_PTR(-ENOMEM);
639 
640 	migf->filp = anon_inode_getfile("mlx5vf_mig", &mlx5vf_save_fops, migf,
641 					O_RDONLY);
642 	if (IS_ERR(migf->filp)) {
643 		ret = PTR_ERR(migf->filp);
644 		goto end;
645 	}
646 
647 	migf->mvdev = mvdev;
648 	ret = mlx5vf_cmd_alloc_pd(migf);
649 	if (ret)
650 		goto out_free;
651 
652 	stream_open(migf->filp->f_inode, migf->filp);
653 	mutex_init(&migf->lock);
654 	init_waitqueue_head(&migf->poll_wait);
655 	init_completion(&migf->save_comp);
656 	/*
657 	 * save_comp is being used as a binary semaphore built from
658 	 * a completion. A normal mutex cannot be used because the lock is
659 	 * passed between kernel threads and lockdep can't model this.
660 	 */
661 	complete(&migf->save_comp);
662 	mlx5_cmd_init_async_ctx(mvdev->mdev, &migf->async_ctx);
663 	INIT_WORK(&migf->async_data.work, mlx5vf_mig_file_cleanup_cb);
664 	INIT_LIST_HEAD(&migf->buf_list);
665 	INIT_LIST_HEAD(&migf->avail_list);
666 	spin_lock_init(&migf->list_lock);
667 	ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, &full_size, 0);
668 	if (ret)
669 		goto out_pd;
670 
671 	ret = mlx5vf_prep_stop_copy(mvdev, migf, length, full_size, track);
672 	if (ret)
673 		goto out_pd;
674 
675 	if (track) {
676 		/* leave the allocated buffer ready for the stop-copy phase */
677 		buf = mlx5vf_alloc_data_buffer(migf,
678 			migf->buf[0]->allocated_length, DMA_FROM_DEVICE);
679 		if (IS_ERR(buf)) {
680 			ret = PTR_ERR(buf);
681 			goto out_pd;
682 		}
683 	} else {
684 		buf = migf->buf[0];
685 		migf->buf[0] = NULL;
686 	}
687 
688 	ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, false, track);
689 	if (ret)
690 		goto out_save;
691 	return migf;
692 out_save:
693 	mlx5vf_free_data_buffer(buf);
694 out_pd:
695 	mlx5fv_cmd_clean_migf_resources(migf);
696 out_free:
697 	fput(migf->filp);
698 end:
699 	kfree(migf);
700 	return ERR_PTR(ret);
701 }
702 
703 static int
704 mlx5vf_append_page_to_mig_buf(struct mlx5_vhca_data_buffer *vhca_buf,
705 			      const char __user **buf, size_t *len,
706 			      loff_t *pos, ssize_t *done)
707 {
708 	unsigned long offset;
709 	size_t page_offset;
710 	struct page *page;
711 	size_t page_len;
712 	u8 *to_buff;
713 	int ret;
714 
715 	offset = *pos - vhca_buf->start_pos;
716 	page_offset = offset % PAGE_SIZE;
717 
718 	page = mlx5vf_get_migration_page(vhca_buf, offset - page_offset);
719 	if (!page)
720 		return -EINVAL;
721 	page_len = min_t(size_t, *len, PAGE_SIZE - page_offset);
722 	to_buff = kmap_local_page(page);
723 	ret = copy_from_user(to_buff + page_offset, *buf, page_len);
724 	kunmap_local(to_buff);
725 	if (ret)
726 		return -EFAULT;
727 
728 	*pos += page_len;
729 	*done += page_len;
730 	*buf += page_len;
731 	*len -= page_len;
732 	vhca_buf->length += page_len;
733 	return 0;
734 }
735 
736 static ssize_t
737 mlx5vf_resume_read_image(struct mlx5_vf_migration_file *migf,
738 			 struct mlx5_vhca_data_buffer *vhca_buf,
739 			 size_t image_size, const char __user **buf,
740 			 size_t *len, loff_t *pos, ssize_t *done,
741 			 bool *has_work)
742 {
743 	size_t copy_len, to_copy;
744 	int ret;
745 
746 	to_copy = min_t(size_t, *len, image_size - vhca_buf->length);
747 	copy_len = to_copy;
748 	while (to_copy) {
749 		ret = mlx5vf_append_page_to_mig_buf(vhca_buf, buf, &to_copy, pos,
750 						    done);
751 		if (ret)
752 			return ret;
753 	}
754 
755 	*len -= copy_len;
756 	if (vhca_buf->length == image_size) {
757 		migf->load_state = MLX5_VF_LOAD_STATE_LOAD_IMAGE;
758 		migf->max_pos += image_size;
759 		*has_work = true;
760 	}
761 
762 	return 0;
763 }
764 
765 static int
766 mlx5vf_resume_read_header_data(struct mlx5_vf_migration_file *migf,
767 			       struct mlx5_vhca_data_buffer *vhca_buf,
768 			       const char __user **buf, size_t *len,
769 			       loff_t *pos, ssize_t *done)
770 {
771 	size_t copy_len, to_copy;
772 	size_t required_data;
773 	u8 *to_buff;
774 	int ret;
775 
776 	required_data = migf->record_size - vhca_buf->length;
777 	to_copy = min_t(size_t, *len, required_data);
778 	copy_len = to_copy;
779 	while (to_copy) {
780 		ret = mlx5vf_append_page_to_mig_buf(vhca_buf, buf, &to_copy, pos,
781 						    done);
782 		if (ret)
783 			return ret;
784 	}
785 
786 	*len -= copy_len;
787 	if (vhca_buf->length == migf->record_size) {
788 		switch (migf->record_tag) {
789 		case MLX5_MIGF_HEADER_TAG_STOP_COPY_SIZE:
790 		{
791 			struct page *page;
792 
793 			page = mlx5vf_get_migration_page(vhca_buf, 0);
794 			if (!page)
795 				return -EINVAL;
796 			to_buff = kmap_local_page(page);
797 			migf->stop_copy_prep_size = min_t(u64,
798 				le64_to_cpup((__le64 *)to_buff), MAX_LOAD_SIZE);
799 			kunmap_local(to_buff);
800 			break;
801 		}
802 		default:
803 			/* Optional tag */
804 			break;
805 		}
806 
807 		migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER;
808 		migf->max_pos += migf->record_size;
809 		vhca_buf->length = 0;
810 	}
811 
812 	return 0;
813 }
814 
815 static int
816 mlx5vf_resume_read_header(struct mlx5_vf_migration_file *migf,
817 			  struct mlx5_vhca_data_buffer *vhca_buf,
818 			  const char __user **buf,
819 			  size_t *len, loff_t *pos,
820 			  ssize_t *done, bool *has_work)
821 {
822 	struct page *page;
823 	size_t copy_len;
824 	u8 *to_buff;
825 	int ret;
826 
827 	copy_len = min_t(size_t, *len,
828 		sizeof(struct mlx5_vf_migration_header) - vhca_buf->length);
829 	page = mlx5vf_get_migration_page(vhca_buf, 0);
830 	if (!page)
831 		return -EINVAL;
832 	to_buff = kmap_local_page(page);
833 	ret = copy_from_user(to_buff + vhca_buf->length, *buf, copy_len);
834 	if (ret) {
835 		ret = -EFAULT;
836 		goto end;
837 	}
838 
839 	*buf += copy_len;
840 	*pos += copy_len;
841 	*done += copy_len;
842 	*len -= copy_len;
843 	vhca_buf->length += copy_len;
844 	if (vhca_buf->length == sizeof(struct mlx5_vf_migration_header)) {
845 		u64 record_size;
846 		u32 flags;
847 
848 		record_size = le64_to_cpup((__le64 *)to_buff);
849 		if (record_size > MAX_LOAD_SIZE) {
850 			ret = -ENOMEM;
851 			goto end;
852 		}
853 
854 		migf->record_size = record_size;
855 		flags = le32_to_cpup((__le32 *)(to_buff +
856 			    offsetof(struct mlx5_vf_migration_header, flags)));
857 		migf->record_tag = le32_to_cpup((__le32 *)(to_buff +
858 			    offsetof(struct mlx5_vf_migration_header, tag)));
859 		switch (migf->record_tag) {
860 		case MLX5_MIGF_HEADER_TAG_FW_DATA:
861 			migf->load_state = MLX5_VF_LOAD_STATE_PREP_IMAGE;
862 			break;
863 		case MLX5_MIGF_HEADER_TAG_STOP_COPY_SIZE:
864 			migf->load_state = MLX5_VF_LOAD_STATE_PREP_HEADER_DATA;
865 			break;
866 		default:
867 			if (!(flags & MLX5_MIGF_HEADER_FLAGS_TAG_OPTIONAL)) {
868 				ret = -EOPNOTSUPP;
869 				goto end;
870 			}
871 			/* We may read and skip this optional record data */
872 			migf->load_state = MLX5_VF_LOAD_STATE_PREP_HEADER_DATA;
873 		}
874 
875 		migf->max_pos += vhca_buf->length;
876 		vhca_buf->length = 0;
877 		*has_work = true;
878 	}
879 end:
880 	kunmap_local(to_buff);
881 	return ret;
882 }
883 
884 static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf,
885 				   size_t len, loff_t *pos)
886 {
887 	struct mlx5_vf_migration_file *migf = filp->private_data;
888 	struct mlx5_vhca_data_buffer *vhca_buf = migf->buf[0];
889 	struct mlx5_vhca_data_buffer *vhca_buf_header = migf->buf_header[0];
890 	loff_t requested_length;
891 	bool has_work = false;
892 	ssize_t done = 0;
893 	int ret = 0;
894 
895 	if (pos)
896 		return -ESPIPE;
897 	pos = &filp->f_pos;
898 
899 	if (*pos < 0 ||
900 	    check_add_overflow((loff_t)len, *pos, &requested_length))
901 		return -EINVAL;
902 
903 	mutex_lock(&migf->mvdev->state_mutex);
904 	mutex_lock(&migf->lock);
905 	if (migf->state == MLX5_MIGF_STATE_ERROR) {
906 		ret = -ENODEV;
907 		goto out_unlock;
908 	}
909 
910 	while (len || has_work) {
911 		has_work = false;
912 		switch (migf->load_state) {
913 		case MLX5_VF_LOAD_STATE_READ_HEADER:
914 			ret = mlx5vf_resume_read_header(migf, vhca_buf_header,
915 							&buf, &len, pos,
916 							&done, &has_work);
917 			if (ret)
918 				goto out_unlock;
919 			break;
920 		case MLX5_VF_LOAD_STATE_PREP_HEADER_DATA:
921 			if (vhca_buf_header->allocated_length < migf->record_size) {
922 				mlx5vf_free_data_buffer(vhca_buf_header);
923 
924 				migf->buf_header[0] = mlx5vf_alloc_data_buffer(migf,
925 						migf->record_size, DMA_NONE);
926 				if (IS_ERR(migf->buf_header[0])) {
927 					ret = PTR_ERR(migf->buf_header[0]);
928 					migf->buf_header[0] = NULL;
929 					goto out_unlock;
930 				}
931 
932 				vhca_buf_header = migf->buf_header[0];
933 			}
934 
935 			vhca_buf_header->start_pos = migf->max_pos;
936 			migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER_DATA;
937 			break;
938 		case MLX5_VF_LOAD_STATE_READ_HEADER_DATA:
939 			ret = mlx5vf_resume_read_header_data(migf, vhca_buf_header,
940 							&buf, &len, pos, &done);
941 			if (ret)
942 				goto out_unlock;
943 			break;
944 		case MLX5_VF_LOAD_STATE_PREP_IMAGE:
945 		{
946 			u64 size = max(migf->record_size,
947 				       migf->stop_copy_prep_size);
948 
949 			if (vhca_buf->allocated_length < size) {
950 				mlx5vf_free_data_buffer(vhca_buf);
951 
952 				migf->buf[0] = mlx5vf_alloc_data_buffer(migf,
953 							size, DMA_TO_DEVICE);
954 				if (IS_ERR(migf->buf[0])) {
955 					ret = PTR_ERR(migf->buf[0]);
956 					migf->buf[0] = NULL;
957 					goto out_unlock;
958 				}
959 
960 				vhca_buf = migf->buf[0];
961 			}
962 
963 			vhca_buf->start_pos = migf->max_pos;
964 			migf->load_state = MLX5_VF_LOAD_STATE_READ_IMAGE;
965 			break;
966 		}
967 		case MLX5_VF_LOAD_STATE_READ_IMAGE:
968 			ret = mlx5vf_resume_read_image(migf, vhca_buf,
969 						migf->record_size,
970 						&buf, &len, pos, &done, &has_work);
971 			if (ret)
972 				goto out_unlock;
973 			break;
974 		case MLX5_VF_LOAD_STATE_LOAD_IMAGE:
975 			ret = mlx5vf_cmd_load_vhca_state(migf->mvdev, migf, vhca_buf);
976 			if (ret)
977 				goto out_unlock;
978 			migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER;
979 
980 			/* prep header buf for next image */
981 			vhca_buf_header->length = 0;
982 			/* prep data buf for next image */
983 			vhca_buf->length = 0;
984 
985 			break;
986 		default:
987 			break;
988 		}
989 	}
990 
991 out_unlock:
992 	if (ret)
993 		migf->state = MLX5_MIGF_STATE_ERROR;
994 	mutex_unlock(&migf->lock);
995 	mlx5vf_state_mutex_unlock(migf->mvdev);
996 	return ret ? ret : done;
997 }
998 
999 static const struct file_operations mlx5vf_resume_fops = {
1000 	.owner = THIS_MODULE,
1001 	.write = mlx5vf_resume_write,
1002 	.release = mlx5vf_release_file,
1003 	.llseek = no_llseek,
1004 };
1005 
1006 static struct mlx5_vf_migration_file *
1007 mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev)
1008 {
1009 	struct mlx5_vf_migration_file *migf;
1010 	struct mlx5_vhca_data_buffer *buf;
1011 	int ret;
1012 
1013 	migf = kzalloc(sizeof(*migf), GFP_KERNEL_ACCOUNT);
1014 	if (!migf)
1015 		return ERR_PTR(-ENOMEM);
1016 
1017 	migf->filp = anon_inode_getfile("mlx5vf_mig", &mlx5vf_resume_fops, migf,
1018 					O_WRONLY);
1019 	if (IS_ERR(migf->filp)) {
1020 		ret = PTR_ERR(migf->filp);
1021 		goto end;
1022 	}
1023 
1024 	migf->mvdev = mvdev;
1025 	ret = mlx5vf_cmd_alloc_pd(migf);
1026 	if (ret)
1027 		goto out_free;
1028 
1029 	buf = mlx5vf_alloc_data_buffer(migf, 0, DMA_TO_DEVICE);
1030 	if (IS_ERR(buf)) {
1031 		ret = PTR_ERR(buf);
1032 		goto out_pd;
1033 	}
1034 
1035 	migf->buf[0] = buf;
1036 	buf = mlx5vf_alloc_data_buffer(migf,
1037 		sizeof(struct mlx5_vf_migration_header), DMA_NONE);
1038 	if (IS_ERR(buf)) {
1039 		ret = PTR_ERR(buf);
1040 		goto out_buf;
1041 	}
1042 
1043 	migf->buf_header[0] = buf;
1044 	migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER;
1045 
1046 	stream_open(migf->filp->f_inode, migf->filp);
1047 	mutex_init(&migf->lock);
1048 	INIT_LIST_HEAD(&migf->buf_list);
1049 	INIT_LIST_HEAD(&migf->avail_list);
1050 	spin_lock_init(&migf->list_lock);
1051 	return migf;
1052 out_buf:
1053 	mlx5vf_free_data_buffer(migf->buf[0]);
1054 out_pd:
1055 	mlx5vf_cmd_dealloc_pd(migf);
1056 out_free:
1057 	fput(migf->filp);
1058 end:
1059 	kfree(migf);
1060 	return ERR_PTR(ret);
1061 }
1062 
1063 void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev,
1064 			enum mlx5_vf_migf_state *last_save_state)
1065 {
1066 	if (mvdev->resuming_migf) {
1067 		mlx5vf_disable_fd(mvdev->resuming_migf);
1068 		mlx5fv_cmd_clean_migf_resources(mvdev->resuming_migf);
1069 		fput(mvdev->resuming_migf->filp);
1070 		mvdev->resuming_migf = NULL;
1071 	}
1072 	if (mvdev->saving_migf) {
1073 		mlx5_cmd_cleanup_async_ctx(&mvdev->saving_migf->async_ctx);
1074 		cancel_work_sync(&mvdev->saving_migf->async_data.work);
1075 		if (last_save_state)
1076 			*last_save_state = mvdev->saving_migf->state;
1077 		mlx5vf_disable_fd(mvdev->saving_migf);
1078 		wake_up_interruptible(&mvdev->saving_migf->poll_wait);
1079 		mlx5fv_cmd_clean_migf_resources(mvdev->saving_migf);
1080 		fput(mvdev->saving_migf->filp);
1081 		mvdev->saving_migf = NULL;
1082 	}
1083 }
1084 
1085 static struct file *
1086 mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev,
1087 				    u32 new)
1088 {
1089 	u32 cur = mvdev->mig_state;
1090 	int ret;
1091 
1092 	if (cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_STOP) {
1093 		ret = mlx5vf_cmd_suspend_vhca(mvdev,
1094 			MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_RESPONDER);
1095 		if (ret)
1096 			return ERR_PTR(ret);
1097 		return NULL;
1098 	}
1099 
1100 	if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RUNNING_P2P) {
1101 		ret = mlx5vf_cmd_resume_vhca(mvdev,
1102 			MLX5_RESUME_VHCA_IN_OP_MOD_RESUME_RESPONDER);
1103 		if (ret)
1104 			return ERR_PTR(ret);
1105 		return NULL;
1106 	}
1107 
1108 	if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_RUNNING_P2P) ||
1109 	    (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) {
1110 		ret = mlx5vf_cmd_suspend_vhca(mvdev,
1111 			MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_INITIATOR);
1112 		if (ret)
1113 			return ERR_PTR(ret);
1114 		return NULL;
1115 	}
1116 
1117 	if ((cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_RUNNING) ||
1118 	    (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_PRE_COPY)) {
1119 		ret = mlx5vf_cmd_resume_vhca(mvdev,
1120 			MLX5_RESUME_VHCA_IN_OP_MOD_RESUME_INITIATOR);
1121 		if (ret)
1122 			return ERR_PTR(ret);
1123 		return NULL;
1124 	}
1125 
1126 	if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_STOP_COPY) {
1127 		struct mlx5_vf_migration_file *migf;
1128 
1129 		migf = mlx5vf_pci_save_device_data(mvdev, false);
1130 		if (IS_ERR(migf))
1131 			return ERR_CAST(migf);
1132 		get_file(migf->filp);
1133 		mvdev->saving_migf = migf;
1134 		return migf->filp;
1135 	}
1136 
1137 	if (cur == VFIO_DEVICE_STATE_STOP_COPY && new == VFIO_DEVICE_STATE_STOP) {
1138 		mlx5vf_disable_fds(mvdev, NULL);
1139 		return NULL;
1140 	}
1141 
1142 	if ((cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_RUNNING) ||
1143 	    (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P &&
1144 	     new == VFIO_DEVICE_STATE_RUNNING_P2P)) {
1145 		struct mlx5_vf_migration_file *migf = mvdev->saving_migf;
1146 		struct mlx5_vhca_data_buffer *buf;
1147 		enum mlx5_vf_migf_state state;
1148 		size_t size;
1149 
1150 		ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &size, NULL,
1151 					MLX5VF_QUERY_INC | MLX5VF_QUERY_CLEANUP);
1152 		if (ret)
1153 			return ERR_PTR(ret);
1154 		buf = mlx5vf_get_data_buffer(migf, size, DMA_FROM_DEVICE);
1155 		if (IS_ERR(buf))
1156 			return ERR_CAST(buf);
1157 		/* pre_copy cleanup */
1158 		ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, false, false);
1159 		if (ret) {
1160 			mlx5vf_put_data_buffer(buf);
1161 			return ERR_PTR(ret);
1162 		}
1163 		mlx5vf_disable_fds(mvdev, &state);
1164 		return (state != MLX5_MIGF_STATE_ERROR) ? NULL : ERR_PTR(-EIO);
1165 	}
1166 
1167 	if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RESUMING) {
1168 		struct mlx5_vf_migration_file *migf;
1169 
1170 		migf = mlx5vf_pci_resume_device_data(mvdev);
1171 		if (IS_ERR(migf))
1172 			return ERR_CAST(migf);
1173 		get_file(migf->filp);
1174 		mvdev->resuming_migf = migf;
1175 		return migf->filp;
1176 	}
1177 
1178 	if (cur == VFIO_DEVICE_STATE_RESUMING && new == VFIO_DEVICE_STATE_STOP) {
1179 		mlx5vf_disable_fds(mvdev, NULL);
1180 		return NULL;
1181 	}
1182 
1183 	if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_PRE_COPY) ||
1184 	    (cur == VFIO_DEVICE_STATE_RUNNING_P2P &&
1185 	     new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) {
1186 		struct mlx5_vf_migration_file *migf;
1187 
1188 		migf = mlx5vf_pci_save_device_data(mvdev, true);
1189 		if (IS_ERR(migf))
1190 			return ERR_CAST(migf);
1191 		get_file(migf->filp);
1192 		mvdev->saving_migf = migf;
1193 		return migf->filp;
1194 	}
1195 
1196 	if (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_STOP_COPY) {
1197 		ret = mlx5vf_cmd_suspend_vhca(mvdev,
1198 			MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_RESPONDER);
1199 		if (ret)
1200 			return ERR_PTR(ret);
1201 		ret = mlx5vf_pci_save_device_inc_data(mvdev);
1202 		return ret ? ERR_PTR(ret) : NULL;
1203 	}
1204 
1205 	/*
1206 	 * vfio_mig_get_next_state() does not use arcs other than the above
1207 	 */
1208 	WARN_ON(true);
1209 	return ERR_PTR(-EINVAL);
1210 }
1211 
1212 /*
1213  * This function is called in all state_mutex unlock cases to
1214  * handle a 'deferred_reset' if exists.
1215  */
1216 void mlx5vf_state_mutex_unlock(struct mlx5vf_pci_core_device *mvdev)
1217 {
1218 again:
1219 	spin_lock(&mvdev->reset_lock);
1220 	if (mvdev->deferred_reset) {
1221 		mvdev->deferred_reset = false;
1222 		spin_unlock(&mvdev->reset_lock);
1223 		mvdev->mig_state = VFIO_DEVICE_STATE_RUNNING;
1224 		mlx5vf_disable_fds(mvdev, NULL);
1225 		goto again;
1226 	}
1227 	mutex_unlock(&mvdev->state_mutex);
1228 	spin_unlock(&mvdev->reset_lock);
1229 }
1230 
1231 static struct file *
1232 mlx5vf_pci_set_device_state(struct vfio_device *vdev,
1233 			    enum vfio_device_mig_state new_state)
1234 {
1235 	struct mlx5vf_pci_core_device *mvdev = container_of(
1236 		vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1237 	enum vfio_device_mig_state next_state;
1238 	struct file *res = NULL;
1239 	int ret;
1240 
1241 	mutex_lock(&mvdev->state_mutex);
1242 	while (new_state != mvdev->mig_state) {
1243 		ret = vfio_mig_get_next_state(vdev, mvdev->mig_state,
1244 					      new_state, &next_state);
1245 		if (ret) {
1246 			res = ERR_PTR(ret);
1247 			break;
1248 		}
1249 		res = mlx5vf_pci_step_device_state_locked(mvdev, next_state);
1250 		if (IS_ERR(res))
1251 			break;
1252 		mvdev->mig_state = next_state;
1253 		if (WARN_ON(res && new_state != mvdev->mig_state)) {
1254 			fput(res);
1255 			res = ERR_PTR(-EINVAL);
1256 			break;
1257 		}
1258 	}
1259 	mlx5vf_state_mutex_unlock(mvdev);
1260 	return res;
1261 }
1262 
1263 static int mlx5vf_pci_get_data_size(struct vfio_device *vdev,
1264 				    unsigned long *stop_copy_length)
1265 {
1266 	struct mlx5vf_pci_core_device *mvdev = container_of(
1267 		vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1268 	size_t state_size;
1269 	u64 total_size;
1270 	int ret;
1271 
1272 	mutex_lock(&mvdev->state_mutex);
1273 	ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &state_size,
1274 						    &total_size, 0);
1275 	if (!ret)
1276 		*stop_copy_length = total_size;
1277 	mlx5vf_state_mutex_unlock(mvdev);
1278 	return ret;
1279 }
1280 
1281 static int mlx5vf_pci_get_device_state(struct vfio_device *vdev,
1282 				       enum vfio_device_mig_state *curr_state)
1283 {
1284 	struct mlx5vf_pci_core_device *mvdev = container_of(
1285 		vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1286 
1287 	mutex_lock(&mvdev->state_mutex);
1288 	*curr_state = mvdev->mig_state;
1289 	mlx5vf_state_mutex_unlock(mvdev);
1290 	return 0;
1291 }
1292 
1293 static void mlx5vf_pci_aer_reset_done(struct pci_dev *pdev)
1294 {
1295 	struct mlx5vf_pci_core_device *mvdev = mlx5vf_drvdata(pdev);
1296 
1297 	if (!mvdev->migrate_cap)
1298 		return;
1299 
1300 	/*
1301 	 * As the higher VFIO layers are holding locks across reset and using
1302 	 * those same locks with the mm_lock we need to prevent ABBA deadlock
1303 	 * with the state_mutex and mm_lock.
1304 	 * In case the state_mutex was taken already we defer the cleanup work
1305 	 * to the unlock flow of the other running context.
1306 	 */
1307 	spin_lock(&mvdev->reset_lock);
1308 	mvdev->deferred_reset = true;
1309 	if (!mutex_trylock(&mvdev->state_mutex)) {
1310 		spin_unlock(&mvdev->reset_lock);
1311 		return;
1312 	}
1313 	spin_unlock(&mvdev->reset_lock);
1314 	mlx5vf_state_mutex_unlock(mvdev);
1315 }
1316 
1317 static int mlx5vf_pci_open_device(struct vfio_device *core_vdev)
1318 {
1319 	struct mlx5vf_pci_core_device *mvdev = container_of(
1320 		core_vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1321 	struct vfio_pci_core_device *vdev = &mvdev->core_device;
1322 	int ret;
1323 
1324 	ret = vfio_pci_core_enable(vdev);
1325 	if (ret)
1326 		return ret;
1327 
1328 	if (mvdev->migrate_cap)
1329 		mvdev->mig_state = VFIO_DEVICE_STATE_RUNNING;
1330 	vfio_pci_core_finish_enable(vdev);
1331 	return 0;
1332 }
1333 
1334 static void mlx5vf_pci_close_device(struct vfio_device *core_vdev)
1335 {
1336 	struct mlx5vf_pci_core_device *mvdev = container_of(
1337 		core_vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1338 
1339 	mlx5vf_cmd_close_migratable(mvdev);
1340 	vfio_pci_core_close_device(core_vdev);
1341 }
1342 
1343 static const struct vfio_migration_ops mlx5vf_pci_mig_ops = {
1344 	.migration_set_state = mlx5vf_pci_set_device_state,
1345 	.migration_get_state = mlx5vf_pci_get_device_state,
1346 	.migration_get_data_size = mlx5vf_pci_get_data_size,
1347 };
1348 
1349 static const struct vfio_log_ops mlx5vf_pci_log_ops = {
1350 	.log_start = mlx5vf_start_page_tracker,
1351 	.log_stop = mlx5vf_stop_page_tracker,
1352 	.log_read_and_clear = mlx5vf_tracker_read_and_clear,
1353 };
1354 
1355 static int mlx5vf_pci_init_dev(struct vfio_device *core_vdev)
1356 {
1357 	struct mlx5vf_pci_core_device *mvdev = container_of(core_vdev,
1358 			struct mlx5vf_pci_core_device, core_device.vdev);
1359 	int ret;
1360 
1361 	ret = vfio_pci_core_init_dev(core_vdev);
1362 	if (ret)
1363 		return ret;
1364 
1365 	mlx5vf_cmd_set_migratable(mvdev, &mlx5vf_pci_mig_ops,
1366 				  &mlx5vf_pci_log_ops);
1367 
1368 	return 0;
1369 }
1370 
1371 static void mlx5vf_pci_release_dev(struct vfio_device *core_vdev)
1372 {
1373 	struct mlx5vf_pci_core_device *mvdev = container_of(core_vdev,
1374 			struct mlx5vf_pci_core_device, core_device.vdev);
1375 
1376 	mlx5vf_cmd_remove_migratable(mvdev);
1377 	vfio_pci_core_release_dev(core_vdev);
1378 }
1379 
1380 static const struct vfio_device_ops mlx5vf_pci_ops = {
1381 	.name = "mlx5-vfio-pci",
1382 	.init = mlx5vf_pci_init_dev,
1383 	.release = mlx5vf_pci_release_dev,
1384 	.open_device = mlx5vf_pci_open_device,
1385 	.close_device = mlx5vf_pci_close_device,
1386 	.ioctl = vfio_pci_core_ioctl,
1387 	.device_feature = vfio_pci_core_ioctl_feature,
1388 	.read = vfio_pci_core_read,
1389 	.write = vfio_pci_core_write,
1390 	.mmap = vfio_pci_core_mmap,
1391 	.request = vfio_pci_core_request,
1392 	.match = vfio_pci_core_match,
1393 	.bind_iommufd = vfio_iommufd_physical_bind,
1394 	.unbind_iommufd = vfio_iommufd_physical_unbind,
1395 	.attach_ioas = vfio_iommufd_physical_attach_ioas,
1396 	.detach_ioas = vfio_iommufd_physical_detach_ioas,
1397 };
1398 
1399 static int mlx5vf_pci_probe(struct pci_dev *pdev,
1400 			    const struct pci_device_id *id)
1401 {
1402 	struct mlx5vf_pci_core_device *mvdev;
1403 	int ret;
1404 
1405 	mvdev = vfio_alloc_device(mlx5vf_pci_core_device, core_device.vdev,
1406 				  &pdev->dev, &mlx5vf_pci_ops);
1407 	if (IS_ERR(mvdev))
1408 		return PTR_ERR(mvdev);
1409 
1410 	dev_set_drvdata(&pdev->dev, &mvdev->core_device);
1411 	ret = vfio_pci_core_register_device(&mvdev->core_device);
1412 	if (ret)
1413 		goto out_put_vdev;
1414 	return 0;
1415 
1416 out_put_vdev:
1417 	vfio_put_device(&mvdev->core_device.vdev);
1418 	return ret;
1419 }
1420 
1421 static void mlx5vf_pci_remove(struct pci_dev *pdev)
1422 {
1423 	struct mlx5vf_pci_core_device *mvdev = mlx5vf_drvdata(pdev);
1424 
1425 	vfio_pci_core_unregister_device(&mvdev->core_device);
1426 	vfio_put_device(&mvdev->core_device.vdev);
1427 }
1428 
1429 static const struct pci_device_id mlx5vf_pci_table[] = {
1430 	{ PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_MELLANOX, 0x101e) }, /* ConnectX Family mlx5Gen Virtual Function */
1431 	{}
1432 };
1433 
1434 MODULE_DEVICE_TABLE(pci, mlx5vf_pci_table);
1435 
1436 static const struct pci_error_handlers mlx5vf_err_handlers = {
1437 	.reset_done = mlx5vf_pci_aer_reset_done,
1438 	.error_detected = vfio_pci_core_aer_err_detected,
1439 };
1440 
1441 static struct pci_driver mlx5vf_pci_driver = {
1442 	.name = KBUILD_MODNAME,
1443 	.id_table = mlx5vf_pci_table,
1444 	.probe = mlx5vf_pci_probe,
1445 	.remove = mlx5vf_pci_remove,
1446 	.err_handler = &mlx5vf_err_handlers,
1447 	.driver_managed_dma = true,
1448 };
1449 
1450 module_pci_driver(mlx5vf_pci_driver);
1451 
1452 MODULE_IMPORT_NS(IOMMUFD);
1453 MODULE_LICENSE("GPL");
1454 MODULE_AUTHOR("Max Gurtovoy <mgurtovoy@nvidia.com>");
1455 MODULE_AUTHOR("Yishai Hadas <yishaih@nvidia.com>");
1456 MODULE_DESCRIPTION(
1457 	"MLX5 VFIO PCI - User Level meta-driver for MLX5 device family");
1458