xref: /linux/drivers/vfio/pci/mlx5/main.c (revision 3536049822060347c8cb5a923186a8d65a8f7a48)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved
4  */
5 
6 #include <linux/device.h>
7 #include <linux/eventfd.h>
8 #include <linux/file.h>
9 #include <linux/interrupt.h>
10 #include <linux/iommu.h>
11 #include <linux/module.h>
12 #include <linux/mutex.h>
13 #include <linux/notifier.h>
14 #include <linux/pci.h>
15 #include <linux/pm_runtime.h>
16 #include <linux/types.h>
17 #include <linux/uaccess.h>
18 #include <linux/vfio.h>
19 #include <linux/sched/mm.h>
20 #include <linux/anon_inodes.h>
21 
22 #include "cmd.h"
23 
24 /* Device specification max LOAD size */
25 #define MAX_LOAD_SIZE (BIT_ULL(__mlx5_bit_sz(load_vhca_state_in, size)) - 1)
26 
27 #define MAX_CHUNK_SIZE SZ_8M
28 
mlx5vf_drvdata(struct pci_dev * pdev)29 static struct mlx5vf_pci_core_device *mlx5vf_drvdata(struct pci_dev *pdev)
30 {
31 	struct vfio_pci_core_device *core_device = dev_get_drvdata(&pdev->dev);
32 
33 	return container_of(core_device, struct mlx5vf_pci_core_device,
34 			    core_device);
35 }
36 
mlx5vf_disable_fd(struct mlx5_vf_migration_file * migf)37 static void mlx5vf_disable_fd(struct mlx5_vf_migration_file *migf)
38 {
39 	mutex_lock(&migf->lock);
40 	migf->state = MLX5_MIGF_STATE_ERROR;
41 	migf->filp->f_pos = 0;
42 	mutex_unlock(&migf->lock);
43 }
44 
mlx5vf_release_file(struct inode * inode,struct file * filp)45 static int mlx5vf_release_file(struct inode *inode, struct file *filp)
46 {
47 	struct mlx5_vf_migration_file *migf = filp->private_data;
48 
49 	mlx5vf_disable_fd(migf);
50 	mutex_destroy(&migf->lock);
51 	kfree(migf);
52 	return 0;
53 }
54 
55 static struct mlx5_vhca_data_buffer *
mlx5vf_get_data_buff_from_pos(struct mlx5_vf_migration_file * migf,loff_t pos,bool * end_of_data)56 mlx5vf_get_data_buff_from_pos(struct mlx5_vf_migration_file *migf, loff_t pos,
57 			      bool *end_of_data)
58 {
59 	struct mlx5_vhca_data_buffer *buf;
60 	bool found = false;
61 
62 	*end_of_data = false;
63 	spin_lock_irq(&migf->list_lock);
64 	if (list_empty(&migf->buf_list)) {
65 		*end_of_data = true;
66 		goto end;
67 	}
68 
69 	buf = list_first_entry(&migf->buf_list, struct mlx5_vhca_data_buffer,
70 			       buf_elm);
71 	if (pos >= buf->start_pos &&
72 	    pos < buf->start_pos + buf->length) {
73 		found = true;
74 		goto end;
75 	}
76 
77 	/*
78 	 * As we use a stream based FD we may expect having the data always
79 	 * on first chunk
80 	 */
81 	migf->state = MLX5_MIGF_STATE_ERROR;
82 
83 end:
84 	spin_unlock_irq(&migf->list_lock);
85 	return found ? buf : NULL;
86 }
87 
mlx5vf_buf_read_done(struct mlx5_vhca_data_buffer * vhca_buf)88 static void mlx5vf_buf_read_done(struct mlx5_vhca_data_buffer *vhca_buf)
89 {
90 	struct mlx5_vf_migration_file *migf = vhca_buf->migf;
91 
92 	if (vhca_buf->stop_copy_chunk_num) {
93 		bool is_header = vhca_buf->dma_dir == DMA_NONE;
94 		u8 chunk_num = vhca_buf->stop_copy_chunk_num;
95 		size_t next_required_umem_size = 0;
96 
97 		if (is_header)
98 			migf->buf_header[chunk_num - 1] = vhca_buf;
99 		else
100 			migf->buf[chunk_num - 1] = vhca_buf;
101 
102 		spin_lock_irq(&migf->list_lock);
103 		list_del_init(&vhca_buf->buf_elm);
104 		if (!is_header) {
105 			next_required_umem_size =
106 				migf->next_required_umem_size;
107 			migf->next_required_umem_size = 0;
108 			migf->num_ready_chunks--;
109 		}
110 		spin_unlock_irq(&migf->list_lock);
111 		if (next_required_umem_size)
112 			mlx5vf_mig_file_set_save_work(migf, chunk_num,
113 						      next_required_umem_size);
114 		return;
115 	}
116 
117 	spin_lock_irq(&migf->list_lock);
118 	list_del_init(&vhca_buf->buf_elm);
119 	list_add_tail(&vhca_buf->buf_elm, &vhca_buf->migf->avail_list);
120 	spin_unlock_irq(&migf->list_lock);
121 }
122 
mlx5vf_buf_read(struct mlx5_vhca_data_buffer * vhca_buf,char __user ** buf,size_t * len,loff_t * pos)123 static ssize_t mlx5vf_buf_read(struct mlx5_vhca_data_buffer *vhca_buf,
124 			       char __user **buf, size_t *len, loff_t *pos)
125 {
126 	unsigned long offset;
127 	ssize_t done = 0;
128 	size_t copy_len;
129 
130 	copy_len = min_t(size_t,
131 			 vhca_buf->start_pos + vhca_buf->length - *pos, *len);
132 	while (copy_len) {
133 		size_t page_offset;
134 		struct page *page;
135 		size_t page_len;
136 		u8 *from_buff;
137 		int ret;
138 
139 		offset = *pos - vhca_buf->start_pos;
140 		page_offset = offset % PAGE_SIZE;
141 		offset -= page_offset;
142 		page = mlx5vf_get_migration_page(vhca_buf, offset);
143 		if (!page)
144 			return -EINVAL;
145 		page_len = min_t(size_t, copy_len, PAGE_SIZE - page_offset);
146 		from_buff = kmap_local_page(page);
147 		ret = copy_to_user(*buf, from_buff + page_offset, page_len);
148 		kunmap_local(from_buff);
149 		if (ret)
150 			return -EFAULT;
151 		*pos += page_len;
152 		*len -= page_len;
153 		*buf += page_len;
154 		done += page_len;
155 		copy_len -= page_len;
156 	}
157 
158 	if (*pos >= vhca_buf->start_pos + vhca_buf->length)
159 		mlx5vf_buf_read_done(vhca_buf);
160 
161 	return done;
162 }
163 
mlx5vf_save_read(struct file * filp,char __user * buf,size_t len,loff_t * pos)164 static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len,
165 			       loff_t *pos)
166 {
167 	struct mlx5_vf_migration_file *migf = filp->private_data;
168 	struct mlx5_vhca_data_buffer *vhca_buf;
169 	bool first_loop_call = true;
170 	bool end_of_data;
171 	ssize_t done = 0;
172 
173 	if (pos)
174 		return -ESPIPE;
175 	pos = &filp->f_pos;
176 
177 	if (!(filp->f_flags & O_NONBLOCK)) {
178 		if (wait_event_interruptible(migf->poll_wait,
179 				!list_empty(&migf->buf_list) ||
180 				migf->state == MLX5_MIGF_STATE_ERROR ||
181 				migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR ||
182 				migf->state == MLX5_MIGF_STATE_PRE_COPY ||
183 				migf->state == MLX5_MIGF_STATE_COMPLETE))
184 			return -ERESTARTSYS;
185 	}
186 
187 	mutex_lock(&migf->lock);
188 	if (migf->state == MLX5_MIGF_STATE_ERROR) {
189 		done = -ENODEV;
190 		goto out_unlock;
191 	}
192 
193 	while (len) {
194 		ssize_t count;
195 
196 		vhca_buf = mlx5vf_get_data_buff_from_pos(migf, *pos,
197 							 &end_of_data);
198 		if (first_loop_call) {
199 			first_loop_call = false;
200 			/* Temporary end of file as part of PRE_COPY */
201 			if (end_of_data && (migf->state == MLX5_MIGF_STATE_PRE_COPY ||
202 				migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR)) {
203 				done = -ENOMSG;
204 				goto out_unlock;
205 			}
206 
207 			if (end_of_data && migf->state != MLX5_MIGF_STATE_COMPLETE) {
208 				if (filp->f_flags & O_NONBLOCK) {
209 					done = -EAGAIN;
210 					goto out_unlock;
211 				}
212 			}
213 		}
214 
215 		if (end_of_data)
216 			goto out_unlock;
217 
218 		if (!vhca_buf) {
219 			done = -EINVAL;
220 			goto out_unlock;
221 		}
222 
223 		count = mlx5vf_buf_read(vhca_buf, &buf, &len, pos);
224 		if (count < 0) {
225 			done = count;
226 			goto out_unlock;
227 		}
228 		done += count;
229 	}
230 
231 out_unlock:
232 	mutex_unlock(&migf->lock);
233 	return done;
234 }
235 
mlx5vf_save_poll(struct file * filp,struct poll_table_struct * wait)236 static __poll_t mlx5vf_save_poll(struct file *filp,
237 				 struct poll_table_struct *wait)
238 {
239 	struct mlx5_vf_migration_file *migf = filp->private_data;
240 	__poll_t pollflags = 0;
241 
242 	poll_wait(filp, &migf->poll_wait, wait);
243 
244 	mutex_lock(&migf->lock);
245 	if (migf->state == MLX5_MIGF_STATE_ERROR)
246 		pollflags = EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
247 	else if (!list_empty(&migf->buf_list) ||
248 		 migf->state == MLX5_MIGF_STATE_COMPLETE)
249 		pollflags = EPOLLIN | EPOLLRDNORM;
250 	mutex_unlock(&migf->lock);
251 
252 	return pollflags;
253 }
254 
255 /*
256  * FD is exposed and user can use it after receiving an error.
257  * Mark migf in error, and wake the user.
258  */
mlx5vf_mark_err(struct mlx5_vf_migration_file * migf)259 static void mlx5vf_mark_err(struct mlx5_vf_migration_file *migf)
260 {
261 	migf->state = MLX5_MIGF_STATE_ERROR;
262 	wake_up_interruptible(&migf->poll_wait);
263 }
264 
mlx5vf_mig_file_set_save_work(struct mlx5_vf_migration_file * migf,u8 chunk_num,size_t next_required_umem_size)265 void mlx5vf_mig_file_set_save_work(struct mlx5_vf_migration_file *migf,
266 				   u8 chunk_num, size_t next_required_umem_size)
267 {
268 	migf->save_data[chunk_num - 1].next_required_umem_size =
269 			next_required_umem_size;
270 	migf->save_data[chunk_num - 1].migf = migf;
271 	get_file(migf->filp);
272 	queue_work(migf->mvdev->cb_wq,
273 		   &migf->save_data[chunk_num - 1].work);
274 }
275 
276 static struct mlx5_vhca_data_buffer *
mlx5vf_mig_file_get_stop_copy_buf(struct mlx5_vf_migration_file * migf,u8 index,size_t required_length)277 mlx5vf_mig_file_get_stop_copy_buf(struct mlx5_vf_migration_file *migf,
278 				  u8 index, size_t required_length)
279 {
280 	u32 npages = DIV_ROUND_UP(required_length, PAGE_SIZE);
281 	struct mlx5_vhca_data_buffer *buf = migf->buf[index];
282 	u8 chunk_num;
283 
284 	WARN_ON(!buf);
285 	chunk_num = buf->stop_copy_chunk_num;
286 	buf->migf->buf[index] = NULL;
287 	/* Checking whether the pre-allocated buffer can fit */
288 	if (buf->npages >= npages)
289 		return buf;
290 
291 	mlx5vf_put_data_buffer(buf);
292 	buf = mlx5vf_get_data_buffer(buf->migf, npages, DMA_FROM_DEVICE);
293 	if (IS_ERR(buf))
294 		return buf;
295 
296 	buf->stop_copy_chunk_num = chunk_num;
297 	return buf;
298 }
299 
mlx5vf_mig_file_save_work(struct work_struct * _work)300 static void mlx5vf_mig_file_save_work(struct work_struct *_work)
301 {
302 	struct mlx5vf_save_work_data *save_data = container_of(_work,
303 		struct mlx5vf_save_work_data, work);
304 	struct mlx5_vf_migration_file *migf = save_data->migf;
305 	struct mlx5vf_pci_core_device *mvdev = migf->mvdev;
306 	struct mlx5_vhca_data_buffer *buf;
307 
308 	mutex_lock(&mvdev->state_mutex);
309 	if (migf->state == MLX5_MIGF_STATE_ERROR)
310 		goto end;
311 
312 	buf = mlx5vf_mig_file_get_stop_copy_buf(migf,
313 				save_data->chunk_num - 1,
314 				save_data->next_required_umem_size);
315 	if (IS_ERR(buf))
316 		goto err;
317 
318 	if (mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, false))
319 		goto err_save;
320 
321 	goto end;
322 
323 err_save:
324 	mlx5vf_put_data_buffer(buf);
325 err:
326 	mlx5vf_mark_err(migf);
327 end:
328 	mlx5vf_state_mutex_unlock(mvdev);
329 	fput(migf->filp);
330 }
331 
mlx5vf_add_stop_copy_header(struct mlx5_vf_migration_file * migf,bool track)332 static int mlx5vf_add_stop_copy_header(struct mlx5_vf_migration_file *migf,
333 				       bool track)
334 {
335 	size_t size = sizeof(struct mlx5_vf_migration_header) +
336 		sizeof(struct mlx5_vf_migration_tag_stop_copy_data);
337 	struct mlx5_vf_migration_tag_stop_copy_data data = {};
338 	struct mlx5_vhca_data_buffer *header_buf = NULL;
339 	struct mlx5_vf_migration_header header = {};
340 	unsigned long flags;
341 	struct page *page;
342 	u8 *to_buff;
343 	int ret;
344 
345 	header_buf = mlx5vf_get_data_buffer(migf, DIV_ROUND_UP(size, PAGE_SIZE),
346 					    DMA_NONE);
347 	if (IS_ERR(header_buf))
348 		return PTR_ERR(header_buf);
349 
350 	header.record_size = cpu_to_le64(sizeof(data));
351 	header.flags = cpu_to_le32(MLX5_MIGF_HEADER_FLAGS_TAG_OPTIONAL);
352 	header.tag = cpu_to_le32(MLX5_MIGF_HEADER_TAG_STOP_COPY_SIZE);
353 	page = mlx5vf_get_migration_page(header_buf, 0);
354 	if (!page) {
355 		ret = -EINVAL;
356 		goto err;
357 	}
358 	to_buff = kmap_local_page(page);
359 	memcpy(to_buff, &header, sizeof(header));
360 	header_buf->length = sizeof(header);
361 	data.stop_copy_size = cpu_to_le64(migf->buf[0]->npages * PAGE_SIZE);
362 	memcpy(to_buff + sizeof(header), &data, sizeof(data));
363 	header_buf->length += sizeof(data);
364 	kunmap_local(to_buff);
365 	header_buf->start_pos = header_buf->migf->max_pos;
366 	migf->max_pos += header_buf->length;
367 	spin_lock_irqsave(&migf->list_lock, flags);
368 	list_add_tail(&header_buf->buf_elm, &migf->buf_list);
369 	spin_unlock_irqrestore(&migf->list_lock, flags);
370 	if (track)
371 		migf->pre_copy_initial_bytes = size;
372 	return 0;
373 err:
374 	mlx5vf_put_data_buffer(header_buf);
375 	return ret;
376 }
377 
mlx5vf_prep_stop_copy(struct mlx5vf_pci_core_device * mvdev,struct mlx5_vf_migration_file * migf,size_t state_size,u64 full_size,bool track)378 static int mlx5vf_prep_stop_copy(struct mlx5vf_pci_core_device *mvdev,
379 				 struct mlx5_vf_migration_file *migf,
380 				 size_t state_size, u64 full_size,
381 				 bool track)
382 {
383 	struct mlx5_vhca_data_buffer *buf;
384 	size_t inc_state_size;
385 	int num_chunks;
386 	int ret;
387 	int i;
388 
389 	if (mvdev->chunk_mode) {
390 		size_t chunk_size = min_t(size_t, MAX_CHUNK_SIZE, full_size);
391 
392 		/* from firmware perspective at least 'state_size' buffer should be set */
393 		inc_state_size = max(state_size, chunk_size);
394 	} else {
395 		if (track) {
396 			/* let's be ready for stop_copy size that might grow by 10 percents */
397 			if (check_add_overflow(state_size, state_size / 10, &inc_state_size))
398 				inc_state_size = state_size;
399 		} else {
400 			inc_state_size = state_size;
401 		}
402 	}
403 
404 	/* let's not overflow the device specification max SAVE size */
405 	inc_state_size = min_t(size_t, inc_state_size,
406 		(BIT_ULL(__mlx5_bit_sz(save_vhca_state_in, size)) - PAGE_SIZE));
407 
408 	num_chunks = mvdev->chunk_mode ? MAX_NUM_CHUNKS : 1;
409 	for (i = 0; i < num_chunks; i++) {
410 		buf = mlx5vf_get_data_buffer(
411 			migf, DIV_ROUND_UP(inc_state_size, PAGE_SIZE),
412 			DMA_FROM_DEVICE);
413 		if (IS_ERR(buf)) {
414 			ret = PTR_ERR(buf);
415 			goto err;
416 		}
417 
418 		migf->buf[i] = buf;
419 		buf = mlx5vf_get_data_buffer(
420 			migf,
421 			DIV_ROUND_UP(sizeof(struct mlx5_vf_migration_header),
422 				     PAGE_SIZE),
423 			DMA_NONE);
424 		if (IS_ERR(buf)) {
425 			ret = PTR_ERR(buf);
426 			goto err;
427 		}
428 		migf->buf_header[i] = buf;
429 		if (mvdev->chunk_mode) {
430 			migf->buf[i]->stop_copy_chunk_num = i + 1;
431 			migf->buf_header[i]->stop_copy_chunk_num = i + 1;
432 			INIT_WORK(&migf->save_data[i].work,
433 				  mlx5vf_mig_file_save_work);
434 			migf->save_data[i].chunk_num = i + 1;
435 		}
436 	}
437 
438 	ret = mlx5vf_add_stop_copy_header(migf, track);
439 	if (ret)
440 		goto err;
441 	return 0;
442 
443 err:
444 	for (i = 0; i < num_chunks; i++) {
445 		if (migf->buf[i]) {
446 			mlx5vf_put_data_buffer(migf->buf[i]);
447 			migf->buf[i] = NULL;
448 		}
449 		if (migf->buf_header[i]) {
450 			mlx5vf_put_data_buffer(migf->buf_header[i]);
451 			migf->buf_header[i] = NULL;
452 		}
453 	}
454 
455 	return ret;
456 }
457 
mlx5vf_precopy_ioctl(struct file * filp,unsigned int cmd,unsigned long arg)458 static long mlx5vf_precopy_ioctl(struct file *filp, unsigned int cmd,
459 				 unsigned long arg)
460 {
461 	struct mlx5_vf_migration_file *migf = filp->private_data;
462 	struct mlx5vf_pci_core_device *mvdev = migf->mvdev;
463 	struct mlx5_vhca_data_buffer *buf;
464 	struct vfio_precopy_info info = {};
465 	loff_t *pos = &filp->f_pos;
466 	unsigned long minsz;
467 	size_t inc_length = 0;
468 	bool end_of_data = false;
469 	int ret;
470 
471 	if (cmd != VFIO_MIG_GET_PRECOPY_INFO)
472 		return -ENOTTY;
473 
474 	minsz = offsetofend(struct vfio_precopy_info, dirty_bytes);
475 
476 	if (copy_from_user(&info, (void __user *)arg, minsz))
477 		return -EFAULT;
478 
479 	if (info.argsz < minsz)
480 		return -EINVAL;
481 
482 	mutex_lock(&mvdev->state_mutex);
483 	if (mvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY &&
484 	    mvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY_P2P) {
485 		ret = -EINVAL;
486 		goto err_state_unlock;
487 	}
488 
489 	/*
490 	 * We can't issue a SAVE command when the device is suspended, so as
491 	 * part of VFIO_DEVICE_STATE_PRE_COPY_P2P no reason to query for extra
492 	 * bytes that can't be read.
493 	 */
494 	if (mvdev->mig_state == VFIO_DEVICE_STATE_PRE_COPY) {
495 		/*
496 		 * Once the query returns it's guaranteed that there is no
497 		 * active SAVE command.
498 		 * As so, the other code below is safe with the proper locks.
499 		 */
500 		ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &inc_length,
501 							    NULL, MLX5VF_QUERY_INC);
502 		if (ret)
503 			goto err_state_unlock;
504 	}
505 
506 	mutex_lock(&migf->lock);
507 	if (migf->state == MLX5_MIGF_STATE_ERROR) {
508 		ret = -ENODEV;
509 		goto err_migf_unlock;
510 	}
511 
512 	if (migf->pre_copy_initial_bytes > *pos) {
513 		info.initial_bytes = migf->pre_copy_initial_bytes - *pos;
514 	} else {
515 		info.dirty_bytes = migf->max_pos - *pos;
516 		if (!info.dirty_bytes)
517 			end_of_data = true;
518 		info.dirty_bytes += inc_length;
519 	}
520 
521 	if (!end_of_data || !inc_length) {
522 		mutex_unlock(&migf->lock);
523 		goto done;
524 	}
525 
526 	mutex_unlock(&migf->lock);
527 	/*
528 	 * We finished transferring the current state and the device has a
529 	 * dirty state, save a new state to be ready for.
530 	 */
531 	buf = mlx5vf_get_data_buffer(migf, DIV_ROUND_UP(inc_length, PAGE_SIZE),
532 				     DMA_FROM_DEVICE);
533 	if (IS_ERR(buf)) {
534 		ret = PTR_ERR(buf);
535 		mlx5vf_mark_err(migf);
536 		goto err_state_unlock;
537 	}
538 
539 	ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, true);
540 	if (ret) {
541 		mlx5vf_mark_err(migf);
542 		mlx5vf_put_data_buffer(buf);
543 		goto err_state_unlock;
544 	}
545 
546 done:
547 	mlx5vf_state_mutex_unlock(mvdev);
548 	if (copy_to_user((void __user *)arg, &info, minsz))
549 		return -EFAULT;
550 	return 0;
551 
552 err_migf_unlock:
553 	mutex_unlock(&migf->lock);
554 err_state_unlock:
555 	mlx5vf_state_mutex_unlock(mvdev);
556 	return ret;
557 }
558 
559 static const struct file_operations mlx5vf_save_fops = {
560 	.owner = THIS_MODULE,
561 	.read = mlx5vf_save_read,
562 	.poll = mlx5vf_save_poll,
563 	.unlocked_ioctl = mlx5vf_precopy_ioctl,
564 	.compat_ioctl = compat_ptr_ioctl,
565 	.release = mlx5vf_release_file,
566 };
567 
mlx5vf_pci_save_device_inc_data(struct mlx5vf_pci_core_device * mvdev)568 static int mlx5vf_pci_save_device_inc_data(struct mlx5vf_pci_core_device *mvdev)
569 {
570 	struct mlx5_vf_migration_file *migf = mvdev->saving_migf;
571 	struct mlx5_vhca_data_buffer *buf;
572 	size_t length;
573 	int ret;
574 
575 	if (migf->state == MLX5_MIGF_STATE_ERROR)
576 		return -ENODEV;
577 
578 	ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, NULL,
579 				MLX5VF_QUERY_INC | MLX5VF_QUERY_FINAL);
580 	if (ret)
581 		goto err;
582 
583 	buf = mlx5vf_mig_file_get_stop_copy_buf(migf, 0, length);
584 	if (IS_ERR(buf)) {
585 		ret = PTR_ERR(buf);
586 		goto err;
587 	}
588 
589 	ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, false);
590 	if (ret)
591 		goto err_save;
592 
593 	return 0;
594 
595 err_save:
596 	mlx5vf_put_data_buffer(buf);
597 err:
598 	mlx5vf_mark_err(migf);
599 	return ret;
600 }
601 
602 static struct mlx5_vf_migration_file *
mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device * mvdev,bool track)603 mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev, bool track)
604 {
605 	struct mlx5_vf_migration_file *migf;
606 	struct mlx5_vhca_data_buffer *buf;
607 	size_t length;
608 	u64 full_size;
609 	int ret;
610 
611 	migf = kzalloc(sizeof(*migf), GFP_KERNEL_ACCOUNT);
612 	if (!migf)
613 		return ERR_PTR(-ENOMEM);
614 
615 	migf->filp = anon_inode_getfile("mlx5vf_mig", &mlx5vf_save_fops, migf,
616 					O_RDONLY);
617 	if (IS_ERR(migf->filp)) {
618 		ret = PTR_ERR(migf->filp);
619 		kfree(migf);
620 		return ERR_PTR(ret);
621 	}
622 
623 	migf->mvdev = mvdev;
624 	stream_open(migf->filp->f_inode, migf->filp);
625 	mutex_init(&migf->lock);
626 	init_waitqueue_head(&migf->poll_wait);
627 	init_completion(&migf->save_comp);
628 	/*
629 	 * save_comp is being used as a binary semaphore built from
630 	 * a completion. A normal mutex cannot be used because the lock is
631 	 * passed between kernel threads and lockdep can't model this.
632 	 */
633 	complete(&migf->save_comp);
634 	mlx5_cmd_init_async_ctx(mvdev->mdev, &migf->async_ctx);
635 	INIT_WORK(&migf->async_data.work, mlx5vf_mig_file_cleanup_cb);
636 	INIT_LIST_HEAD(&migf->buf_list);
637 	INIT_LIST_HEAD(&migf->avail_list);
638 	spin_lock_init(&migf->list_lock);
639 
640 	ret = mlx5vf_cmd_alloc_pd(migf);
641 	if (ret)
642 		goto out;
643 
644 	ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, &full_size, 0);
645 	if (ret)
646 		goto out_pd;
647 
648 	ret = mlx5vf_prep_stop_copy(mvdev, migf, length, full_size, track);
649 	if (ret)
650 		goto out_pd;
651 
652 	if (track) {
653 		/* leave the allocated buffer ready for the stop-copy phase */
654 		buf = mlx5vf_alloc_data_buffer(migf, migf->buf[0]->npages,
655 					       DMA_FROM_DEVICE);
656 		if (IS_ERR(buf)) {
657 			ret = PTR_ERR(buf);
658 			goto out_pd;
659 		}
660 	} else {
661 		buf = migf->buf[0];
662 		migf->buf[0] = NULL;
663 	}
664 
665 	ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, false, track);
666 	if (ret)
667 		goto out_save;
668 	return migf;
669 out_save:
670 	mlx5vf_free_data_buffer(buf);
671 out_pd:
672 	mlx5fv_cmd_clean_migf_resources(migf);
673 out:
674 	fput(migf->filp);
675 	return ERR_PTR(ret);
676 }
677 
678 static int
mlx5vf_append_page_to_mig_buf(struct mlx5_vhca_data_buffer * vhca_buf,const char __user ** buf,size_t * len,loff_t * pos,ssize_t * done)679 mlx5vf_append_page_to_mig_buf(struct mlx5_vhca_data_buffer *vhca_buf,
680 			      const char __user **buf, size_t *len,
681 			      loff_t *pos, ssize_t *done)
682 {
683 	unsigned long offset;
684 	size_t page_offset;
685 	struct page *page;
686 	size_t page_len;
687 	u8 *to_buff;
688 	int ret;
689 
690 	offset = *pos - vhca_buf->start_pos;
691 	page_offset = offset % PAGE_SIZE;
692 
693 	page = mlx5vf_get_migration_page(vhca_buf, offset - page_offset);
694 	if (!page)
695 		return -EINVAL;
696 	page_len = min_t(size_t, *len, PAGE_SIZE - page_offset);
697 	to_buff = kmap_local_page(page);
698 	ret = copy_from_user(to_buff + page_offset, *buf, page_len);
699 	kunmap_local(to_buff);
700 	if (ret)
701 		return -EFAULT;
702 
703 	*pos += page_len;
704 	*done += page_len;
705 	*buf += page_len;
706 	*len -= page_len;
707 	vhca_buf->length += page_len;
708 	return 0;
709 }
710 
711 static ssize_t
mlx5vf_resume_read_image(struct mlx5_vf_migration_file * migf,struct mlx5_vhca_data_buffer * vhca_buf,size_t image_size,const char __user ** buf,size_t * len,loff_t * pos,ssize_t * done,bool * has_work)712 mlx5vf_resume_read_image(struct mlx5_vf_migration_file *migf,
713 			 struct mlx5_vhca_data_buffer *vhca_buf,
714 			 size_t image_size, const char __user **buf,
715 			 size_t *len, loff_t *pos, ssize_t *done,
716 			 bool *has_work)
717 {
718 	size_t copy_len, to_copy;
719 	int ret;
720 
721 	to_copy = min_t(size_t, *len, image_size - vhca_buf->length);
722 	copy_len = to_copy;
723 	while (to_copy) {
724 		ret = mlx5vf_append_page_to_mig_buf(vhca_buf, buf, &to_copy, pos,
725 						    done);
726 		if (ret)
727 			return ret;
728 	}
729 
730 	*len -= copy_len;
731 	if (vhca_buf->length == image_size) {
732 		migf->load_state = MLX5_VF_LOAD_STATE_LOAD_IMAGE;
733 		migf->max_pos += image_size;
734 		*has_work = true;
735 	}
736 
737 	return 0;
738 }
739 
740 static int
mlx5vf_resume_read_header_data(struct mlx5_vf_migration_file * migf,struct mlx5_vhca_data_buffer * vhca_buf,const char __user ** buf,size_t * len,loff_t * pos,ssize_t * done)741 mlx5vf_resume_read_header_data(struct mlx5_vf_migration_file *migf,
742 			       struct mlx5_vhca_data_buffer *vhca_buf,
743 			       const char __user **buf, size_t *len,
744 			       loff_t *pos, ssize_t *done)
745 {
746 	size_t copy_len, to_copy;
747 	size_t required_data;
748 	u8 *to_buff;
749 	int ret;
750 
751 	required_data = migf->record_size - vhca_buf->length;
752 	to_copy = min_t(size_t, *len, required_data);
753 	copy_len = to_copy;
754 	while (to_copy) {
755 		ret = mlx5vf_append_page_to_mig_buf(vhca_buf, buf, &to_copy, pos,
756 						    done);
757 		if (ret)
758 			return ret;
759 	}
760 
761 	*len -= copy_len;
762 	if (vhca_buf->length == migf->record_size) {
763 		switch (migf->record_tag) {
764 		case MLX5_MIGF_HEADER_TAG_STOP_COPY_SIZE:
765 		{
766 			struct page *page;
767 
768 			page = mlx5vf_get_migration_page(vhca_buf, 0);
769 			if (!page)
770 				return -EINVAL;
771 			to_buff = kmap_local_page(page);
772 			migf->stop_copy_prep_size = min_t(u64,
773 				le64_to_cpup((__le64 *)to_buff), MAX_LOAD_SIZE);
774 			kunmap_local(to_buff);
775 			break;
776 		}
777 		default:
778 			/* Optional tag */
779 			break;
780 		}
781 
782 		migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER;
783 		migf->max_pos += migf->record_size;
784 		vhca_buf->length = 0;
785 	}
786 
787 	return 0;
788 }
789 
790 static int
mlx5vf_resume_read_header(struct mlx5_vf_migration_file * migf,struct mlx5_vhca_data_buffer * vhca_buf,const char __user ** buf,size_t * len,loff_t * pos,ssize_t * done,bool * has_work)791 mlx5vf_resume_read_header(struct mlx5_vf_migration_file *migf,
792 			  struct mlx5_vhca_data_buffer *vhca_buf,
793 			  const char __user **buf,
794 			  size_t *len, loff_t *pos,
795 			  ssize_t *done, bool *has_work)
796 {
797 	struct page *page;
798 	size_t copy_len;
799 	u8 *to_buff;
800 	int ret;
801 
802 	copy_len = min_t(size_t, *len,
803 		sizeof(struct mlx5_vf_migration_header) - vhca_buf->length);
804 	page = mlx5vf_get_migration_page(vhca_buf, 0);
805 	if (!page)
806 		return -EINVAL;
807 	to_buff = kmap_local_page(page);
808 	ret = copy_from_user(to_buff + vhca_buf->length, *buf, copy_len);
809 	if (ret) {
810 		ret = -EFAULT;
811 		goto end;
812 	}
813 
814 	*buf += copy_len;
815 	*pos += copy_len;
816 	*done += copy_len;
817 	*len -= copy_len;
818 	vhca_buf->length += copy_len;
819 	if (vhca_buf->length == sizeof(struct mlx5_vf_migration_header)) {
820 		u64 record_size;
821 		u32 flags;
822 
823 		record_size = le64_to_cpup((__le64 *)to_buff);
824 		if (record_size > MAX_LOAD_SIZE) {
825 			ret = -ENOMEM;
826 			goto end;
827 		}
828 
829 		migf->record_size = record_size;
830 		flags = le32_to_cpup((__le32 *)(to_buff +
831 			    offsetof(struct mlx5_vf_migration_header, flags)));
832 		migf->record_tag = le32_to_cpup((__le32 *)(to_buff +
833 			    offsetof(struct mlx5_vf_migration_header, tag)));
834 		switch (migf->record_tag) {
835 		case MLX5_MIGF_HEADER_TAG_FW_DATA:
836 			migf->load_state = MLX5_VF_LOAD_STATE_PREP_IMAGE;
837 			break;
838 		case MLX5_MIGF_HEADER_TAG_STOP_COPY_SIZE:
839 			migf->load_state = MLX5_VF_LOAD_STATE_PREP_HEADER_DATA;
840 			break;
841 		default:
842 			if (!(flags & MLX5_MIGF_HEADER_FLAGS_TAG_OPTIONAL)) {
843 				ret = -EOPNOTSUPP;
844 				goto end;
845 			}
846 			/* We may read and skip this optional record data */
847 			migf->load_state = MLX5_VF_LOAD_STATE_PREP_HEADER_DATA;
848 		}
849 
850 		migf->max_pos += vhca_buf->length;
851 		vhca_buf->length = 0;
852 		*has_work = true;
853 	}
854 end:
855 	kunmap_local(to_buff);
856 	return ret;
857 }
858 
mlx5vf_resume_write(struct file * filp,const char __user * buf,size_t len,loff_t * pos)859 static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf,
860 				   size_t len, loff_t *pos)
861 {
862 	struct mlx5_vf_migration_file *migf = filp->private_data;
863 	struct mlx5_vhca_data_buffer *vhca_buf = migf->buf[0];
864 	struct mlx5_vhca_data_buffer *vhca_buf_header = migf->buf_header[0];
865 	loff_t requested_length;
866 	bool has_work = false;
867 	ssize_t done = 0;
868 	int ret = 0;
869 
870 	if (pos)
871 		return -ESPIPE;
872 	pos = &filp->f_pos;
873 
874 	if (*pos < 0 ||
875 	    check_add_overflow((loff_t)len, *pos, &requested_length))
876 		return -EINVAL;
877 
878 	mutex_lock(&migf->mvdev->state_mutex);
879 	mutex_lock(&migf->lock);
880 	if (migf->state == MLX5_MIGF_STATE_ERROR) {
881 		ret = -ENODEV;
882 		goto out_unlock;
883 	}
884 
885 	while (len || has_work) {
886 		has_work = false;
887 		switch (migf->load_state) {
888 		case MLX5_VF_LOAD_STATE_READ_HEADER:
889 			ret = mlx5vf_resume_read_header(migf, vhca_buf_header,
890 							&buf, &len, pos,
891 							&done, &has_work);
892 			if (ret)
893 				goto out_unlock;
894 			break;
895 		case MLX5_VF_LOAD_STATE_PREP_HEADER_DATA:
896 		{
897 			u32 npages = DIV_ROUND_UP(migf->record_size, PAGE_SIZE);
898 
899 			if (vhca_buf_header->npages < npages) {
900 				mlx5vf_free_data_buffer(vhca_buf_header);
901 
902 				migf->buf_header[0] = mlx5vf_alloc_data_buffer(
903 					migf, npages, DMA_NONE);
904 				if (IS_ERR(migf->buf_header[0])) {
905 					ret = PTR_ERR(migf->buf_header[0]);
906 					migf->buf_header[0] = NULL;
907 					goto out_unlock;
908 				}
909 
910 				vhca_buf_header = migf->buf_header[0];
911 			}
912 
913 			vhca_buf_header->start_pos = migf->max_pos;
914 			migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER_DATA;
915 			break;
916 		}
917 		case MLX5_VF_LOAD_STATE_READ_HEADER_DATA:
918 			ret = mlx5vf_resume_read_header_data(migf, vhca_buf_header,
919 							&buf, &len, pos, &done);
920 			if (ret)
921 				goto out_unlock;
922 			break;
923 		case MLX5_VF_LOAD_STATE_PREP_IMAGE:
924 		{
925 			u64 size = max(migf->record_size,
926 				       migf->stop_copy_prep_size);
927 			u32 npages = DIV_ROUND_UP(size, PAGE_SIZE);
928 
929 			if (vhca_buf->npages < npages) {
930 				mlx5vf_free_data_buffer(vhca_buf);
931 
932 				migf->buf[0] = mlx5vf_alloc_data_buffer(
933 					migf, npages, DMA_TO_DEVICE);
934 				if (IS_ERR(migf->buf[0])) {
935 					ret = PTR_ERR(migf->buf[0]);
936 					migf->buf[0] = NULL;
937 					goto out_unlock;
938 				}
939 
940 				vhca_buf = migf->buf[0];
941 			}
942 
943 			vhca_buf->start_pos = migf->max_pos;
944 			migf->load_state = MLX5_VF_LOAD_STATE_READ_IMAGE;
945 			break;
946 		}
947 		case MLX5_VF_LOAD_STATE_READ_IMAGE:
948 			ret = mlx5vf_resume_read_image(migf, vhca_buf,
949 						migf->record_size,
950 						&buf, &len, pos, &done, &has_work);
951 			if (ret)
952 				goto out_unlock;
953 			break;
954 		case MLX5_VF_LOAD_STATE_LOAD_IMAGE:
955 			ret = mlx5vf_cmd_load_vhca_state(migf->mvdev, migf, vhca_buf);
956 			if (ret)
957 				goto out_unlock;
958 			migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER;
959 
960 			/* prep header buf for next image */
961 			vhca_buf_header->length = 0;
962 			/* prep data buf for next image */
963 			vhca_buf->length = 0;
964 
965 			break;
966 		default:
967 			break;
968 		}
969 	}
970 
971 out_unlock:
972 	if (ret)
973 		migf->state = MLX5_MIGF_STATE_ERROR;
974 	mutex_unlock(&migf->lock);
975 	mlx5vf_state_mutex_unlock(migf->mvdev);
976 	return ret ? ret : done;
977 }
978 
979 static const struct file_operations mlx5vf_resume_fops = {
980 	.owner = THIS_MODULE,
981 	.write = mlx5vf_resume_write,
982 	.release = mlx5vf_release_file,
983 };
984 
985 static struct mlx5_vf_migration_file *
mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device * mvdev)986 mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev)
987 {
988 	struct mlx5_vf_migration_file *migf;
989 	struct mlx5_vhca_data_buffer *buf;
990 	int ret;
991 
992 	migf = kzalloc(sizeof(*migf), GFP_KERNEL_ACCOUNT);
993 	if (!migf)
994 		return ERR_PTR(-ENOMEM);
995 
996 	migf->filp = anon_inode_getfile("mlx5vf_mig", &mlx5vf_resume_fops, migf,
997 					O_WRONLY);
998 	if (IS_ERR(migf->filp)) {
999 		ret = PTR_ERR(migf->filp);
1000 		kfree(migf);
1001 		return ERR_PTR(ret);
1002 	}
1003 
1004 	stream_open(migf->filp->f_inode, migf->filp);
1005 	mutex_init(&migf->lock);
1006 	INIT_LIST_HEAD(&migf->buf_list);
1007 	INIT_LIST_HEAD(&migf->avail_list);
1008 	spin_lock_init(&migf->list_lock);
1009 	migf->mvdev = mvdev;
1010 	ret = mlx5vf_cmd_alloc_pd(migf);
1011 	if (ret)
1012 		goto out;
1013 
1014 	buf = mlx5vf_alloc_data_buffer(migf, 0, DMA_TO_DEVICE);
1015 	if (IS_ERR(buf)) {
1016 		ret = PTR_ERR(buf);
1017 		goto out_pd;
1018 	}
1019 
1020 	migf->buf[0] = buf;
1021 	buf = mlx5vf_alloc_data_buffer(
1022 		migf,
1023 		DIV_ROUND_UP(sizeof(struct mlx5_vf_migration_header),
1024 			     PAGE_SIZE),
1025 		DMA_NONE);
1026 	if (IS_ERR(buf)) {
1027 		ret = PTR_ERR(buf);
1028 		goto out_buf;
1029 	}
1030 
1031 	migf->buf_header[0] = buf;
1032 	migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER;
1033 
1034 	return migf;
1035 out_buf:
1036 	mlx5vf_free_data_buffer(migf->buf[0]);
1037 out_pd:
1038 	mlx5vf_cmd_dealloc_pd(migf);
1039 out:
1040 	fput(migf->filp);
1041 	return ERR_PTR(ret);
1042 }
1043 
mlx5vf_disable_fds(struct mlx5vf_pci_core_device * mvdev,enum mlx5_vf_migf_state * last_save_state)1044 void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev,
1045 			enum mlx5_vf_migf_state *last_save_state)
1046 {
1047 	if (mvdev->resuming_migf) {
1048 		mlx5vf_disable_fd(mvdev->resuming_migf);
1049 		mlx5fv_cmd_clean_migf_resources(mvdev->resuming_migf);
1050 		fput(mvdev->resuming_migf->filp);
1051 		mvdev->resuming_migf = NULL;
1052 	}
1053 	if (mvdev->saving_migf) {
1054 		mlx5_cmd_cleanup_async_ctx(&mvdev->saving_migf->async_ctx);
1055 		cancel_work_sync(&mvdev->saving_migf->async_data.work);
1056 		if (last_save_state)
1057 			*last_save_state = mvdev->saving_migf->state;
1058 		mlx5vf_disable_fd(mvdev->saving_migf);
1059 		wake_up_interruptible(&mvdev->saving_migf->poll_wait);
1060 		mlx5fv_cmd_clean_migf_resources(mvdev->saving_migf);
1061 		fput(mvdev->saving_migf->filp);
1062 		mvdev->saving_migf = NULL;
1063 	}
1064 }
1065 
1066 static struct file *
mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device * mvdev,u32 new)1067 mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev,
1068 				    u32 new)
1069 {
1070 	u32 cur = mvdev->mig_state;
1071 	int ret;
1072 
1073 	if (cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_STOP) {
1074 		ret = mlx5vf_cmd_suspend_vhca(mvdev,
1075 			MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_RESPONDER);
1076 		if (ret)
1077 			return ERR_PTR(ret);
1078 		return NULL;
1079 	}
1080 
1081 	if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RUNNING_P2P) {
1082 		ret = mlx5vf_cmd_resume_vhca(mvdev,
1083 			MLX5_RESUME_VHCA_IN_OP_MOD_RESUME_RESPONDER);
1084 		if (ret)
1085 			return ERR_PTR(ret);
1086 		return NULL;
1087 	}
1088 
1089 	if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_RUNNING_P2P) ||
1090 	    (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) {
1091 		ret = mlx5vf_cmd_suspend_vhca(mvdev,
1092 			MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_INITIATOR);
1093 		if (ret)
1094 			return ERR_PTR(ret);
1095 		return NULL;
1096 	}
1097 
1098 	if ((cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_RUNNING) ||
1099 	    (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_PRE_COPY)) {
1100 		ret = mlx5vf_cmd_resume_vhca(mvdev,
1101 			MLX5_RESUME_VHCA_IN_OP_MOD_RESUME_INITIATOR);
1102 		if (ret)
1103 			return ERR_PTR(ret);
1104 		return NULL;
1105 	}
1106 
1107 	if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_STOP_COPY) {
1108 		struct mlx5_vf_migration_file *migf;
1109 
1110 		migf = mlx5vf_pci_save_device_data(mvdev, false);
1111 		if (IS_ERR(migf))
1112 			return ERR_CAST(migf);
1113 		get_file(migf->filp);
1114 		mvdev->saving_migf = migf;
1115 		return migf->filp;
1116 	}
1117 
1118 	if (cur == VFIO_DEVICE_STATE_STOP_COPY && new == VFIO_DEVICE_STATE_STOP) {
1119 		mlx5vf_disable_fds(mvdev, NULL);
1120 		return NULL;
1121 	}
1122 
1123 	if ((cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_RUNNING) ||
1124 	    (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P &&
1125 	     new == VFIO_DEVICE_STATE_RUNNING_P2P)) {
1126 		struct mlx5_vf_migration_file *migf = mvdev->saving_migf;
1127 		struct mlx5_vhca_data_buffer *buf;
1128 		enum mlx5_vf_migf_state state;
1129 		size_t size;
1130 
1131 		ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &size, NULL,
1132 					MLX5VF_QUERY_INC | MLX5VF_QUERY_CLEANUP);
1133 		if (ret)
1134 			return ERR_PTR(ret);
1135 		buf = mlx5vf_get_data_buffer(migf,
1136 				DIV_ROUND_UP(size, PAGE_SIZE), DMA_FROM_DEVICE);
1137 		if (IS_ERR(buf))
1138 			return ERR_CAST(buf);
1139 		/* pre_copy cleanup */
1140 		ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, false, false);
1141 		if (ret) {
1142 			mlx5vf_put_data_buffer(buf);
1143 			return ERR_PTR(ret);
1144 		}
1145 		mlx5vf_disable_fds(mvdev, &state);
1146 		return (state != MLX5_MIGF_STATE_ERROR) ? NULL : ERR_PTR(-EIO);
1147 	}
1148 
1149 	if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RESUMING) {
1150 		struct mlx5_vf_migration_file *migf;
1151 
1152 		migf = mlx5vf_pci_resume_device_data(mvdev);
1153 		if (IS_ERR(migf))
1154 			return ERR_CAST(migf);
1155 		get_file(migf->filp);
1156 		mvdev->resuming_migf = migf;
1157 		return migf->filp;
1158 	}
1159 
1160 	if (cur == VFIO_DEVICE_STATE_RESUMING && new == VFIO_DEVICE_STATE_STOP) {
1161 		mlx5vf_disable_fds(mvdev, NULL);
1162 		return NULL;
1163 	}
1164 
1165 	if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_PRE_COPY) ||
1166 	    (cur == VFIO_DEVICE_STATE_RUNNING_P2P &&
1167 	     new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) {
1168 		struct mlx5_vf_migration_file *migf;
1169 
1170 		migf = mlx5vf_pci_save_device_data(mvdev, true);
1171 		if (IS_ERR(migf))
1172 			return ERR_CAST(migf);
1173 		get_file(migf->filp);
1174 		mvdev->saving_migf = migf;
1175 		return migf->filp;
1176 	}
1177 
1178 	if (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_STOP_COPY) {
1179 		ret = mlx5vf_cmd_suspend_vhca(mvdev,
1180 			MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_RESPONDER);
1181 		if (ret)
1182 			return ERR_PTR(ret);
1183 		ret = mlx5vf_pci_save_device_inc_data(mvdev);
1184 		return ret ? ERR_PTR(ret) : NULL;
1185 	}
1186 
1187 	/*
1188 	 * vfio_mig_get_next_state() does not use arcs other than the above
1189 	 */
1190 	WARN_ON(true);
1191 	return ERR_PTR(-EINVAL);
1192 }
1193 
1194 /*
1195  * This function is called in all state_mutex unlock cases to
1196  * handle a 'deferred_reset' if exists.
1197  */
mlx5vf_state_mutex_unlock(struct mlx5vf_pci_core_device * mvdev)1198 void mlx5vf_state_mutex_unlock(struct mlx5vf_pci_core_device *mvdev)
1199 {
1200 again:
1201 	spin_lock(&mvdev->reset_lock);
1202 	if (mvdev->deferred_reset) {
1203 		mvdev->deferred_reset = false;
1204 		spin_unlock(&mvdev->reset_lock);
1205 		mvdev->mig_state = VFIO_DEVICE_STATE_RUNNING;
1206 		mlx5vf_disable_fds(mvdev, NULL);
1207 		goto again;
1208 	}
1209 	mutex_unlock(&mvdev->state_mutex);
1210 	spin_unlock(&mvdev->reset_lock);
1211 }
1212 
1213 static struct file *
mlx5vf_pci_set_device_state(struct vfio_device * vdev,enum vfio_device_mig_state new_state)1214 mlx5vf_pci_set_device_state(struct vfio_device *vdev,
1215 			    enum vfio_device_mig_state new_state)
1216 {
1217 	struct mlx5vf_pci_core_device *mvdev = container_of(
1218 		vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1219 	enum vfio_device_mig_state next_state;
1220 	struct file *res = NULL;
1221 	int ret;
1222 
1223 	mutex_lock(&mvdev->state_mutex);
1224 	while (new_state != mvdev->mig_state) {
1225 		ret = vfio_mig_get_next_state(vdev, mvdev->mig_state,
1226 					      new_state, &next_state);
1227 		if (ret) {
1228 			res = ERR_PTR(ret);
1229 			break;
1230 		}
1231 		res = mlx5vf_pci_step_device_state_locked(mvdev, next_state);
1232 		if (IS_ERR(res))
1233 			break;
1234 		mvdev->mig_state = next_state;
1235 		if (WARN_ON(res && new_state != mvdev->mig_state)) {
1236 			fput(res);
1237 			res = ERR_PTR(-EINVAL);
1238 			break;
1239 		}
1240 	}
1241 	mlx5vf_state_mutex_unlock(mvdev);
1242 	return res;
1243 }
1244 
mlx5vf_pci_get_data_size(struct vfio_device * vdev,unsigned long * stop_copy_length)1245 static int mlx5vf_pci_get_data_size(struct vfio_device *vdev,
1246 				    unsigned long *stop_copy_length)
1247 {
1248 	struct mlx5vf_pci_core_device *mvdev = container_of(
1249 		vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1250 	size_t state_size;
1251 	u64 total_size;
1252 	int ret;
1253 
1254 	mutex_lock(&mvdev->state_mutex);
1255 	ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &state_size,
1256 						    &total_size, 0);
1257 	if (!ret)
1258 		*stop_copy_length = total_size;
1259 	mlx5vf_state_mutex_unlock(mvdev);
1260 	return ret;
1261 }
1262 
mlx5vf_pci_get_device_state(struct vfio_device * vdev,enum vfio_device_mig_state * curr_state)1263 static int mlx5vf_pci_get_device_state(struct vfio_device *vdev,
1264 				       enum vfio_device_mig_state *curr_state)
1265 {
1266 	struct mlx5vf_pci_core_device *mvdev = container_of(
1267 		vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1268 
1269 	mutex_lock(&mvdev->state_mutex);
1270 	*curr_state = mvdev->mig_state;
1271 	mlx5vf_state_mutex_unlock(mvdev);
1272 	return 0;
1273 }
1274 
mlx5vf_pci_aer_reset_done(struct pci_dev * pdev)1275 static void mlx5vf_pci_aer_reset_done(struct pci_dev *pdev)
1276 {
1277 	struct mlx5vf_pci_core_device *mvdev = mlx5vf_drvdata(pdev);
1278 
1279 	if (!mvdev->migrate_cap)
1280 		return;
1281 
1282 	/*
1283 	 * As the higher VFIO layers are holding locks across reset and using
1284 	 * those same locks with the mm_lock we need to prevent ABBA deadlock
1285 	 * with the state_mutex and mm_lock.
1286 	 * In case the state_mutex was taken already we defer the cleanup work
1287 	 * to the unlock flow of the other running context.
1288 	 */
1289 	spin_lock(&mvdev->reset_lock);
1290 	mvdev->deferred_reset = true;
1291 	if (!mutex_trylock(&mvdev->state_mutex)) {
1292 		spin_unlock(&mvdev->reset_lock);
1293 		return;
1294 	}
1295 	spin_unlock(&mvdev->reset_lock);
1296 	mlx5vf_state_mutex_unlock(mvdev);
1297 }
1298 
mlx5vf_pci_open_device(struct vfio_device * core_vdev)1299 static int mlx5vf_pci_open_device(struct vfio_device *core_vdev)
1300 {
1301 	struct mlx5vf_pci_core_device *mvdev = container_of(
1302 		core_vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1303 	struct vfio_pci_core_device *vdev = &mvdev->core_device;
1304 	int ret;
1305 
1306 	ret = vfio_pci_core_enable(vdev);
1307 	if (ret)
1308 		return ret;
1309 
1310 	if (mvdev->migrate_cap)
1311 		mvdev->mig_state = VFIO_DEVICE_STATE_RUNNING;
1312 	vfio_pci_core_finish_enable(vdev);
1313 	return 0;
1314 }
1315 
mlx5vf_pci_close_device(struct vfio_device * core_vdev)1316 static void mlx5vf_pci_close_device(struct vfio_device *core_vdev)
1317 {
1318 	struct mlx5vf_pci_core_device *mvdev = container_of(
1319 		core_vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1320 
1321 	mlx5vf_cmd_close_migratable(mvdev);
1322 	vfio_pci_core_close_device(core_vdev);
1323 }
1324 
1325 static const struct vfio_migration_ops mlx5vf_pci_mig_ops = {
1326 	.migration_set_state = mlx5vf_pci_set_device_state,
1327 	.migration_get_state = mlx5vf_pci_get_device_state,
1328 	.migration_get_data_size = mlx5vf_pci_get_data_size,
1329 };
1330 
1331 static const struct vfio_log_ops mlx5vf_pci_log_ops = {
1332 	.log_start = mlx5vf_start_page_tracker,
1333 	.log_stop = mlx5vf_stop_page_tracker,
1334 	.log_read_and_clear = mlx5vf_tracker_read_and_clear,
1335 };
1336 
mlx5vf_pci_init_dev(struct vfio_device * core_vdev)1337 static int mlx5vf_pci_init_dev(struct vfio_device *core_vdev)
1338 {
1339 	struct mlx5vf_pci_core_device *mvdev = container_of(core_vdev,
1340 			struct mlx5vf_pci_core_device, core_device.vdev);
1341 	int ret;
1342 
1343 	ret = vfio_pci_core_init_dev(core_vdev);
1344 	if (ret)
1345 		return ret;
1346 
1347 	mlx5vf_cmd_set_migratable(mvdev, &mlx5vf_pci_mig_ops,
1348 				  &mlx5vf_pci_log_ops);
1349 
1350 	return 0;
1351 }
1352 
mlx5vf_pci_release_dev(struct vfio_device * core_vdev)1353 static void mlx5vf_pci_release_dev(struct vfio_device *core_vdev)
1354 {
1355 	struct mlx5vf_pci_core_device *mvdev = container_of(core_vdev,
1356 			struct mlx5vf_pci_core_device, core_device.vdev);
1357 
1358 	mlx5vf_cmd_remove_migratable(mvdev);
1359 	vfio_pci_core_release_dev(core_vdev);
1360 }
1361 
1362 static const struct vfio_device_ops mlx5vf_pci_ops = {
1363 	.name = "mlx5-vfio-pci",
1364 	.init = mlx5vf_pci_init_dev,
1365 	.release = mlx5vf_pci_release_dev,
1366 	.open_device = mlx5vf_pci_open_device,
1367 	.close_device = mlx5vf_pci_close_device,
1368 	.ioctl = vfio_pci_core_ioctl,
1369 	.device_feature = vfio_pci_core_ioctl_feature,
1370 	.read = vfio_pci_core_read,
1371 	.write = vfio_pci_core_write,
1372 	.mmap = vfio_pci_core_mmap,
1373 	.request = vfio_pci_core_request,
1374 	.match = vfio_pci_core_match,
1375 	.bind_iommufd = vfio_iommufd_physical_bind,
1376 	.unbind_iommufd = vfio_iommufd_physical_unbind,
1377 	.attach_ioas = vfio_iommufd_physical_attach_ioas,
1378 	.detach_ioas = vfio_iommufd_physical_detach_ioas,
1379 };
1380 
mlx5vf_pci_probe(struct pci_dev * pdev,const struct pci_device_id * id)1381 static int mlx5vf_pci_probe(struct pci_dev *pdev,
1382 			    const struct pci_device_id *id)
1383 {
1384 	struct mlx5vf_pci_core_device *mvdev;
1385 	int ret;
1386 
1387 	mvdev = vfio_alloc_device(mlx5vf_pci_core_device, core_device.vdev,
1388 				  &pdev->dev, &mlx5vf_pci_ops);
1389 	if (IS_ERR(mvdev))
1390 		return PTR_ERR(mvdev);
1391 
1392 	dev_set_drvdata(&pdev->dev, &mvdev->core_device);
1393 	ret = vfio_pci_core_register_device(&mvdev->core_device);
1394 	if (ret)
1395 		goto out_put_vdev;
1396 	return 0;
1397 
1398 out_put_vdev:
1399 	vfio_put_device(&mvdev->core_device.vdev);
1400 	return ret;
1401 }
1402 
mlx5vf_pci_remove(struct pci_dev * pdev)1403 static void mlx5vf_pci_remove(struct pci_dev *pdev)
1404 {
1405 	struct mlx5vf_pci_core_device *mvdev = mlx5vf_drvdata(pdev);
1406 
1407 	vfio_pci_core_unregister_device(&mvdev->core_device);
1408 	vfio_put_device(&mvdev->core_device.vdev);
1409 }
1410 
1411 static const struct pci_device_id mlx5vf_pci_table[] = {
1412 	{ PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_MELLANOX, 0x101e) }, /* ConnectX Family mlx5Gen Virtual Function */
1413 	{}
1414 };
1415 
1416 MODULE_DEVICE_TABLE(pci, mlx5vf_pci_table);
1417 
1418 static const struct pci_error_handlers mlx5vf_err_handlers = {
1419 	.reset_done = mlx5vf_pci_aer_reset_done,
1420 	.error_detected = vfio_pci_core_aer_err_detected,
1421 };
1422 
1423 static struct pci_driver mlx5vf_pci_driver = {
1424 	.name = KBUILD_MODNAME,
1425 	.id_table = mlx5vf_pci_table,
1426 	.probe = mlx5vf_pci_probe,
1427 	.remove = mlx5vf_pci_remove,
1428 	.err_handler = &mlx5vf_err_handlers,
1429 	.driver_managed_dma = true,
1430 };
1431 
1432 module_pci_driver(mlx5vf_pci_driver);
1433 
1434 MODULE_IMPORT_NS("IOMMUFD");
1435 MODULE_LICENSE("GPL");
1436 MODULE_AUTHOR("Max Gurtovoy <mgurtovoy@nvidia.com>");
1437 MODULE_AUTHOR("Yishai Hadas <yishaih@nvidia.com>");
1438 MODULE_DESCRIPTION(
1439 	"MLX5 VFIO PCI - User Level meta-driver for MLX5 device family");
1440