1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved
4 */
5
6 #include <linux/device.h>
7 #include <linux/eventfd.h>
8 #include <linux/file.h>
9 #include <linux/interrupt.h>
10 #include <linux/iommu.h>
11 #include <linux/module.h>
12 #include <linux/mutex.h>
13 #include <linux/notifier.h>
14 #include <linux/pci.h>
15 #include <linux/pm_runtime.h>
16 #include <linux/types.h>
17 #include <linux/uaccess.h>
18 #include <linux/vfio.h>
19 #include <linux/sched/mm.h>
20 #include <linux/anon_inodes.h>
21
22 #include "cmd.h"
23
24 /* Device specification max LOAD size */
25 #define MAX_LOAD_SIZE (BIT_ULL(__mlx5_bit_sz(load_vhca_state_in, size)) - 1)
26
27 #define MAX_CHUNK_SIZE SZ_8M
28
mlx5vf_drvdata(struct pci_dev * pdev)29 static struct mlx5vf_pci_core_device *mlx5vf_drvdata(struct pci_dev *pdev)
30 {
31 struct vfio_pci_core_device *core_device = dev_get_drvdata(&pdev->dev);
32
33 return container_of(core_device, struct mlx5vf_pci_core_device,
34 core_device);
35 }
36
mlx5vf_disable_fd(struct mlx5_vf_migration_file * migf)37 static void mlx5vf_disable_fd(struct mlx5_vf_migration_file *migf)
38 {
39 mutex_lock(&migf->lock);
40 migf->state = MLX5_MIGF_STATE_ERROR;
41 migf->filp->f_pos = 0;
42 mutex_unlock(&migf->lock);
43 }
44
mlx5vf_release_file(struct inode * inode,struct file * filp)45 static int mlx5vf_release_file(struct inode *inode, struct file *filp)
46 {
47 struct mlx5_vf_migration_file *migf = filp->private_data;
48
49 mlx5vf_disable_fd(migf);
50 mutex_destroy(&migf->lock);
51 kfree(migf);
52 return 0;
53 }
54
55 static struct mlx5_vhca_data_buffer *
mlx5vf_get_data_buff_from_pos(struct mlx5_vf_migration_file * migf,loff_t pos,bool * end_of_data)56 mlx5vf_get_data_buff_from_pos(struct mlx5_vf_migration_file *migf, loff_t pos,
57 bool *end_of_data)
58 {
59 struct mlx5_vhca_data_buffer *buf;
60 bool found = false;
61
62 *end_of_data = false;
63 spin_lock_irq(&migf->list_lock);
64 if (list_empty(&migf->buf_list)) {
65 *end_of_data = true;
66 goto end;
67 }
68
69 buf = list_first_entry(&migf->buf_list, struct mlx5_vhca_data_buffer,
70 buf_elm);
71 if (pos >= buf->start_pos &&
72 pos < buf->start_pos + buf->length) {
73 found = true;
74 goto end;
75 }
76
77 /*
78 * As we use a stream based FD we may expect having the data always
79 * on first chunk
80 */
81 migf->state = MLX5_MIGF_STATE_ERROR;
82
83 end:
84 spin_unlock_irq(&migf->list_lock);
85 return found ? buf : NULL;
86 }
87
mlx5vf_buf_read_done(struct mlx5_vhca_data_buffer * vhca_buf)88 static void mlx5vf_buf_read_done(struct mlx5_vhca_data_buffer *vhca_buf)
89 {
90 struct mlx5_vf_migration_file *migf = vhca_buf->migf;
91
92 if (vhca_buf->stop_copy_chunk_num) {
93 bool is_header = vhca_buf->dma_dir == DMA_NONE;
94 u8 chunk_num = vhca_buf->stop_copy_chunk_num;
95 size_t next_required_umem_size = 0;
96
97 if (is_header)
98 migf->buf_header[chunk_num - 1] = vhca_buf;
99 else
100 migf->buf[chunk_num - 1] = vhca_buf;
101
102 spin_lock_irq(&migf->list_lock);
103 list_del_init(&vhca_buf->buf_elm);
104 if (!is_header) {
105 next_required_umem_size =
106 migf->next_required_umem_size;
107 migf->next_required_umem_size = 0;
108 migf->num_ready_chunks--;
109 }
110 spin_unlock_irq(&migf->list_lock);
111 if (next_required_umem_size)
112 mlx5vf_mig_file_set_save_work(migf, chunk_num,
113 next_required_umem_size);
114 return;
115 }
116
117 spin_lock_irq(&migf->list_lock);
118 list_del_init(&vhca_buf->buf_elm);
119 list_add_tail(&vhca_buf->buf_elm, &vhca_buf->migf->avail_list);
120 spin_unlock_irq(&migf->list_lock);
121 }
122
mlx5vf_buf_read(struct mlx5_vhca_data_buffer * vhca_buf,char __user ** buf,size_t * len,loff_t * pos)123 static ssize_t mlx5vf_buf_read(struct mlx5_vhca_data_buffer *vhca_buf,
124 char __user **buf, size_t *len, loff_t *pos)
125 {
126 unsigned long offset;
127 ssize_t done = 0;
128 size_t copy_len;
129
130 copy_len = min_t(size_t,
131 vhca_buf->start_pos + vhca_buf->length - *pos, *len);
132 while (copy_len) {
133 size_t page_offset;
134 struct page *page;
135 size_t page_len;
136 u8 *from_buff;
137 int ret;
138
139 offset = *pos - vhca_buf->start_pos;
140 page_offset = offset % PAGE_SIZE;
141 offset -= page_offset;
142 page = mlx5vf_get_migration_page(vhca_buf, offset);
143 if (!page)
144 return -EINVAL;
145 page_len = min_t(size_t, copy_len, PAGE_SIZE - page_offset);
146 from_buff = kmap_local_page(page);
147 ret = copy_to_user(*buf, from_buff + page_offset, page_len);
148 kunmap_local(from_buff);
149 if (ret)
150 return -EFAULT;
151 *pos += page_len;
152 *len -= page_len;
153 *buf += page_len;
154 done += page_len;
155 copy_len -= page_len;
156 }
157
158 if (*pos >= vhca_buf->start_pos + vhca_buf->length)
159 mlx5vf_buf_read_done(vhca_buf);
160
161 return done;
162 }
163
mlx5vf_save_read(struct file * filp,char __user * buf,size_t len,loff_t * pos)164 static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len,
165 loff_t *pos)
166 {
167 struct mlx5_vf_migration_file *migf = filp->private_data;
168 struct mlx5_vhca_data_buffer *vhca_buf;
169 bool first_loop_call = true;
170 bool end_of_data;
171 ssize_t done = 0;
172
173 if (pos)
174 return -ESPIPE;
175 pos = &filp->f_pos;
176
177 if (!(filp->f_flags & O_NONBLOCK)) {
178 if (wait_event_interruptible(migf->poll_wait,
179 !list_empty(&migf->buf_list) ||
180 migf->state == MLX5_MIGF_STATE_ERROR ||
181 migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR ||
182 migf->state == MLX5_MIGF_STATE_PRE_COPY ||
183 migf->state == MLX5_MIGF_STATE_COMPLETE))
184 return -ERESTARTSYS;
185 }
186
187 mutex_lock(&migf->lock);
188 if (migf->state == MLX5_MIGF_STATE_ERROR) {
189 done = -ENODEV;
190 goto out_unlock;
191 }
192
193 while (len) {
194 ssize_t count;
195
196 vhca_buf = mlx5vf_get_data_buff_from_pos(migf, *pos,
197 &end_of_data);
198 if (first_loop_call) {
199 first_loop_call = false;
200 /* Temporary end of file as part of PRE_COPY */
201 if (end_of_data && (migf->state == MLX5_MIGF_STATE_PRE_COPY ||
202 migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR)) {
203 done = -ENOMSG;
204 goto out_unlock;
205 }
206
207 if (end_of_data && migf->state != MLX5_MIGF_STATE_COMPLETE) {
208 if (filp->f_flags & O_NONBLOCK) {
209 done = -EAGAIN;
210 goto out_unlock;
211 }
212 }
213 }
214
215 if (end_of_data)
216 goto out_unlock;
217
218 if (!vhca_buf) {
219 done = -EINVAL;
220 goto out_unlock;
221 }
222
223 count = mlx5vf_buf_read(vhca_buf, &buf, &len, pos);
224 if (count < 0) {
225 done = count;
226 goto out_unlock;
227 }
228 done += count;
229 }
230
231 out_unlock:
232 mutex_unlock(&migf->lock);
233 return done;
234 }
235
mlx5vf_save_poll(struct file * filp,struct poll_table_struct * wait)236 static __poll_t mlx5vf_save_poll(struct file *filp,
237 struct poll_table_struct *wait)
238 {
239 struct mlx5_vf_migration_file *migf = filp->private_data;
240 __poll_t pollflags = 0;
241
242 poll_wait(filp, &migf->poll_wait, wait);
243
244 mutex_lock(&migf->lock);
245 if (migf->state == MLX5_MIGF_STATE_ERROR)
246 pollflags = EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
247 else if (!list_empty(&migf->buf_list) ||
248 migf->state == MLX5_MIGF_STATE_COMPLETE)
249 pollflags = EPOLLIN | EPOLLRDNORM;
250 mutex_unlock(&migf->lock);
251
252 return pollflags;
253 }
254
255 /*
256 * FD is exposed and user can use it after receiving an error.
257 * Mark migf in error, and wake the user.
258 */
mlx5vf_mark_err(struct mlx5_vf_migration_file * migf)259 static void mlx5vf_mark_err(struct mlx5_vf_migration_file *migf)
260 {
261 migf->state = MLX5_MIGF_STATE_ERROR;
262 wake_up_interruptible(&migf->poll_wait);
263 }
264
mlx5vf_mig_file_set_save_work(struct mlx5_vf_migration_file * migf,u8 chunk_num,size_t next_required_umem_size)265 void mlx5vf_mig_file_set_save_work(struct mlx5_vf_migration_file *migf,
266 u8 chunk_num, size_t next_required_umem_size)
267 {
268 migf->save_data[chunk_num - 1].next_required_umem_size =
269 next_required_umem_size;
270 migf->save_data[chunk_num - 1].migf = migf;
271 get_file(migf->filp);
272 queue_work(migf->mvdev->cb_wq,
273 &migf->save_data[chunk_num - 1].work);
274 }
275
276 static struct mlx5_vhca_data_buffer *
mlx5vf_mig_file_get_stop_copy_buf(struct mlx5_vf_migration_file * migf,u8 index,size_t required_length)277 mlx5vf_mig_file_get_stop_copy_buf(struct mlx5_vf_migration_file *migf,
278 u8 index, size_t required_length)
279 {
280 u32 npages = DIV_ROUND_UP(required_length, PAGE_SIZE);
281 struct mlx5_vhca_data_buffer *buf = migf->buf[index];
282 u8 chunk_num;
283
284 WARN_ON(!buf);
285 chunk_num = buf->stop_copy_chunk_num;
286 buf->migf->buf[index] = NULL;
287 /* Checking whether the pre-allocated buffer can fit */
288 if (buf->npages >= npages)
289 return buf;
290
291 mlx5vf_put_data_buffer(buf);
292 buf = mlx5vf_get_data_buffer(buf->migf, npages, DMA_FROM_DEVICE);
293 if (IS_ERR(buf))
294 return buf;
295
296 buf->stop_copy_chunk_num = chunk_num;
297 return buf;
298 }
299
mlx5vf_mig_file_save_work(struct work_struct * _work)300 static void mlx5vf_mig_file_save_work(struct work_struct *_work)
301 {
302 struct mlx5vf_save_work_data *save_data = container_of(_work,
303 struct mlx5vf_save_work_data, work);
304 struct mlx5_vf_migration_file *migf = save_data->migf;
305 struct mlx5vf_pci_core_device *mvdev = migf->mvdev;
306 struct mlx5_vhca_data_buffer *buf;
307
308 mutex_lock(&mvdev->state_mutex);
309 if (migf->state == MLX5_MIGF_STATE_ERROR)
310 goto end;
311
312 buf = mlx5vf_mig_file_get_stop_copy_buf(migf,
313 save_data->chunk_num - 1,
314 save_data->next_required_umem_size);
315 if (IS_ERR(buf))
316 goto err;
317
318 if (mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, false))
319 goto err_save;
320
321 goto end;
322
323 err_save:
324 mlx5vf_put_data_buffer(buf);
325 err:
326 mlx5vf_mark_err(migf);
327 end:
328 mlx5vf_state_mutex_unlock(mvdev);
329 fput(migf->filp);
330 }
331
mlx5vf_add_stop_copy_header(struct mlx5_vf_migration_file * migf,bool track)332 static int mlx5vf_add_stop_copy_header(struct mlx5_vf_migration_file *migf,
333 bool track)
334 {
335 size_t size = sizeof(struct mlx5_vf_migration_header) +
336 sizeof(struct mlx5_vf_migration_tag_stop_copy_data);
337 struct mlx5_vf_migration_tag_stop_copy_data data = {};
338 struct mlx5_vhca_data_buffer *header_buf = NULL;
339 struct mlx5_vf_migration_header header = {};
340 unsigned long flags;
341 struct page *page;
342 u8 *to_buff;
343 int ret;
344
345 header_buf = mlx5vf_get_data_buffer(migf, DIV_ROUND_UP(size, PAGE_SIZE),
346 DMA_NONE);
347 if (IS_ERR(header_buf))
348 return PTR_ERR(header_buf);
349
350 header.record_size = cpu_to_le64(sizeof(data));
351 header.flags = cpu_to_le32(MLX5_MIGF_HEADER_FLAGS_TAG_OPTIONAL);
352 header.tag = cpu_to_le32(MLX5_MIGF_HEADER_TAG_STOP_COPY_SIZE);
353 page = mlx5vf_get_migration_page(header_buf, 0);
354 if (!page) {
355 ret = -EINVAL;
356 goto err;
357 }
358 to_buff = kmap_local_page(page);
359 memcpy(to_buff, &header, sizeof(header));
360 header_buf->length = sizeof(header);
361 data.stop_copy_size = cpu_to_le64(migf->buf[0]->npages * PAGE_SIZE);
362 memcpy(to_buff + sizeof(header), &data, sizeof(data));
363 header_buf->length += sizeof(data);
364 kunmap_local(to_buff);
365 header_buf->start_pos = header_buf->migf->max_pos;
366 migf->max_pos += header_buf->length;
367 spin_lock_irqsave(&migf->list_lock, flags);
368 list_add_tail(&header_buf->buf_elm, &migf->buf_list);
369 spin_unlock_irqrestore(&migf->list_lock, flags);
370 if (track)
371 migf->pre_copy_initial_bytes = size;
372 return 0;
373 err:
374 mlx5vf_put_data_buffer(header_buf);
375 return ret;
376 }
377
mlx5vf_prep_stop_copy(struct mlx5vf_pci_core_device * mvdev,struct mlx5_vf_migration_file * migf,size_t state_size,u64 full_size,bool track)378 static int mlx5vf_prep_stop_copy(struct mlx5vf_pci_core_device *mvdev,
379 struct mlx5_vf_migration_file *migf,
380 size_t state_size, u64 full_size,
381 bool track)
382 {
383 struct mlx5_vhca_data_buffer *buf;
384 size_t inc_state_size;
385 int num_chunks;
386 int ret;
387 int i;
388
389 if (mvdev->chunk_mode) {
390 size_t chunk_size = min_t(size_t, MAX_CHUNK_SIZE, full_size);
391
392 /* from firmware perspective at least 'state_size' buffer should be set */
393 inc_state_size = max(state_size, chunk_size);
394 } else {
395 if (track) {
396 /* let's be ready for stop_copy size that might grow by 10 percents */
397 if (check_add_overflow(state_size, state_size / 10, &inc_state_size))
398 inc_state_size = state_size;
399 } else {
400 inc_state_size = state_size;
401 }
402 }
403
404 /* let's not overflow the device specification max SAVE size */
405 inc_state_size = min_t(size_t, inc_state_size,
406 (BIT_ULL(__mlx5_bit_sz(save_vhca_state_in, size)) - PAGE_SIZE));
407
408 num_chunks = mvdev->chunk_mode ? MAX_NUM_CHUNKS : 1;
409 for (i = 0; i < num_chunks; i++) {
410 buf = mlx5vf_get_data_buffer(
411 migf, DIV_ROUND_UP(inc_state_size, PAGE_SIZE),
412 DMA_FROM_DEVICE);
413 if (IS_ERR(buf)) {
414 ret = PTR_ERR(buf);
415 goto err;
416 }
417
418 migf->buf[i] = buf;
419 buf = mlx5vf_get_data_buffer(
420 migf,
421 DIV_ROUND_UP(sizeof(struct mlx5_vf_migration_header),
422 PAGE_SIZE),
423 DMA_NONE);
424 if (IS_ERR(buf)) {
425 ret = PTR_ERR(buf);
426 goto err;
427 }
428 migf->buf_header[i] = buf;
429 if (mvdev->chunk_mode) {
430 migf->buf[i]->stop_copy_chunk_num = i + 1;
431 migf->buf_header[i]->stop_copy_chunk_num = i + 1;
432 INIT_WORK(&migf->save_data[i].work,
433 mlx5vf_mig_file_save_work);
434 migf->save_data[i].chunk_num = i + 1;
435 }
436 }
437
438 ret = mlx5vf_add_stop_copy_header(migf, track);
439 if (ret)
440 goto err;
441 return 0;
442
443 err:
444 for (i = 0; i < num_chunks; i++) {
445 if (migf->buf[i]) {
446 mlx5vf_put_data_buffer(migf->buf[i]);
447 migf->buf[i] = NULL;
448 }
449 if (migf->buf_header[i]) {
450 mlx5vf_put_data_buffer(migf->buf_header[i]);
451 migf->buf_header[i] = NULL;
452 }
453 }
454
455 return ret;
456 }
457
mlx5vf_precopy_ioctl(struct file * filp,unsigned int cmd,unsigned long arg)458 static long mlx5vf_precopy_ioctl(struct file *filp, unsigned int cmd,
459 unsigned long arg)
460 {
461 struct mlx5_vf_migration_file *migf = filp->private_data;
462 struct mlx5vf_pci_core_device *mvdev = migf->mvdev;
463 struct mlx5_vhca_data_buffer *buf;
464 struct vfio_precopy_info info = {};
465 loff_t *pos = &filp->f_pos;
466 unsigned long minsz;
467 size_t inc_length = 0;
468 bool end_of_data = false;
469 int ret;
470
471 if (cmd != VFIO_MIG_GET_PRECOPY_INFO)
472 return -ENOTTY;
473
474 minsz = offsetofend(struct vfio_precopy_info, dirty_bytes);
475
476 if (copy_from_user(&info, (void __user *)arg, minsz))
477 return -EFAULT;
478
479 if (info.argsz < minsz)
480 return -EINVAL;
481
482 mutex_lock(&mvdev->state_mutex);
483 if (mvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY &&
484 mvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY_P2P) {
485 ret = -EINVAL;
486 goto err_state_unlock;
487 }
488
489 /*
490 * We can't issue a SAVE command when the device is suspended, so as
491 * part of VFIO_DEVICE_STATE_PRE_COPY_P2P no reason to query for extra
492 * bytes that can't be read.
493 */
494 if (mvdev->mig_state == VFIO_DEVICE_STATE_PRE_COPY) {
495 /*
496 * Once the query returns it's guaranteed that there is no
497 * active SAVE command.
498 * As so, the other code below is safe with the proper locks.
499 */
500 ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &inc_length,
501 NULL, MLX5VF_QUERY_INC);
502 if (ret)
503 goto err_state_unlock;
504 }
505
506 mutex_lock(&migf->lock);
507 if (migf->state == MLX5_MIGF_STATE_ERROR) {
508 ret = -ENODEV;
509 goto err_migf_unlock;
510 }
511
512 if (migf->pre_copy_initial_bytes > *pos) {
513 info.initial_bytes = migf->pre_copy_initial_bytes - *pos;
514 } else {
515 info.dirty_bytes = migf->max_pos - *pos;
516 if (!info.dirty_bytes)
517 end_of_data = true;
518 info.dirty_bytes += inc_length;
519 }
520
521 if (!end_of_data || !inc_length) {
522 mutex_unlock(&migf->lock);
523 goto done;
524 }
525
526 mutex_unlock(&migf->lock);
527 /*
528 * We finished transferring the current state and the device has a
529 * dirty state, save a new state to be ready for.
530 */
531 buf = mlx5vf_get_data_buffer(migf, DIV_ROUND_UP(inc_length, PAGE_SIZE),
532 DMA_FROM_DEVICE);
533 if (IS_ERR(buf)) {
534 ret = PTR_ERR(buf);
535 mlx5vf_mark_err(migf);
536 goto err_state_unlock;
537 }
538
539 ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, true);
540 if (ret) {
541 mlx5vf_mark_err(migf);
542 mlx5vf_put_data_buffer(buf);
543 goto err_state_unlock;
544 }
545
546 done:
547 mlx5vf_state_mutex_unlock(mvdev);
548 if (copy_to_user((void __user *)arg, &info, minsz))
549 return -EFAULT;
550 return 0;
551
552 err_migf_unlock:
553 mutex_unlock(&migf->lock);
554 err_state_unlock:
555 mlx5vf_state_mutex_unlock(mvdev);
556 return ret;
557 }
558
559 static const struct file_operations mlx5vf_save_fops = {
560 .owner = THIS_MODULE,
561 .read = mlx5vf_save_read,
562 .poll = mlx5vf_save_poll,
563 .unlocked_ioctl = mlx5vf_precopy_ioctl,
564 .compat_ioctl = compat_ptr_ioctl,
565 .release = mlx5vf_release_file,
566 };
567
mlx5vf_pci_save_device_inc_data(struct mlx5vf_pci_core_device * mvdev)568 static int mlx5vf_pci_save_device_inc_data(struct mlx5vf_pci_core_device *mvdev)
569 {
570 struct mlx5_vf_migration_file *migf = mvdev->saving_migf;
571 struct mlx5_vhca_data_buffer *buf;
572 size_t length;
573 int ret;
574
575 if (migf->state == MLX5_MIGF_STATE_ERROR)
576 return -ENODEV;
577
578 ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, NULL,
579 MLX5VF_QUERY_INC | MLX5VF_QUERY_FINAL);
580 if (ret)
581 goto err;
582
583 buf = mlx5vf_mig_file_get_stop_copy_buf(migf, 0, length);
584 if (IS_ERR(buf)) {
585 ret = PTR_ERR(buf);
586 goto err;
587 }
588
589 ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, false);
590 if (ret)
591 goto err_save;
592
593 return 0;
594
595 err_save:
596 mlx5vf_put_data_buffer(buf);
597 err:
598 mlx5vf_mark_err(migf);
599 return ret;
600 }
601
602 static struct mlx5_vf_migration_file *
mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device * mvdev,bool track)603 mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev, bool track)
604 {
605 struct mlx5_vf_migration_file *migf;
606 struct mlx5_vhca_data_buffer *buf;
607 size_t length;
608 u64 full_size;
609 int ret;
610
611 migf = kzalloc(sizeof(*migf), GFP_KERNEL_ACCOUNT);
612 if (!migf)
613 return ERR_PTR(-ENOMEM);
614
615 migf->filp = anon_inode_getfile("mlx5vf_mig", &mlx5vf_save_fops, migf,
616 O_RDONLY);
617 if (IS_ERR(migf->filp)) {
618 ret = PTR_ERR(migf->filp);
619 kfree(migf);
620 return ERR_PTR(ret);
621 }
622
623 migf->mvdev = mvdev;
624 stream_open(migf->filp->f_inode, migf->filp);
625 mutex_init(&migf->lock);
626 init_waitqueue_head(&migf->poll_wait);
627 init_completion(&migf->save_comp);
628 /*
629 * save_comp is being used as a binary semaphore built from
630 * a completion. A normal mutex cannot be used because the lock is
631 * passed between kernel threads and lockdep can't model this.
632 */
633 complete(&migf->save_comp);
634 mlx5_cmd_init_async_ctx(mvdev->mdev, &migf->async_ctx);
635 INIT_WORK(&migf->async_data.work, mlx5vf_mig_file_cleanup_cb);
636 INIT_LIST_HEAD(&migf->buf_list);
637 INIT_LIST_HEAD(&migf->avail_list);
638 spin_lock_init(&migf->list_lock);
639
640 ret = mlx5vf_cmd_alloc_pd(migf);
641 if (ret)
642 goto out;
643
644 ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, &full_size, 0);
645 if (ret)
646 goto out_pd;
647
648 ret = mlx5vf_prep_stop_copy(mvdev, migf, length, full_size, track);
649 if (ret)
650 goto out_pd;
651
652 if (track) {
653 /* leave the allocated buffer ready for the stop-copy phase */
654 buf = mlx5vf_alloc_data_buffer(migf, migf->buf[0]->npages,
655 DMA_FROM_DEVICE);
656 if (IS_ERR(buf)) {
657 ret = PTR_ERR(buf);
658 goto out_pd;
659 }
660 } else {
661 buf = migf->buf[0];
662 migf->buf[0] = NULL;
663 }
664
665 ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, false, track);
666 if (ret)
667 goto out_save;
668 return migf;
669 out_save:
670 mlx5vf_free_data_buffer(buf);
671 out_pd:
672 mlx5fv_cmd_clean_migf_resources(migf);
673 out:
674 fput(migf->filp);
675 return ERR_PTR(ret);
676 }
677
678 static int
mlx5vf_append_page_to_mig_buf(struct mlx5_vhca_data_buffer * vhca_buf,const char __user ** buf,size_t * len,loff_t * pos,ssize_t * done)679 mlx5vf_append_page_to_mig_buf(struct mlx5_vhca_data_buffer *vhca_buf,
680 const char __user **buf, size_t *len,
681 loff_t *pos, ssize_t *done)
682 {
683 unsigned long offset;
684 size_t page_offset;
685 struct page *page;
686 size_t page_len;
687 u8 *to_buff;
688 int ret;
689
690 offset = *pos - vhca_buf->start_pos;
691 page_offset = offset % PAGE_SIZE;
692
693 page = mlx5vf_get_migration_page(vhca_buf, offset - page_offset);
694 if (!page)
695 return -EINVAL;
696 page_len = min_t(size_t, *len, PAGE_SIZE - page_offset);
697 to_buff = kmap_local_page(page);
698 ret = copy_from_user(to_buff + page_offset, *buf, page_len);
699 kunmap_local(to_buff);
700 if (ret)
701 return -EFAULT;
702
703 *pos += page_len;
704 *done += page_len;
705 *buf += page_len;
706 *len -= page_len;
707 vhca_buf->length += page_len;
708 return 0;
709 }
710
711 static ssize_t
mlx5vf_resume_read_image(struct mlx5_vf_migration_file * migf,struct mlx5_vhca_data_buffer * vhca_buf,size_t image_size,const char __user ** buf,size_t * len,loff_t * pos,ssize_t * done,bool * has_work)712 mlx5vf_resume_read_image(struct mlx5_vf_migration_file *migf,
713 struct mlx5_vhca_data_buffer *vhca_buf,
714 size_t image_size, const char __user **buf,
715 size_t *len, loff_t *pos, ssize_t *done,
716 bool *has_work)
717 {
718 size_t copy_len, to_copy;
719 int ret;
720
721 to_copy = min_t(size_t, *len, image_size - vhca_buf->length);
722 copy_len = to_copy;
723 while (to_copy) {
724 ret = mlx5vf_append_page_to_mig_buf(vhca_buf, buf, &to_copy, pos,
725 done);
726 if (ret)
727 return ret;
728 }
729
730 *len -= copy_len;
731 if (vhca_buf->length == image_size) {
732 migf->load_state = MLX5_VF_LOAD_STATE_LOAD_IMAGE;
733 migf->max_pos += image_size;
734 *has_work = true;
735 }
736
737 return 0;
738 }
739
740 static int
mlx5vf_resume_read_header_data(struct mlx5_vf_migration_file * migf,struct mlx5_vhca_data_buffer * vhca_buf,const char __user ** buf,size_t * len,loff_t * pos,ssize_t * done)741 mlx5vf_resume_read_header_data(struct mlx5_vf_migration_file *migf,
742 struct mlx5_vhca_data_buffer *vhca_buf,
743 const char __user **buf, size_t *len,
744 loff_t *pos, ssize_t *done)
745 {
746 size_t copy_len, to_copy;
747 size_t required_data;
748 u8 *to_buff;
749 int ret;
750
751 required_data = migf->record_size - vhca_buf->length;
752 to_copy = min_t(size_t, *len, required_data);
753 copy_len = to_copy;
754 while (to_copy) {
755 ret = mlx5vf_append_page_to_mig_buf(vhca_buf, buf, &to_copy, pos,
756 done);
757 if (ret)
758 return ret;
759 }
760
761 *len -= copy_len;
762 if (vhca_buf->length == migf->record_size) {
763 switch (migf->record_tag) {
764 case MLX5_MIGF_HEADER_TAG_STOP_COPY_SIZE:
765 {
766 struct page *page;
767
768 page = mlx5vf_get_migration_page(vhca_buf, 0);
769 if (!page)
770 return -EINVAL;
771 to_buff = kmap_local_page(page);
772 migf->stop_copy_prep_size = min_t(u64,
773 le64_to_cpup((__le64 *)to_buff), MAX_LOAD_SIZE);
774 kunmap_local(to_buff);
775 break;
776 }
777 default:
778 /* Optional tag */
779 break;
780 }
781
782 migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER;
783 migf->max_pos += migf->record_size;
784 vhca_buf->length = 0;
785 }
786
787 return 0;
788 }
789
790 static int
mlx5vf_resume_read_header(struct mlx5_vf_migration_file * migf,struct mlx5_vhca_data_buffer * vhca_buf,const char __user ** buf,size_t * len,loff_t * pos,ssize_t * done,bool * has_work)791 mlx5vf_resume_read_header(struct mlx5_vf_migration_file *migf,
792 struct mlx5_vhca_data_buffer *vhca_buf,
793 const char __user **buf,
794 size_t *len, loff_t *pos,
795 ssize_t *done, bool *has_work)
796 {
797 struct page *page;
798 size_t copy_len;
799 u8 *to_buff;
800 int ret;
801
802 copy_len = min_t(size_t, *len,
803 sizeof(struct mlx5_vf_migration_header) - vhca_buf->length);
804 page = mlx5vf_get_migration_page(vhca_buf, 0);
805 if (!page)
806 return -EINVAL;
807 to_buff = kmap_local_page(page);
808 ret = copy_from_user(to_buff + vhca_buf->length, *buf, copy_len);
809 if (ret) {
810 ret = -EFAULT;
811 goto end;
812 }
813
814 *buf += copy_len;
815 *pos += copy_len;
816 *done += copy_len;
817 *len -= copy_len;
818 vhca_buf->length += copy_len;
819 if (vhca_buf->length == sizeof(struct mlx5_vf_migration_header)) {
820 u64 record_size;
821 u32 flags;
822
823 record_size = le64_to_cpup((__le64 *)to_buff);
824 if (record_size > MAX_LOAD_SIZE) {
825 ret = -ENOMEM;
826 goto end;
827 }
828
829 migf->record_size = record_size;
830 flags = le32_to_cpup((__le32 *)(to_buff +
831 offsetof(struct mlx5_vf_migration_header, flags)));
832 migf->record_tag = le32_to_cpup((__le32 *)(to_buff +
833 offsetof(struct mlx5_vf_migration_header, tag)));
834 switch (migf->record_tag) {
835 case MLX5_MIGF_HEADER_TAG_FW_DATA:
836 migf->load_state = MLX5_VF_LOAD_STATE_PREP_IMAGE;
837 break;
838 case MLX5_MIGF_HEADER_TAG_STOP_COPY_SIZE:
839 migf->load_state = MLX5_VF_LOAD_STATE_PREP_HEADER_DATA;
840 break;
841 default:
842 if (!(flags & MLX5_MIGF_HEADER_FLAGS_TAG_OPTIONAL)) {
843 ret = -EOPNOTSUPP;
844 goto end;
845 }
846 /* We may read and skip this optional record data */
847 migf->load_state = MLX5_VF_LOAD_STATE_PREP_HEADER_DATA;
848 }
849
850 migf->max_pos += vhca_buf->length;
851 vhca_buf->length = 0;
852 *has_work = true;
853 }
854 end:
855 kunmap_local(to_buff);
856 return ret;
857 }
858
mlx5vf_resume_write(struct file * filp,const char __user * buf,size_t len,loff_t * pos)859 static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf,
860 size_t len, loff_t *pos)
861 {
862 struct mlx5_vf_migration_file *migf = filp->private_data;
863 struct mlx5_vhca_data_buffer *vhca_buf = migf->buf[0];
864 struct mlx5_vhca_data_buffer *vhca_buf_header = migf->buf_header[0];
865 loff_t requested_length;
866 bool has_work = false;
867 ssize_t done = 0;
868 int ret = 0;
869
870 if (pos)
871 return -ESPIPE;
872 pos = &filp->f_pos;
873
874 if (*pos < 0 ||
875 check_add_overflow((loff_t)len, *pos, &requested_length))
876 return -EINVAL;
877
878 mutex_lock(&migf->mvdev->state_mutex);
879 mutex_lock(&migf->lock);
880 if (migf->state == MLX5_MIGF_STATE_ERROR) {
881 ret = -ENODEV;
882 goto out_unlock;
883 }
884
885 while (len || has_work) {
886 has_work = false;
887 switch (migf->load_state) {
888 case MLX5_VF_LOAD_STATE_READ_HEADER:
889 ret = mlx5vf_resume_read_header(migf, vhca_buf_header,
890 &buf, &len, pos,
891 &done, &has_work);
892 if (ret)
893 goto out_unlock;
894 break;
895 case MLX5_VF_LOAD_STATE_PREP_HEADER_DATA:
896 {
897 u32 npages = DIV_ROUND_UP(migf->record_size, PAGE_SIZE);
898
899 if (vhca_buf_header->npages < npages) {
900 mlx5vf_free_data_buffer(vhca_buf_header);
901
902 migf->buf_header[0] = mlx5vf_alloc_data_buffer(
903 migf, npages, DMA_NONE);
904 if (IS_ERR(migf->buf_header[0])) {
905 ret = PTR_ERR(migf->buf_header[0]);
906 migf->buf_header[0] = NULL;
907 goto out_unlock;
908 }
909
910 vhca_buf_header = migf->buf_header[0];
911 }
912
913 vhca_buf_header->start_pos = migf->max_pos;
914 migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER_DATA;
915 break;
916 }
917 case MLX5_VF_LOAD_STATE_READ_HEADER_DATA:
918 ret = mlx5vf_resume_read_header_data(migf, vhca_buf_header,
919 &buf, &len, pos, &done);
920 if (ret)
921 goto out_unlock;
922 break;
923 case MLX5_VF_LOAD_STATE_PREP_IMAGE:
924 {
925 u64 size = max(migf->record_size,
926 migf->stop_copy_prep_size);
927 u32 npages = DIV_ROUND_UP(size, PAGE_SIZE);
928
929 if (vhca_buf->npages < npages) {
930 mlx5vf_free_data_buffer(vhca_buf);
931
932 migf->buf[0] = mlx5vf_alloc_data_buffer(
933 migf, npages, DMA_TO_DEVICE);
934 if (IS_ERR(migf->buf[0])) {
935 ret = PTR_ERR(migf->buf[0]);
936 migf->buf[0] = NULL;
937 goto out_unlock;
938 }
939
940 vhca_buf = migf->buf[0];
941 }
942
943 vhca_buf->start_pos = migf->max_pos;
944 migf->load_state = MLX5_VF_LOAD_STATE_READ_IMAGE;
945 break;
946 }
947 case MLX5_VF_LOAD_STATE_READ_IMAGE:
948 ret = mlx5vf_resume_read_image(migf, vhca_buf,
949 migf->record_size,
950 &buf, &len, pos, &done, &has_work);
951 if (ret)
952 goto out_unlock;
953 break;
954 case MLX5_VF_LOAD_STATE_LOAD_IMAGE:
955 ret = mlx5vf_cmd_load_vhca_state(migf->mvdev, migf, vhca_buf);
956 if (ret)
957 goto out_unlock;
958 migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER;
959
960 /* prep header buf for next image */
961 vhca_buf_header->length = 0;
962 /* prep data buf for next image */
963 vhca_buf->length = 0;
964
965 break;
966 default:
967 break;
968 }
969 }
970
971 out_unlock:
972 if (ret)
973 migf->state = MLX5_MIGF_STATE_ERROR;
974 mutex_unlock(&migf->lock);
975 mlx5vf_state_mutex_unlock(migf->mvdev);
976 return ret ? ret : done;
977 }
978
979 static const struct file_operations mlx5vf_resume_fops = {
980 .owner = THIS_MODULE,
981 .write = mlx5vf_resume_write,
982 .release = mlx5vf_release_file,
983 };
984
985 static struct mlx5_vf_migration_file *
mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device * mvdev)986 mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev)
987 {
988 struct mlx5_vf_migration_file *migf;
989 struct mlx5_vhca_data_buffer *buf;
990 int ret;
991
992 migf = kzalloc(sizeof(*migf), GFP_KERNEL_ACCOUNT);
993 if (!migf)
994 return ERR_PTR(-ENOMEM);
995
996 migf->filp = anon_inode_getfile("mlx5vf_mig", &mlx5vf_resume_fops, migf,
997 O_WRONLY);
998 if (IS_ERR(migf->filp)) {
999 ret = PTR_ERR(migf->filp);
1000 kfree(migf);
1001 return ERR_PTR(ret);
1002 }
1003
1004 stream_open(migf->filp->f_inode, migf->filp);
1005 mutex_init(&migf->lock);
1006 INIT_LIST_HEAD(&migf->buf_list);
1007 INIT_LIST_HEAD(&migf->avail_list);
1008 spin_lock_init(&migf->list_lock);
1009 migf->mvdev = mvdev;
1010 ret = mlx5vf_cmd_alloc_pd(migf);
1011 if (ret)
1012 goto out;
1013
1014 buf = mlx5vf_alloc_data_buffer(migf, 0, DMA_TO_DEVICE);
1015 if (IS_ERR(buf)) {
1016 ret = PTR_ERR(buf);
1017 goto out_pd;
1018 }
1019
1020 migf->buf[0] = buf;
1021 buf = mlx5vf_alloc_data_buffer(
1022 migf,
1023 DIV_ROUND_UP(sizeof(struct mlx5_vf_migration_header),
1024 PAGE_SIZE),
1025 DMA_NONE);
1026 if (IS_ERR(buf)) {
1027 ret = PTR_ERR(buf);
1028 goto out_buf;
1029 }
1030
1031 migf->buf_header[0] = buf;
1032 migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER;
1033
1034 return migf;
1035 out_buf:
1036 mlx5vf_free_data_buffer(migf->buf[0]);
1037 out_pd:
1038 mlx5vf_cmd_dealloc_pd(migf);
1039 out:
1040 fput(migf->filp);
1041 return ERR_PTR(ret);
1042 }
1043
mlx5vf_disable_fds(struct mlx5vf_pci_core_device * mvdev,enum mlx5_vf_migf_state * last_save_state)1044 void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev,
1045 enum mlx5_vf_migf_state *last_save_state)
1046 {
1047 if (mvdev->resuming_migf) {
1048 mlx5vf_disable_fd(mvdev->resuming_migf);
1049 mlx5fv_cmd_clean_migf_resources(mvdev->resuming_migf);
1050 fput(mvdev->resuming_migf->filp);
1051 mvdev->resuming_migf = NULL;
1052 }
1053 if (mvdev->saving_migf) {
1054 mlx5_cmd_cleanup_async_ctx(&mvdev->saving_migf->async_ctx);
1055 cancel_work_sync(&mvdev->saving_migf->async_data.work);
1056 if (last_save_state)
1057 *last_save_state = mvdev->saving_migf->state;
1058 mlx5vf_disable_fd(mvdev->saving_migf);
1059 wake_up_interruptible(&mvdev->saving_migf->poll_wait);
1060 mlx5fv_cmd_clean_migf_resources(mvdev->saving_migf);
1061 fput(mvdev->saving_migf->filp);
1062 mvdev->saving_migf = NULL;
1063 }
1064 }
1065
1066 static struct file *
mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device * mvdev,u32 new)1067 mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev,
1068 u32 new)
1069 {
1070 u32 cur = mvdev->mig_state;
1071 int ret;
1072
1073 if (cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_STOP) {
1074 ret = mlx5vf_cmd_suspend_vhca(mvdev,
1075 MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_RESPONDER);
1076 if (ret)
1077 return ERR_PTR(ret);
1078 return NULL;
1079 }
1080
1081 if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RUNNING_P2P) {
1082 ret = mlx5vf_cmd_resume_vhca(mvdev,
1083 MLX5_RESUME_VHCA_IN_OP_MOD_RESUME_RESPONDER);
1084 if (ret)
1085 return ERR_PTR(ret);
1086 return NULL;
1087 }
1088
1089 if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_RUNNING_P2P) ||
1090 (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) {
1091 ret = mlx5vf_cmd_suspend_vhca(mvdev,
1092 MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_INITIATOR);
1093 if (ret)
1094 return ERR_PTR(ret);
1095 return NULL;
1096 }
1097
1098 if ((cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_RUNNING) ||
1099 (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_PRE_COPY)) {
1100 ret = mlx5vf_cmd_resume_vhca(mvdev,
1101 MLX5_RESUME_VHCA_IN_OP_MOD_RESUME_INITIATOR);
1102 if (ret)
1103 return ERR_PTR(ret);
1104 return NULL;
1105 }
1106
1107 if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_STOP_COPY) {
1108 struct mlx5_vf_migration_file *migf;
1109
1110 migf = mlx5vf_pci_save_device_data(mvdev, false);
1111 if (IS_ERR(migf))
1112 return ERR_CAST(migf);
1113 get_file(migf->filp);
1114 mvdev->saving_migf = migf;
1115 return migf->filp;
1116 }
1117
1118 if (cur == VFIO_DEVICE_STATE_STOP_COPY && new == VFIO_DEVICE_STATE_STOP) {
1119 mlx5vf_disable_fds(mvdev, NULL);
1120 return NULL;
1121 }
1122
1123 if ((cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_RUNNING) ||
1124 (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P &&
1125 new == VFIO_DEVICE_STATE_RUNNING_P2P)) {
1126 struct mlx5_vf_migration_file *migf = mvdev->saving_migf;
1127 struct mlx5_vhca_data_buffer *buf;
1128 enum mlx5_vf_migf_state state;
1129 size_t size;
1130
1131 ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &size, NULL,
1132 MLX5VF_QUERY_INC | MLX5VF_QUERY_CLEANUP);
1133 if (ret)
1134 return ERR_PTR(ret);
1135 buf = mlx5vf_get_data_buffer(migf,
1136 DIV_ROUND_UP(size, PAGE_SIZE), DMA_FROM_DEVICE);
1137 if (IS_ERR(buf))
1138 return ERR_CAST(buf);
1139 /* pre_copy cleanup */
1140 ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, false, false);
1141 if (ret) {
1142 mlx5vf_put_data_buffer(buf);
1143 return ERR_PTR(ret);
1144 }
1145 mlx5vf_disable_fds(mvdev, &state);
1146 return (state != MLX5_MIGF_STATE_ERROR) ? NULL : ERR_PTR(-EIO);
1147 }
1148
1149 if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RESUMING) {
1150 struct mlx5_vf_migration_file *migf;
1151
1152 migf = mlx5vf_pci_resume_device_data(mvdev);
1153 if (IS_ERR(migf))
1154 return ERR_CAST(migf);
1155 get_file(migf->filp);
1156 mvdev->resuming_migf = migf;
1157 return migf->filp;
1158 }
1159
1160 if (cur == VFIO_DEVICE_STATE_RESUMING && new == VFIO_DEVICE_STATE_STOP) {
1161 mlx5vf_disable_fds(mvdev, NULL);
1162 return NULL;
1163 }
1164
1165 if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_PRE_COPY) ||
1166 (cur == VFIO_DEVICE_STATE_RUNNING_P2P &&
1167 new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) {
1168 struct mlx5_vf_migration_file *migf;
1169
1170 migf = mlx5vf_pci_save_device_data(mvdev, true);
1171 if (IS_ERR(migf))
1172 return ERR_CAST(migf);
1173 get_file(migf->filp);
1174 mvdev->saving_migf = migf;
1175 return migf->filp;
1176 }
1177
1178 if (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_STOP_COPY) {
1179 ret = mlx5vf_cmd_suspend_vhca(mvdev,
1180 MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_RESPONDER);
1181 if (ret)
1182 return ERR_PTR(ret);
1183 ret = mlx5vf_pci_save_device_inc_data(mvdev);
1184 return ret ? ERR_PTR(ret) : NULL;
1185 }
1186
1187 /*
1188 * vfio_mig_get_next_state() does not use arcs other than the above
1189 */
1190 WARN_ON(true);
1191 return ERR_PTR(-EINVAL);
1192 }
1193
1194 /*
1195 * This function is called in all state_mutex unlock cases to
1196 * handle a 'deferred_reset' if exists.
1197 */
mlx5vf_state_mutex_unlock(struct mlx5vf_pci_core_device * mvdev)1198 void mlx5vf_state_mutex_unlock(struct mlx5vf_pci_core_device *mvdev)
1199 {
1200 again:
1201 spin_lock(&mvdev->reset_lock);
1202 if (mvdev->deferred_reset) {
1203 mvdev->deferred_reset = false;
1204 spin_unlock(&mvdev->reset_lock);
1205 mvdev->mig_state = VFIO_DEVICE_STATE_RUNNING;
1206 mlx5vf_disable_fds(mvdev, NULL);
1207 goto again;
1208 }
1209 mutex_unlock(&mvdev->state_mutex);
1210 spin_unlock(&mvdev->reset_lock);
1211 }
1212
1213 static struct file *
mlx5vf_pci_set_device_state(struct vfio_device * vdev,enum vfio_device_mig_state new_state)1214 mlx5vf_pci_set_device_state(struct vfio_device *vdev,
1215 enum vfio_device_mig_state new_state)
1216 {
1217 struct mlx5vf_pci_core_device *mvdev = container_of(
1218 vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1219 enum vfio_device_mig_state next_state;
1220 struct file *res = NULL;
1221 int ret;
1222
1223 mutex_lock(&mvdev->state_mutex);
1224 while (new_state != mvdev->mig_state) {
1225 ret = vfio_mig_get_next_state(vdev, mvdev->mig_state,
1226 new_state, &next_state);
1227 if (ret) {
1228 res = ERR_PTR(ret);
1229 break;
1230 }
1231 res = mlx5vf_pci_step_device_state_locked(mvdev, next_state);
1232 if (IS_ERR(res))
1233 break;
1234 mvdev->mig_state = next_state;
1235 if (WARN_ON(res && new_state != mvdev->mig_state)) {
1236 fput(res);
1237 res = ERR_PTR(-EINVAL);
1238 break;
1239 }
1240 }
1241 mlx5vf_state_mutex_unlock(mvdev);
1242 return res;
1243 }
1244
mlx5vf_pci_get_data_size(struct vfio_device * vdev,unsigned long * stop_copy_length)1245 static int mlx5vf_pci_get_data_size(struct vfio_device *vdev,
1246 unsigned long *stop_copy_length)
1247 {
1248 struct mlx5vf_pci_core_device *mvdev = container_of(
1249 vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1250 size_t state_size;
1251 u64 total_size;
1252 int ret;
1253
1254 mutex_lock(&mvdev->state_mutex);
1255 ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &state_size,
1256 &total_size, 0);
1257 if (!ret)
1258 *stop_copy_length = total_size;
1259 mlx5vf_state_mutex_unlock(mvdev);
1260 return ret;
1261 }
1262
mlx5vf_pci_get_device_state(struct vfio_device * vdev,enum vfio_device_mig_state * curr_state)1263 static int mlx5vf_pci_get_device_state(struct vfio_device *vdev,
1264 enum vfio_device_mig_state *curr_state)
1265 {
1266 struct mlx5vf_pci_core_device *mvdev = container_of(
1267 vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1268
1269 mutex_lock(&mvdev->state_mutex);
1270 *curr_state = mvdev->mig_state;
1271 mlx5vf_state_mutex_unlock(mvdev);
1272 return 0;
1273 }
1274
mlx5vf_pci_aer_reset_done(struct pci_dev * pdev)1275 static void mlx5vf_pci_aer_reset_done(struct pci_dev *pdev)
1276 {
1277 struct mlx5vf_pci_core_device *mvdev = mlx5vf_drvdata(pdev);
1278
1279 if (!mvdev->migrate_cap)
1280 return;
1281
1282 /*
1283 * As the higher VFIO layers are holding locks across reset and using
1284 * those same locks with the mm_lock we need to prevent ABBA deadlock
1285 * with the state_mutex and mm_lock.
1286 * In case the state_mutex was taken already we defer the cleanup work
1287 * to the unlock flow of the other running context.
1288 */
1289 spin_lock(&mvdev->reset_lock);
1290 mvdev->deferred_reset = true;
1291 if (!mutex_trylock(&mvdev->state_mutex)) {
1292 spin_unlock(&mvdev->reset_lock);
1293 return;
1294 }
1295 spin_unlock(&mvdev->reset_lock);
1296 mlx5vf_state_mutex_unlock(mvdev);
1297 }
1298
mlx5vf_pci_open_device(struct vfio_device * core_vdev)1299 static int mlx5vf_pci_open_device(struct vfio_device *core_vdev)
1300 {
1301 struct mlx5vf_pci_core_device *mvdev = container_of(
1302 core_vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1303 struct vfio_pci_core_device *vdev = &mvdev->core_device;
1304 int ret;
1305
1306 ret = vfio_pci_core_enable(vdev);
1307 if (ret)
1308 return ret;
1309
1310 if (mvdev->migrate_cap)
1311 mvdev->mig_state = VFIO_DEVICE_STATE_RUNNING;
1312 vfio_pci_core_finish_enable(vdev);
1313 return 0;
1314 }
1315
mlx5vf_pci_close_device(struct vfio_device * core_vdev)1316 static void mlx5vf_pci_close_device(struct vfio_device *core_vdev)
1317 {
1318 struct mlx5vf_pci_core_device *mvdev = container_of(
1319 core_vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1320
1321 mlx5vf_cmd_close_migratable(mvdev);
1322 vfio_pci_core_close_device(core_vdev);
1323 }
1324
1325 static const struct vfio_migration_ops mlx5vf_pci_mig_ops = {
1326 .migration_set_state = mlx5vf_pci_set_device_state,
1327 .migration_get_state = mlx5vf_pci_get_device_state,
1328 .migration_get_data_size = mlx5vf_pci_get_data_size,
1329 };
1330
1331 static const struct vfio_log_ops mlx5vf_pci_log_ops = {
1332 .log_start = mlx5vf_start_page_tracker,
1333 .log_stop = mlx5vf_stop_page_tracker,
1334 .log_read_and_clear = mlx5vf_tracker_read_and_clear,
1335 };
1336
mlx5vf_pci_init_dev(struct vfio_device * core_vdev)1337 static int mlx5vf_pci_init_dev(struct vfio_device *core_vdev)
1338 {
1339 struct mlx5vf_pci_core_device *mvdev = container_of(core_vdev,
1340 struct mlx5vf_pci_core_device, core_device.vdev);
1341 int ret;
1342
1343 ret = vfio_pci_core_init_dev(core_vdev);
1344 if (ret)
1345 return ret;
1346
1347 mlx5vf_cmd_set_migratable(mvdev, &mlx5vf_pci_mig_ops,
1348 &mlx5vf_pci_log_ops);
1349
1350 return 0;
1351 }
1352
mlx5vf_pci_release_dev(struct vfio_device * core_vdev)1353 static void mlx5vf_pci_release_dev(struct vfio_device *core_vdev)
1354 {
1355 struct mlx5vf_pci_core_device *mvdev = container_of(core_vdev,
1356 struct mlx5vf_pci_core_device, core_device.vdev);
1357
1358 mlx5vf_cmd_remove_migratable(mvdev);
1359 vfio_pci_core_release_dev(core_vdev);
1360 }
1361
1362 static const struct vfio_device_ops mlx5vf_pci_ops = {
1363 .name = "mlx5-vfio-pci",
1364 .init = mlx5vf_pci_init_dev,
1365 .release = mlx5vf_pci_release_dev,
1366 .open_device = mlx5vf_pci_open_device,
1367 .close_device = mlx5vf_pci_close_device,
1368 .ioctl = vfio_pci_core_ioctl,
1369 .device_feature = vfio_pci_core_ioctl_feature,
1370 .read = vfio_pci_core_read,
1371 .write = vfio_pci_core_write,
1372 .mmap = vfio_pci_core_mmap,
1373 .request = vfio_pci_core_request,
1374 .match = vfio_pci_core_match,
1375 .bind_iommufd = vfio_iommufd_physical_bind,
1376 .unbind_iommufd = vfio_iommufd_physical_unbind,
1377 .attach_ioas = vfio_iommufd_physical_attach_ioas,
1378 .detach_ioas = vfio_iommufd_physical_detach_ioas,
1379 };
1380
mlx5vf_pci_probe(struct pci_dev * pdev,const struct pci_device_id * id)1381 static int mlx5vf_pci_probe(struct pci_dev *pdev,
1382 const struct pci_device_id *id)
1383 {
1384 struct mlx5vf_pci_core_device *mvdev;
1385 int ret;
1386
1387 mvdev = vfio_alloc_device(mlx5vf_pci_core_device, core_device.vdev,
1388 &pdev->dev, &mlx5vf_pci_ops);
1389 if (IS_ERR(mvdev))
1390 return PTR_ERR(mvdev);
1391
1392 dev_set_drvdata(&pdev->dev, &mvdev->core_device);
1393 ret = vfio_pci_core_register_device(&mvdev->core_device);
1394 if (ret)
1395 goto out_put_vdev;
1396 return 0;
1397
1398 out_put_vdev:
1399 vfio_put_device(&mvdev->core_device.vdev);
1400 return ret;
1401 }
1402
mlx5vf_pci_remove(struct pci_dev * pdev)1403 static void mlx5vf_pci_remove(struct pci_dev *pdev)
1404 {
1405 struct mlx5vf_pci_core_device *mvdev = mlx5vf_drvdata(pdev);
1406
1407 vfio_pci_core_unregister_device(&mvdev->core_device);
1408 vfio_put_device(&mvdev->core_device.vdev);
1409 }
1410
1411 static const struct pci_device_id mlx5vf_pci_table[] = {
1412 { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_MELLANOX, 0x101e) }, /* ConnectX Family mlx5Gen Virtual Function */
1413 {}
1414 };
1415
1416 MODULE_DEVICE_TABLE(pci, mlx5vf_pci_table);
1417
1418 static const struct pci_error_handlers mlx5vf_err_handlers = {
1419 .reset_done = mlx5vf_pci_aer_reset_done,
1420 .error_detected = vfio_pci_core_aer_err_detected,
1421 };
1422
1423 static struct pci_driver mlx5vf_pci_driver = {
1424 .name = KBUILD_MODNAME,
1425 .id_table = mlx5vf_pci_table,
1426 .probe = mlx5vf_pci_probe,
1427 .remove = mlx5vf_pci_remove,
1428 .err_handler = &mlx5vf_err_handlers,
1429 .driver_managed_dma = true,
1430 };
1431
1432 module_pci_driver(mlx5vf_pci_driver);
1433
1434 MODULE_IMPORT_NS("IOMMUFD");
1435 MODULE_LICENSE("GPL");
1436 MODULE_AUTHOR("Max Gurtovoy <mgurtovoy@nvidia.com>");
1437 MODULE_AUTHOR("Yishai Hadas <yishaih@nvidia.com>");
1438 MODULE_DESCRIPTION(
1439 "MLX5 VFIO PCI - User Level meta-driver for MLX5 device family");
1440