1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved 4 */ 5 6 #include <linux/device.h> 7 #include <linux/eventfd.h> 8 #include <linux/file.h> 9 #include <linux/interrupt.h> 10 #include <linux/iommu.h> 11 #include <linux/module.h> 12 #include <linux/mutex.h> 13 #include <linux/notifier.h> 14 #include <linux/pci.h> 15 #include <linux/pm_runtime.h> 16 #include <linux/types.h> 17 #include <linux/uaccess.h> 18 #include <linux/vfio.h> 19 #include <linux/sched/mm.h> 20 #include <linux/anon_inodes.h> 21 22 #include "cmd.h" 23 24 /* Device specification max LOAD size */ 25 #define MAX_LOAD_SIZE (BIT_ULL(__mlx5_bit_sz(load_vhca_state_in, size)) - 1) 26 27 static struct mlx5vf_pci_core_device *mlx5vf_drvdata(struct pci_dev *pdev) 28 { 29 struct vfio_pci_core_device *core_device = dev_get_drvdata(&pdev->dev); 30 31 return container_of(core_device, struct mlx5vf_pci_core_device, 32 core_device); 33 } 34 35 struct page * 36 mlx5vf_get_migration_page(struct mlx5_vhca_data_buffer *buf, 37 unsigned long offset) 38 { 39 unsigned long cur_offset = 0; 40 struct scatterlist *sg; 41 unsigned int i; 42 43 /* All accesses are sequential */ 44 if (offset < buf->last_offset || !buf->last_offset_sg) { 45 buf->last_offset = 0; 46 buf->last_offset_sg = buf->table.sgt.sgl; 47 buf->sg_last_entry = 0; 48 } 49 50 cur_offset = buf->last_offset; 51 52 for_each_sg(buf->last_offset_sg, sg, 53 buf->table.sgt.orig_nents - buf->sg_last_entry, i) { 54 if (offset < sg->length + cur_offset) { 55 buf->last_offset_sg = sg; 56 buf->sg_last_entry += i; 57 buf->last_offset = cur_offset; 58 return nth_page(sg_page(sg), 59 (offset - cur_offset) / PAGE_SIZE); 60 } 61 cur_offset += sg->length; 62 } 63 return NULL; 64 } 65 66 int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf, 67 unsigned int npages) 68 { 69 unsigned int to_alloc = npages; 70 struct page **page_list; 71 unsigned long filled; 72 unsigned int to_fill; 73 int ret; 74 75 to_fill = min_t(unsigned int, npages, PAGE_SIZE / sizeof(*page_list)); 76 page_list = kvzalloc(to_fill * sizeof(*page_list), GFP_KERNEL_ACCOUNT); 77 if (!page_list) 78 return -ENOMEM; 79 80 do { 81 filled = alloc_pages_bulk_array(GFP_KERNEL_ACCOUNT, to_fill, 82 page_list); 83 if (!filled) { 84 ret = -ENOMEM; 85 goto err; 86 } 87 to_alloc -= filled; 88 ret = sg_alloc_append_table_from_pages( 89 &buf->table, page_list, filled, 0, 90 filled << PAGE_SHIFT, UINT_MAX, SG_MAX_SINGLE_ALLOC, 91 GFP_KERNEL_ACCOUNT); 92 93 if (ret) 94 goto err; 95 buf->allocated_length += filled * PAGE_SIZE; 96 /* clean input for another bulk allocation */ 97 memset(page_list, 0, filled * sizeof(*page_list)); 98 to_fill = min_t(unsigned int, to_alloc, 99 PAGE_SIZE / sizeof(*page_list)); 100 } while (to_alloc > 0); 101 102 kvfree(page_list); 103 return 0; 104 105 err: 106 kvfree(page_list); 107 return ret; 108 } 109 110 static void mlx5vf_disable_fd(struct mlx5_vf_migration_file *migf) 111 { 112 mutex_lock(&migf->lock); 113 migf->state = MLX5_MIGF_STATE_ERROR; 114 migf->filp->f_pos = 0; 115 mutex_unlock(&migf->lock); 116 } 117 118 static int mlx5vf_release_file(struct inode *inode, struct file *filp) 119 { 120 struct mlx5_vf_migration_file *migf = filp->private_data; 121 122 mlx5vf_disable_fd(migf); 123 mutex_destroy(&migf->lock); 124 kfree(migf); 125 return 0; 126 } 127 128 static struct mlx5_vhca_data_buffer * 129 mlx5vf_get_data_buff_from_pos(struct mlx5_vf_migration_file *migf, loff_t pos, 130 bool *end_of_data) 131 { 132 struct mlx5_vhca_data_buffer *buf; 133 bool found = false; 134 135 *end_of_data = false; 136 spin_lock_irq(&migf->list_lock); 137 if (list_empty(&migf->buf_list)) { 138 *end_of_data = true; 139 goto end; 140 } 141 142 buf = list_first_entry(&migf->buf_list, struct mlx5_vhca_data_buffer, 143 buf_elm); 144 if (pos >= buf->start_pos && 145 pos < buf->start_pos + buf->length) { 146 found = true; 147 goto end; 148 } 149 150 /* 151 * As we use a stream based FD we may expect having the data always 152 * on first chunk 153 */ 154 migf->state = MLX5_MIGF_STATE_ERROR; 155 156 end: 157 spin_unlock_irq(&migf->list_lock); 158 return found ? buf : NULL; 159 } 160 161 static ssize_t mlx5vf_buf_read(struct mlx5_vhca_data_buffer *vhca_buf, 162 char __user **buf, size_t *len, loff_t *pos) 163 { 164 unsigned long offset; 165 ssize_t done = 0; 166 size_t copy_len; 167 168 copy_len = min_t(size_t, 169 vhca_buf->start_pos + vhca_buf->length - *pos, *len); 170 while (copy_len) { 171 size_t page_offset; 172 struct page *page; 173 size_t page_len; 174 u8 *from_buff; 175 int ret; 176 177 offset = *pos - vhca_buf->start_pos; 178 page_offset = offset % PAGE_SIZE; 179 offset -= page_offset; 180 page = mlx5vf_get_migration_page(vhca_buf, offset); 181 if (!page) 182 return -EINVAL; 183 page_len = min_t(size_t, copy_len, PAGE_SIZE - page_offset); 184 from_buff = kmap_local_page(page); 185 ret = copy_to_user(*buf, from_buff + page_offset, page_len); 186 kunmap_local(from_buff); 187 if (ret) 188 return -EFAULT; 189 *pos += page_len; 190 *len -= page_len; 191 *buf += page_len; 192 done += page_len; 193 copy_len -= page_len; 194 } 195 196 if (*pos >= vhca_buf->start_pos + vhca_buf->length) { 197 spin_lock_irq(&vhca_buf->migf->list_lock); 198 list_del_init(&vhca_buf->buf_elm); 199 list_add_tail(&vhca_buf->buf_elm, &vhca_buf->migf->avail_list); 200 spin_unlock_irq(&vhca_buf->migf->list_lock); 201 } 202 203 return done; 204 } 205 206 static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len, 207 loff_t *pos) 208 { 209 struct mlx5_vf_migration_file *migf = filp->private_data; 210 struct mlx5_vhca_data_buffer *vhca_buf; 211 bool first_loop_call = true; 212 bool end_of_data; 213 ssize_t done = 0; 214 215 if (pos) 216 return -ESPIPE; 217 pos = &filp->f_pos; 218 219 if (!(filp->f_flags & O_NONBLOCK)) { 220 if (wait_event_interruptible(migf->poll_wait, 221 !list_empty(&migf->buf_list) || 222 migf->state == MLX5_MIGF_STATE_ERROR || 223 migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR || 224 migf->state == MLX5_MIGF_STATE_PRE_COPY || 225 migf->state == MLX5_MIGF_STATE_COMPLETE)) 226 return -ERESTARTSYS; 227 } 228 229 mutex_lock(&migf->lock); 230 if (migf->state == MLX5_MIGF_STATE_ERROR) { 231 done = -ENODEV; 232 goto out_unlock; 233 } 234 235 while (len) { 236 ssize_t count; 237 238 vhca_buf = mlx5vf_get_data_buff_from_pos(migf, *pos, 239 &end_of_data); 240 if (first_loop_call) { 241 first_loop_call = false; 242 /* Temporary end of file as part of PRE_COPY */ 243 if (end_of_data && (migf->state == MLX5_MIGF_STATE_PRE_COPY || 244 migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR)) { 245 done = -ENOMSG; 246 goto out_unlock; 247 } 248 249 if (end_of_data && migf->state != MLX5_MIGF_STATE_COMPLETE) { 250 if (filp->f_flags & O_NONBLOCK) { 251 done = -EAGAIN; 252 goto out_unlock; 253 } 254 } 255 } 256 257 if (end_of_data) 258 goto out_unlock; 259 260 if (!vhca_buf) { 261 done = -EINVAL; 262 goto out_unlock; 263 } 264 265 count = mlx5vf_buf_read(vhca_buf, &buf, &len, pos); 266 if (count < 0) { 267 done = count; 268 goto out_unlock; 269 } 270 done += count; 271 } 272 273 out_unlock: 274 mutex_unlock(&migf->lock); 275 return done; 276 } 277 278 static __poll_t mlx5vf_save_poll(struct file *filp, 279 struct poll_table_struct *wait) 280 { 281 struct mlx5_vf_migration_file *migf = filp->private_data; 282 __poll_t pollflags = 0; 283 284 poll_wait(filp, &migf->poll_wait, wait); 285 286 mutex_lock(&migf->lock); 287 if (migf->state == MLX5_MIGF_STATE_ERROR) 288 pollflags = EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; 289 else if (!list_empty(&migf->buf_list) || 290 migf->state == MLX5_MIGF_STATE_COMPLETE) 291 pollflags = EPOLLIN | EPOLLRDNORM; 292 mutex_unlock(&migf->lock); 293 294 return pollflags; 295 } 296 297 /* 298 * FD is exposed and user can use it after receiving an error. 299 * Mark migf in error, and wake the user. 300 */ 301 static void mlx5vf_mark_err(struct mlx5_vf_migration_file *migf) 302 { 303 migf->state = MLX5_MIGF_STATE_ERROR; 304 wake_up_interruptible(&migf->poll_wait); 305 } 306 307 static int mlx5vf_add_stop_copy_header(struct mlx5_vf_migration_file *migf) 308 { 309 size_t size = sizeof(struct mlx5_vf_migration_header) + 310 sizeof(struct mlx5_vf_migration_tag_stop_copy_data); 311 struct mlx5_vf_migration_tag_stop_copy_data data = {}; 312 struct mlx5_vhca_data_buffer *header_buf = NULL; 313 struct mlx5_vf_migration_header header = {}; 314 unsigned long flags; 315 struct page *page; 316 u8 *to_buff; 317 int ret; 318 319 header_buf = mlx5vf_get_data_buffer(migf, size, DMA_NONE); 320 if (IS_ERR(header_buf)) 321 return PTR_ERR(header_buf); 322 323 header.record_size = cpu_to_le64(sizeof(data)); 324 header.flags = cpu_to_le32(MLX5_MIGF_HEADER_FLAGS_TAG_OPTIONAL); 325 header.tag = cpu_to_le32(MLX5_MIGF_HEADER_TAG_STOP_COPY_SIZE); 326 page = mlx5vf_get_migration_page(header_buf, 0); 327 if (!page) { 328 ret = -EINVAL; 329 goto err; 330 } 331 to_buff = kmap_local_page(page); 332 memcpy(to_buff, &header, sizeof(header)); 333 header_buf->length = sizeof(header); 334 data.stop_copy_size = cpu_to_le64(migf->buf->allocated_length); 335 memcpy(to_buff + sizeof(header), &data, sizeof(data)); 336 header_buf->length += sizeof(data); 337 kunmap_local(to_buff); 338 header_buf->start_pos = header_buf->migf->max_pos; 339 migf->max_pos += header_buf->length; 340 spin_lock_irqsave(&migf->list_lock, flags); 341 list_add_tail(&header_buf->buf_elm, &migf->buf_list); 342 spin_unlock_irqrestore(&migf->list_lock, flags); 343 migf->pre_copy_initial_bytes = size; 344 return 0; 345 err: 346 mlx5vf_put_data_buffer(header_buf); 347 return ret; 348 } 349 350 static int mlx5vf_prep_stop_copy(struct mlx5_vf_migration_file *migf, 351 size_t state_size) 352 { 353 struct mlx5_vhca_data_buffer *buf; 354 size_t inc_state_size; 355 int ret; 356 357 /* let's be ready for stop_copy size that might grow by 10 percents */ 358 if (check_add_overflow(state_size, state_size / 10, &inc_state_size)) 359 inc_state_size = state_size; 360 361 buf = mlx5vf_get_data_buffer(migf, inc_state_size, DMA_FROM_DEVICE); 362 if (IS_ERR(buf)) 363 return PTR_ERR(buf); 364 365 migf->buf = buf; 366 buf = mlx5vf_get_data_buffer(migf, 367 sizeof(struct mlx5_vf_migration_header), DMA_NONE); 368 if (IS_ERR(buf)) { 369 ret = PTR_ERR(buf); 370 goto err; 371 } 372 373 migf->buf_header = buf; 374 ret = mlx5vf_add_stop_copy_header(migf); 375 if (ret) 376 goto err_header; 377 return 0; 378 379 err_header: 380 mlx5vf_put_data_buffer(migf->buf_header); 381 migf->buf_header = NULL; 382 err: 383 mlx5vf_put_data_buffer(migf->buf); 384 migf->buf = NULL; 385 return ret; 386 } 387 388 static long mlx5vf_precopy_ioctl(struct file *filp, unsigned int cmd, 389 unsigned long arg) 390 { 391 struct mlx5_vf_migration_file *migf = filp->private_data; 392 struct mlx5vf_pci_core_device *mvdev = migf->mvdev; 393 struct mlx5_vhca_data_buffer *buf; 394 struct vfio_precopy_info info = {}; 395 loff_t *pos = &filp->f_pos; 396 unsigned long minsz; 397 size_t inc_length = 0; 398 bool end_of_data = false; 399 int ret; 400 401 if (cmd != VFIO_MIG_GET_PRECOPY_INFO) 402 return -ENOTTY; 403 404 minsz = offsetofend(struct vfio_precopy_info, dirty_bytes); 405 406 if (copy_from_user(&info, (void __user *)arg, minsz)) 407 return -EFAULT; 408 409 if (info.argsz < minsz) 410 return -EINVAL; 411 412 mutex_lock(&mvdev->state_mutex); 413 if (mvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY && 414 mvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY_P2P) { 415 ret = -EINVAL; 416 goto err_state_unlock; 417 } 418 419 /* 420 * We can't issue a SAVE command when the device is suspended, so as 421 * part of VFIO_DEVICE_STATE_PRE_COPY_P2P no reason to query for extra 422 * bytes that can't be read. 423 */ 424 if (mvdev->mig_state == VFIO_DEVICE_STATE_PRE_COPY) { 425 /* 426 * Once the query returns it's guaranteed that there is no 427 * active SAVE command. 428 * As so, the other code below is safe with the proper locks. 429 */ 430 ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &inc_length, 431 MLX5VF_QUERY_INC); 432 if (ret) 433 goto err_state_unlock; 434 } 435 436 mutex_lock(&migf->lock); 437 if (migf->state == MLX5_MIGF_STATE_ERROR) { 438 ret = -ENODEV; 439 goto err_migf_unlock; 440 } 441 442 if (migf->pre_copy_initial_bytes > *pos) { 443 info.initial_bytes = migf->pre_copy_initial_bytes - *pos; 444 } else { 445 buf = mlx5vf_get_data_buff_from_pos(migf, *pos, &end_of_data); 446 if (buf) { 447 info.dirty_bytes = buf->start_pos + buf->length - *pos; 448 } else { 449 if (!end_of_data) { 450 ret = -EINVAL; 451 goto err_migf_unlock; 452 } 453 info.dirty_bytes = inc_length; 454 } 455 } 456 457 if (!end_of_data || !inc_length) { 458 mutex_unlock(&migf->lock); 459 goto done; 460 } 461 462 mutex_unlock(&migf->lock); 463 /* 464 * We finished transferring the current state and the device has a 465 * dirty state, save a new state to be ready for. 466 */ 467 buf = mlx5vf_get_data_buffer(migf, inc_length, DMA_FROM_DEVICE); 468 if (IS_ERR(buf)) { 469 ret = PTR_ERR(buf); 470 mlx5vf_mark_err(migf); 471 goto err_state_unlock; 472 } 473 474 ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, true); 475 if (ret) { 476 mlx5vf_mark_err(migf); 477 mlx5vf_put_data_buffer(buf); 478 goto err_state_unlock; 479 } 480 481 done: 482 mlx5vf_state_mutex_unlock(mvdev); 483 if (copy_to_user((void __user *)arg, &info, minsz)) 484 return -EFAULT; 485 return 0; 486 487 err_migf_unlock: 488 mutex_unlock(&migf->lock); 489 err_state_unlock: 490 mlx5vf_state_mutex_unlock(mvdev); 491 return ret; 492 } 493 494 static const struct file_operations mlx5vf_save_fops = { 495 .owner = THIS_MODULE, 496 .read = mlx5vf_save_read, 497 .poll = mlx5vf_save_poll, 498 .unlocked_ioctl = mlx5vf_precopy_ioctl, 499 .compat_ioctl = compat_ptr_ioctl, 500 .release = mlx5vf_release_file, 501 .llseek = no_llseek, 502 }; 503 504 static int mlx5vf_pci_save_device_inc_data(struct mlx5vf_pci_core_device *mvdev) 505 { 506 struct mlx5_vf_migration_file *migf = mvdev->saving_migf; 507 struct mlx5_vhca_data_buffer *buf; 508 size_t length; 509 int ret; 510 511 if (migf->state == MLX5_MIGF_STATE_ERROR) 512 return -ENODEV; 513 514 ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, 515 MLX5VF_QUERY_INC | MLX5VF_QUERY_FINAL); 516 if (ret) 517 goto err; 518 519 /* Checking whether we have a matching pre-allocated buffer that can fit */ 520 if (migf->buf && migf->buf->allocated_length >= length) { 521 buf = migf->buf; 522 migf->buf = NULL; 523 } else { 524 buf = mlx5vf_get_data_buffer(migf, length, DMA_FROM_DEVICE); 525 if (IS_ERR(buf)) { 526 ret = PTR_ERR(buf); 527 goto err; 528 } 529 } 530 531 ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, false); 532 if (ret) 533 goto err_save; 534 535 return 0; 536 537 err_save: 538 mlx5vf_put_data_buffer(buf); 539 err: 540 mlx5vf_mark_err(migf); 541 return ret; 542 } 543 544 static struct mlx5_vf_migration_file * 545 mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev, bool track) 546 { 547 struct mlx5_vf_migration_file *migf; 548 struct mlx5_vhca_data_buffer *buf; 549 size_t length; 550 int ret; 551 552 migf = kzalloc(sizeof(*migf), GFP_KERNEL_ACCOUNT); 553 if (!migf) 554 return ERR_PTR(-ENOMEM); 555 556 migf->filp = anon_inode_getfile("mlx5vf_mig", &mlx5vf_save_fops, migf, 557 O_RDONLY); 558 if (IS_ERR(migf->filp)) { 559 ret = PTR_ERR(migf->filp); 560 goto end; 561 } 562 563 migf->mvdev = mvdev; 564 ret = mlx5vf_cmd_alloc_pd(migf); 565 if (ret) 566 goto out_free; 567 568 stream_open(migf->filp->f_inode, migf->filp); 569 mutex_init(&migf->lock); 570 init_waitqueue_head(&migf->poll_wait); 571 init_completion(&migf->save_comp); 572 /* 573 * save_comp is being used as a binary semaphore built from 574 * a completion. A normal mutex cannot be used because the lock is 575 * passed between kernel threads and lockdep can't model this. 576 */ 577 complete(&migf->save_comp); 578 mlx5_cmd_init_async_ctx(mvdev->mdev, &migf->async_ctx); 579 INIT_WORK(&migf->async_data.work, mlx5vf_mig_file_cleanup_cb); 580 INIT_LIST_HEAD(&migf->buf_list); 581 INIT_LIST_HEAD(&migf->avail_list); 582 spin_lock_init(&migf->list_lock); 583 ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, 0); 584 if (ret) 585 goto out_pd; 586 587 if (track) { 588 ret = mlx5vf_prep_stop_copy(migf, length); 589 if (ret) 590 goto out_pd; 591 } 592 593 buf = mlx5vf_alloc_data_buffer(migf, length, DMA_FROM_DEVICE); 594 if (IS_ERR(buf)) { 595 ret = PTR_ERR(buf); 596 goto out_pd; 597 } 598 599 ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, false, track); 600 if (ret) 601 goto out_save; 602 return migf; 603 out_save: 604 mlx5vf_free_data_buffer(buf); 605 out_pd: 606 mlx5fv_cmd_clean_migf_resources(migf); 607 out_free: 608 fput(migf->filp); 609 end: 610 kfree(migf); 611 return ERR_PTR(ret); 612 } 613 614 static int 615 mlx5vf_append_page_to_mig_buf(struct mlx5_vhca_data_buffer *vhca_buf, 616 const char __user **buf, size_t *len, 617 loff_t *pos, ssize_t *done) 618 { 619 unsigned long offset; 620 size_t page_offset; 621 struct page *page; 622 size_t page_len; 623 u8 *to_buff; 624 int ret; 625 626 offset = *pos - vhca_buf->start_pos; 627 page_offset = offset % PAGE_SIZE; 628 629 page = mlx5vf_get_migration_page(vhca_buf, offset - page_offset); 630 if (!page) 631 return -EINVAL; 632 page_len = min_t(size_t, *len, PAGE_SIZE - page_offset); 633 to_buff = kmap_local_page(page); 634 ret = copy_from_user(to_buff + page_offset, *buf, page_len); 635 kunmap_local(to_buff); 636 if (ret) 637 return -EFAULT; 638 639 *pos += page_len; 640 *done += page_len; 641 *buf += page_len; 642 *len -= page_len; 643 vhca_buf->length += page_len; 644 return 0; 645 } 646 647 static int 648 mlx5vf_resume_read_image_no_header(struct mlx5_vhca_data_buffer *vhca_buf, 649 loff_t requested_length, 650 const char __user **buf, size_t *len, 651 loff_t *pos, ssize_t *done) 652 { 653 int ret; 654 655 if (requested_length > MAX_LOAD_SIZE) 656 return -ENOMEM; 657 658 if (vhca_buf->allocated_length < requested_length) { 659 ret = mlx5vf_add_migration_pages( 660 vhca_buf, 661 DIV_ROUND_UP(requested_length - vhca_buf->allocated_length, 662 PAGE_SIZE)); 663 if (ret) 664 return ret; 665 } 666 667 while (*len) { 668 ret = mlx5vf_append_page_to_mig_buf(vhca_buf, buf, len, pos, 669 done); 670 if (ret) 671 return ret; 672 } 673 674 return 0; 675 } 676 677 static ssize_t 678 mlx5vf_resume_read_image(struct mlx5_vf_migration_file *migf, 679 struct mlx5_vhca_data_buffer *vhca_buf, 680 size_t image_size, const char __user **buf, 681 size_t *len, loff_t *pos, ssize_t *done, 682 bool *has_work) 683 { 684 size_t copy_len, to_copy; 685 int ret; 686 687 to_copy = min_t(size_t, *len, image_size - vhca_buf->length); 688 copy_len = to_copy; 689 while (to_copy) { 690 ret = mlx5vf_append_page_to_mig_buf(vhca_buf, buf, &to_copy, pos, 691 done); 692 if (ret) 693 return ret; 694 } 695 696 *len -= copy_len; 697 if (vhca_buf->length == image_size) { 698 migf->load_state = MLX5_VF_LOAD_STATE_LOAD_IMAGE; 699 migf->max_pos += image_size; 700 *has_work = true; 701 } 702 703 return 0; 704 } 705 706 static int 707 mlx5vf_resume_read_header_data(struct mlx5_vf_migration_file *migf, 708 struct mlx5_vhca_data_buffer *vhca_buf, 709 const char __user **buf, size_t *len, 710 loff_t *pos, ssize_t *done) 711 { 712 size_t copy_len, to_copy; 713 size_t required_data; 714 u8 *to_buff; 715 int ret; 716 717 required_data = migf->record_size - vhca_buf->length; 718 to_copy = min_t(size_t, *len, required_data); 719 copy_len = to_copy; 720 while (to_copy) { 721 ret = mlx5vf_append_page_to_mig_buf(vhca_buf, buf, &to_copy, pos, 722 done); 723 if (ret) 724 return ret; 725 } 726 727 *len -= copy_len; 728 if (vhca_buf->length == migf->record_size) { 729 switch (migf->record_tag) { 730 case MLX5_MIGF_HEADER_TAG_STOP_COPY_SIZE: 731 { 732 struct page *page; 733 734 page = mlx5vf_get_migration_page(vhca_buf, 0); 735 if (!page) 736 return -EINVAL; 737 to_buff = kmap_local_page(page); 738 migf->stop_copy_prep_size = min_t(u64, 739 le64_to_cpup((__le64 *)to_buff), MAX_LOAD_SIZE); 740 kunmap_local(to_buff); 741 break; 742 } 743 default: 744 /* Optional tag */ 745 break; 746 } 747 748 migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER; 749 migf->max_pos += migf->record_size; 750 vhca_buf->length = 0; 751 } 752 753 return 0; 754 } 755 756 static int 757 mlx5vf_resume_read_header(struct mlx5_vf_migration_file *migf, 758 struct mlx5_vhca_data_buffer *vhca_buf, 759 const char __user **buf, 760 size_t *len, loff_t *pos, 761 ssize_t *done, bool *has_work) 762 { 763 struct page *page; 764 size_t copy_len; 765 u8 *to_buff; 766 int ret; 767 768 copy_len = min_t(size_t, *len, 769 sizeof(struct mlx5_vf_migration_header) - vhca_buf->length); 770 page = mlx5vf_get_migration_page(vhca_buf, 0); 771 if (!page) 772 return -EINVAL; 773 to_buff = kmap_local_page(page); 774 ret = copy_from_user(to_buff + vhca_buf->length, *buf, copy_len); 775 if (ret) { 776 ret = -EFAULT; 777 goto end; 778 } 779 780 *buf += copy_len; 781 *pos += copy_len; 782 *done += copy_len; 783 *len -= copy_len; 784 vhca_buf->length += copy_len; 785 if (vhca_buf->length == sizeof(struct mlx5_vf_migration_header)) { 786 u64 record_size; 787 u32 flags; 788 789 record_size = le64_to_cpup((__le64 *)to_buff); 790 if (record_size > MAX_LOAD_SIZE) { 791 ret = -ENOMEM; 792 goto end; 793 } 794 795 migf->record_size = record_size; 796 flags = le32_to_cpup((__le32 *)(to_buff + 797 offsetof(struct mlx5_vf_migration_header, flags))); 798 migf->record_tag = le32_to_cpup((__le32 *)(to_buff + 799 offsetof(struct mlx5_vf_migration_header, tag))); 800 switch (migf->record_tag) { 801 case MLX5_MIGF_HEADER_TAG_FW_DATA: 802 migf->load_state = MLX5_VF_LOAD_STATE_PREP_IMAGE; 803 break; 804 case MLX5_MIGF_HEADER_TAG_STOP_COPY_SIZE: 805 migf->load_state = MLX5_VF_LOAD_STATE_PREP_HEADER_DATA; 806 break; 807 default: 808 if (!(flags & MLX5_MIGF_HEADER_FLAGS_TAG_OPTIONAL)) { 809 ret = -EOPNOTSUPP; 810 goto end; 811 } 812 /* We may read and skip this optional record data */ 813 migf->load_state = MLX5_VF_LOAD_STATE_PREP_HEADER_DATA; 814 } 815 816 migf->max_pos += vhca_buf->length; 817 vhca_buf->length = 0; 818 *has_work = true; 819 } 820 end: 821 kunmap_local(to_buff); 822 return ret; 823 } 824 825 static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf, 826 size_t len, loff_t *pos) 827 { 828 struct mlx5_vf_migration_file *migf = filp->private_data; 829 struct mlx5_vhca_data_buffer *vhca_buf = migf->buf; 830 struct mlx5_vhca_data_buffer *vhca_buf_header = migf->buf_header; 831 loff_t requested_length; 832 bool has_work = false; 833 ssize_t done = 0; 834 int ret = 0; 835 836 if (pos) 837 return -ESPIPE; 838 pos = &filp->f_pos; 839 840 if (*pos < 0 || 841 check_add_overflow((loff_t)len, *pos, &requested_length)) 842 return -EINVAL; 843 844 mutex_lock(&migf->mvdev->state_mutex); 845 mutex_lock(&migf->lock); 846 if (migf->state == MLX5_MIGF_STATE_ERROR) { 847 ret = -ENODEV; 848 goto out_unlock; 849 } 850 851 while (len || has_work) { 852 has_work = false; 853 switch (migf->load_state) { 854 case MLX5_VF_LOAD_STATE_READ_HEADER: 855 ret = mlx5vf_resume_read_header(migf, vhca_buf_header, 856 &buf, &len, pos, 857 &done, &has_work); 858 if (ret) 859 goto out_unlock; 860 break; 861 case MLX5_VF_LOAD_STATE_PREP_HEADER_DATA: 862 if (vhca_buf_header->allocated_length < migf->record_size) { 863 mlx5vf_free_data_buffer(vhca_buf_header); 864 865 migf->buf_header = mlx5vf_alloc_data_buffer(migf, 866 migf->record_size, DMA_NONE); 867 if (IS_ERR(migf->buf_header)) { 868 ret = PTR_ERR(migf->buf_header); 869 migf->buf_header = NULL; 870 goto out_unlock; 871 } 872 873 vhca_buf_header = migf->buf_header; 874 } 875 876 vhca_buf_header->start_pos = migf->max_pos; 877 migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER_DATA; 878 break; 879 case MLX5_VF_LOAD_STATE_READ_HEADER_DATA: 880 ret = mlx5vf_resume_read_header_data(migf, vhca_buf_header, 881 &buf, &len, pos, &done); 882 if (ret) 883 goto out_unlock; 884 break; 885 case MLX5_VF_LOAD_STATE_PREP_IMAGE: 886 { 887 u64 size = max(migf->record_size, 888 migf->stop_copy_prep_size); 889 890 if (vhca_buf->allocated_length < size) { 891 mlx5vf_free_data_buffer(vhca_buf); 892 893 migf->buf = mlx5vf_alloc_data_buffer(migf, 894 size, DMA_TO_DEVICE); 895 if (IS_ERR(migf->buf)) { 896 ret = PTR_ERR(migf->buf); 897 migf->buf = NULL; 898 goto out_unlock; 899 } 900 901 vhca_buf = migf->buf; 902 } 903 904 vhca_buf->start_pos = migf->max_pos; 905 migf->load_state = MLX5_VF_LOAD_STATE_READ_IMAGE; 906 break; 907 } 908 case MLX5_VF_LOAD_STATE_READ_IMAGE_NO_HEADER: 909 ret = mlx5vf_resume_read_image_no_header(vhca_buf, 910 requested_length, 911 &buf, &len, pos, &done); 912 if (ret) 913 goto out_unlock; 914 break; 915 case MLX5_VF_LOAD_STATE_READ_IMAGE: 916 ret = mlx5vf_resume_read_image(migf, vhca_buf, 917 migf->record_size, 918 &buf, &len, pos, &done, &has_work); 919 if (ret) 920 goto out_unlock; 921 break; 922 case MLX5_VF_LOAD_STATE_LOAD_IMAGE: 923 ret = mlx5vf_cmd_load_vhca_state(migf->mvdev, migf, vhca_buf); 924 if (ret) 925 goto out_unlock; 926 migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER; 927 928 /* prep header buf for next image */ 929 vhca_buf_header->length = 0; 930 /* prep data buf for next image */ 931 vhca_buf->length = 0; 932 933 break; 934 default: 935 break; 936 } 937 } 938 939 out_unlock: 940 if (ret) 941 migf->state = MLX5_MIGF_STATE_ERROR; 942 mutex_unlock(&migf->lock); 943 mlx5vf_state_mutex_unlock(migf->mvdev); 944 return ret ? ret : done; 945 } 946 947 static const struct file_operations mlx5vf_resume_fops = { 948 .owner = THIS_MODULE, 949 .write = mlx5vf_resume_write, 950 .release = mlx5vf_release_file, 951 .llseek = no_llseek, 952 }; 953 954 static struct mlx5_vf_migration_file * 955 mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev) 956 { 957 struct mlx5_vf_migration_file *migf; 958 struct mlx5_vhca_data_buffer *buf; 959 int ret; 960 961 migf = kzalloc(sizeof(*migf), GFP_KERNEL_ACCOUNT); 962 if (!migf) 963 return ERR_PTR(-ENOMEM); 964 965 migf->filp = anon_inode_getfile("mlx5vf_mig", &mlx5vf_resume_fops, migf, 966 O_WRONLY); 967 if (IS_ERR(migf->filp)) { 968 ret = PTR_ERR(migf->filp); 969 goto end; 970 } 971 972 migf->mvdev = mvdev; 973 ret = mlx5vf_cmd_alloc_pd(migf); 974 if (ret) 975 goto out_free; 976 977 buf = mlx5vf_alloc_data_buffer(migf, 0, DMA_TO_DEVICE); 978 if (IS_ERR(buf)) { 979 ret = PTR_ERR(buf); 980 goto out_pd; 981 } 982 983 migf->buf = buf; 984 if (MLX5VF_PRE_COPY_SUPP(mvdev)) { 985 buf = mlx5vf_alloc_data_buffer(migf, 986 sizeof(struct mlx5_vf_migration_header), DMA_NONE); 987 if (IS_ERR(buf)) { 988 ret = PTR_ERR(buf); 989 goto out_buf; 990 } 991 992 migf->buf_header = buf; 993 migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER; 994 } else { 995 /* Initial state will be to read the image */ 996 migf->load_state = MLX5_VF_LOAD_STATE_READ_IMAGE_NO_HEADER; 997 } 998 999 stream_open(migf->filp->f_inode, migf->filp); 1000 mutex_init(&migf->lock); 1001 INIT_LIST_HEAD(&migf->buf_list); 1002 INIT_LIST_HEAD(&migf->avail_list); 1003 spin_lock_init(&migf->list_lock); 1004 return migf; 1005 out_buf: 1006 mlx5vf_free_data_buffer(migf->buf); 1007 out_pd: 1008 mlx5vf_cmd_dealloc_pd(migf); 1009 out_free: 1010 fput(migf->filp); 1011 end: 1012 kfree(migf); 1013 return ERR_PTR(ret); 1014 } 1015 1016 void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev) 1017 { 1018 if (mvdev->resuming_migf) { 1019 mlx5vf_disable_fd(mvdev->resuming_migf); 1020 mlx5fv_cmd_clean_migf_resources(mvdev->resuming_migf); 1021 fput(mvdev->resuming_migf->filp); 1022 mvdev->resuming_migf = NULL; 1023 } 1024 if (mvdev->saving_migf) { 1025 mlx5_cmd_cleanup_async_ctx(&mvdev->saving_migf->async_ctx); 1026 cancel_work_sync(&mvdev->saving_migf->async_data.work); 1027 mlx5vf_disable_fd(mvdev->saving_migf); 1028 mlx5fv_cmd_clean_migf_resources(mvdev->saving_migf); 1029 fput(mvdev->saving_migf->filp); 1030 mvdev->saving_migf = NULL; 1031 } 1032 } 1033 1034 static struct file * 1035 mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev, 1036 u32 new) 1037 { 1038 u32 cur = mvdev->mig_state; 1039 int ret; 1040 1041 if (cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_STOP) { 1042 ret = mlx5vf_cmd_suspend_vhca(mvdev, 1043 MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_RESPONDER); 1044 if (ret) 1045 return ERR_PTR(ret); 1046 return NULL; 1047 } 1048 1049 if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RUNNING_P2P) { 1050 ret = mlx5vf_cmd_resume_vhca(mvdev, 1051 MLX5_RESUME_VHCA_IN_OP_MOD_RESUME_RESPONDER); 1052 if (ret) 1053 return ERR_PTR(ret); 1054 return NULL; 1055 } 1056 1057 if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_RUNNING_P2P) || 1058 (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) { 1059 ret = mlx5vf_cmd_suspend_vhca(mvdev, 1060 MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_INITIATOR); 1061 if (ret) 1062 return ERR_PTR(ret); 1063 return NULL; 1064 } 1065 1066 if ((cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_RUNNING) || 1067 (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_PRE_COPY)) { 1068 ret = mlx5vf_cmd_resume_vhca(mvdev, 1069 MLX5_RESUME_VHCA_IN_OP_MOD_RESUME_INITIATOR); 1070 if (ret) 1071 return ERR_PTR(ret); 1072 return NULL; 1073 } 1074 1075 if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_STOP_COPY) { 1076 struct mlx5_vf_migration_file *migf; 1077 1078 migf = mlx5vf_pci_save_device_data(mvdev, false); 1079 if (IS_ERR(migf)) 1080 return ERR_CAST(migf); 1081 get_file(migf->filp); 1082 mvdev->saving_migf = migf; 1083 return migf->filp; 1084 } 1085 1086 if ((cur == VFIO_DEVICE_STATE_STOP_COPY && new == VFIO_DEVICE_STATE_STOP) || 1087 (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_RUNNING) || 1088 (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && 1089 new == VFIO_DEVICE_STATE_RUNNING_P2P)) { 1090 mlx5vf_disable_fds(mvdev); 1091 return NULL; 1092 } 1093 1094 if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RESUMING) { 1095 struct mlx5_vf_migration_file *migf; 1096 1097 migf = mlx5vf_pci_resume_device_data(mvdev); 1098 if (IS_ERR(migf)) 1099 return ERR_CAST(migf); 1100 get_file(migf->filp); 1101 mvdev->resuming_migf = migf; 1102 return migf->filp; 1103 } 1104 1105 if (cur == VFIO_DEVICE_STATE_RESUMING && new == VFIO_DEVICE_STATE_STOP) { 1106 if (!MLX5VF_PRE_COPY_SUPP(mvdev)) { 1107 ret = mlx5vf_cmd_load_vhca_state(mvdev, 1108 mvdev->resuming_migf, 1109 mvdev->resuming_migf->buf); 1110 if (ret) 1111 return ERR_PTR(ret); 1112 } 1113 mlx5vf_disable_fds(mvdev); 1114 return NULL; 1115 } 1116 1117 if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_PRE_COPY) || 1118 (cur == VFIO_DEVICE_STATE_RUNNING_P2P && 1119 new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) { 1120 struct mlx5_vf_migration_file *migf; 1121 1122 migf = mlx5vf_pci_save_device_data(mvdev, true); 1123 if (IS_ERR(migf)) 1124 return ERR_CAST(migf); 1125 get_file(migf->filp); 1126 mvdev->saving_migf = migf; 1127 return migf->filp; 1128 } 1129 1130 if (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_STOP_COPY) { 1131 ret = mlx5vf_cmd_suspend_vhca(mvdev, 1132 MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_RESPONDER); 1133 if (ret) 1134 return ERR_PTR(ret); 1135 ret = mlx5vf_pci_save_device_inc_data(mvdev); 1136 return ret ? ERR_PTR(ret) : NULL; 1137 } 1138 1139 /* 1140 * vfio_mig_get_next_state() does not use arcs other than the above 1141 */ 1142 WARN_ON(true); 1143 return ERR_PTR(-EINVAL); 1144 } 1145 1146 /* 1147 * This function is called in all state_mutex unlock cases to 1148 * handle a 'deferred_reset' if exists. 1149 */ 1150 void mlx5vf_state_mutex_unlock(struct mlx5vf_pci_core_device *mvdev) 1151 { 1152 again: 1153 spin_lock(&mvdev->reset_lock); 1154 if (mvdev->deferred_reset) { 1155 mvdev->deferred_reset = false; 1156 spin_unlock(&mvdev->reset_lock); 1157 mvdev->mig_state = VFIO_DEVICE_STATE_RUNNING; 1158 mlx5vf_disable_fds(mvdev); 1159 goto again; 1160 } 1161 mutex_unlock(&mvdev->state_mutex); 1162 spin_unlock(&mvdev->reset_lock); 1163 } 1164 1165 static struct file * 1166 mlx5vf_pci_set_device_state(struct vfio_device *vdev, 1167 enum vfio_device_mig_state new_state) 1168 { 1169 struct mlx5vf_pci_core_device *mvdev = container_of( 1170 vdev, struct mlx5vf_pci_core_device, core_device.vdev); 1171 enum vfio_device_mig_state next_state; 1172 struct file *res = NULL; 1173 int ret; 1174 1175 mutex_lock(&mvdev->state_mutex); 1176 while (new_state != mvdev->mig_state) { 1177 ret = vfio_mig_get_next_state(vdev, mvdev->mig_state, 1178 new_state, &next_state); 1179 if (ret) { 1180 res = ERR_PTR(ret); 1181 break; 1182 } 1183 res = mlx5vf_pci_step_device_state_locked(mvdev, next_state); 1184 if (IS_ERR(res)) 1185 break; 1186 mvdev->mig_state = next_state; 1187 if (WARN_ON(res && new_state != mvdev->mig_state)) { 1188 fput(res); 1189 res = ERR_PTR(-EINVAL); 1190 break; 1191 } 1192 } 1193 mlx5vf_state_mutex_unlock(mvdev); 1194 return res; 1195 } 1196 1197 static int mlx5vf_pci_get_data_size(struct vfio_device *vdev, 1198 unsigned long *stop_copy_length) 1199 { 1200 struct mlx5vf_pci_core_device *mvdev = container_of( 1201 vdev, struct mlx5vf_pci_core_device, core_device.vdev); 1202 size_t state_size; 1203 int ret; 1204 1205 mutex_lock(&mvdev->state_mutex); 1206 ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, 1207 &state_size, 0); 1208 if (!ret) 1209 *stop_copy_length = state_size; 1210 mlx5vf_state_mutex_unlock(mvdev); 1211 return ret; 1212 } 1213 1214 static int mlx5vf_pci_get_device_state(struct vfio_device *vdev, 1215 enum vfio_device_mig_state *curr_state) 1216 { 1217 struct mlx5vf_pci_core_device *mvdev = container_of( 1218 vdev, struct mlx5vf_pci_core_device, core_device.vdev); 1219 1220 mutex_lock(&mvdev->state_mutex); 1221 *curr_state = mvdev->mig_state; 1222 mlx5vf_state_mutex_unlock(mvdev); 1223 return 0; 1224 } 1225 1226 static void mlx5vf_pci_aer_reset_done(struct pci_dev *pdev) 1227 { 1228 struct mlx5vf_pci_core_device *mvdev = mlx5vf_drvdata(pdev); 1229 1230 if (!mvdev->migrate_cap) 1231 return; 1232 1233 /* 1234 * As the higher VFIO layers are holding locks across reset and using 1235 * those same locks with the mm_lock we need to prevent ABBA deadlock 1236 * with the state_mutex and mm_lock. 1237 * In case the state_mutex was taken already we defer the cleanup work 1238 * to the unlock flow of the other running context. 1239 */ 1240 spin_lock(&mvdev->reset_lock); 1241 mvdev->deferred_reset = true; 1242 if (!mutex_trylock(&mvdev->state_mutex)) { 1243 spin_unlock(&mvdev->reset_lock); 1244 return; 1245 } 1246 spin_unlock(&mvdev->reset_lock); 1247 mlx5vf_state_mutex_unlock(mvdev); 1248 } 1249 1250 static int mlx5vf_pci_open_device(struct vfio_device *core_vdev) 1251 { 1252 struct mlx5vf_pci_core_device *mvdev = container_of( 1253 core_vdev, struct mlx5vf_pci_core_device, core_device.vdev); 1254 struct vfio_pci_core_device *vdev = &mvdev->core_device; 1255 int ret; 1256 1257 ret = vfio_pci_core_enable(vdev); 1258 if (ret) 1259 return ret; 1260 1261 if (mvdev->migrate_cap) 1262 mvdev->mig_state = VFIO_DEVICE_STATE_RUNNING; 1263 vfio_pci_core_finish_enable(vdev); 1264 return 0; 1265 } 1266 1267 static void mlx5vf_pci_close_device(struct vfio_device *core_vdev) 1268 { 1269 struct mlx5vf_pci_core_device *mvdev = container_of( 1270 core_vdev, struct mlx5vf_pci_core_device, core_device.vdev); 1271 1272 mlx5vf_cmd_close_migratable(mvdev); 1273 vfio_pci_core_close_device(core_vdev); 1274 } 1275 1276 static const struct vfio_migration_ops mlx5vf_pci_mig_ops = { 1277 .migration_set_state = mlx5vf_pci_set_device_state, 1278 .migration_get_state = mlx5vf_pci_get_device_state, 1279 .migration_get_data_size = mlx5vf_pci_get_data_size, 1280 }; 1281 1282 static const struct vfio_log_ops mlx5vf_pci_log_ops = { 1283 .log_start = mlx5vf_start_page_tracker, 1284 .log_stop = mlx5vf_stop_page_tracker, 1285 .log_read_and_clear = mlx5vf_tracker_read_and_clear, 1286 }; 1287 1288 static int mlx5vf_pci_init_dev(struct vfio_device *core_vdev) 1289 { 1290 struct mlx5vf_pci_core_device *mvdev = container_of(core_vdev, 1291 struct mlx5vf_pci_core_device, core_device.vdev); 1292 int ret; 1293 1294 ret = vfio_pci_core_init_dev(core_vdev); 1295 if (ret) 1296 return ret; 1297 1298 mlx5vf_cmd_set_migratable(mvdev, &mlx5vf_pci_mig_ops, 1299 &mlx5vf_pci_log_ops); 1300 1301 return 0; 1302 } 1303 1304 static void mlx5vf_pci_release_dev(struct vfio_device *core_vdev) 1305 { 1306 struct mlx5vf_pci_core_device *mvdev = container_of(core_vdev, 1307 struct mlx5vf_pci_core_device, core_device.vdev); 1308 1309 mlx5vf_cmd_remove_migratable(mvdev); 1310 vfio_pci_core_release_dev(core_vdev); 1311 } 1312 1313 static const struct vfio_device_ops mlx5vf_pci_ops = { 1314 .name = "mlx5-vfio-pci", 1315 .init = mlx5vf_pci_init_dev, 1316 .release = mlx5vf_pci_release_dev, 1317 .open_device = mlx5vf_pci_open_device, 1318 .close_device = mlx5vf_pci_close_device, 1319 .ioctl = vfio_pci_core_ioctl, 1320 .device_feature = vfio_pci_core_ioctl_feature, 1321 .read = vfio_pci_core_read, 1322 .write = vfio_pci_core_write, 1323 .mmap = vfio_pci_core_mmap, 1324 .request = vfio_pci_core_request, 1325 .match = vfio_pci_core_match, 1326 .bind_iommufd = vfio_iommufd_physical_bind, 1327 .unbind_iommufd = vfio_iommufd_physical_unbind, 1328 .attach_ioas = vfio_iommufd_physical_attach_ioas, 1329 }; 1330 1331 static int mlx5vf_pci_probe(struct pci_dev *pdev, 1332 const struct pci_device_id *id) 1333 { 1334 struct mlx5vf_pci_core_device *mvdev; 1335 int ret; 1336 1337 mvdev = vfio_alloc_device(mlx5vf_pci_core_device, core_device.vdev, 1338 &pdev->dev, &mlx5vf_pci_ops); 1339 if (IS_ERR(mvdev)) 1340 return PTR_ERR(mvdev); 1341 1342 dev_set_drvdata(&pdev->dev, &mvdev->core_device); 1343 ret = vfio_pci_core_register_device(&mvdev->core_device); 1344 if (ret) 1345 goto out_put_vdev; 1346 return 0; 1347 1348 out_put_vdev: 1349 vfio_put_device(&mvdev->core_device.vdev); 1350 return ret; 1351 } 1352 1353 static void mlx5vf_pci_remove(struct pci_dev *pdev) 1354 { 1355 struct mlx5vf_pci_core_device *mvdev = mlx5vf_drvdata(pdev); 1356 1357 vfio_pci_core_unregister_device(&mvdev->core_device); 1358 vfio_put_device(&mvdev->core_device.vdev); 1359 } 1360 1361 static const struct pci_device_id mlx5vf_pci_table[] = { 1362 { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_MELLANOX, 0x101e) }, /* ConnectX Family mlx5Gen Virtual Function */ 1363 {} 1364 }; 1365 1366 MODULE_DEVICE_TABLE(pci, mlx5vf_pci_table); 1367 1368 static const struct pci_error_handlers mlx5vf_err_handlers = { 1369 .reset_done = mlx5vf_pci_aer_reset_done, 1370 .error_detected = vfio_pci_core_aer_err_detected, 1371 }; 1372 1373 static struct pci_driver mlx5vf_pci_driver = { 1374 .name = KBUILD_MODNAME, 1375 .id_table = mlx5vf_pci_table, 1376 .probe = mlx5vf_pci_probe, 1377 .remove = mlx5vf_pci_remove, 1378 .err_handler = &mlx5vf_err_handlers, 1379 .driver_managed_dma = true, 1380 }; 1381 1382 module_pci_driver(mlx5vf_pci_driver); 1383 1384 MODULE_LICENSE("GPL"); 1385 MODULE_AUTHOR("Max Gurtovoy <mgurtovoy@nvidia.com>"); 1386 MODULE_AUTHOR("Yishai Hadas <yishaih@nvidia.com>"); 1387 MODULE_DESCRIPTION( 1388 "MLX5 VFIO PCI - User Level meta-driver for MLX5 device family"); 1389