1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved 4 */ 5 6 #include <linux/device.h> 7 #include <linux/eventfd.h> 8 #include <linux/file.h> 9 #include <linux/interrupt.h> 10 #include <linux/iommu.h> 11 #include <linux/module.h> 12 #include <linux/mutex.h> 13 #include <linux/notifier.h> 14 #include <linux/pci.h> 15 #include <linux/pm_runtime.h> 16 #include <linux/types.h> 17 #include <linux/uaccess.h> 18 #include <linux/vfio.h> 19 #include <linux/sched/mm.h> 20 #include <linux/anon_inodes.h> 21 22 #include "cmd.h" 23 24 /* Device specification max LOAD size */ 25 #define MAX_LOAD_SIZE (BIT_ULL(__mlx5_bit_sz(load_vhca_state_in, size)) - 1) 26 27 #define MAX_CHUNK_SIZE SZ_8M 28 29 static struct mlx5vf_pci_core_device *mlx5vf_drvdata(struct pci_dev *pdev) 30 { 31 struct vfio_pci_core_device *core_device = dev_get_drvdata(&pdev->dev); 32 33 return container_of(core_device, struct mlx5vf_pci_core_device, 34 core_device); 35 } 36 37 static void mlx5vf_disable_fd(struct mlx5_vf_migration_file *migf) 38 { 39 mutex_lock(&migf->lock); 40 migf->state = MLX5_MIGF_STATE_ERROR; 41 migf->filp->f_pos = 0; 42 mutex_unlock(&migf->lock); 43 } 44 45 static int mlx5vf_release_file(struct inode *inode, struct file *filp) 46 { 47 struct mlx5_vf_migration_file *migf = filp->private_data; 48 49 mlx5vf_disable_fd(migf); 50 mutex_destroy(&migf->lock); 51 kfree(migf); 52 return 0; 53 } 54 55 static struct mlx5_vhca_data_buffer * 56 mlx5vf_get_data_buff_from_pos(struct mlx5_vf_migration_file *migf, loff_t pos, 57 bool *end_of_data) 58 { 59 struct mlx5_vhca_data_buffer *buf; 60 bool found = false; 61 62 *end_of_data = false; 63 spin_lock_irq(&migf->list_lock); 64 if (list_empty(&migf->buf_list)) { 65 *end_of_data = true; 66 goto end; 67 } 68 69 buf = list_first_entry(&migf->buf_list, struct mlx5_vhca_data_buffer, 70 buf_elm); 71 if (pos >= buf->start_pos && 72 pos < buf->start_pos + buf->length) { 73 found = true; 74 goto end; 75 } 76 77 /* 78 * As we use a stream based FD we may expect having the data always 79 * on first chunk 80 */ 81 migf->state = MLX5_MIGF_STATE_ERROR; 82 83 end: 84 spin_unlock_irq(&migf->list_lock); 85 return found ? buf : NULL; 86 } 87 88 static void mlx5vf_buf_read_done(struct mlx5_vhca_data_buffer *vhca_buf) 89 { 90 struct mlx5_vf_migration_file *migf = vhca_buf->migf; 91 92 if (vhca_buf->stop_copy_chunk_num) { 93 bool is_header = vhca_buf->dma_dir == DMA_NONE; 94 u8 chunk_num = vhca_buf->stop_copy_chunk_num; 95 size_t next_required_umem_size = 0; 96 97 if (is_header) 98 migf->buf_header[chunk_num - 1] = vhca_buf; 99 else 100 migf->buf[chunk_num - 1] = vhca_buf; 101 102 spin_lock_irq(&migf->list_lock); 103 list_del_init(&vhca_buf->buf_elm); 104 if (!is_header) { 105 next_required_umem_size = 106 migf->next_required_umem_size; 107 migf->next_required_umem_size = 0; 108 migf->num_ready_chunks--; 109 } 110 spin_unlock_irq(&migf->list_lock); 111 if (next_required_umem_size) 112 mlx5vf_mig_file_set_save_work(migf, chunk_num, 113 next_required_umem_size); 114 return; 115 } 116 117 spin_lock_irq(&migf->list_lock); 118 list_del_init(&vhca_buf->buf_elm); 119 list_add_tail(&vhca_buf->buf_elm, &vhca_buf->migf->avail_list); 120 spin_unlock_irq(&migf->list_lock); 121 } 122 123 static ssize_t mlx5vf_buf_read(struct mlx5_vhca_data_buffer *vhca_buf, 124 char __user **buf, size_t *len, loff_t *pos) 125 { 126 unsigned long offset; 127 ssize_t done = 0; 128 size_t copy_len; 129 130 copy_len = min_t(size_t, 131 vhca_buf->start_pos + vhca_buf->length - *pos, *len); 132 while (copy_len) { 133 size_t page_offset; 134 struct page *page; 135 size_t page_len; 136 u8 *from_buff; 137 int ret; 138 139 offset = *pos - vhca_buf->start_pos; 140 page_offset = offset % PAGE_SIZE; 141 offset -= page_offset; 142 page = mlx5vf_get_migration_page(vhca_buf, offset); 143 if (!page) 144 return -EINVAL; 145 page_len = min_t(size_t, copy_len, PAGE_SIZE - page_offset); 146 from_buff = kmap_local_page(page); 147 ret = copy_to_user(*buf, from_buff + page_offset, page_len); 148 kunmap_local(from_buff); 149 if (ret) 150 return -EFAULT; 151 *pos += page_len; 152 *len -= page_len; 153 *buf += page_len; 154 done += page_len; 155 copy_len -= page_len; 156 } 157 158 if (*pos >= vhca_buf->start_pos + vhca_buf->length) 159 mlx5vf_buf_read_done(vhca_buf); 160 161 return done; 162 } 163 164 static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len, 165 loff_t *pos) 166 { 167 struct mlx5_vf_migration_file *migf = filp->private_data; 168 struct mlx5_vhca_data_buffer *vhca_buf; 169 bool first_loop_call = true; 170 bool end_of_data; 171 ssize_t done = 0; 172 173 if (pos) 174 return -ESPIPE; 175 pos = &filp->f_pos; 176 177 if (!(filp->f_flags & O_NONBLOCK)) { 178 if (wait_event_interruptible(migf->poll_wait, 179 !list_empty(&migf->buf_list) || 180 migf->state == MLX5_MIGF_STATE_ERROR || 181 migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR || 182 migf->state == MLX5_MIGF_STATE_PRE_COPY || 183 migf->state == MLX5_MIGF_STATE_COMPLETE)) 184 return -ERESTARTSYS; 185 } 186 187 mutex_lock(&migf->lock); 188 if (migf->state == MLX5_MIGF_STATE_ERROR) { 189 done = -ENODEV; 190 goto out_unlock; 191 } 192 193 while (len) { 194 ssize_t count; 195 196 vhca_buf = mlx5vf_get_data_buff_from_pos(migf, *pos, 197 &end_of_data); 198 if (first_loop_call) { 199 first_loop_call = false; 200 /* Temporary end of file as part of PRE_COPY */ 201 if (end_of_data && (migf->state == MLX5_MIGF_STATE_PRE_COPY || 202 migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR)) { 203 done = -ENOMSG; 204 goto out_unlock; 205 } 206 207 if (end_of_data && migf->state != MLX5_MIGF_STATE_COMPLETE) { 208 if (filp->f_flags & O_NONBLOCK) { 209 done = -EAGAIN; 210 goto out_unlock; 211 } 212 } 213 } 214 215 if (end_of_data) 216 goto out_unlock; 217 218 if (!vhca_buf) { 219 done = -EINVAL; 220 goto out_unlock; 221 } 222 223 count = mlx5vf_buf_read(vhca_buf, &buf, &len, pos); 224 if (count < 0) { 225 done = count; 226 goto out_unlock; 227 } 228 done += count; 229 } 230 231 out_unlock: 232 mutex_unlock(&migf->lock); 233 return done; 234 } 235 236 static __poll_t mlx5vf_save_poll(struct file *filp, 237 struct poll_table_struct *wait) 238 { 239 struct mlx5_vf_migration_file *migf = filp->private_data; 240 __poll_t pollflags = 0; 241 242 poll_wait(filp, &migf->poll_wait, wait); 243 244 mutex_lock(&migf->lock); 245 if (migf->state == MLX5_MIGF_STATE_ERROR) 246 pollflags = EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; 247 else if (!list_empty(&migf->buf_list) || 248 migf->state == MLX5_MIGF_STATE_COMPLETE) 249 pollflags = EPOLLIN | EPOLLRDNORM; 250 mutex_unlock(&migf->lock); 251 252 return pollflags; 253 } 254 255 /* 256 * FD is exposed and user can use it after receiving an error. 257 * Mark migf in error, and wake the user. 258 */ 259 static void mlx5vf_mark_err(struct mlx5_vf_migration_file *migf) 260 { 261 migf->state = MLX5_MIGF_STATE_ERROR; 262 wake_up_interruptible(&migf->poll_wait); 263 } 264 265 void mlx5vf_mig_file_set_save_work(struct mlx5_vf_migration_file *migf, 266 u8 chunk_num, size_t next_required_umem_size) 267 { 268 migf->save_data[chunk_num - 1].next_required_umem_size = 269 next_required_umem_size; 270 migf->save_data[chunk_num - 1].migf = migf; 271 get_file(migf->filp); 272 queue_work(migf->mvdev->cb_wq, 273 &migf->save_data[chunk_num - 1].work); 274 } 275 276 static struct mlx5_vhca_data_buffer * 277 mlx5vf_mig_file_get_stop_copy_buf(struct mlx5_vf_migration_file *migf, 278 u8 index, size_t required_length) 279 { 280 u32 npages = DIV_ROUND_UP(required_length, PAGE_SIZE); 281 struct mlx5_vhca_data_buffer *buf = migf->buf[index]; 282 u8 chunk_num; 283 284 WARN_ON(!buf); 285 chunk_num = buf->stop_copy_chunk_num; 286 buf->migf->buf[index] = NULL; 287 /* Checking whether the pre-allocated buffer can fit */ 288 if (buf->npages >= npages) 289 return buf; 290 291 mlx5vf_put_data_buffer(buf); 292 buf = mlx5vf_get_data_buffer(buf->migf, npages, DMA_FROM_DEVICE); 293 if (IS_ERR(buf)) 294 return buf; 295 296 buf->stop_copy_chunk_num = chunk_num; 297 return buf; 298 } 299 300 static void mlx5vf_mig_file_save_work(struct work_struct *_work) 301 { 302 struct mlx5vf_save_work_data *save_data = container_of(_work, 303 struct mlx5vf_save_work_data, work); 304 struct mlx5_vf_migration_file *migf = save_data->migf; 305 struct mlx5vf_pci_core_device *mvdev = migf->mvdev; 306 struct mlx5_vhca_data_buffer *buf; 307 308 mutex_lock(&mvdev->state_mutex); 309 if (migf->state == MLX5_MIGF_STATE_ERROR) 310 goto end; 311 312 buf = mlx5vf_mig_file_get_stop_copy_buf(migf, 313 save_data->chunk_num - 1, 314 save_data->next_required_umem_size); 315 if (IS_ERR(buf)) 316 goto err; 317 318 if (mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, false)) 319 goto err_save; 320 321 goto end; 322 323 err_save: 324 mlx5vf_put_data_buffer(buf); 325 err: 326 mlx5vf_mark_err(migf); 327 end: 328 mlx5vf_state_mutex_unlock(mvdev); 329 fput(migf->filp); 330 } 331 332 static int mlx5vf_add_stop_copy_header(struct mlx5_vf_migration_file *migf, 333 bool track) 334 { 335 size_t size = sizeof(struct mlx5_vf_migration_header) + 336 sizeof(struct mlx5_vf_migration_tag_stop_copy_data); 337 struct mlx5_vf_migration_tag_stop_copy_data data = {}; 338 struct mlx5_vhca_data_buffer *header_buf = NULL; 339 struct mlx5_vf_migration_header header = {}; 340 unsigned long flags; 341 struct page *page; 342 u8 *to_buff; 343 int ret; 344 345 header_buf = mlx5vf_get_data_buffer(migf, DIV_ROUND_UP(size, PAGE_SIZE), 346 DMA_NONE); 347 if (IS_ERR(header_buf)) 348 return PTR_ERR(header_buf); 349 350 header.record_size = cpu_to_le64(sizeof(data)); 351 header.flags = cpu_to_le32(MLX5_MIGF_HEADER_FLAGS_TAG_OPTIONAL); 352 header.tag = cpu_to_le32(MLX5_MIGF_HEADER_TAG_STOP_COPY_SIZE); 353 page = mlx5vf_get_migration_page(header_buf, 0); 354 if (!page) { 355 ret = -EINVAL; 356 goto err; 357 } 358 to_buff = kmap_local_page(page); 359 memcpy(to_buff, &header, sizeof(header)); 360 header_buf->length = sizeof(header); 361 data.stop_copy_size = cpu_to_le64(migf->buf[0]->npages * PAGE_SIZE); 362 memcpy(to_buff + sizeof(header), &data, sizeof(data)); 363 header_buf->length += sizeof(data); 364 kunmap_local(to_buff); 365 header_buf->start_pos = header_buf->migf->max_pos; 366 migf->max_pos += header_buf->length; 367 spin_lock_irqsave(&migf->list_lock, flags); 368 list_add_tail(&header_buf->buf_elm, &migf->buf_list); 369 spin_unlock_irqrestore(&migf->list_lock, flags); 370 if (track) 371 migf->pre_copy_initial_bytes = size; 372 return 0; 373 err: 374 mlx5vf_put_data_buffer(header_buf); 375 return ret; 376 } 377 378 static int mlx5vf_prep_stop_copy(struct mlx5vf_pci_core_device *mvdev, 379 struct mlx5_vf_migration_file *migf, 380 size_t state_size, u64 full_size, 381 bool track) 382 { 383 struct mlx5_vhca_data_buffer *buf; 384 size_t inc_state_size; 385 int num_chunks; 386 int ret; 387 int i; 388 389 if (mvdev->chunk_mode) { 390 size_t chunk_size = min_t(size_t, MAX_CHUNK_SIZE, full_size); 391 392 /* from firmware perspective at least 'state_size' buffer should be set */ 393 inc_state_size = max(state_size, chunk_size); 394 } else { 395 if (track) { 396 /* let's be ready for stop_copy size that might grow by 10 percents */ 397 if (check_add_overflow(state_size, state_size / 10, &inc_state_size)) 398 inc_state_size = state_size; 399 } else { 400 inc_state_size = state_size; 401 } 402 } 403 404 /* let's not overflow the device specification max SAVE size */ 405 inc_state_size = min_t(size_t, inc_state_size, 406 (BIT_ULL(__mlx5_bit_sz(save_vhca_state_in, size)) - PAGE_SIZE)); 407 408 num_chunks = mvdev->chunk_mode ? MAX_NUM_CHUNKS : 1; 409 for (i = 0; i < num_chunks; i++) { 410 buf = mlx5vf_get_data_buffer( 411 migf, DIV_ROUND_UP(inc_state_size, PAGE_SIZE), 412 DMA_FROM_DEVICE); 413 if (IS_ERR(buf)) { 414 ret = PTR_ERR(buf); 415 goto err; 416 } 417 418 migf->buf[i] = buf; 419 buf = mlx5vf_get_data_buffer( 420 migf, 421 DIV_ROUND_UP(sizeof(struct mlx5_vf_migration_header), 422 PAGE_SIZE), 423 DMA_NONE); 424 if (IS_ERR(buf)) { 425 ret = PTR_ERR(buf); 426 goto err; 427 } 428 migf->buf_header[i] = buf; 429 if (mvdev->chunk_mode) { 430 migf->buf[i]->stop_copy_chunk_num = i + 1; 431 migf->buf_header[i]->stop_copy_chunk_num = i + 1; 432 INIT_WORK(&migf->save_data[i].work, 433 mlx5vf_mig_file_save_work); 434 migf->save_data[i].chunk_num = i + 1; 435 } 436 } 437 438 ret = mlx5vf_add_stop_copy_header(migf, track); 439 if (ret) 440 goto err; 441 return 0; 442 443 err: 444 for (i = 0; i < num_chunks; i++) { 445 if (migf->buf[i]) { 446 mlx5vf_put_data_buffer(migf->buf[i]); 447 migf->buf[i] = NULL; 448 } 449 if (migf->buf_header[i]) { 450 mlx5vf_put_data_buffer(migf->buf_header[i]); 451 migf->buf_header[i] = NULL; 452 } 453 } 454 455 return ret; 456 } 457 458 static long mlx5vf_precopy_ioctl(struct file *filp, unsigned int cmd, 459 unsigned long arg) 460 { 461 struct mlx5_vf_migration_file *migf = filp->private_data; 462 struct mlx5vf_pci_core_device *mvdev = migf->mvdev; 463 struct mlx5_vhca_data_buffer *buf; 464 struct vfio_precopy_info info = {}; 465 loff_t *pos = &filp->f_pos; 466 unsigned long minsz; 467 size_t inc_length = 0; 468 bool end_of_data = false; 469 int ret; 470 471 if (cmd != VFIO_MIG_GET_PRECOPY_INFO) 472 return -ENOTTY; 473 474 minsz = offsetofend(struct vfio_precopy_info, dirty_bytes); 475 476 if (copy_from_user(&info, (void __user *)arg, minsz)) 477 return -EFAULT; 478 479 if (info.argsz < minsz) 480 return -EINVAL; 481 482 mutex_lock(&mvdev->state_mutex); 483 if (mvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY && 484 mvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY_P2P) { 485 ret = -EINVAL; 486 goto err_state_unlock; 487 } 488 489 /* 490 * We can't issue a SAVE command when the device is suspended, so as 491 * part of VFIO_DEVICE_STATE_PRE_COPY_P2P no reason to query for extra 492 * bytes that can't be read. 493 */ 494 if (mvdev->mig_state == VFIO_DEVICE_STATE_PRE_COPY) { 495 /* 496 * Once the query returns it's guaranteed that there is no 497 * active SAVE command. 498 * As so, the other code below is safe with the proper locks. 499 */ 500 ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &inc_length, 501 NULL, MLX5VF_QUERY_INC); 502 if (ret) 503 goto err_state_unlock; 504 } 505 506 mutex_lock(&migf->lock); 507 if (migf->state == MLX5_MIGF_STATE_ERROR) { 508 ret = -ENODEV; 509 goto err_migf_unlock; 510 } 511 512 if (migf->pre_copy_initial_bytes > *pos) { 513 info.initial_bytes = migf->pre_copy_initial_bytes - *pos; 514 } else { 515 info.dirty_bytes = migf->max_pos - *pos; 516 if (!info.dirty_bytes) 517 end_of_data = true; 518 info.dirty_bytes += inc_length; 519 } 520 521 if (!end_of_data || !inc_length) { 522 mutex_unlock(&migf->lock); 523 goto done; 524 } 525 526 mutex_unlock(&migf->lock); 527 /* 528 * We finished transferring the current state and the device has a 529 * dirty state, save a new state to be ready for. 530 */ 531 buf = mlx5vf_get_data_buffer(migf, DIV_ROUND_UP(inc_length, PAGE_SIZE), 532 DMA_FROM_DEVICE); 533 if (IS_ERR(buf)) { 534 ret = PTR_ERR(buf); 535 mlx5vf_mark_err(migf); 536 goto err_state_unlock; 537 } 538 539 ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, true); 540 if (ret) { 541 mlx5vf_mark_err(migf); 542 mlx5vf_put_data_buffer(buf); 543 goto err_state_unlock; 544 } 545 546 done: 547 mlx5vf_state_mutex_unlock(mvdev); 548 if (copy_to_user((void __user *)arg, &info, minsz)) 549 return -EFAULT; 550 return 0; 551 552 err_migf_unlock: 553 mutex_unlock(&migf->lock); 554 err_state_unlock: 555 mlx5vf_state_mutex_unlock(mvdev); 556 return ret; 557 } 558 559 static const struct file_operations mlx5vf_save_fops = { 560 .owner = THIS_MODULE, 561 .read = mlx5vf_save_read, 562 .poll = mlx5vf_save_poll, 563 .unlocked_ioctl = mlx5vf_precopy_ioctl, 564 .compat_ioctl = compat_ptr_ioctl, 565 .release = mlx5vf_release_file, 566 }; 567 568 static int mlx5vf_pci_save_device_inc_data(struct mlx5vf_pci_core_device *mvdev) 569 { 570 struct mlx5_vf_migration_file *migf = mvdev->saving_migf; 571 struct mlx5_vhca_data_buffer *buf; 572 size_t length; 573 int ret; 574 575 if (migf->state == MLX5_MIGF_STATE_ERROR) 576 return -ENODEV; 577 578 ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, NULL, 579 MLX5VF_QUERY_INC | MLX5VF_QUERY_FINAL); 580 if (ret) 581 goto err; 582 583 buf = mlx5vf_mig_file_get_stop_copy_buf(migf, 0, length); 584 if (IS_ERR(buf)) { 585 ret = PTR_ERR(buf); 586 goto err; 587 } 588 589 ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, false); 590 if (ret) 591 goto err_save; 592 593 return 0; 594 595 err_save: 596 mlx5vf_put_data_buffer(buf); 597 err: 598 mlx5vf_mark_err(migf); 599 return ret; 600 } 601 602 static struct mlx5_vf_migration_file * 603 mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev, bool track) 604 { 605 struct mlx5_vf_migration_file *migf; 606 struct mlx5_vhca_data_buffer *buf; 607 size_t length; 608 u64 full_size; 609 int ret; 610 611 migf = kzalloc(sizeof(*migf), GFP_KERNEL_ACCOUNT); 612 if (!migf) 613 return ERR_PTR(-ENOMEM); 614 615 migf->filp = anon_inode_getfile("mlx5vf_mig", &mlx5vf_save_fops, migf, 616 O_RDONLY); 617 if (IS_ERR(migf->filp)) { 618 ret = PTR_ERR(migf->filp); 619 kfree(migf); 620 return ERR_PTR(ret); 621 } 622 623 migf->mvdev = mvdev; 624 stream_open(migf->filp->f_inode, migf->filp); 625 mutex_init(&migf->lock); 626 init_waitqueue_head(&migf->poll_wait); 627 init_completion(&migf->save_comp); 628 /* 629 * save_comp is being used as a binary semaphore built from 630 * a completion. A normal mutex cannot be used because the lock is 631 * passed between kernel threads and lockdep can't model this. 632 */ 633 complete(&migf->save_comp); 634 mlx5_cmd_init_async_ctx(mvdev->mdev, &migf->async_ctx); 635 INIT_WORK(&migf->async_data.work, mlx5vf_mig_file_cleanup_cb); 636 INIT_LIST_HEAD(&migf->buf_list); 637 INIT_LIST_HEAD(&migf->avail_list); 638 spin_lock_init(&migf->list_lock); 639 640 ret = mlx5vf_cmd_alloc_pd(migf); 641 if (ret) 642 goto out; 643 644 ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, &full_size, 0); 645 if (ret) 646 goto out_pd; 647 648 ret = mlx5vf_prep_stop_copy(mvdev, migf, length, full_size, track); 649 if (ret) 650 goto out_pd; 651 652 if (track) { 653 /* leave the allocated buffer ready for the stop-copy phase */ 654 buf = mlx5vf_alloc_data_buffer(migf, migf->buf[0]->npages, 655 DMA_FROM_DEVICE); 656 if (IS_ERR(buf)) { 657 ret = PTR_ERR(buf); 658 goto out_pd; 659 } 660 } else { 661 buf = migf->buf[0]; 662 migf->buf[0] = NULL; 663 } 664 665 ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, false, track); 666 if (ret) 667 goto out_save; 668 return migf; 669 out_save: 670 mlx5vf_free_data_buffer(buf); 671 out_pd: 672 mlx5fv_cmd_clean_migf_resources(migf); 673 out: 674 fput(migf->filp); 675 return ERR_PTR(ret); 676 } 677 678 static int 679 mlx5vf_append_page_to_mig_buf(struct mlx5_vhca_data_buffer *vhca_buf, 680 const char __user **buf, size_t *len, 681 loff_t *pos, ssize_t *done) 682 { 683 unsigned long offset; 684 size_t page_offset; 685 struct page *page; 686 size_t page_len; 687 u8 *to_buff; 688 int ret; 689 690 offset = *pos - vhca_buf->start_pos; 691 page_offset = offset % PAGE_SIZE; 692 693 page = mlx5vf_get_migration_page(vhca_buf, offset - page_offset); 694 if (!page) 695 return -EINVAL; 696 page_len = min_t(size_t, *len, PAGE_SIZE - page_offset); 697 to_buff = kmap_local_page(page); 698 ret = copy_from_user(to_buff + page_offset, *buf, page_len); 699 kunmap_local(to_buff); 700 if (ret) 701 return -EFAULT; 702 703 *pos += page_len; 704 *done += page_len; 705 *buf += page_len; 706 *len -= page_len; 707 vhca_buf->length += page_len; 708 return 0; 709 } 710 711 static ssize_t 712 mlx5vf_resume_read_image(struct mlx5_vf_migration_file *migf, 713 struct mlx5_vhca_data_buffer *vhca_buf, 714 size_t image_size, const char __user **buf, 715 size_t *len, loff_t *pos, ssize_t *done, 716 bool *has_work) 717 { 718 size_t copy_len, to_copy; 719 int ret; 720 721 to_copy = min_t(size_t, *len, image_size - vhca_buf->length); 722 copy_len = to_copy; 723 while (to_copy) { 724 ret = mlx5vf_append_page_to_mig_buf(vhca_buf, buf, &to_copy, pos, 725 done); 726 if (ret) 727 return ret; 728 } 729 730 *len -= copy_len; 731 if (vhca_buf->length == image_size) { 732 migf->load_state = MLX5_VF_LOAD_STATE_LOAD_IMAGE; 733 migf->max_pos += image_size; 734 *has_work = true; 735 } 736 737 return 0; 738 } 739 740 static int 741 mlx5vf_resume_read_header_data(struct mlx5_vf_migration_file *migf, 742 struct mlx5_vhca_data_buffer *vhca_buf, 743 const char __user **buf, size_t *len, 744 loff_t *pos, ssize_t *done) 745 { 746 size_t copy_len, to_copy; 747 size_t required_data; 748 u8 *to_buff; 749 int ret; 750 751 required_data = migf->record_size - vhca_buf->length; 752 to_copy = min_t(size_t, *len, required_data); 753 copy_len = to_copy; 754 while (to_copy) { 755 ret = mlx5vf_append_page_to_mig_buf(vhca_buf, buf, &to_copy, pos, 756 done); 757 if (ret) 758 return ret; 759 } 760 761 *len -= copy_len; 762 if (vhca_buf->length == migf->record_size) { 763 switch (migf->record_tag) { 764 case MLX5_MIGF_HEADER_TAG_STOP_COPY_SIZE: 765 { 766 struct page *page; 767 768 page = mlx5vf_get_migration_page(vhca_buf, 0); 769 if (!page) 770 return -EINVAL; 771 to_buff = kmap_local_page(page); 772 migf->stop_copy_prep_size = min_t(u64, 773 le64_to_cpup((__le64 *)to_buff), MAX_LOAD_SIZE); 774 kunmap_local(to_buff); 775 break; 776 } 777 default: 778 /* Optional tag */ 779 break; 780 } 781 782 migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER; 783 migf->max_pos += migf->record_size; 784 vhca_buf->length = 0; 785 } 786 787 return 0; 788 } 789 790 static int 791 mlx5vf_resume_read_header(struct mlx5_vf_migration_file *migf, 792 struct mlx5_vhca_data_buffer *vhca_buf, 793 const char __user **buf, 794 size_t *len, loff_t *pos, 795 ssize_t *done, bool *has_work) 796 { 797 struct page *page; 798 size_t copy_len; 799 u8 *to_buff; 800 int ret; 801 802 copy_len = min_t(size_t, *len, 803 sizeof(struct mlx5_vf_migration_header) - vhca_buf->length); 804 page = mlx5vf_get_migration_page(vhca_buf, 0); 805 if (!page) 806 return -EINVAL; 807 to_buff = kmap_local_page(page); 808 ret = copy_from_user(to_buff + vhca_buf->length, *buf, copy_len); 809 if (ret) { 810 ret = -EFAULT; 811 goto end; 812 } 813 814 *buf += copy_len; 815 *pos += copy_len; 816 *done += copy_len; 817 *len -= copy_len; 818 vhca_buf->length += copy_len; 819 if (vhca_buf->length == sizeof(struct mlx5_vf_migration_header)) { 820 u64 record_size; 821 u32 flags; 822 823 record_size = le64_to_cpup((__le64 *)to_buff); 824 if (record_size > MAX_LOAD_SIZE) { 825 ret = -ENOMEM; 826 goto end; 827 } 828 829 migf->record_size = record_size; 830 flags = le32_to_cpup((__le32 *)(to_buff + 831 offsetof(struct mlx5_vf_migration_header, flags))); 832 migf->record_tag = le32_to_cpup((__le32 *)(to_buff + 833 offsetof(struct mlx5_vf_migration_header, tag))); 834 switch (migf->record_tag) { 835 case MLX5_MIGF_HEADER_TAG_FW_DATA: 836 migf->load_state = MLX5_VF_LOAD_STATE_PREP_IMAGE; 837 break; 838 case MLX5_MIGF_HEADER_TAG_STOP_COPY_SIZE: 839 migf->load_state = MLX5_VF_LOAD_STATE_PREP_HEADER_DATA; 840 break; 841 default: 842 if (!(flags & MLX5_MIGF_HEADER_FLAGS_TAG_OPTIONAL)) { 843 ret = -EOPNOTSUPP; 844 goto end; 845 } 846 /* We may read and skip this optional record data */ 847 migf->load_state = MLX5_VF_LOAD_STATE_PREP_HEADER_DATA; 848 } 849 850 migf->max_pos += vhca_buf->length; 851 vhca_buf->length = 0; 852 *has_work = true; 853 } 854 end: 855 kunmap_local(to_buff); 856 return ret; 857 } 858 859 static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf, 860 size_t len, loff_t *pos) 861 { 862 struct mlx5_vf_migration_file *migf = filp->private_data; 863 struct mlx5_vhca_data_buffer *vhca_buf = migf->buf[0]; 864 struct mlx5_vhca_data_buffer *vhca_buf_header = migf->buf_header[0]; 865 loff_t requested_length; 866 bool has_work = false; 867 ssize_t done = 0; 868 int ret = 0; 869 870 if (pos) 871 return -ESPIPE; 872 pos = &filp->f_pos; 873 874 if (*pos < 0 || 875 check_add_overflow((loff_t)len, *pos, &requested_length)) 876 return -EINVAL; 877 878 mutex_lock(&migf->mvdev->state_mutex); 879 mutex_lock(&migf->lock); 880 if (migf->state == MLX5_MIGF_STATE_ERROR) { 881 ret = -ENODEV; 882 goto out_unlock; 883 } 884 885 while (len || has_work) { 886 has_work = false; 887 switch (migf->load_state) { 888 case MLX5_VF_LOAD_STATE_READ_HEADER: 889 ret = mlx5vf_resume_read_header(migf, vhca_buf_header, 890 &buf, &len, pos, 891 &done, &has_work); 892 if (ret) 893 goto out_unlock; 894 break; 895 case MLX5_VF_LOAD_STATE_PREP_HEADER_DATA: 896 { 897 u32 npages = DIV_ROUND_UP(migf->record_size, PAGE_SIZE); 898 899 if (vhca_buf_header->npages < npages) { 900 mlx5vf_free_data_buffer(vhca_buf_header); 901 902 migf->buf_header[0] = mlx5vf_alloc_data_buffer( 903 migf, npages, DMA_NONE); 904 if (IS_ERR(migf->buf_header[0])) { 905 ret = PTR_ERR(migf->buf_header[0]); 906 migf->buf_header[0] = NULL; 907 goto out_unlock; 908 } 909 910 vhca_buf_header = migf->buf_header[0]; 911 } 912 913 vhca_buf_header->start_pos = migf->max_pos; 914 migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER_DATA; 915 break; 916 } 917 case MLX5_VF_LOAD_STATE_READ_HEADER_DATA: 918 ret = mlx5vf_resume_read_header_data(migf, vhca_buf_header, 919 &buf, &len, pos, &done); 920 if (ret) 921 goto out_unlock; 922 break; 923 case MLX5_VF_LOAD_STATE_PREP_IMAGE: 924 { 925 u64 size = max(migf->record_size, 926 migf->stop_copy_prep_size); 927 u32 npages = DIV_ROUND_UP(size, PAGE_SIZE); 928 929 if (vhca_buf->npages < npages) { 930 mlx5vf_free_data_buffer(vhca_buf); 931 932 migf->buf[0] = mlx5vf_alloc_data_buffer( 933 migf, npages, DMA_TO_DEVICE); 934 if (IS_ERR(migf->buf[0])) { 935 ret = PTR_ERR(migf->buf[0]); 936 migf->buf[0] = NULL; 937 goto out_unlock; 938 } 939 940 vhca_buf = migf->buf[0]; 941 } 942 943 vhca_buf->start_pos = migf->max_pos; 944 migf->load_state = MLX5_VF_LOAD_STATE_READ_IMAGE; 945 break; 946 } 947 case MLX5_VF_LOAD_STATE_READ_IMAGE: 948 ret = mlx5vf_resume_read_image(migf, vhca_buf, 949 migf->record_size, 950 &buf, &len, pos, &done, &has_work); 951 if (ret) 952 goto out_unlock; 953 break; 954 case MLX5_VF_LOAD_STATE_LOAD_IMAGE: 955 ret = mlx5vf_cmd_load_vhca_state(migf->mvdev, migf, vhca_buf); 956 if (ret) 957 goto out_unlock; 958 migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER; 959 960 /* prep header buf for next image */ 961 vhca_buf_header->length = 0; 962 /* prep data buf for next image */ 963 vhca_buf->length = 0; 964 965 break; 966 default: 967 break; 968 } 969 } 970 971 out_unlock: 972 if (ret) 973 migf->state = MLX5_MIGF_STATE_ERROR; 974 mutex_unlock(&migf->lock); 975 mlx5vf_state_mutex_unlock(migf->mvdev); 976 return ret ? ret : done; 977 } 978 979 static const struct file_operations mlx5vf_resume_fops = { 980 .owner = THIS_MODULE, 981 .write = mlx5vf_resume_write, 982 .release = mlx5vf_release_file, 983 }; 984 985 static struct mlx5_vf_migration_file * 986 mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev) 987 { 988 struct mlx5_vf_migration_file *migf; 989 struct mlx5_vhca_data_buffer *buf; 990 int ret; 991 992 migf = kzalloc(sizeof(*migf), GFP_KERNEL_ACCOUNT); 993 if (!migf) 994 return ERR_PTR(-ENOMEM); 995 996 migf->filp = anon_inode_getfile("mlx5vf_mig", &mlx5vf_resume_fops, migf, 997 O_WRONLY); 998 if (IS_ERR(migf->filp)) { 999 ret = PTR_ERR(migf->filp); 1000 kfree(migf); 1001 return ERR_PTR(ret); 1002 } 1003 1004 stream_open(migf->filp->f_inode, migf->filp); 1005 mutex_init(&migf->lock); 1006 INIT_LIST_HEAD(&migf->buf_list); 1007 INIT_LIST_HEAD(&migf->avail_list); 1008 spin_lock_init(&migf->list_lock); 1009 migf->mvdev = mvdev; 1010 ret = mlx5vf_cmd_alloc_pd(migf); 1011 if (ret) 1012 goto out; 1013 1014 buf = mlx5vf_alloc_data_buffer(migf, 0, DMA_TO_DEVICE); 1015 if (IS_ERR(buf)) { 1016 ret = PTR_ERR(buf); 1017 goto out_pd; 1018 } 1019 1020 migf->buf[0] = buf; 1021 buf = mlx5vf_alloc_data_buffer( 1022 migf, 1023 DIV_ROUND_UP(sizeof(struct mlx5_vf_migration_header), 1024 PAGE_SIZE), 1025 DMA_NONE); 1026 if (IS_ERR(buf)) { 1027 ret = PTR_ERR(buf); 1028 goto out_buf; 1029 } 1030 1031 migf->buf_header[0] = buf; 1032 migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER; 1033 1034 return migf; 1035 out_buf: 1036 mlx5vf_free_data_buffer(migf->buf[0]); 1037 out_pd: 1038 mlx5vf_cmd_dealloc_pd(migf); 1039 out: 1040 fput(migf->filp); 1041 return ERR_PTR(ret); 1042 } 1043 1044 void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev, 1045 enum mlx5_vf_migf_state *last_save_state) 1046 { 1047 if (mvdev->resuming_migf) { 1048 mlx5vf_disable_fd(mvdev->resuming_migf); 1049 mlx5fv_cmd_clean_migf_resources(mvdev->resuming_migf); 1050 fput(mvdev->resuming_migf->filp); 1051 mvdev->resuming_migf = NULL; 1052 } 1053 if (mvdev->saving_migf) { 1054 mlx5_cmd_cleanup_async_ctx(&mvdev->saving_migf->async_ctx); 1055 cancel_work_sync(&mvdev->saving_migf->async_data.work); 1056 if (last_save_state) 1057 *last_save_state = mvdev->saving_migf->state; 1058 mlx5vf_disable_fd(mvdev->saving_migf); 1059 wake_up_interruptible(&mvdev->saving_migf->poll_wait); 1060 mlx5fv_cmd_clean_migf_resources(mvdev->saving_migf); 1061 fput(mvdev->saving_migf->filp); 1062 mvdev->saving_migf = NULL; 1063 } 1064 } 1065 1066 static struct file * 1067 mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev, 1068 u32 new) 1069 { 1070 u32 cur = mvdev->mig_state; 1071 int ret; 1072 1073 if (cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_STOP) { 1074 ret = mlx5vf_cmd_suspend_vhca(mvdev, 1075 MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_RESPONDER); 1076 if (ret) 1077 return ERR_PTR(ret); 1078 return NULL; 1079 } 1080 1081 if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RUNNING_P2P) { 1082 ret = mlx5vf_cmd_resume_vhca(mvdev, 1083 MLX5_RESUME_VHCA_IN_OP_MOD_RESUME_RESPONDER); 1084 if (ret) 1085 return ERR_PTR(ret); 1086 return NULL; 1087 } 1088 1089 if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_RUNNING_P2P) || 1090 (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) { 1091 ret = mlx5vf_cmd_suspend_vhca(mvdev, 1092 MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_INITIATOR); 1093 if (ret) 1094 return ERR_PTR(ret); 1095 return NULL; 1096 } 1097 1098 if ((cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_RUNNING) || 1099 (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_PRE_COPY)) { 1100 ret = mlx5vf_cmd_resume_vhca(mvdev, 1101 MLX5_RESUME_VHCA_IN_OP_MOD_RESUME_INITIATOR); 1102 if (ret) 1103 return ERR_PTR(ret); 1104 return NULL; 1105 } 1106 1107 if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_STOP_COPY) { 1108 struct mlx5_vf_migration_file *migf; 1109 1110 migf = mlx5vf_pci_save_device_data(mvdev, false); 1111 if (IS_ERR(migf)) 1112 return ERR_CAST(migf); 1113 get_file(migf->filp); 1114 mvdev->saving_migf = migf; 1115 return migf->filp; 1116 } 1117 1118 if (cur == VFIO_DEVICE_STATE_STOP_COPY && new == VFIO_DEVICE_STATE_STOP) { 1119 mlx5vf_disable_fds(mvdev, NULL); 1120 return NULL; 1121 } 1122 1123 if ((cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_RUNNING) || 1124 (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && 1125 new == VFIO_DEVICE_STATE_RUNNING_P2P)) { 1126 struct mlx5_vf_migration_file *migf = mvdev->saving_migf; 1127 struct mlx5_vhca_data_buffer *buf; 1128 enum mlx5_vf_migf_state state; 1129 size_t size; 1130 1131 ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &size, NULL, 1132 MLX5VF_QUERY_INC | MLX5VF_QUERY_CLEANUP); 1133 if (ret) 1134 return ERR_PTR(ret); 1135 buf = mlx5vf_get_data_buffer(migf, 1136 DIV_ROUND_UP(size, PAGE_SIZE), DMA_FROM_DEVICE); 1137 if (IS_ERR(buf)) 1138 return ERR_CAST(buf); 1139 /* pre_copy cleanup */ 1140 ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, false, false); 1141 if (ret) { 1142 mlx5vf_put_data_buffer(buf); 1143 return ERR_PTR(ret); 1144 } 1145 mlx5vf_disable_fds(mvdev, &state); 1146 return (state != MLX5_MIGF_STATE_ERROR) ? NULL : ERR_PTR(-EIO); 1147 } 1148 1149 if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RESUMING) { 1150 struct mlx5_vf_migration_file *migf; 1151 1152 migf = mlx5vf_pci_resume_device_data(mvdev); 1153 if (IS_ERR(migf)) 1154 return ERR_CAST(migf); 1155 get_file(migf->filp); 1156 mvdev->resuming_migf = migf; 1157 return migf->filp; 1158 } 1159 1160 if (cur == VFIO_DEVICE_STATE_RESUMING && new == VFIO_DEVICE_STATE_STOP) { 1161 mlx5vf_disable_fds(mvdev, NULL); 1162 return NULL; 1163 } 1164 1165 if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_PRE_COPY) || 1166 (cur == VFIO_DEVICE_STATE_RUNNING_P2P && 1167 new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) { 1168 struct mlx5_vf_migration_file *migf; 1169 1170 migf = mlx5vf_pci_save_device_data(mvdev, true); 1171 if (IS_ERR(migf)) 1172 return ERR_CAST(migf); 1173 get_file(migf->filp); 1174 mvdev->saving_migf = migf; 1175 return migf->filp; 1176 } 1177 1178 if (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_STOP_COPY) { 1179 ret = mlx5vf_cmd_suspend_vhca(mvdev, 1180 MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_RESPONDER); 1181 if (ret) 1182 return ERR_PTR(ret); 1183 ret = mlx5vf_pci_save_device_inc_data(mvdev); 1184 return ret ? ERR_PTR(ret) : NULL; 1185 } 1186 1187 /* 1188 * vfio_mig_get_next_state() does not use arcs other than the above 1189 */ 1190 WARN_ON(true); 1191 return ERR_PTR(-EINVAL); 1192 } 1193 1194 /* 1195 * This function is called in all state_mutex unlock cases to 1196 * handle a 'deferred_reset' if exists. 1197 */ 1198 void mlx5vf_state_mutex_unlock(struct mlx5vf_pci_core_device *mvdev) 1199 { 1200 again: 1201 spin_lock(&mvdev->reset_lock); 1202 if (mvdev->deferred_reset) { 1203 mvdev->deferred_reset = false; 1204 spin_unlock(&mvdev->reset_lock); 1205 mvdev->mig_state = VFIO_DEVICE_STATE_RUNNING; 1206 mlx5vf_disable_fds(mvdev, NULL); 1207 goto again; 1208 } 1209 mutex_unlock(&mvdev->state_mutex); 1210 spin_unlock(&mvdev->reset_lock); 1211 } 1212 1213 static struct file * 1214 mlx5vf_pci_set_device_state(struct vfio_device *vdev, 1215 enum vfio_device_mig_state new_state) 1216 { 1217 struct mlx5vf_pci_core_device *mvdev = container_of( 1218 vdev, struct mlx5vf_pci_core_device, core_device.vdev); 1219 enum vfio_device_mig_state next_state; 1220 struct file *res = NULL; 1221 int ret; 1222 1223 mutex_lock(&mvdev->state_mutex); 1224 while (new_state != mvdev->mig_state) { 1225 ret = vfio_mig_get_next_state(vdev, mvdev->mig_state, 1226 new_state, &next_state); 1227 if (ret) { 1228 res = ERR_PTR(ret); 1229 break; 1230 } 1231 res = mlx5vf_pci_step_device_state_locked(mvdev, next_state); 1232 if (IS_ERR(res)) 1233 break; 1234 mvdev->mig_state = next_state; 1235 if (WARN_ON(res && new_state != mvdev->mig_state)) { 1236 fput(res); 1237 res = ERR_PTR(-EINVAL); 1238 break; 1239 } 1240 } 1241 mlx5vf_state_mutex_unlock(mvdev); 1242 return res; 1243 } 1244 1245 static int mlx5vf_pci_get_data_size(struct vfio_device *vdev, 1246 unsigned long *stop_copy_length) 1247 { 1248 struct mlx5vf_pci_core_device *mvdev = container_of( 1249 vdev, struct mlx5vf_pci_core_device, core_device.vdev); 1250 size_t state_size; 1251 u64 total_size; 1252 int ret; 1253 1254 mutex_lock(&mvdev->state_mutex); 1255 ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &state_size, 1256 &total_size, 0); 1257 if (!ret) 1258 *stop_copy_length = total_size; 1259 mlx5vf_state_mutex_unlock(mvdev); 1260 return ret; 1261 } 1262 1263 static int mlx5vf_pci_get_device_state(struct vfio_device *vdev, 1264 enum vfio_device_mig_state *curr_state) 1265 { 1266 struct mlx5vf_pci_core_device *mvdev = container_of( 1267 vdev, struct mlx5vf_pci_core_device, core_device.vdev); 1268 1269 mutex_lock(&mvdev->state_mutex); 1270 *curr_state = mvdev->mig_state; 1271 mlx5vf_state_mutex_unlock(mvdev); 1272 return 0; 1273 } 1274 1275 static void mlx5vf_pci_aer_reset_done(struct pci_dev *pdev) 1276 { 1277 struct mlx5vf_pci_core_device *mvdev = mlx5vf_drvdata(pdev); 1278 1279 if (!mvdev->migrate_cap) 1280 return; 1281 1282 /* 1283 * As the higher VFIO layers are holding locks across reset and using 1284 * those same locks with the mm_lock we need to prevent ABBA deadlock 1285 * with the state_mutex and mm_lock. 1286 * In case the state_mutex was taken already we defer the cleanup work 1287 * to the unlock flow of the other running context. 1288 */ 1289 spin_lock(&mvdev->reset_lock); 1290 mvdev->deferred_reset = true; 1291 if (!mutex_trylock(&mvdev->state_mutex)) { 1292 spin_unlock(&mvdev->reset_lock); 1293 return; 1294 } 1295 spin_unlock(&mvdev->reset_lock); 1296 mlx5vf_state_mutex_unlock(mvdev); 1297 } 1298 1299 static int mlx5vf_pci_open_device(struct vfio_device *core_vdev) 1300 { 1301 struct mlx5vf_pci_core_device *mvdev = container_of( 1302 core_vdev, struct mlx5vf_pci_core_device, core_device.vdev); 1303 struct vfio_pci_core_device *vdev = &mvdev->core_device; 1304 int ret; 1305 1306 ret = vfio_pci_core_enable(vdev); 1307 if (ret) 1308 return ret; 1309 1310 if (mvdev->migrate_cap) 1311 mvdev->mig_state = VFIO_DEVICE_STATE_RUNNING; 1312 vfio_pci_core_finish_enable(vdev); 1313 return 0; 1314 } 1315 1316 static void mlx5vf_pci_close_device(struct vfio_device *core_vdev) 1317 { 1318 struct mlx5vf_pci_core_device *mvdev = container_of( 1319 core_vdev, struct mlx5vf_pci_core_device, core_device.vdev); 1320 1321 mlx5vf_cmd_close_migratable(mvdev); 1322 vfio_pci_core_close_device(core_vdev); 1323 } 1324 1325 static const struct vfio_migration_ops mlx5vf_pci_mig_ops = { 1326 .migration_set_state = mlx5vf_pci_set_device_state, 1327 .migration_get_state = mlx5vf_pci_get_device_state, 1328 .migration_get_data_size = mlx5vf_pci_get_data_size, 1329 }; 1330 1331 static const struct vfio_log_ops mlx5vf_pci_log_ops = { 1332 .log_start = mlx5vf_start_page_tracker, 1333 .log_stop = mlx5vf_stop_page_tracker, 1334 .log_read_and_clear = mlx5vf_tracker_read_and_clear, 1335 }; 1336 1337 static int mlx5vf_pci_init_dev(struct vfio_device *core_vdev) 1338 { 1339 struct mlx5vf_pci_core_device *mvdev = container_of(core_vdev, 1340 struct mlx5vf_pci_core_device, core_device.vdev); 1341 int ret; 1342 1343 ret = vfio_pci_core_init_dev(core_vdev); 1344 if (ret) 1345 return ret; 1346 1347 mlx5vf_cmd_set_migratable(mvdev, &mlx5vf_pci_mig_ops, 1348 &mlx5vf_pci_log_ops); 1349 1350 return 0; 1351 } 1352 1353 static void mlx5vf_pci_release_dev(struct vfio_device *core_vdev) 1354 { 1355 struct mlx5vf_pci_core_device *mvdev = container_of(core_vdev, 1356 struct mlx5vf_pci_core_device, core_device.vdev); 1357 1358 mlx5vf_cmd_remove_migratable(mvdev); 1359 vfio_pci_core_release_dev(core_vdev); 1360 } 1361 1362 static const struct vfio_device_ops mlx5vf_pci_ops = { 1363 .name = "mlx5-vfio-pci", 1364 .init = mlx5vf_pci_init_dev, 1365 .release = mlx5vf_pci_release_dev, 1366 .open_device = mlx5vf_pci_open_device, 1367 .close_device = mlx5vf_pci_close_device, 1368 .ioctl = vfio_pci_core_ioctl, 1369 .device_feature = vfio_pci_core_ioctl_feature, 1370 .read = vfio_pci_core_read, 1371 .write = vfio_pci_core_write, 1372 .mmap = vfio_pci_core_mmap, 1373 .request = vfio_pci_core_request, 1374 .match = vfio_pci_core_match, 1375 .match_token_uuid = vfio_pci_core_match_token_uuid, 1376 .bind_iommufd = vfio_iommufd_physical_bind, 1377 .unbind_iommufd = vfio_iommufd_physical_unbind, 1378 .attach_ioas = vfio_iommufd_physical_attach_ioas, 1379 .detach_ioas = vfio_iommufd_physical_detach_ioas, 1380 }; 1381 1382 static int mlx5vf_pci_probe(struct pci_dev *pdev, 1383 const struct pci_device_id *id) 1384 { 1385 struct mlx5vf_pci_core_device *mvdev; 1386 int ret; 1387 1388 mvdev = vfio_alloc_device(mlx5vf_pci_core_device, core_device.vdev, 1389 &pdev->dev, &mlx5vf_pci_ops); 1390 if (IS_ERR(mvdev)) 1391 return PTR_ERR(mvdev); 1392 1393 dev_set_drvdata(&pdev->dev, &mvdev->core_device); 1394 ret = vfio_pci_core_register_device(&mvdev->core_device); 1395 if (ret) 1396 goto out_put_vdev; 1397 return 0; 1398 1399 out_put_vdev: 1400 vfio_put_device(&mvdev->core_device.vdev); 1401 return ret; 1402 } 1403 1404 static void mlx5vf_pci_remove(struct pci_dev *pdev) 1405 { 1406 struct mlx5vf_pci_core_device *mvdev = mlx5vf_drvdata(pdev); 1407 1408 vfio_pci_core_unregister_device(&mvdev->core_device); 1409 vfio_put_device(&mvdev->core_device.vdev); 1410 } 1411 1412 static const struct pci_device_id mlx5vf_pci_table[] = { 1413 { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_MELLANOX, 0x101e) }, /* ConnectX Family mlx5Gen Virtual Function */ 1414 {} 1415 }; 1416 1417 MODULE_DEVICE_TABLE(pci, mlx5vf_pci_table); 1418 1419 static const struct pci_error_handlers mlx5vf_err_handlers = { 1420 .reset_done = mlx5vf_pci_aer_reset_done, 1421 .error_detected = vfio_pci_core_aer_err_detected, 1422 }; 1423 1424 static struct pci_driver mlx5vf_pci_driver = { 1425 .name = KBUILD_MODNAME, 1426 .id_table = mlx5vf_pci_table, 1427 .probe = mlx5vf_pci_probe, 1428 .remove = mlx5vf_pci_remove, 1429 .err_handler = &mlx5vf_err_handlers, 1430 .driver_managed_dma = true, 1431 }; 1432 1433 module_pci_driver(mlx5vf_pci_driver); 1434 1435 MODULE_IMPORT_NS("IOMMUFD"); 1436 MODULE_LICENSE("GPL"); 1437 MODULE_AUTHOR("Max Gurtovoy <mgurtovoy@nvidia.com>"); 1438 MODULE_AUTHOR("Yishai Hadas <yishaih@nvidia.com>"); 1439 MODULE_DESCRIPTION( 1440 "MLX5 VFIO PCI - User Level meta-driver for MLX5 device family"); 1441