1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved 4 */ 5 6 #include <linux/device.h> 7 #include <linux/eventfd.h> 8 #include <linux/file.h> 9 #include <linux/interrupt.h> 10 #include <linux/iommu.h> 11 #include <linux/module.h> 12 #include <linux/mutex.h> 13 #include <linux/notifier.h> 14 #include <linux/pci.h> 15 #include <linux/pm_runtime.h> 16 #include <linux/types.h> 17 #include <linux/uaccess.h> 18 #include <linux/vfio.h> 19 #include <linux/sched/mm.h> 20 #include <linux/anon_inodes.h> 21 22 #include "cmd.h" 23 24 /* Device specification max LOAD size */ 25 #define MAX_LOAD_SIZE (BIT_ULL(__mlx5_bit_sz(load_vhca_state_in, size)) - 1) 26 27 #define MAX_CHUNK_SIZE SZ_8M 28 29 static struct mlx5vf_pci_core_device *mlx5vf_drvdata(struct pci_dev *pdev) 30 { 31 struct vfio_pci_core_device *core_device = dev_get_drvdata(&pdev->dev); 32 33 return container_of(core_device, struct mlx5vf_pci_core_device, 34 core_device); 35 } 36 37 struct page * 38 mlx5vf_get_migration_page(struct mlx5_vhca_data_buffer *buf, 39 unsigned long offset) 40 { 41 unsigned long cur_offset = 0; 42 struct scatterlist *sg; 43 unsigned int i; 44 45 /* All accesses are sequential */ 46 if (offset < buf->last_offset || !buf->last_offset_sg) { 47 buf->last_offset = 0; 48 buf->last_offset_sg = buf->table.sgt.sgl; 49 buf->sg_last_entry = 0; 50 } 51 52 cur_offset = buf->last_offset; 53 54 for_each_sg(buf->last_offset_sg, sg, 55 buf->table.sgt.orig_nents - buf->sg_last_entry, i) { 56 if (offset < sg->length + cur_offset) { 57 buf->last_offset_sg = sg; 58 buf->sg_last_entry += i; 59 buf->last_offset = cur_offset; 60 return nth_page(sg_page(sg), 61 (offset - cur_offset) / PAGE_SIZE); 62 } 63 cur_offset += sg->length; 64 } 65 return NULL; 66 } 67 68 static void mlx5vf_disable_fd(struct mlx5_vf_migration_file *migf) 69 { 70 mutex_lock(&migf->lock); 71 migf->state = MLX5_MIGF_STATE_ERROR; 72 migf->filp->f_pos = 0; 73 mutex_unlock(&migf->lock); 74 } 75 76 static int mlx5vf_release_file(struct inode *inode, struct file *filp) 77 { 78 struct mlx5_vf_migration_file *migf = filp->private_data; 79 80 mlx5vf_disable_fd(migf); 81 mutex_destroy(&migf->lock); 82 kfree(migf); 83 return 0; 84 } 85 86 static struct mlx5_vhca_data_buffer * 87 mlx5vf_get_data_buff_from_pos(struct mlx5_vf_migration_file *migf, loff_t pos, 88 bool *end_of_data) 89 { 90 struct mlx5_vhca_data_buffer *buf; 91 bool found = false; 92 93 *end_of_data = false; 94 spin_lock_irq(&migf->list_lock); 95 if (list_empty(&migf->buf_list)) { 96 *end_of_data = true; 97 goto end; 98 } 99 100 buf = list_first_entry(&migf->buf_list, struct mlx5_vhca_data_buffer, 101 buf_elm); 102 if (pos >= buf->start_pos && 103 pos < buf->start_pos + buf->length) { 104 found = true; 105 goto end; 106 } 107 108 /* 109 * As we use a stream based FD we may expect having the data always 110 * on first chunk 111 */ 112 migf->state = MLX5_MIGF_STATE_ERROR; 113 114 end: 115 spin_unlock_irq(&migf->list_lock); 116 return found ? buf : NULL; 117 } 118 119 static void mlx5vf_buf_read_done(struct mlx5_vhca_data_buffer *vhca_buf) 120 { 121 struct mlx5_vf_migration_file *migf = vhca_buf->migf; 122 123 if (vhca_buf->stop_copy_chunk_num) { 124 bool is_header = vhca_buf->dma_dir == DMA_NONE; 125 u8 chunk_num = vhca_buf->stop_copy_chunk_num; 126 size_t next_required_umem_size = 0; 127 128 if (is_header) 129 migf->buf_header[chunk_num - 1] = vhca_buf; 130 else 131 migf->buf[chunk_num - 1] = vhca_buf; 132 133 spin_lock_irq(&migf->list_lock); 134 list_del_init(&vhca_buf->buf_elm); 135 if (!is_header) { 136 next_required_umem_size = 137 migf->next_required_umem_size; 138 migf->next_required_umem_size = 0; 139 migf->num_ready_chunks--; 140 } 141 spin_unlock_irq(&migf->list_lock); 142 if (next_required_umem_size) 143 mlx5vf_mig_file_set_save_work(migf, chunk_num, 144 next_required_umem_size); 145 return; 146 } 147 148 spin_lock_irq(&migf->list_lock); 149 list_del_init(&vhca_buf->buf_elm); 150 list_add_tail(&vhca_buf->buf_elm, &vhca_buf->migf->avail_list); 151 spin_unlock_irq(&migf->list_lock); 152 } 153 154 static ssize_t mlx5vf_buf_read(struct mlx5_vhca_data_buffer *vhca_buf, 155 char __user **buf, size_t *len, loff_t *pos) 156 { 157 unsigned long offset; 158 ssize_t done = 0; 159 size_t copy_len; 160 161 copy_len = min_t(size_t, 162 vhca_buf->start_pos + vhca_buf->length - *pos, *len); 163 while (copy_len) { 164 size_t page_offset; 165 struct page *page; 166 size_t page_len; 167 u8 *from_buff; 168 int ret; 169 170 offset = *pos - vhca_buf->start_pos; 171 page_offset = offset % PAGE_SIZE; 172 offset -= page_offset; 173 page = mlx5vf_get_migration_page(vhca_buf, offset); 174 if (!page) 175 return -EINVAL; 176 page_len = min_t(size_t, copy_len, PAGE_SIZE - page_offset); 177 from_buff = kmap_local_page(page); 178 ret = copy_to_user(*buf, from_buff + page_offset, page_len); 179 kunmap_local(from_buff); 180 if (ret) 181 return -EFAULT; 182 *pos += page_len; 183 *len -= page_len; 184 *buf += page_len; 185 done += page_len; 186 copy_len -= page_len; 187 } 188 189 if (*pos >= vhca_buf->start_pos + vhca_buf->length) 190 mlx5vf_buf_read_done(vhca_buf); 191 192 return done; 193 } 194 195 static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len, 196 loff_t *pos) 197 { 198 struct mlx5_vf_migration_file *migf = filp->private_data; 199 struct mlx5_vhca_data_buffer *vhca_buf; 200 bool first_loop_call = true; 201 bool end_of_data; 202 ssize_t done = 0; 203 204 if (pos) 205 return -ESPIPE; 206 pos = &filp->f_pos; 207 208 if (!(filp->f_flags & O_NONBLOCK)) { 209 if (wait_event_interruptible(migf->poll_wait, 210 !list_empty(&migf->buf_list) || 211 migf->state == MLX5_MIGF_STATE_ERROR || 212 migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR || 213 migf->state == MLX5_MIGF_STATE_PRE_COPY || 214 migf->state == MLX5_MIGF_STATE_COMPLETE)) 215 return -ERESTARTSYS; 216 } 217 218 mutex_lock(&migf->lock); 219 if (migf->state == MLX5_MIGF_STATE_ERROR) { 220 done = -ENODEV; 221 goto out_unlock; 222 } 223 224 while (len) { 225 ssize_t count; 226 227 vhca_buf = mlx5vf_get_data_buff_from_pos(migf, *pos, 228 &end_of_data); 229 if (first_loop_call) { 230 first_loop_call = false; 231 /* Temporary end of file as part of PRE_COPY */ 232 if (end_of_data && (migf->state == MLX5_MIGF_STATE_PRE_COPY || 233 migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR)) { 234 done = -ENOMSG; 235 goto out_unlock; 236 } 237 238 if (end_of_data && migf->state != MLX5_MIGF_STATE_COMPLETE) { 239 if (filp->f_flags & O_NONBLOCK) { 240 done = -EAGAIN; 241 goto out_unlock; 242 } 243 } 244 } 245 246 if (end_of_data) 247 goto out_unlock; 248 249 if (!vhca_buf) { 250 done = -EINVAL; 251 goto out_unlock; 252 } 253 254 count = mlx5vf_buf_read(vhca_buf, &buf, &len, pos); 255 if (count < 0) { 256 done = count; 257 goto out_unlock; 258 } 259 done += count; 260 } 261 262 out_unlock: 263 mutex_unlock(&migf->lock); 264 return done; 265 } 266 267 static __poll_t mlx5vf_save_poll(struct file *filp, 268 struct poll_table_struct *wait) 269 { 270 struct mlx5_vf_migration_file *migf = filp->private_data; 271 __poll_t pollflags = 0; 272 273 poll_wait(filp, &migf->poll_wait, wait); 274 275 mutex_lock(&migf->lock); 276 if (migf->state == MLX5_MIGF_STATE_ERROR) 277 pollflags = EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; 278 else if (!list_empty(&migf->buf_list) || 279 migf->state == MLX5_MIGF_STATE_COMPLETE) 280 pollflags = EPOLLIN | EPOLLRDNORM; 281 mutex_unlock(&migf->lock); 282 283 return pollflags; 284 } 285 286 /* 287 * FD is exposed and user can use it after receiving an error. 288 * Mark migf in error, and wake the user. 289 */ 290 static void mlx5vf_mark_err(struct mlx5_vf_migration_file *migf) 291 { 292 migf->state = MLX5_MIGF_STATE_ERROR; 293 wake_up_interruptible(&migf->poll_wait); 294 } 295 296 void mlx5vf_mig_file_set_save_work(struct mlx5_vf_migration_file *migf, 297 u8 chunk_num, size_t next_required_umem_size) 298 { 299 migf->save_data[chunk_num - 1].next_required_umem_size = 300 next_required_umem_size; 301 migf->save_data[chunk_num - 1].migf = migf; 302 get_file(migf->filp); 303 queue_work(migf->mvdev->cb_wq, 304 &migf->save_data[chunk_num - 1].work); 305 } 306 307 static struct mlx5_vhca_data_buffer * 308 mlx5vf_mig_file_get_stop_copy_buf(struct mlx5_vf_migration_file *migf, 309 u8 index, size_t required_length) 310 { 311 struct mlx5_vhca_data_buffer *buf = migf->buf[index]; 312 u8 chunk_num; 313 314 WARN_ON(!buf); 315 chunk_num = buf->stop_copy_chunk_num; 316 buf->migf->buf[index] = NULL; 317 /* Checking whether the pre-allocated buffer can fit */ 318 if (buf->allocated_length >= required_length) 319 return buf; 320 321 mlx5vf_put_data_buffer(buf); 322 buf = mlx5vf_get_data_buffer(buf->migf, required_length, 323 DMA_FROM_DEVICE); 324 if (IS_ERR(buf)) 325 return buf; 326 327 buf->stop_copy_chunk_num = chunk_num; 328 return buf; 329 } 330 331 static void mlx5vf_mig_file_save_work(struct work_struct *_work) 332 { 333 struct mlx5vf_save_work_data *save_data = container_of(_work, 334 struct mlx5vf_save_work_data, work); 335 struct mlx5_vf_migration_file *migf = save_data->migf; 336 struct mlx5vf_pci_core_device *mvdev = migf->mvdev; 337 struct mlx5_vhca_data_buffer *buf; 338 339 mutex_lock(&mvdev->state_mutex); 340 if (migf->state == MLX5_MIGF_STATE_ERROR) 341 goto end; 342 343 buf = mlx5vf_mig_file_get_stop_copy_buf(migf, 344 save_data->chunk_num - 1, 345 save_data->next_required_umem_size); 346 if (IS_ERR(buf)) 347 goto err; 348 349 if (mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, false)) 350 goto err_save; 351 352 goto end; 353 354 err_save: 355 mlx5vf_put_data_buffer(buf); 356 err: 357 mlx5vf_mark_err(migf); 358 end: 359 mlx5vf_state_mutex_unlock(mvdev); 360 fput(migf->filp); 361 } 362 363 static int mlx5vf_add_stop_copy_header(struct mlx5_vf_migration_file *migf, 364 bool track) 365 { 366 size_t size = sizeof(struct mlx5_vf_migration_header) + 367 sizeof(struct mlx5_vf_migration_tag_stop_copy_data); 368 struct mlx5_vf_migration_tag_stop_copy_data data = {}; 369 struct mlx5_vhca_data_buffer *header_buf = NULL; 370 struct mlx5_vf_migration_header header = {}; 371 unsigned long flags; 372 struct page *page; 373 u8 *to_buff; 374 int ret; 375 376 header_buf = mlx5vf_get_data_buffer(migf, size, DMA_NONE); 377 if (IS_ERR(header_buf)) 378 return PTR_ERR(header_buf); 379 380 header.record_size = cpu_to_le64(sizeof(data)); 381 header.flags = cpu_to_le32(MLX5_MIGF_HEADER_FLAGS_TAG_OPTIONAL); 382 header.tag = cpu_to_le32(MLX5_MIGF_HEADER_TAG_STOP_COPY_SIZE); 383 page = mlx5vf_get_migration_page(header_buf, 0); 384 if (!page) { 385 ret = -EINVAL; 386 goto err; 387 } 388 to_buff = kmap_local_page(page); 389 memcpy(to_buff, &header, sizeof(header)); 390 header_buf->length = sizeof(header); 391 data.stop_copy_size = cpu_to_le64(migf->buf[0]->allocated_length); 392 memcpy(to_buff + sizeof(header), &data, sizeof(data)); 393 header_buf->length += sizeof(data); 394 kunmap_local(to_buff); 395 header_buf->start_pos = header_buf->migf->max_pos; 396 migf->max_pos += header_buf->length; 397 spin_lock_irqsave(&migf->list_lock, flags); 398 list_add_tail(&header_buf->buf_elm, &migf->buf_list); 399 spin_unlock_irqrestore(&migf->list_lock, flags); 400 if (track) 401 migf->pre_copy_initial_bytes = size; 402 return 0; 403 err: 404 mlx5vf_put_data_buffer(header_buf); 405 return ret; 406 } 407 408 static int mlx5vf_prep_stop_copy(struct mlx5vf_pci_core_device *mvdev, 409 struct mlx5_vf_migration_file *migf, 410 size_t state_size, u64 full_size, 411 bool track) 412 { 413 struct mlx5_vhca_data_buffer *buf; 414 size_t inc_state_size; 415 int num_chunks; 416 int ret; 417 int i; 418 419 if (mvdev->chunk_mode) { 420 size_t chunk_size = min_t(size_t, MAX_CHUNK_SIZE, full_size); 421 422 /* from firmware perspective at least 'state_size' buffer should be set */ 423 inc_state_size = max(state_size, chunk_size); 424 } else { 425 if (track) { 426 /* let's be ready for stop_copy size that might grow by 10 percents */ 427 if (check_add_overflow(state_size, state_size / 10, &inc_state_size)) 428 inc_state_size = state_size; 429 } else { 430 inc_state_size = state_size; 431 } 432 } 433 434 /* let's not overflow the device specification max SAVE size */ 435 inc_state_size = min_t(size_t, inc_state_size, 436 (BIT_ULL(__mlx5_bit_sz(save_vhca_state_in, size)) - PAGE_SIZE)); 437 438 num_chunks = mvdev->chunk_mode ? MAX_NUM_CHUNKS : 1; 439 for (i = 0; i < num_chunks; i++) { 440 buf = mlx5vf_get_data_buffer(migf, inc_state_size, DMA_FROM_DEVICE); 441 if (IS_ERR(buf)) { 442 ret = PTR_ERR(buf); 443 goto err; 444 } 445 446 migf->buf[i] = buf; 447 buf = mlx5vf_get_data_buffer(migf, 448 sizeof(struct mlx5_vf_migration_header), DMA_NONE); 449 if (IS_ERR(buf)) { 450 ret = PTR_ERR(buf); 451 goto err; 452 } 453 migf->buf_header[i] = buf; 454 if (mvdev->chunk_mode) { 455 migf->buf[i]->stop_copy_chunk_num = i + 1; 456 migf->buf_header[i]->stop_copy_chunk_num = i + 1; 457 INIT_WORK(&migf->save_data[i].work, 458 mlx5vf_mig_file_save_work); 459 migf->save_data[i].chunk_num = i + 1; 460 } 461 } 462 463 ret = mlx5vf_add_stop_copy_header(migf, track); 464 if (ret) 465 goto err; 466 return 0; 467 468 err: 469 for (i = 0; i < num_chunks; i++) { 470 if (migf->buf[i]) { 471 mlx5vf_put_data_buffer(migf->buf[i]); 472 migf->buf[i] = NULL; 473 } 474 if (migf->buf_header[i]) { 475 mlx5vf_put_data_buffer(migf->buf_header[i]); 476 migf->buf_header[i] = NULL; 477 } 478 } 479 480 return ret; 481 } 482 483 static long mlx5vf_precopy_ioctl(struct file *filp, unsigned int cmd, 484 unsigned long arg) 485 { 486 struct mlx5_vf_migration_file *migf = filp->private_data; 487 struct mlx5vf_pci_core_device *mvdev = migf->mvdev; 488 struct mlx5_vhca_data_buffer *buf; 489 struct vfio_precopy_info info = {}; 490 loff_t *pos = &filp->f_pos; 491 unsigned long minsz; 492 size_t inc_length = 0; 493 bool end_of_data = false; 494 int ret; 495 496 if (cmd != VFIO_MIG_GET_PRECOPY_INFO) 497 return -ENOTTY; 498 499 minsz = offsetofend(struct vfio_precopy_info, dirty_bytes); 500 501 if (copy_from_user(&info, (void __user *)arg, minsz)) 502 return -EFAULT; 503 504 if (info.argsz < minsz) 505 return -EINVAL; 506 507 mutex_lock(&mvdev->state_mutex); 508 if (mvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY && 509 mvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY_P2P) { 510 ret = -EINVAL; 511 goto err_state_unlock; 512 } 513 514 /* 515 * We can't issue a SAVE command when the device is suspended, so as 516 * part of VFIO_DEVICE_STATE_PRE_COPY_P2P no reason to query for extra 517 * bytes that can't be read. 518 */ 519 if (mvdev->mig_state == VFIO_DEVICE_STATE_PRE_COPY) { 520 /* 521 * Once the query returns it's guaranteed that there is no 522 * active SAVE command. 523 * As so, the other code below is safe with the proper locks. 524 */ 525 ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &inc_length, 526 NULL, MLX5VF_QUERY_INC); 527 if (ret) 528 goto err_state_unlock; 529 } 530 531 mutex_lock(&migf->lock); 532 if (migf->state == MLX5_MIGF_STATE_ERROR) { 533 ret = -ENODEV; 534 goto err_migf_unlock; 535 } 536 537 if (migf->pre_copy_initial_bytes > *pos) { 538 info.initial_bytes = migf->pre_copy_initial_bytes - *pos; 539 } else { 540 info.dirty_bytes = migf->max_pos - *pos; 541 if (!info.dirty_bytes) 542 end_of_data = true; 543 info.dirty_bytes += inc_length; 544 } 545 546 if (!end_of_data || !inc_length) { 547 mutex_unlock(&migf->lock); 548 goto done; 549 } 550 551 mutex_unlock(&migf->lock); 552 /* 553 * We finished transferring the current state and the device has a 554 * dirty state, save a new state to be ready for. 555 */ 556 buf = mlx5vf_get_data_buffer(migf, inc_length, DMA_FROM_DEVICE); 557 if (IS_ERR(buf)) { 558 ret = PTR_ERR(buf); 559 mlx5vf_mark_err(migf); 560 goto err_state_unlock; 561 } 562 563 ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, true); 564 if (ret) { 565 mlx5vf_mark_err(migf); 566 mlx5vf_put_data_buffer(buf); 567 goto err_state_unlock; 568 } 569 570 done: 571 mlx5vf_state_mutex_unlock(mvdev); 572 if (copy_to_user((void __user *)arg, &info, minsz)) 573 return -EFAULT; 574 return 0; 575 576 err_migf_unlock: 577 mutex_unlock(&migf->lock); 578 err_state_unlock: 579 mlx5vf_state_mutex_unlock(mvdev); 580 return ret; 581 } 582 583 static const struct file_operations mlx5vf_save_fops = { 584 .owner = THIS_MODULE, 585 .read = mlx5vf_save_read, 586 .poll = mlx5vf_save_poll, 587 .unlocked_ioctl = mlx5vf_precopy_ioctl, 588 .compat_ioctl = compat_ptr_ioctl, 589 .release = mlx5vf_release_file, 590 }; 591 592 static int mlx5vf_pci_save_device_inc_data(struct mlx5vf_pci_core_device *mvdev) 593 { 594 struct mlx5_vf_migration_file *migf = mvdev->saving_migf; 595 struct mlx5_vhca_data_buffer *buf; 596 size_t length; 597 int ret; 598 599 if (migf->state == MLX5_MIGF_STATE_ERROR) 600 return -ENODEV; 601 602 ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, NULL, 603 MLX5VF_QUERY_INC | MLX5VF_QUERY_FINAL); 604 if (ret) 605 goto err; 606 607 buf = mlx5vf_mig_file_get_stop_copy_buf(migf, 0, length); 608 if (IS_ERR(buf)) { 609 ret = PTR_ERR(buf); 610 goto err; 611 } 612 613 ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, false); 614 if (ret) 615 goto err_save; 616 617 return 0; 618 619 err_save: 620 mlx5vf_put_data_buffer(buf); 621 err: 622 mlx5vf_mark_err(migf); 623 return ret; 624 } 625 626 static struct mlx5_vf_migration_file * 627 mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev, bool track) 628 { 629 struct mlx5_vf_migration_file *migf; 630 struct mlx5_vhca_data_buffer *buf; 631 size_t length; 632 u64 full_size; 633 int ret; 634 635 migf = kzalloc(sizeof(*migf), GFP_KERNEL_ACCOUNT); 636 if (!migf) 637 return ERR_PTR(-ENOMEM); 638 639 migf->filp = anon_inode_getfile("mlx5vf_mig", &mlx5vf_save_fops, migf, 640 O_RDONLY); 641 if (IS_ERR(migf->filp)) { 642 ret = PTR_ERR(migf->filp); 643 kfree(migf); 644 return ERR_PTR(ret); 645 } 646 647 migf->mvdev = mvdev; 648 stream_open(migf->filp->f_inode, migf->filp); 649 mutex_init(&migf->lock); 650 init_waitqueue_head(&migf->poll_wait); 651 init_completion(&migf->save_comp); 652 /* 653 * save_comp is being used as a binary semaphore built from 654 * a completion. A normal mutex cannot be used because the lock is 655 * passed between kernel threads and lockdep can't model this. 656 */ 657 complete(&migf->save_comp); 658 mlx5_cmd_init_async_ctx(mvdev->mdev, &migf->async_ctx); 659 INIT_WORK(&migf->async_data.work, mlx5vf_mig_file_cleanup_cb); 660 INIT_LIST_HEAD(&migf->buf_list); 661 INIT_LIST_HEAD(&migf->avail_list); 662 spin_lock_init(&migf->list_lock); 663 664 ret = mlx5vf_cmd_alloc_pd(migf); 665 if (ret) 666 goto out; 667 668 ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, &full_size, 0); 669 if (ret) 670 goto out_pd; 671 672 ret = mlx5vf_prep_stop_copy(mvdev, migf, length, full_size, track); 673 if (ret) 674 goto out_pd; 675 676 if (track) { 677 /* leave the allocated buffer ready for the stop-copy phase */ 678 buf = mlx5vf_alloc_data_buffer(migf, 679 migf->buf[0]->allocated_length, DMA_FROM_DEVICE); 680 if (IS_ERR(buf)) { 681 ret = PTR_ERR(buf); 682 goto out_pd; 683 } 684 } else { 685 buf = migf->buf[0]; 686 migf->buf[0] = NULL; 687 } 688 689 ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, false, track); 690 if (ret) 691 goto out_save; 692 return migf; 693 out_save: 694 mlx5vf_free_data_buffer(buf); 695 out_pd: 696 mlx5fv_cmd_clean_migf_resources(migf); 697 out: 698 fput(migf->filp); 699 return ERR_PTR(ret); 700 } 701 702 static int 703 mlx5vf_append_page_to_mig_buf(struct mlx5_vhca_data_buffer *vhca_buf, 704 const char __user **buf, size_t *len, 705 loff_t *pos, ssize_t *done) 706 { 707 unsigned long offset; 708 size_t page_offset; 709 struct page *page; 710 size_t page_len; 711 u8 *to_buff; 712 int ret; 713 714 offset = *pos - vhca_buf->start_pos; 715 page_offset = offset % PAGE_SIZE; 716 717 page = mlx5vf_get_migration_page(vhca_buf, offset - page_offset); 718 if (!page) 719 return -EINVAL; 720 page_len = min_t(size_t, *len, PAGE_SIZE - page_offset); 721 to_buff = kmap_local_page(page); 722 ret = copy_from_user(to_buff + page_offset, *buf, page_len); 723 kunmap_local(to_buff); 724 if (ret) 725 return -EFAULT; 726 727 *pos += page_len; 728 *done += page_len; 729 *buf += page_len; 730 *len -= page_len; 731 vhca_buf->length += page_len; 732 return 0; 733 } 734 735 static ssize_t 736 mlx5vf_resume_read_image(struct mlx5_vf_migration_file *migf, 737 struct mlx5_vhca_data_buffer *vhca_buf, 738 size_t image_size, const char __user **buf, 739 size_t *len, loff_t *pos, ssize_t *done, 740 bool *has_work) 741 { 742 size_t copy_len, to_copy; 743 int ret; 744 745 to_copy = min_t(size_t, *len, image_size - vhca_buf->length); 746 copy_len = to_copy; 747 while (to_copy) { 748 ret = mlx5vf_append_page_to_mig_buf(vhca_buf, buf, &to_copy, pos, 749 done); 750 if (ret) 751 return ret; 752 } 753 754 *len -= copy_len; 755 if (vhca_buf->length == image_size) { 756 migf->load_state = MLX5_VF_LOAD_STATE_LOAD_IMAGE; 757 migf->max_pos += image_size; 758 *has_work = true; 759 } 760 761 return 0; 762 } 763 764 static int 765 mlx5vf_resume_read_header_data(struct mlx5_vf_migration_file *migf, 766 struct mlx5_vhca_data_buffer *vhca_buf, 767 const char __user **buf, size_t *len, 768 loff_t *pos, ssize_t *done) 769 { 770 size_t copy_len, to_copy; 771 size_t required_data; 772 u8 *to_buff; 773 int ret; 774 775 required_data = migf->record_size - vhca_buf->length; 776 to_copy = min_t(size_t, *len, required_data); 777 copy_len = to_copy; 778 while (to_copy) { 779 ret = mlx5vf_append_page_to_mig_buf(vhca_buf, buf, &to_copy, pos, 780 done); 781 if (ret) 782 return ret; 783 } 784 785 *len -= copy_len; 786 if (vhca_buf->length == migf->record_size) { 787 switch (migf->record_tag) { 788 case MLX5_MIGF_HEADER_TAG_STOP_COPY_SIZE: 789 { 790 struct page *page; 791 792 page = mlx5vf_get_migration_page(vhca_buf, 0); 793 if (!page) 794 return -EINVAL; 795 to_buff = kmap_local_page(page); 796 migf->stop_copy_prep_size = min_t(u64, 797 le64_to_cpup((__le64 *)to_buff), MAX_LOAD_SIZE); 798 kunmap_local(to_buff); 799 break; 800 } 801 default: 802 /* Optional tag */ 803 break; 804 } 805 806 migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER; 807 migf->max_pos += migf->record_size; 808 vhca_buf->length = 0; 809 } 810 811 return 0; 812 } 813 814 static int 815 mlx5vf_resume_read_header(struct mlx5_vf_migration_file *migf, 816 struct mlx5_vhca_data_buffer *vhca_buf, 817 const char __user **buf, 818 size_t *len, loff_t *pos, 819 ssize_t *done, bool *has_work) 820 { 821 struct page *page; 822 size_t copy_len; 823 u8 *to_buff; 824 int ret; 825 826 copy_len = min_t(size_t, *len, 827 sizeof(struct mlx5_vf_migration_header) - vhca_buf->length); 828 page = mlx5vf_get_migration_page(vhca_buf, 0); 829 if (!page) 830 return -EINVAL; 831 to_buff = kmap_local_page(page); 832 ret = copy_from_user(to_buff + vhca_buf->length, *buf, copy_len); 833 if (ret) { 834 ret = -EFAULT; 835 goto end; 836 } 837 838 *buf += copy_len; 839 *pos += copy_len; 840 *done += copy_len; 841 *len -= copy_len; 842 vhca_buf->length += copy_len; 843 if (vhca_buf->length == sizeof(struct mlx5_vf_migration_header)) { 844 u64 record_size; 845 u32 flags; 846 847 record_size = le64_to_cpup((__le64 *)to_buff); 848 if (record_size > MAX_LOAD_SIZE) { 849 ret = -ENOMEM; 850 goto end; 851 } 852 853 migf->record_size = record_size; 854 flags = le32_to_cpup((__le32 *)(to_buff + 855 offsetof(struct mlx5_vf_migration_header, flags))); 856 migf->record_tag = le32_to_cpup((__le32 *)(to_buff + 857 offsetof(struct mlx5_vf_migration_header, tag))); 858 switch (migf->record_tag) { 859 case MLX5_MIGF_HEADER_TAG_FW_DATA: 860 migf->load_state = MLX5_VF_LOAD_STATE_PREP_IMAGE; 861 break; 862 case MLX5_MIGF_HEADER_TAG_STOP_COPY_SIZE: 863 migf->load_state = MLX5_VF_LOAD_STATE_PREP_HEADER_DATA; 864 break; 865 default: 866 if (!(flags & MLX5_MIGF_HEADER_FLAGS_TAG_OPTIONAL)) { 867 ret = -EOPNOTSUPP; 868 goto end; 869 } 870 /* We may read and skip this optional record data */ 871 migf->load_state = MLX5_VF_LOAD_STATE_PREP_HEADER_DATA; 872 } 873 874 migf->max_pos += vhca_buf->length; 875 vhca_buf->length = 0; 876 *has_work = true; 877 } 878 end: 879 kunmap_local(to_buff); 880 return ret; 881 } 882 883 static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf, 884 size_t len, loff_t *pos) 885 { 886 struct mlx5_vf_migration_file *migf = filp->private_data; 887 struct mlx5_vhca_data_buffer *vhca_buf = migf->buf[0]; 888 struct mlx5_vhca_data_buffer *vhca_buf_header = migf->buf_header[0]; 889 loff_t requested_length; 890 bool has_work = false; 891 ssize_t done = 0; 892 int ret = 0; 893 894 if (pos) 895 return -ESPIPE; 896 pos = &filp->f_pos; 897 898 if (*pos < 0 || 899 check_add_overflow((loff_t)len, *pos, &requested_length)) 900 return -EINVAL; 901 902 mutex_lock(&migf->mvdev->state_mutex); 903 mutex_lock(&migf->lock); 904 if (migf->state == MLX5_MIGF_STATE_ERROR) { 905 ret = -ENODEV; 906 goto out_unlock; 907 } 908 909 while (len || has_work) { 910 has_work = false; 911 switch (migf->load_state) { 912 case MLX5_VF_LOAD_STATE_READ_HEADER: 913 ret = mlx5vf_resume_read_header(migf, vhca_buf_header, 914 &buf, &len, pos, 915 &done, &has_work); 916 if (ret) 917 goto out_unlock; 918 break; 919 case MLX5_VF_LOAD_STATE_PREP_HEADER_DATA: 920 if (vhca_buf_header->allocated_length < migf->record_size) { 921 mlx5vf_free_data_buffer(vhca_buf_header); 922 923 migf->buf_header[0] = mlx5vf_alloc_data_buffer(migf, 924 migf->record_size, DMA_NONE); 925 if (IS_ERR(migf->buf_header[0])) { 926 ret = PTR_ERR(migf->buf_header[0]); 927 migf->buf_header[0] = NULL; 928 goto out_unlock; 929 } 930 931 vhca_buf_header = migf->buf_header[0]; 932 } 933 934 vhca_buf_header->start_pos = migf->max_pos; 935 migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER_DATA; 936 break; 937 case MLX5_VF_LOAD_STATE_READ_HEADER_DATA: 938 ret = mlx5vf_resume_read_header_data(migf, vhca_buf_header, 939 &buf, &len, pos, &done); 940 if (ret) 941 goto out_unlock; 942 break; 943 case MLX5_VF_LOAD_STATE_PREP_IMAGE: 944 { 945 u64 size = max(migf->record_size, 946 migf->stop_copy_prep_size); 947 948 if (vhca_buf->allocated_length < size) { 949 mlx5vf_free_data_buffer(vhca_buf); 950 951 migf->buf[0] = mlx5vf_alloc_data_buffer(migf, 952 size, DMA_TO_DEVICE); 953 if (IS_ERR(migf->buf[0])) { 954 ret = PTR_ERR(migf->buf[0]); 955 migf->buf[0] = NULL; 956 goto out_unlock; 957 } 958 959 vhca_buf = migf->buf[0]; 960 } 961 962 vhca_buf->start_pos = migf->max_pos; 963 migf->load_state = MLX5_VF_LOAD_STATE_READ_IMAGE; 964 break; 965 } 966 case MLX5_VF_LOAD_STATE_READ_IMAGE: 967 ret = mlx5vf_resume_read_image(migf, vhca_buf, 968 migf->record_size, 969 &buf, &len, pos, &done, &has_work); 970 if (ret) 971 goto out_unlock; 972 break; 973 case MLX5_VF_LOAD_STATE_LOAD_IMAGE: 974 ret = mlx5vf_cmd_load_vhca_state(migf->mvdev, migf, vhca_buf); 975 if (ret) 976 goto out_unlock; 977 migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER; 978 979 /* prep header buf for next image */ 980 vhca_buf_header->length = 0; 981 /* prep data buf for next image */ 982 vhca_buf->length = 0; 983 984 break; 985 default: 986 break; 987 } 988 } 989 990 out_unlock: 991 if (ret) 992 migf->state = MLX5_MIGF_STATE_ERROR; 993 mutex_unlock(&migf->lock); 994 mlx5vf_state_mutex_unlock(migf->mvdev); 995 return ret ? ret : done; 996 } 997 998 static const struct file_operations mlx5vf_resume_fops = { 999 .owner = THIS_MODULE, 1000 .write = mlx5vf_resume_write, 1001 .release = mlx5vf_release_file, 1002 }; 1003 1004 static struct mlx5_vf_migration_file * 1005 mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev) 1006 { 1007 struct mlx5_vf_migration_file *migf; 1008 struct mlx5_vhca_data_buffer *buf; 1009 int ret; 1010 1011 migf = kzalloc(sizeof(*migf), GFP_KERNEL_ACCOUNT); 1012 if (!migf) 1013 return ERR_PTR(-ENOMEM); 1014 1015 migf->filp = anon_inode_getfile("mlx5vf_mig", &mlx5vf_resume_fops, migf, 1016 O_WRONLY); 1017 if (IS_ERR(migf->filp)) { 1018 ret = PTR_ERR(migf->filp); 1019 kfree(migf); 1020 return ERR_PTR(ret); 1021 } 1022 1023 stream_open(migf->filp->f_inode, migf->filp); 1024 mutex_init(&migf->lock); 1025 INIT_LIST_HEAD(&migf->buf_list); 1026 INIT_LIST_HEAD(&migf->avail_list); 1027 spin_lock_init(&migf->list_lock); 1028 migf->mvdev = mvdev; 1029 ret = mlx5vf_cmd_alloc_pd(migf); 1030 if (ret) 1031 goto out; 1032 1033 buf = mlx5vf_alloc_data_buffer(migf, 0, DMA_TO_DEVICE); 1034 if (IS_ERR(buf)) { 1035 ret = PTR_ERR(buf); 1036 goto out_pd; 1037 } 1038 1039 migf->buf[0] = buf; 1040 buf = mlx5vf_alloc_data_buffer(migf, 1041 sizeof(struct mlx5_vf_migration_header), DMA_NONE); 1042 if (IS_ERR(buf)) { 1043 ret = PTR_ERR(buf); 1044 goto out_buf; 1045 } 1046 1047 migf->buf_header[0] = buf; 1048 migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER; 1049 1050 return migf; 1051 out_buf: 1052 mlx5vf_free_data_buffer(migf->buf[0]); 1053 out_pd: 1054 mlx5vf_cmd_dealloc_pd(migf); 1055 out: 1056 fput(migf->filp); 1057 return ERR_PTR(ret); 1058 } 1059 1060 void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev, 1061 enum mlx5_vf_migf_state *last_save_state) 1062 { 1063 if (mvdev->resuming_migf) { 1064 mlx5vf_disable_fd(mvdev->resuming_migf); 1065 mlx5fv_cmd_clean_migf_resources(mvdev->resuming_migf); 1066 fput(mvdev->resuming_migf->filp); 1067 mvdev->resuming_migf = NULL; 1068 } 1069 if (mvdev->saving_migf) { 1070 mlx5_cmd_cleanup_async_ctx(&mvdev->saving_migf->async_ctx); 1071 cancel_work_sync(&mvdev->saving_migf->async_data.work); 1072 if (last_save_state) 1073 *last_save_state = mvdev->saving_migf->state; 1074 mlx5vf_disable_fd(mvdev->saving_migf); 1075 wake_up_interruptible(&mvdev->saving_migf->poll_wait); 1076 mlx5fv_cmd_clean_migf_resources(mvdev->saving_migf); 1077 fput(mvdev->saving_migf->filp); 1078 mvdev->saving_migf = NULL; 1079 } 1080 } 1081 1082 static struct file * 1083 mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev, 1084 u32 new) 1085 { 1086 u32 cur = mvdev->mig_state; 1087 int ret; 1088 1089 if (cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_STOP) { 1090 ret = mlx5vf_cmd_suspend_vhca(mvdev, 1091 MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_RESPONDER); 1092 if (ret) 1093 return ERR_PTR(ret); 1094 return NULL; 1095 } 1096 1097 if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RUNNING_P2P) { 1098 ret = mlx5vf_cmd_resume_vhca(mvdev, 1099 MLX5_RESUME_VHCA_IN_OP_MOD_RESUME_RESPONDER); 1100 if (ret) 1101 return ERR_PTR(ret); 1102 return NULL; 1103 } 1104 1105 if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_RUNNING_P2P) || 1106 (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) { 1107 ret = mlx5vf_cmd_suspend_vhca(mvdev, 1108 MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_INITIATOR); 1109 if (ret) 1110 return ERR_PTR(ret); 1111 return NULL; 1112 } 1113 1114 if ((cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_RUNNING) || 1115 (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_PRE_COPY)) { 1116 ret = mlx5vf_cmd_resume_vhca(mvdev, 1117 MLX5_RESUME_VHCA_IN_OP_MOD_RESUME_INITIATOR); 1118 if (ret) 1119 return ERR_PTR(ret); 1120 return NULL; 1121 } 1122 1123 if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_STOP_COPY) { 1124 struct mlx5_vf_migration_file *migf; 1125 1126 migf = mlx5vf_pci_save_device_data(mvdev, false); 1127 if (IS_ERR(migf)) 1128 return ERR_CAST(migf); 1129 get_file(migf->filp); 1130 mvdev->saving_migf = migf; 1131 return migf->filp; 1132 } 1133 1134 if (cur == VFIO_DEVICE_STATE_STOP_COPY && new == VFIO_DEVICE_STATE_STOP) { 1135 mlx5vf_disable_fds(mvdev, NULL); 1136 return NULL; 1137 } 1138 1139 if ((cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_RUNNING) || 1140 (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && 1141 new == VFIO_DEVICE_STATE_RUNNING_P2P)) { 1142 struct mlx5_vf_migration_file *migf = mvdev->saving_migf; 1143 struct mlx5_vhca_data_buffer *buf; 1144 enum mlx5_vf_migf_state state; 1145 size_t size; 1146 1147 ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &size, NULL, 1148 MLX5VF_QUERY_INC | MLX5VF_QUERY_CLEANUP); 1149 if (ret) 1150 return ERR_PTR(ret); 1151 buf = mlx5vf_get_data_buffer(migf, size, DMA_FROM_DEVICE); 1152 if (IS_ERR(buf)) 1153 return ERR_CAST(buf); 1154 /* pre_copy cleanup */ 1155 ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, false, false); 1156 if (ret) { 1157 mlx5vf_put_data_buffer(buf); 1158 return ERR_PTR(ret); 1159 } 1160 mlx5vf_disable_fds(mvdev, &state); 1161 return (state != MLX5_MIGF_STATE_ERROR) ? NULL : ERR_PTR(-EIO); 1162 } 1163 1164 if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RESUMING) { 1165 struct mlx5_vf_migration_file *migf; 1166 1167 migf = mlx5vf_pci_resume_device_data(mvdev); 1168 if (IS_ERR(migf)) 1169 return ERR_CAST(migf); 1170 get_file(migf->filp); 1171 mvdev->resuming_migf = migf; 1172 return migf->filp; 1173 } 1174 1175 if (cur == VFIO_DEVICE_STATE_RESUMING && new == VFIO_DEVICE_STATE_STOP) { 1176 mlx5vf_disable_fds(mvdev, NULL); 1177 return NULL; 1178 } 1179 1180 if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_PRE_COPY) || 1181 (cur == VFIO_DEVICE_STATE_RUNNING_P2P && 1182 new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) { 1183 struct mlx5_vf_migration_file *migf; 1184 1185 migf = mlx5vf_pci_save_device_data(mvdev, true); 1186 if (IS_ERR(migf)) 1187 return ERR_CAST(migf); 1188 get_file(migf->filp); 1189 mvdev->saving_migf = migf; 1190 return migf->filp; 1191 } 1192 1193 if (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_STOP_COPY) { 1194 ret = mlx5vf_cmd_suspend_vhca(mvdev, 1195 MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_RESPONDER); 1196 if (ret) 1197 return ERR_PTR(ret); 1198 ret = mlx5vf_pci_save_device_inc_data(mvdev); 1199 return ret ? ERR_PTR(ret) : NULL; 1200 } 1201 1202 /* 1203 * vfio_mig_get_next_state() does not use arcs other than the above 1204 */ 1205 WARN_ON(true); 1206 return ERR_PTR(-EINVAL); 1207 } 1208 1209 /* 1210 * This function is called in all state_mutex unlock cases to 1211 * handle a 'deferred_reset' if exists. 1212 */ 1213 void mlx5vf_state_mutex_unlock(struct mlx5vf_pci_core_device *mvdev) 1214 { 1215 again: 1216 spin_lock(&mvdev->reset_lock); 1217 if (mvdev->deferred_reset) { 1218 mvdev->deferred_reset = false; 1219 spin_unlock(&mvdev->reset_lock); 1220 mvdev->mig_state = VFIO_DEVICE_STATE_RUNNING; 1221 mlx5vf_disable_fds(mvdev, NULL); 1222 goto again; 1223 } 1224 mutex_unlock(&mvdev->state_mutex); 1225 spin_unlock(&mvdev->reset_lock); 1226 } 1227 1228 static struct file * 1229 mlx5vf_pci_set_device_state(struct vfio_device *vdev, 1230 enum vfio_device_mig_state new_state) 1231 { 1232 struct mlx5vf_pci_core_device *mvdev = container_of( 1233 vdev, struct mlx5vf_pci_core_device, core_device.vdev); 1234 enum vfio_device_mig_state next_state; 1235 struct file *res = NULL; 1236 int ret; 1237 1238 mutex_lock(&mvdev->state_mutex); 1239 while (new_state != mvdev->mig_state) { 1240 ret = vfio_mig_get_next_state(vdev, mvdev->mig_state, 1241 new_state, &next_state); 1242 if (ret) { 1243 res = ERR_PTR(ret); 1244 break; 1245 } 1246 res = mlx5vf_pci_step_device_state_locked(mvdev, next_state); 1247 if (IS_ERR(res)) 1248 break; 1249 mvdev->mig_state = next_state; 1250 if (WARN_ON(res && new_state != mvdev->mig_state)) { 1251 fput(res); 1252 res = ERR_PTR(-EINVAL); 1253 break; 1254 } 1255 } 1256 mlx5vf_state_mutex_unlock(mvdev); 1257 return res; 1258 } 1259 1260 static int mlx5vf_pci_get_data_size(struct vfio_device *vdev, 1261 unsigned long *stop_copy_length) 1262 { 1263 struct mlx5vf_pci_core_device *mvdev = container_of( 1264 vdev, struct mlx5vf_pci_core_device, core_device.vdev); 1265 size_t state_size; 1266 u64 total_size; 1267 int ret; 1268 1269 mutex_lock(&mvdev->state_mutex); 1270 ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &state_size, 1271 &total_size, 0); 1272 if (!ret) 1273 *stop_copy_length = total_size; 1274 mlx5vf_state_mutex_unlock(mvdev); 1275 return ret; 1276 } 1277 1278 static int mlx5vf_pci_get_device_state(struct vfio_device *vdev, 1279 enum vfio_device_mig_state *curr_state) 1280 { 1281 struct mlx5vf_pci_core_device *mvdev = container_of( 1282 vdev, struct mlx5vf_pci_core_device, core_device.vdev); 1283 1284 mutex_lock(&mvdev->state_mutex); 1285 *curr_state = mvdev->mig_state; 1286 mlx5vf_state_mutex_unlock(mvdev); 1287 return 0; 1288 } 1289 1290 static void mlx5vf_pci_aer_reset_done(struct pci_dev *pdev) 1291 { 1292 struct mlx5vf_pci_core_device *mvdev = mlx5vf_drvdata(pdev); 1293 1294 if (!mvdev->migrate_cap) 1295 return; 1296 1297 /* 1298 * As the higher VFIO layers are holding locks across reset and using 1299 * those same locks with the mm_lock we need to prevent ABBA deadlock 1300 * with the state_mutex and mm_lock. 1301 * In case the state_mutex was taken already we defer the cleanup work 1302 * to the unlock flow of the other running context. 1303 */ 1304 spin_lock(&mvdev->reset_lock); 1305 mvdev->deferred_reset = true; 1306 if (!mutex_trylock(&mvdev->state_mutex)) { 1307 spin_unlock(&mvdev->reset_lock); 1308 return; 1309 } 1310 spin_unlock(&mvdev->reset_lock); 1311 mlx5vf_state_mutex_unlock(mvdev); 1312 } 1313 1314 static int mlx5vf_pci_open_device(struct vfio_device *core_vdev) 1315 { 1316 struct mlx5vf_pci_core_device *mvdev = container_of( 1317 core_vdev, struct mlx5vf_pci_core_device, core_device.vdev); 1318 struct vfio_pci_core_device *vdev = &mvdev->core_device; 1319 int ret; 1320 1321 ret = vfio_pci_core_enable(vdev); 1322 if (ret) 1323 return ret; 1324 1325 if (mvdev->migrate_cap) 1326 mvdev->mig_state = VFIO_DEVICE_STATE_RUNNING; 1327 vfio_pci_core_finish_enable(vdev); 1328 return 0; 1329 } 1330 1331 static void mlx5vf_pci_close_device(struct vfio_device *core_vdev) 1332 { 1333 struct mlx5vf_pci_core_device *mvdev = container_of( 1334 core_vdev, struct mlx5vf_pci_core_device, core_device.vdev); 1335 1336 mlx5vf_cmd_close_migratable(mvdev); 1337 vfio_pci_core_close_device(core_vdev); 1338 } 1339 1340 static const struct vfio_migration_ops mlx5vf_pci_mig_ops = { 1341 .migration_set_state = mlx5vf_pci_set_device_state, 1342 .migration_get_state = mlx5vf_pci_get_device_state, 1343 .migration_get_data_size = mlx5vf_pci_get_data_size, 1344 }; 1345 1346 static const struct vfio_log_ops mlx5vf_pci_log_ops = { 1347 .log_start = mlx5vf_start_page_tracker, 1348 .log_stop = mlx5vf_stop_page_tracker, 1349 .log_read_and_clear = mlx5vf_tracker_read_and_clear, 1350 }; 1351 1352 static int mlx5vf_pci_init_dev(struct vfio_device *core_vdev) 1353 { 1354 struct mlx5vf_pci_core_device *mvdev = container_of(core_vdev, 1355 struct mlx5vf_pci_core_device, core_device.vdev); 1356 int ret; 1357 1358 ret = vfio_pci_core_init_dev(core_vdev); 1359 if (ret) 1360 return ret; 1361 1362 mlx5vf_cmd_set_migratable(mvdev, &mlx5vf_pci_mig_ops, 1363 &mlx5vf_pci_log_ops); 1364 1365 return 0; 1366 } 1367 1368 static void mlx5vf_pci_release_dev(struct vfio_device *core_vdev) 1369 { 1370 struct mlx5vf_pci_core_device *mvdev = container_of(core_vdev, 1371 struct mlx5vf_pci_core_device, core_device.vdev); 1372 1373 mlx5vf_cmd_remove_migratable(mvdev); 1374 vfio_pci_core_release_dev(core_vdev); 1375 } 1376 1377 static const struct vfio_device_ops mlx5vf_pci_ops = { 1378 .name = "mlx5-vfio-pci", 1379 .init = mlx5vf_pci_init_dev, 1380 .release = mlx5vf_pci_release_dev, 1381 .open_device = mlx5vf_pci_open_device, 1382 .close_device = mlx5vf_pci_close_device, 1383 .ioctl = vfio_pci_core_ioctl, 1384 .device_feature = vfio_pci_core_ioctl_feature, 1385 .read = vfio_pci_core_read, 1386 .write = vfio_pci_core_write, 1387 .mmap = vfio_pci_core_mmap, 1388 .request = vfio_pci_core_request, 1389 .match = vfio_pci_core_match, 1390 .bind_iommufd = vfio_iommufd_physical_bind, 1391 .unbind_iommufd = vfio_iommufd_physical_unbind, 1392 .attach_ioas = vfio_iommufd_physical_attach_ioas, 1393 .detach_ioas = vfio_iommufd_physical_detach_ioas, 1394 }; 1395 1396 static int mlx5vf_pci_probe(struct pci_dev *pdev, 1397 const struct pci_device_id *id) 1398 { 1399 struct mlx5vf_pci_core_device *mvdev; 1400 int ret; 1401 1402 mvdev = vfio_alloc_device(mlx5vf_pci_core_device, core_device.vdev, 1403 &pdev->dev, &mlx5vf_pci_ops); 1404 if (IS_ERR(mvdev)) 1405 return PTR_ERR(mvdev); 1406 1407 dev_set_drvdata(&pdev->dev, &mvdev->core_device); 1408 ret = vfio_pci_core_register_device(&mvdev->core_device); 1409 if (ret) 1410 goto out_put_vdev; 1411 return 0; 1412 1413 out_put_vdev: 1414 vfio_put_device(&mvdev->core_device.vdev); 1415 return ret; 1416 } 1417 1418 static void mlx5vf_pci_remove(struct pci_dev *pdev) 1419 { 1420 struct mlx5vf_pci_core_device *mvdev = mlx5vf_drvdata(pdev); 1421 1422 vfio_pci_core_unregister_device(&mvdev->core_device); 1423 vfio_put_device(&mvdev->core_device.vdev); 1424 } 1425 1426 static const struct pci_device_id mlx5vf_pci_table[] = { 1427 { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_MELLANOX, 0x101e) }, /* ConnectX Family mlx5Gen Virtual Function */ 1428 {} 1429 }; 1430 1431 MODULE_DEVICE_TABLE(pci, mlx5vf_pci_table); 1432 1433 static const struct pci_error_handlers mlx5vf_err_handlers = { 1434 .reset_done = mlx5vf_pci_aer_reset_done, 1435 .error_detected = vfio_pci_core_aer_err_detected, 1436 }; 1437 1438 static struct pci_driver mlx5vf_pci_driver = { 1439 .name = KBUILD_MODNAME, 1440 .id_table = mlx5vf_pci_table, 1441 .probe = mlx5vf_pci_probe, 1442 .remove = mlx5vf_pci_remove, 1443 .err_handler = &mlx5vf_err_handlers, 1444 .driver_managed_dma = true, 1445 }; 1446 1447 module_pci_driver(mlx5vf_pci_driver); 1448 1449 MODULE_IMPORT_NS(IOMMUFD); 1450 MODULE_LICENSE("GPL"); 1451 MODULE_AUTHOR("Max Gurtovoy <mgurtovoy@nvidia.com>"); 1452 MODULE_AUTHOR("Yishai Hadas <yishaih@nvidia.com>"); 1453 MODULE_DESCRIPTION( 1454 "MLX5 VFIO PCI - User Level meta-driver for MLX5 device family"); 1455