1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved 4 */ 5 6 #include <linux/device.h> 7 #include <linux/eventfd.h> 8 #include <linux/file.h> 9 #include <linux/interrupt.h> 10 #include <linux/iommu.h> 11 #include <linux/module.h> 12 #include <linux/mutex.h> 13 #include <linux/notifier.h> 14 #include <linux/pci.h> 15 #include <linux/pm_runtime.h> 16 #include <linux/types.h> 17 #include <linux/uaccess.h> 18 #include <linux/vfio.h> 19 #include <linux/sched/mm.h> 20 #include <linux/anon_inodes.h> 21 22 #include "cmd.h" 23 24 /* Device specification max LOAD size */ 25 #define MAX_LOAD_SIZE (BIT_ULL(__mlx5_bit_sz(load_vhca_state_in, size)) - 1) 26 27 #define MAX_CHUNK_SIZE SZ_8M 28 29 static struct mlx5vf_pci_core_device *mlx5vf_drvdata(struct pci_dev *pdev) 30 { 31 struct vfio_pci_core_device *core_device = dev_get_drvdata(&pdev->dev); 32 33 return container_of(core_device, struct mlx5vf_pci_core_device, 34 core_device); 35 } 36 37 struct page * 38 mlx5vf_get_migration_page(struct mlx5_vhca_data_buffer *buf, 39 unsigned long offset) 40 { 41 unsigned long cur_offset = 0; 42 struct scatterlist *sg; 43 unsigned int i; 44 45 /* All accesses are sequential */ 46 if (offset < buf->last_offset || !buf->last_offset_sg) { 47 buf->last_offset = 0; 48 buf->last_offset_sg = buf->table.sgt.sgl; 49 buf->sg_last_entry = 0; 50 } 51 52 cur_offset = buf->last_offset; 53 54 for_each_sg(buf->last_offset_sg, sg, 55 buf->table.sgt.orig_nents - buf->sg_last_entry, i) { 56 if (offset < sg->length + cur_offset) { 57 buf->last_offset_sg = sg; 58 buf->sg_last_entry += i; 59 buf->last_offset = cur_offset; 60 return nth_page(sg_page(sg), 61 (offset - cur_offset) / PAGE_SIZE); 62 } 63 cur_offset += sg->length; 64 } 65 return NULL; 66 } 67 68 int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf, 69 unsigned int npages) 70 { 71 unsigned int to_alloc = npages; 72 struct page **page_list; 73 unsigned long filled; 74 unsigned int to_fill; 75 int ret; 76 77 to_fill = min_t(unsigned int, npages, PAGE_SIZE / sizeof(*page_list)); 78 page_list = kvzalloc(to_fill * sizeof(*page_list), GFP_KERNEL_ACCOUNT); 79 if (!page_list) 80 return -ENOMEM; 81 82 do { 83 filled = alloc_pages_bulk_array(GFP_KERNEL_ACCOUNT, to_fill, 84 page_list); 85 if (!filled) { 86 ret = -ENOMEM; 87 goto err; 88 } 89 to_alloc -= filled; 90 ret = sg_alloc_append_table_from_pages( 91 &buf->table, page_list, filled, 0, 92 filled << PAGE_SHIFT, UINT_MAX, SG_MAX_SINGLE_ALLOC, 93 GFP_KERNEL_ACCOUNT); 94 95 if (ret) 96 goto err; 97 buf->allocated_length += filled * PAGE_SIZE; 98 /* clean input for another bulk allocation */ 99 memset(page_list, 0, filled * sizeof(*page_list)); 100 to_fill = min_t(unsigned int, to_alloc, 101 PAGE_SIZE / sizeof(*page_list)); 102 } while (to_alloc > 0); 103 104 kvfree(page_list); 105 return 0; 106 107 err: 108 kvfree(page_list); 109 return ret; 110 } 111 112 static void mlx5vf_disable_fd(struct mlx5_vf_migration_file *migf) 113 { 114 mutex_lock(&migf->lock); 115 migf->state = MLX5_MIGF_STATE_ERROR; 116 migf->filp->f_pos = 0; 117 mutex_unlock(&migf->lock); 118 } 119 120 static int mlx5vf_release_file(struct inode *inode, struct file *filp) 121 { 122 struct mlx5_vf_migration_file *migf = filp->private_data; 123 124 mlx5vf_disable_fd(migf); 125 mutex_destroy(&migf->lock); 126 kfree(migf); 127 return 0; 128 } 129 130 static struct mlx5_vhca_data_buffer * 131 mlx5vf_get_data_buff_from_pos(struct mlx5_vf_migration_file *migf, loff_t pos, 132 bool *end_of_data) 133 { 134 struct mlx5_vhca_data_buffer *buf; 135 bool found = false; 136 137 *end_of_data = false; 138 spin_lock_irq(&migf->list_lock); 139 if (list_empty(&migf->buf_list)) { 140 *end_of_data = true; 141 goto end; 142 } 143 144 buf = list_first_entry(&migf->buf_list, struct mlx5_vhca_data_buffer, 145 buf_elm); 146 if (pos >= buf->start_pos && 147 pos < buf->start_pos + buf->length) { 148 found = true; 149 goto end; 150 } 151 152 /* 153 * As we use a stream based FD we may expect having the data always 154 * on first chunk 155 */ 156 migf->state = MLX5_MIGF_STATE_ERROR; 157 158 end: 159 spin_unlock_irq(&migf->list_lock); 160 return found ? buf : NULL; 161 } 162 163 static void mlx5vf_buf_read_done(struct mlx5_vhca_data_buffer *vhca_buf) 164 { 165 struct mlx5_vf_migration_file *migf = vhca_buf->migf; 166 167 if (vhca_buf->stop_copy_chunk_num) { 168 bool is_header = vhca_buf->dma_dir == DMA_NONE; 169 u8 chunk_num = vhca_buf->stop_copy_chunk_num; 170 size_t next_required_umem_size = 0; 171 172 if (is_header) 173 migf->buf_header[chunk_num - 1] = vhca_buf; 174 else 175 migf->buf[chunk_num - 1] = vhca_buf; 176 177 spin_lock_irq(&migf->list_lock); 178 list_del_init(&vhca_buf->buf_elm); 179 if (!is_header) { 180 next_required_umem_size = 181 migf->next_required_umem_size; 182 migf->next_required_umem_size = 0; 183 migf->num_ready_chunks--; 184 } 185 spin_unlock_irq(&migf->list_lock); 186 if (next_required_umem_size) 187 mlx5vf_mig_file_set_save_work(migf, chunk_num, 188 next_required_umem_size); 189 return; 190 } 191 192 spin_lock_irq(&migf->list_lock); 193 list_del_init(&vhca_buf->buf_elm); 194 list_add_tail(&vhca_buf->buf_elm, &vhca_buf->migf->avail_list); 195 spin_unlock_irq(&migf->list_lock); 196 } 197 198 static ssize_t mlx5vf_buf_read(struct mlx5_vhca_data_buffer *vhca_buf, 199 char __user **buf, size_t *len, loff_t *pos) 200 { 201 unsigned long offset; 202 ssize_t done = 0; 203 size_t copy_len; 204 205 copy_len = min_t(size_t, 206 vhca_buf->start_pos + vhca_buf->length - *pos, *len); 207 while (copy_len) { 208 size_t page_offset; 209 struct page *page; 210 size_t page_len; 211 u8 *from_buff; 212 int ret; 213 214 offset = *pos - vhca_buf->start_pos; 215 page_offset = offset % PAGE_SIZE; 216 offset -= page_offset; 217 page = mlx5vf_get_migration_page(vhca_buf, offset); 218 if (!page) 219 return -EINVAL; 220 page_len = min_t(size_t, copy_len, PAGE_SIZE - page_offset); 221 from_buff = kmap_local_page(page); 222 ret = copy_to_user(*buf, from_buff + page_offset, page_len); 223 kunmap_local(from_buff); 224 if (ret) 225 return -EFAULT; 226 *pos += page_len; 227 *len -= page_len; 228 *buf += page_len; 229 done += page_len; 230 copy_len -= page_len; 231 } 232 233 if (*pos >= vhca_buf->start_pos + vhca_buf->length) 234 mlx5vf_buf_read_done(vhca_buf); 235 236 return done; 237 } 238 239 static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len, 240 loff_t *pos) 241 { 242 struct mlx5_vf_migration_file *migf = filp->private_data; 243 struct mlx5_vhca_data_buffer *vhca_buf; 244 bool first_loop_call = true; 245 bool end_of_data; 246 ssize_t done = 0; 247 248 if (pos) 249 return -ESPIPE; 250 pos = &filp->f_pos; 251 252 if (!(filp->f_flags & O_NONBLOCK)) { 253 if (wait_event_interruptible(migf->poll_wait, 254 !list_empty(&migf->buf_list) || 255 migf->state == MLX5_MIGF_STATE_ERROR || 256 migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR || 257 migf->state == MLX5_MIGF_STATE_PRE_COPY || 258 migf->state == MLX5_MIGF_STATE_COMPLETE)) 259 return -ERESTARTSYS; 260 } 261 262 mutex_lock(&migf->lock); 263 if (migf->state == MLX5_MIGF_STATE_ERROR) { 264 done = -ENODEV; 265 goto out_unlock; 266 } 267 268 while (len) { 269 ssize_t count; 270 271 vhca_buf = mlx5vf_get_data_buff_from_pos(migf, *pos, 272 &end_of_data); 273 if (first_loop_call) { 274 first_loop_call = false; 275 /* Temporary end of file as part of PRE_COPY */ 276 if (end_of_data && (migf->state == MLX5_MIGF_STATE_PRE_COPY || 277 migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR)) { 278 done = -ENOMSG; 279 goto out_unlock; 280 } 281 282 if (end_of_data && migf->state != MLX5_MIGF_STATE_COMPLETE) { 283 if (filp->f_flags & O_NONBLOCK) { 284 done = -EAGAIN; 285 goto out_unlock; 286 } 287 } 288 } 289 290 if (end_of_data) 291 goto out_unlock; 292 293 if (!vhca_buf) { 294 done = -EINVAL; 295 goto out_unlock; 296 } 297 298 count = mlx5vf_buf_read(vhca_buf, &buf, &len, pos); 299 if (count < 0) { 300 done = count; 301 goto out_unlock; 302 } 303 done += count; 304 } 305 306 out_unlock: 307 mutex_unlock(&migf->lock); 308 return done; 309 } 310 311 static __poll_t mlx5vf_save_poll(struct file *filp, 312 struct poll_table_struct *wait) 313 { 314 struct mlx5_vf_migration_file *migf = filp->private_data; 315 __poll_t pollflags = 0; 316 317 poll_wait(filp, &migf->poll_wait, wait); 318 319 mutex_lock(&migf->lock); 320 if (migf->state == MLX5_MIGF_STATE_ERROR) 321 pollflags = EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; 322 else if (!list_empty(&migf->buf_list) || 323 migf->state == MLX5_MIGF_STATE_COMPLETE) 324 pollflags = EPOLLIN | EPOLLRDNORM; 325 mutex_unlock(&migf->lock); 326 327 return pollflags; 328 } 329 330 /* 331 * FD is exposed and user can use it after receiving an error. 332 * Mark migf in error, and wake the user. 333 */ 334 static void mlx5vf_mark_err(struct mlx5_vf_migration_file *migf) 335 { 336 migf->state = MLX5_MIGF_STATE_ERROR; 337 wake_up_interruptible(&migf->poll_wait); 338 } 339 340 void mlx5vf_mig_file_set_save_work(struct mlx5_vf_migration_file *migf, 341 u8 chunk_num, size_t next_required_umem_size) 342 { 343 migf->save_data[chunk_num - 1].next_required_umem_size = 344 next_required_umem_size; 345 migf->save_data[chunk_num - 1].migf = migf; 346 get_file(migf->filp); 347 queue_work(migf->mvdev->cb_wq, 348 &migf->save_data[chunk_num - 1].work); 349 } 350 351 static struct mlx5_vhca_data_buffer * 352 mlx5vf_mig_file_get_stop_copy_buf(struct mlx5_vf_migration_file *migf, 353 u8 index, size_t required_length) 354 { 355 struct mlx5_vhca_data_buffer *buf = migf->buf[index]; 356 u8 chunk_num; 357 358 WARN_ON(!buf); 359 chunk_num = buf->stop_copy_chunk_num; 360 buf->migf->buf[index] = NULL; 361 /* Checking whether the pre-allocated buffer can fit */ 362 if (buf->allocated_length >= required_length) 363 return buf; 364 365 mlx5vf_put_data_buffer(buf); 366 buf = mlx5vf_get_data_buffer(buf->migf, required_length, 367 DMA_FROM_DEVICE); 368 if (IS_ERR(buf)) 369 return buf; 370 371 buf->stop_copy_chunk_num = chunk_num; 372 return buf; 373 } 374 375 static void mlx5vf_mig_file_save_work(struct work_struct *_work) 376 { 377 struct mlx5vf_save_work_data *save_data = container_of(_work, 378 struct mlx5vf_save_work_data, work); 379 struct mlx5_vf_migration_file *migf = save_data->migf; 380 struct mlx5vf_pci_core_device *mvdev = migf->mvdev; 381 struct mlx5_vhca_data_buffer *buf; 382 383 mutex_lock(&mvdev->state_mutex); 384 if (migf->state == MLX5_MIGF_STATE_ERROR) 385 goto end; 386 387 buf = mlx5vf_mig_file_get_stop_copy_buf(migf, 388 save_data->chunk_num - 1, 389 save_data->next_required_umem_size); 390 if (IS_ERR(buf)) 391 goto err; 392 393 if (mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, false)) 394 goto err_save; 395 396 goto end; 397 398 err_save: 399 mlx5vf_put_data_buffer(buf); 400 err: 401 mlx5vf_mark_err(migf); 402 end: 403 mlx5vf_state_mutex_unlock(mvdev); 404 fput(migf->filp); 405 } 406 407 static int mlx5vf_add_stop_copy_header(struct mlx5_vf_migration_file *migf, 408 bool track) 409 { 410 size_t size = sizeof(struct mlx5_vf_migration_header) + 411 sizeof(struct mlx5_vf_migration_tag_stop_copy_data); 412 struct mlx5_vf_migration_tag_stop_copy_data data = {}; 413 struct mlx5_vhca_data_buffer *header_buf = NULL; 414 struct mlx5_vf_migration_header header = {}; 415 unsigned long flags; 416 struct page *page; 417 u8 *to_buff; 418 int ret; 419 420 header_buf = mlx5vf_get_data_buffer(migf, size, DMA_NONE); 421 if (IS_ERR(header_buf)) 422 return PTR_ERR(header_buf); 423 424 header.record_size = cpu_to_le64(sizeof(data)); 425 header.flags = cpu_to_le32(MLX5_MIGF_HEADER_FLAGS_TAG_OPTIONAL); 426 header.tag = cpu_to_le32(MLX5_MIGF_HEADER_TAG_STOP_COPY_SIZE); 427 page = mlx5vf_get_migration_page(header_buf, 0); 428 if (!page) { 429 ret = -EINVAL; 430 goto err; 431 } 432 to_buff = kmap_local_page(page); 433 memcpy(to_buff, &header, sizeof(header)); 434 header_buf->length = sizeof(header); 435 data.stop_copy_size = cpu_to_le64(migf->buf[0]->allocated_length); 436 memcpy(to_buff + sizeof(header), &data, sizeof(data)); 437 header_buf->length += sizeof(data); 438 kunmap_local(to_buff); 439 header_buf->start_pos = header_buf->migf->max_pos; 440 migf->max_pos += header_buf->length; 441 spin_lock_irqsave(&migf->list_lock, flags); 442 list_add_tail(&header_buf->buf_elm, &migf->buf_list); 443 spin_unlock_irqrestore(&migf->list_lock, flags); 444 if (track) 445 migf->pre_copy_initial_bytes = size; 446 return 0; 447 err: 448 mlx5vf_put_data_buffer(header_buf); 449 return ret; 450 } 451 452 static int mlx5vf_prep_stop_copy(struct mlx5vf_pci_core_device *mvdev, 453 struct mlx5_vf_migration_file *migf, 454 size_t state_size, u64 full_size, 455 bool track) 456 { 457 struct mlx5_vhca_data_buffer *buf; 458 size_t inc_state_size; 459 int num_chunks; 460 int ret; 461 int i; 462 463 if (mvdev->chunk_mode) { 464 size_t chunk_size = min_t(size_t, MAX_CHUNK_SIZE, full_size); 465 466 /* from firmware perspective at least 'state_size' buffer should be set */ 467 inc_state_size = max(state_size, chunk_size); 468 } else { 469 if (track) { 470 /* let's be ready for stop_copy size that might grow by 10 percents */ 471 if (check_add_overflow(state_size, state_size / 10, &inc_state_size)) 472 inc_state_size = state_size; 473 } else { 474 inc_state_size = state_size; 475 } 476 } 477 478 /* let's not overflow the device specification max SAVE size */ 479 inc_state_size = min_t(size_t, inc_state_size, 480 (BIT_ULL(__mlx5_bit_sz(save_vhca_state_in, size)) - PAGE_SIZE)); 481 482 num_chunks = mvdev->chunk_mode ? MAX_NUM_CHUNKS : 1; 483 for (i = 0; i < num_chunks; i++) { 484 buf = mlx5vf_get_data_buffer(migf, inc_state_size, DMA_FROM_DEVICE); 485 if (IS_ERR(buf)) { 486 ret = PTR_ERR(buf); 487 goto err; 488 } 489 490 migf->buf[i] = buf; 491 buf = mlx5vf_get_data_buffer(migf, 492 sizeof(struct mlx5_vf_migration_header), DMA_NONE); 493 if (IS_ERR(buf)) { 494 ret = PTR_ERR(buf); 495 goto err; 496 } 497 migf->buf_header[i] = buf; 498 if (mvdev->chunk_mode) { 499 migf->buf[i]->stop_copy_chunk_num = i + 1; 500 migf->buf_header[i]->stop_copy_chunk_num = i + 1; 501 INIT_WORK(&migf->save_data[i].work, 502 mlx5vf_mig_file_save_work); 503 migf->save_data[i].chunk_num = i + 1; 504 } 505 } 506 507 ret = mlx5vf_add_stop_copy_header(migf, track); 508 if (ret) 509 goto err; 510 return 0; 511 512 err: 513 for (i = 0; i < num_chunks; i++) { 514 if (migf->buf[i]) { 515 mlx5vf_put_data_buffer(migf->buf[i]); 516 migf->buf[i] = NULL; 517 } 518 if (migf->buf_header[i]) { 519 mlx5vf_put_data_buffer(migf->buf_header[i]); 520 migf->buf_header[i] = NULL; 521 } 522 } 523 524 return ret; 525 } 526 527 static long mlx5vf_precopy_ioctl(struct file *filp, unsigned int cmd, 528 unsigned long arg) 529 { 530 struct mlx5_vf_migration_file *migf = filp->private_data; 531 struct mlx5vf_pci_core_device *mvdev = migf->mvdev; 532 struct mlx5_vhca_data_buffer *buf; 533 struct vfio_precopy_info info = {}; 534 loff_t *pos = &filp->f_pos; 535 unsigned long minsz; 536 size_t inc_length = 0; 537 bool end_of_data = false; 538 int ret; 539 540 if (cmd != VFIO_MIG_GET_PRECOPY_INFO) 541 return -ENOTTY; 542 543 minsz = offsetofend(struct vfio_precopy_info, dirty_bytes); 544 545 if (copy_from_user(&info, (void __user *)arg, minsz)) 546 return -EFAULT; 547 548 if (info.argsz < minsz) 549 return -EINVAL; 550 551 mutex_lock(&mvdev->state_mutex); 552 if (mvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY && 553 mvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY_P2P) { 554 ret = -EINVAL; 555 goto err_state_unlock; 556 } 557 558 /* 559 * We can't issue a SAVE command when the device is suspended, so as 560 * part of VFIO_DEVICE_STATE_PRE_COPY_P2P no reason to query for extra 561 * bytes that can't be read. 562 */ 563 if (mvdev->mig_state == VFIO_DEVICE_STATE_PRE_COPY) { 564 /* 565 * Once the query returns it's guaranteed that there is no 566 * active SAVE command. 567 * As so, the other code below is safe with the proper locks. 568 */ 569 ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &inc_length, 570 NULL, MLX5VF_QUERY_INC); 571 if (ret) 572 goto err_state_unlock; 573 } 574 575 mutex_lock(&migf->lock); 576 if (migf->state == MLX5_MIGF_STATE_ERROR) { 577 ret = -ENODEV; 578 goto err_migf_unlock; 579 } 580 581 if (migf->pre_copy_initial_bytes > *pos) { 582 info.initial_bytes = migf->pre_copy_initial_bytes - *pos; 583 } else { 584 info.dirty_bytes = migf->max_pos - *pos; 585 if (!info.dirty_bytes) 586 end_of_data = true; 587 info.dirty_bytes += inc_length; 588 } 589 590 if (!end_of_data || !inc_length) { 591 mutex_unlock(&migf->lock); 592 goto done; 593 } 594 595 mutex_unlock(&migf->lock); 596 /* 597 * We finished transferring the current state and the device has a 598 * dirty state, save a new state to be ready for. 599 */ 600 buf = mlx5vf_get_data_buffer(migf, inc_length, DMA_FROM_DEVICE); 601 if (IS_ERR(buf)) { 602 ret = PTR_ERR(buf); 603 mlx5vf_mark_err(migf); 604 goto err_state_unlock; 605 } 606 607 ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, true); 608 if (ret) { 609 mlx5vf_mark_err(migf); 610 mlx5vf_put_data_buffer(buf); 611 goto err_state_unlock; 612 } 613 614 done: 615 mlx5vf_state_mutex_unlock(mvdev); 616 if (copy_to_user((void __user *)arg, &info, minsz)) 617 return -EFAULT; 618 return 0; 619 620 err_migf_unlock: 621 mutex_unlock(&migf->lock); 622 err_state_unlock: 623 mlx5vf_state_mutex_unlock(mvdev); 624 return ret; 625 } 626 627 static const struct file_operations mlx5vf_save_fops = { 628 .owner = THIS_MODULE, 629 .read = mlx5vf_save_read, 630 .poll = mlx5vf_save_poll, 631 .unlocked_ioctl = mlx5vf_precopy_ioctl, 632 .compat_ioctl = compat_ptr_ioctl, 633 .release = mlx5vf_release_file, 634 .llseek = no_llseek, 635 }; 636 637 static int mlx5vf_pci_save_device_inc_data(struct mlx5vf_pci_core_device *mvdev) 638 { 639 struct mlx5_vf_migration_file *migf = mvdev->saving_migf; 640 struct mlx5_vhca_data_buffer *buf; 641 size_t length; 642 int ret; 643 644 if (migf->state == MLX5_MIGF_STATE_ERROR) 645 return -ENODEV; 646 647 ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, NULL, 648 MLX5VF_QUERY_INC | MLX5VF_QUERY_FINAL); 649 if (ret) 650 goto err; 651 652 buf = mlx5vf_mig_file_get_stop_copy_buf(migf, 0, length); 653 if (IS_ERR(buf)) { 654 ret = PTR_ERR(buf); 655 goto err; 656 } 657 658 ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, false); 659 if (ret) 660 goto err_save; 661 662 return 0; 663 664 err_save: 665 mlx5vf_put_data_buffer(buf); 666 err: 667 mlx5vf_mark_err(migf); 668 return ret; 669 } 670 671 static struct mlx5_vf_migration_file * 672 mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev, bool track) 673 { 674 struct mlx5_vf_migration_file *migf; 675 struct mlx5_vhca_data_buffer *buf; 676 size_t length; 677 u64 full_size; 678 int ret; 679 680 migf = kzalloc(sizeof(*migf), GFP_KERNEL_ACCOUNT); 681 if (!migf) 682 return ERR_PTR(-ENOMEM); 683 684 migf->filp = anon_inode_getfile("mlx5vf_mig", &mlx5vf_save_fops, migf, 685 O_RDONLY); 686 if (IS_ERR(migf->filp)) { 687 ret = PTR_ERR(migf->filp); 688 goto end; 689 } 690 691 migf->mvdev = mvdev; 692 ret = mlx5vf_cmd_alloc_pd(migf); 693 if (ret) 694 goto out_free; 695 696 stream_open(migf->filp->f_inode, migf->filp); 697 mutex_init(&migf->lock); 698 init_waitqueue_head(&migf->poll_wait); 699 init_completion(&migf->save_comp); 700 /* 701 * save_comp is being used as a binary semaphore built from 702 * a completion. A normal mutex cannot be used because the lock is 703 * passed between kernel threads and lockdep can't model this. 704 */ 705 complete(&migf->save_comp); 706 mlx5_cmd_init_async_ctx(mvdev->mdev, &migf->async_ctx); 707 INIT_WORK(&migf->async_data.work, mlx5vf_mig_file_cleanup_cb); 708 INIT_LIST_HEAD(&migf->buf_list); 709 INIT_LIST_HEAD(&migf->avail_list); 710 spin_lock_init(&migf->list_lock); 711 ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, &full_size, 0); 712 if (ret) 713 goto out_pd; 714 715 ret = mlx5vf_prep_stop_copy(mvdev, migf, length, full_size, track); 716 if (ret) 717 goto out_pd; 718 719 if (track) { 720 /* leave the allocated buffer ready for the stop-copy phase */ 721 buf = mlx5vf_alloc_data_buffer(migf, 722 migf->buf[0]->allocated_length, DMA_FROM_DEVICE); 723 if (IS_ERR(buf)) { 724 ret = PTR_ERR(buf); 725 goto out_pd; 726 } 727 } else { 728 buf = migf->buf[0]; 729 migf->buf[0] = NULL; 730 } 731 732 ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, false, track); 733 if (ret) 734 goto out_save; 735 return migf; 736 out_save: 737 mlx5vf_free_data_buffer(buf); 738 out_pd: 739 mlx5fv_cmd_clean_migf_resources(migf); 740 out_free: 741 fput(migf->filp); 742 end: 743 kfree(migf); 744 return ERR_PTR(ret); 745 } 746 747 static int 748 mlx5vf_append_page_to_mig_buf(struct mlx5_vhca_data_buffer *vhca_buf, 749 const char __user **buf, size_t *len, 750 loff_t *pos, ssize_t *done) 751 { 752 unsigned long offset; 753 size_t page_offset; 754 struct page *page; 755 size_t page_len; 756 u8 *to_buff; 757 int ret; 758 759 offset = *pos - vhca_buf->start_pos; 760 page_offset = offset % PAGE_SIZE; 761 762 page = mlx5vf_get_migration_page(vhca_buf, offset - page_offset); 763 if (!page) 764 return -EINVAL; 765 page_len = min_t(size_t, *len, PAGE_SIZE - page_offset); 766 to_buff = kmap_local_page(page); 767 ret = copy_from_user(to_buff + page_offset, *buf, page_len); 768 kunmap_local(to_buff); 769 if (ret) 770 return -EFAULT; 771 772 *pos += page_len; 773 *done += page_len; 774 *buf += page_len; 775 *len -= page_len; 776 vhca_buf->length += page_len; 777 return 0; 778 } 779 780 static int 781 mlx5vf_resume_read_image_no_header(struct mlx5_vhca_data_buffer *vhca_buf, 782 loff_t requested_length, 783 const char __user **buf, size_t *len, 784 loff_t *pos, ssize_t *done) 785 { 786 int ret; 787 788 if (requested_length > MAX_LOAD_SIZE) 789 return -ENOMEM; 790 791 if (vhca_buf->allocated_length < requested_length) { 792 ret = mlx5vf_add_migration_pages( 793 vhca_buf, 794 DIV_ROUND_UP(requested_length - vhca_buf->allocated_length, 795 PAGE_SIZE)); 796 if (ret) 797 return ret; 798 } 799 800 while (*len) { 801 ret = mlx5vf_append_page_to_mig_buf(vhca_buf, buf, len, pos, 802 done); 803 if (ret) 804 return ret; 805 } 806 807 return 0; 808 } 809 810 static ssize_t 811 mlx5vf_resume_read_image(struct mlx5_vf_migration_file *migf, 812 struct mlx5_vhca_data_buffer *vhca_buf, 813 size_t image_size, const char __user **buf, 814 size_t *len, loff_t *pos, ssize_t *done, 815 bool *has_work) 816 { 817 size_t copy_len, to_copy; 818 int ret; 819 820 to_copy = min_t(size_t, *len, image_size - vhca_buf->length); 821 copy_len = to_copy; 822 while (to_copy) { 823 ret = mlx5vf_append_page_to_mig_buf(vhca_buf, buf, &to_copy, pos, 824 done); 825 if (ret) 826 return ret; 827 } 828 829 *len -= copy_len; 830 if (vhca_buf->length == image_size) { 831 migf->load_state = MLX5_VF_LOAD_STATE_LOAD_IMAGE; 832 migf->max_pos += image_size; 833 *has_work = true; 834 } 835 836 return 0; 837 } 838 839 static int 840 mlx5vf_resume_read_header_data(struct mlx5_vf_migration_file *migf, 841 struct mlx5_vhca_data_buffer *vhca_buf, 842 const char __user **buf, size_t *len, 843 loff_t *pos, ssize_t *done) 844 { 845 size_t copy_len, to_copy; 846 size_t required_data; 847 u8 *to_buff; 848 int ret; 849 850 required_data = migf->record_size - vhca_buf->length; 851 to_copy = min_t(size_t, *len, required_data); 852 copy_len = to_copy; 853 while (to_copy) { 854 ret = mlx5vf_append_page_to_mig_buf(vhca_buf, buf, &to_copy, pos, 855 done); 856 if (ret) 857 return ret; 858 } 859 860 *len -= copy_len; 861 if (vhca_buf->length == migf->record_size) { 862 switch (migf->record_tag) { 863 case MLX5_MIGF_HEADER_TAG_STOP_COPY_SIZE: 864 { 865 struct page *page; 866 867 page = mlx5vf_get_migration_page(vhca_buf, 0); 868 if (!page) 869 return -EINVAL; 870 to_buff = kmap_local_page(page); 871 migf->stop_copy_prep_size = min_t(u64, 872 le64_to_cpup((__le64 *)to_buff), MAX_LOAD_SIZE); 873 kunmap_local(to_buff); 874 break; 875 } 876 default: 877 /* Optional tag */ 878 break; 879 } 880 881 migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER; 882 migf->max_pos += migf->record_size; 883 vhca_buf->length = 0; 884 } 885 886 return 0; 887 } 888 889 static int 890 mlx5vf_resume_read_header(struct mlx5_vf_migration_file *migf, 891 struct mlx5_vhca_data_buffer *vhca_buf, 892 const char __user **buf, 893 size_t *len, loff_t *pos, 894 ssize_t *done, bool *has_work) 895 { 896 struct page *page; 897 size_t copy_len; 898 u8 *to_buff; 899 int ret; 900 901 copy_len = min_t(size_t, *len, 902 sizeof(struct mlx5_vf_migration_header) - vhca_buf->length); 903 page = mlx5vf_get_migration_page(vhca_buf, 0); 904 if (!page) 905 return -EINVAL; 906 to_buff = kmap_local_page(page); 907 ret = copy_from_user(to_buff + vhca_buf->length, *buf, copy_len); 908 if (ret) { 909 ret = -EFAULT; 910 goto end; 911 } 912 913 *buf += copy_len; 914 *pos += copy_len; 915 *done += copy_len; 916 *len -= copy_len; 917 vhca_buf->length += copy_len; 918 if (vhca_buf->length == sizeof(struct mlx5_vf_migration_header)) { 919 u64 record_size; 920 u32 flags; 921 922 record_size = le64_to_cpup((__le64 *)to_buff); 923 if (record_size > MAX_LOAD_SIZE) { 924 ret = -ENOMEM; 925 goto end; 926 } 927 928 migf->record_size = record_size; 929 flags = le32_to_cpup((__le32 *)(to_buff + 930 offsetof(struct mlx5_vf_migration_header, flags))); 931 migf->record_tag = le32_to_cpup((__le32 *)(to_buff + 932 offsetof(struct mlx5_vf_migration_header, tag))); 933 switch (migf->record_tag) { 934 case MLX5_MIGF_HEADER_TAG_FW_DATA: 935 migf->load_state = MLX5_VF_LOAD_STATE_PREP_IMAGE; 936 break; 937 case MLX5_MIGF_HEADER_TAG_STOP_COPY_SIZE: 938 migf->load_state = MLX5_VF_LOAD_STATE_PREP_HEADER_DATA; 939 break; 940 default: 941 if (!(flags & MLX5_MIGF_HEADER_FLAGS_TAG_OPTIONAL)) { 942 ret = -EOPNOTSUPP; 943 goto end; 944 } 945 /* We may read and skip this optional record data */ 946 migf->load_state = MLX5_VF_LOAD_STATE_PREP_HEADER_DATA; 947 } 948 949 migf->max_pos += vhca_buf->length; 950 vhca_buf->length = 0; 951 *has_work = true; 952 } 953 end: 954 kunmap_local(to_buff); 955 return ret; 956 } 957 958 static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf, 959 size_t len, loff_t *pos) 960 { 961 struct mlx5_vf_migration_file *migf = filp->private_data; 962 struct mlx5_vhca_data_buffer *vhca_buf = migf->buf[0]; 963 struct mlx5_vhca_data_buffer *vhca_buf_header = migf->buf_header[0]; 964 loff_t requested_length; 965 bool has_work = false; 966 ssize_t done = 0; 967 int ret = 0; 968 969 if (pos) 970 return -ESPIPE; 971 pos = &filp->f_pos; 972 973 if (*pos < 0 || 974 check_add_overflow((loff_t)len, *pos, &requested_length)) 975 return -EINVAL; 976 977 mutex_lock(&migf->mvdev->state_mutex); 978 mutex_lock(&migf->lock); 979 if (migf->state == MLX5_MIGF_STATE_ERROR) { 980 ret = -ENODEV; 981 goto out_unlock; 982 } 983 984 while (len || has_work) { 985 has_work = false; 986 switch (migf->load_state) { 987 case MLX5_VF_LOAD_STATE_READ_HEADER: 988 ret = mlx5vf_resume_read_header(migf, vhca_buf_header, 989 &buf, &len, pos, 990 &done, &has_work); 991 if (ret) 992 goto out_unlock; 993 break; 994 case MLX5_VF_LOAD_STATE_PREP_HEADER_DATA: 995 if (vhca_buf_header->allocated_length < migf->record_size) { 996 mlx5vf_free_data_buffer(vhca_buf_header); 997 998 migf->buf_header[0] = mlx5vf_alloc_data_buffer(migf, 999 migf->record_size, DMA_NONE); 1000 if (IS_ERR(migf->buf_header[0])) { 1001 ret = PTR_ERR(migf->buf_header[0]); 1002 migf->buf_header[0] = NULL; 1003 goto out_unlock; 1004 } 1005 1006 vhca_buf_header = migf->buf_header[0]; 1007 } 1008 1009 vhca_buf_header->start_pos = migf->max_pos; 1010 migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER_DATA; 1011 break; 1012 case MLX5_VF_LOAD_STATE_READ_HEADER_DATA: 1013 ret = mlx5vf_resume_read_header_data(migf, vhca_buf_header, 1014 &buf, &len, pos, &done); 1015 if (ret) 1016 goto out_unlock; 1017 break; 1018 case MLX5_VF_LOAD_STATE_PREP_IMAGE: 1019 { 1020 u64 size = max(migf->record_size, 1021 migf->stop_copy_prep_size); 1022 1023 if (vhca_buf->allocated_length < size) { 1024 mlx5vf_free_data_buffer(vhca_buf); 1025 1026 migf->buf[0] = mlx5vf_alloc_data_buffer(migf, 1027 size, DMA_TO_DEVICE); 1028 if (IS_ERR(migf->buf[0])) { 1029 ret = PTR_ERR(migf->buf[0]); 1030 migf->buf[0] = NULL; 1031 goto out_unlock; 1032 } 1033 1034 vhca_buf = migf->buf[0]; 1035 } 1036 1037 vhca_buf->start_pos = migf->max_pos; 1038 migf->load_state = MLX5_VF_LOAD_STATE_READ_IMAGE; 1039 break; 1040 } 1041 case MLX5_VF_LOAD_STATE_READ_IMAGE_NO_HEADER: 1042 ret = mlx5vf_resume_read_image_no_header(vhca_buf, 1043 requested_length, 1044 &buf, &len, pos, &done); 1045 if (ret) 1046 goto out_unlock; 1047 break; 1048 case MLX5_VF_LOAD_STATE_READ_IMAGE: 1049 ret = mlx5vf_resume_read_image(migf, vhca_buf, 1050 migf->record_size, 1051 &buf, &len, pos, &done, &has_work); 1052 if (ret) 1053 goto out_unlock; 1054 break; 1055 case MLX5_VF_LOAD_STATE_LOAD_IMAGE: 1056 ret = mlx5vf_cmd_load_vhca_state(migf->mvdev, migf, vhca_buf); 1057 if (ret) 1058 goto out_unlock; 1059 migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER; 1060 1061 /* prep header buf for next image */ 1062 vhca_buf_header->length = 0; 1063 /* prep data buf for next image */ 1064 vhca_buf->length = 0; 1065 1066 break; 1067 default: 1068 break; 1069 } 1070 } 1071 1072 out_unlock: 1073 if (ret) 1074 migf->state = MLX5_MIGF_STATE_ERROR; 1075 mutex_unlock(&migf->lock); 1076 mlx5vf_state_mutex_unlock(migf->mvdev); 1077 return ret ? ret : done; 1078 } 1079 1080 static const struct file_operations mlx5vf_resume_fops = { 1081 .owner = THIS_MODULE, 1082 .write = mlx5vf_resume_write, 1083 .release = mlx5vf_release_file, 1084 .llseek = no_llseek, 1085 }; 1086 1087 static struct mlx5_vf_migration_file * 1088 mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev) 1089 { 1090 struct mlx5_vf_migration_file *migf; 1091 struct mlx5_vhca_data_buffer *buf; 1092 int ret; 1093 1094 migf = kzalloc(sizeof(*migf), GFP_KERNEL_ACCOUNT); 1095 if (!migf) 1096 return ERR_PTR(-ENOMEM); 1097 1098 migf->filp = anon_inode_getfile("mlx5vf_mig", &mlx5vf_resume_fops, migf, 1099 O_WRONLY); 1100 if (IS_ERR(migf->filp)) { 1101 ret = PTR_ERR(migf->filp); 1102 goto end; 1103 } 1104 1105 migf->mvdev = mvdev; 1106 ret = mlx5vf_cmd_alloc_pd(migf); 1107 if (ret) 1108 goto out_free; 1109 1110 buf = mlx5vf_alloc_data_buffer(migf, 0, DMA_TO_DEVICE); 1111 if (IS_ERR(buf)) { 1112 ret = PTR_ERR(buf); 1113 goto out_pd; 1114 } 1115 1116 migf->buf[0] = buf; 1117 if (MLX5VF_PRE_COPY_SUPP(mvdev)) { 1118 buf = mlx5vf_alloc_data_buffer(migf, 1119 sizeof(struct mlx5_vf_migration_header), DMA_NONE); 1120 if (IS_ERR(buf)) { 1121 ret = PTR_ERR(buf); 1122 goto out_buf; 1123 } 1124 1125 migf->buf_header[0] = buf; 1126 migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER; 1127 } else { 1128 /* Initial state will be to read the image */ 1129 migf->load_state = MLX5_VF_LOAD_STATE_READ_IMAGE_NO_HEADER; 1130 } 1131 1132 stream_open(migf->filp->f_inode, migf->filp); 1133 mutex_init(&migf->lock); 1134 INIT_LIST_HEAD(&migf->buf_list); 1135 INIT_LIST_HEAD(&migf->avail_list); 1136 spin_lock_init(&migf->list_lock); 1137 return migf; 1138 out_buf: 1139 mlx5vf_free_data_buffer(migf->buf[0]); 1140 out_pd: 1141 mlx5vf_cmd_dealloc_pd(migf); 1142 out_free: 1143 fput(migf->filp); 1144 end: 1145 kfree(migf); 1146 return ERR_PTR(ret); 1147 } 1148 1149 void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev) 1150 { 1151 if (mvdev->resuming_migf) { 1152 mlx5vf_disable_fd(mvdev->resuming_migf); 1153 mlx5fv_cmd_clean_migf_resources(mvdev->resuming_migf); 1154 fput(mvdev->resuming_migf->filp); 1155 mvdev->resuming_migf = NULL; 1156 } 1157 if (mvdev->saving_migf) { 1158 mlx5_cmd_cleanup_async_ctx(&mvdev->saving_migf->async_ctx); 1159 cancel_work_sync(&mvdev->saving_migf->async_data.work); 1160 mlx5vf_disable_fd(mvdev->saving_migf); 1161 wake_up_interruptible(&mvdev->saving_migf->poll_wait); 1162 mlx5fv_cmd_clean_migf_resources(mvdev->saving_migf); 1163 fput(mvdev->saving_migf->filp); 1164 mvdev->saving_migf = NULL; 1165 } 1166 } 1167 1168 static struct file * 1169 mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev, 1170 u32 new) 1171 { 1172 u32 cur = mvdev->mig_state; 1173 int ret; 1174 1175 if (cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_STOP) { 1176 ret = mlx5vf_cmd_suspend_vhca(mvdev, 1177 MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_RESPONDER); 1178 if (ret) 1179 return ERR_PTR(ret); 1180 return NULL; 1181 } 1182 1183 if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RUNNING_P2P) { 1184 ret = mlx5vf_cmd_resume_vhca(mvdev, 1185 MLX5_RESUME_VHCA_IN_OP_MOD_RESUME_RESPONDER); 1186 if (ret) 1187 return ERR_PTR(ret); 1188 return NULL; 1189 } 1190 1191 if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_RUNNING_P2P) || 1192 (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) { 1193 ret = mlx5vf_cmd_suspend_vhca(mvdev, 1194 MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_INITIATOR); 1195 if (ret) 1196 return ERR_PTR(ret); 1197 return NULL; 1198 } 1199 1200 if ((cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_RUNNING) || 1201 (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_PRE_COPY)) { 1202 ret = mlx5vf_cmd_resume_vhca(mvdev, 1203 MLX5_RESUME_VHCA_IN_OP_MOD_RESUME_INITIATOR); 1204 if (ret) 1205 return ERR_PTR(ret); 1206 return NULL; 1207 } 1208 1209 if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_STOP_COPY) { 1210 struct mlx5_vf_migration_file *migf; 1211 1212 migf = mlx5vf_pci_save_device_data(mvdev, false); 1213 if (IS_ERR(migf)) 1214 return ERR_CAST(migf); 1215 get_file(migf->filp); 1216 mvdev->saving_migf = migf; 1217 return migf->filp; 1218 } 1219 1220 if ((cur == VFIO_DEVICE_STATE_STOP_COPY && new == VFIO_DEVICE_STATE_STOP) || 1221 (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_RUNNING) || 1222 (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && 1223 new == VFIO_DEVICE_STATE_RUNNING_P2P)) { 1224 mlx5vf_disable_fds(mvdev); 1225 return NULL; 1226 } 1227 1228 if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RESUMING) { 1229 struct mlx5_vf_migration_file *migf; 1230 1231 migf = mlx5vf_pci_resume_device_data(mvdev); 1232 if (IS_ERR(migf)) 1233 return ERR_CAST(migf); 1234 get_file(migf->filp); 1235 mvdev->resuming_migf = migf; 1236 return migf->filp; 1237 } 1238 1239 if (cur == VFIO_DEVICE_STATE_RESUMING && new == VFIO_DEVICE_STATE_STOP) { 1240 if (!MLX5VF_PRE_COPY_SUPP(mvdev)) { 1241 ret = mlx5vf_cmd_load_vhca_state(mvdev, 1242 mvdev->resuming_migf, 1243 mvdev->resuming_migf->buf[0]); 1244 if (ret) 1245 return ERR_PTR(ret); 1246 } 1247 mlx5vf_disable_fds(mvdev); 1248 return NULL; 1249 } 1250 1251 if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_PRE_COPY) || 1252 (cur == VFIO_DEVICE_STATE_RUNNING_P2P && 1253 new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) { 1254 struct mlx5_vf_migration_file *migf; 1255 1256 migf = mlx5vf_pci_save_device_data(mvdev, true); 1257 if (IS_ERR(migf)) 1258 return ERR_CAST(migf); 1259 get_file(migf->filp); 1260 mvdev->saving_migf = migf; 1261 return migf->filp; 1262 } 1263 1264 if (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_STOP_COPY) { 1265 ret = mlx5vf_cmd_suspend_vhca(mvdev, 1266 MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_RESPONDER); 1267 if (ret) 1268 return ERR_PTR(ret); 1269 ret = mlx5vf_pci_save_device_inc_data(mvdev); 1270 return ret ? ERR_PTR(ret) : NULL; 1271 } 1272 1273 /* 1274 * vfio_mig_get_next_state() does not use arcs other than the above 1275 */ 1276 WARN_ON(true); 1277 return ERR_PTR(-EINVAL); 1278 } 1279 1280 /* 1281 * This function is called in all state_mutex unlock cases to 1282 * handle a 'deferred_reset' if exists. 1283 */ 1284 void mlx5vf_state_mutex_unlock(struct mlx5vf_pci_core_device *mvdev) 1285 { 1286 again: 1287 spin_lock(&mvdev->reset_lock); 1288 if (mvdev->deferred_reset) { 1289 mvdev->deferred_reset = false; 1290 spin_unlock(&mvdev->reset_lock); 1291 mvdev->mig_state = VFIO_DEVICE_STATE_RUNNING; 1292 mlx5vf_disable_fds(mvdev); 1293 goto again; 1294 } 1295 mutex_unlock(&mvdev->state_mutex); 1296 spin_unlock(&mvdev->reset_lock); 1297 } 1298 1299 static struct file * 1300 mlx5vf_pci_set_device_state(struct vfio_device *vdev, 1301 enum vfio_device_mig_state new_state) 1302 { 1303 struct mlx5vf_pci_core_device *mvdev = container_of( 1304 vdev, struct mlx5vf_pci_core_device, core_device.vdev); 1305 enum vfio_device_mig_state next_state; 1306 struct file *res = NULL; 1307 int ret; 1308 1309 mutex_lock(&mvdev->state_mutex); 1310 while (new_state != mvdev->mig_state) { 1311 ret = vfio_mig_get_next_state(vdev, mvdev->mig_state, 1312 new_state, &next_state); 1313 if (ret) { 1314 res = ERR_PTR(ret); 1315 break; 1316 } 1317 res = mlx5vf_pci_step_device_state_locked(mvdev, next_state); 1318 if (IS_ERR(res)) 1319 break; 1320 mvdev->mig_state = next_state; 1321 if (WARN_ON(res && new_state != mvdev->mig_state)) { 1322 fput(res); 1323 res = ERR_PTR(-EINVAL); 1324 break; 1325 } 1326 } 1327 mlx5vf_state_mutex_unlock(mvdev); 1328 return res; 1329 } 1330 1331 static int mlx5vf_pci_get_data_size(struct vfio_device *vdev, 1332 unsigned long *stop_copy_length) 1333 { 1334 struct mlx5vf_pci_core_device *mvdev = container_of( 1335 vdev, struct mlx5vf_pci_core_device, core_device.vdev); 1336 size_t state_size; 1337 u64 total_size; 1338 int ret; 1339 1340 mutex_lock(&mvdev->state_mutex); 1341 ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &state_size, 1342 &total_size, 0); 1343 if (!ret) 1344 *stop_copy_length = total_size; 1345 mlx5vf_state_mutex_unlock(mvdev); 1346 return ret; 1347 } 1348 1349 static int mlx5vf_pci_get_device_state(struct vfio_device *vdev, 1350 enum vfio_device_mig_state *curr_state) 1351 { 1352 struct mlx5vf_pci_core_device *mvdev = container_of( 1353 vdev, struct mlx5vf_pci_core_device, core_device.vdev); 1354 1355 mutex_lock(&mvdev->state_mutex); 1356 *curr_state = mvdev->mig_state; 1357 mlx5vf_state_mutex_unlock(mvdev); 1358 return 0; 1359 } 1360 1361 static void mlx5vf_pci_aer_reset_done(struct pci_dev *pdev) 1362 { 1363 struct mlx5vf_pci_core_device *mvdev = mlx5vf_drvdata(pdev); 1364 1365 if (!mvdev->migrate_cap) 1366 return; 1367 1368 /* 1369 * As the higher VFIO layers are holding locks across reset and using 1370 * those same locks with the mm_lock we need to prevent ABBA deadlock 1371 * with the state_mutex and mm_lock. 1372 * In case the state_mutex was taken already we defer the cleanup work 1373 * to the unlock flow of the other running context. 1374 */ 1375 spin_lock(&mvdev->reset_lock); 1376 mvdev->deferred_reset = true; 1377 if (!mutex_trylock(&mvdev->state_mutex)) { 1378 spin_unlock(&mvdev->reset_lock); 1379 return; 1380 } 1381 spin_unlock(&mvdev->reset_lock); 1382 mlx5vf_state_mutex_unlock(mvdev); 1383 } 1384 1385 static int mlx5vf_pci_open_device(struct vfio_device *core_vdev) 1386 { 1387 struct mlx5vf_pci_core_device *mvdev = container_of( 1388 core_vdev, struct mlx5vf_pci_core_device, core_device.vdev); 1389 struct vfio_pci_core_device *vdev = &mvdev->core_device; 1390 int ret; 1391 1392 ret = vfio_pci_core_enable(vdev); 1393 if (ret) 1394 return ret; 1395 1396 if (mvdev->migrate_cap) 1397 mvdev->mig_state = VFIO_DEVICE_STATE_RUNNING; 1398 vfio_pci_core_finish_enable(vdev); 1399 return 0; 1400 } 1401 1402 static void mlx5vf_pci_close_device(struct vfio_device *core_vdev) 1403 { 1404 struct mlx5vf_pci_core_device *mvdev = container_of( 1405 core_vdev, struct mlx5vf_pci_core_device, core_device.vdev); 1406 1407 mlx5vf_cmd_close_migratable(mvdev); 1408 vfio_pci_core_close_device(core_vdev); 1409 } 1410 1411 static const struct vfio_migration_ops mlx5vf_pci_mig_ops = { 1412 .migration_set_state = mlx5vf_pci_set_device_state, 1413 .migration_get_state = mlx5vf_pci_get_device_state, 1414 .migration_get_data_size = mlx5vf_pci_get_data_size, 1415 }; 1416 1417 static const struct vfio_log_ops mlx5vf_pci_log_ops = { 1418 .log_start = mlx5vf_start_page_tracker, 1419 .log_stop = mlx5vf_stop_page_tracker, 1420 .log_read_and_clear = mlx5vf_tracker_read_and_clear, 1421 }; 1422 1423 static int mlx5vf_pci_init_dev(struct vfio_device *core_vdev) 1424 { 1425 struct mlx5vf_pci_core_device *mvdev = container_of(core_vdev, 1426 struct mlx5vf_pci_core_device, core_device.vdev); 1427 int ret; 1428 1429 ret = vfio_pci_core_init_dev(core_vdev); 1430 if (ret) 1431 return ret; 1432 1433 mlx5vf_cmd_set_migratable(mvdev, &mlx5vf_pci_mig_ops, 1434 &mlx5vf_pci_log_ops); 1435 1436 return 0; 1437 } 1438 1439 static void mlx5vf_pci_release_dev(struct vfio_device *core_vdev) 1440 { 1441 struct mlx5vf_pci_core_device *mvdev = container_of(core_vdev, 1442 struct mlx5vf_pci_core_device, core_device.vdev); 1443 1444 mlx5vf_cmd_remove_migratable(mvdev); 1445 vfio_pci_core_release_dev(core_vdev); 1446 } 1447 1448 static const struct vfio_device_ops mlx5vf_pci_ops = { 1449 .name = "mlx5-vfio-pci", 1450 .init = mlx5vf_pci_init_dev, 1451 .release = mlx5vf_pci_release_dev, 1452 .open_device = mlx5vf_pci_open_device, 1453 .close_device = mlx5vf_pci_close_device, 1454 .ioctl = vfio_pci_core_ioctl, 1455 .device_feature = vfio_pci_core_ioctl_feature, 1456 .read = vfio_pci_core_read, 1457 .write = vfio_pci_core_write, 1458 .mmap = vfio_pci_core_mmap, 1459 .request = vfio_pci_core_request, 1460 .match = vfio_pci_core_match, 1461 .bind_iommufd = vfio_iommufd_physical_bind, 1462 .unbind_iommufd = vfio_iommufd_physical_unbind, 1463 .attach_ioas = vfio_iommufd_physical_attach_ioas, 1464 .detach_ioas = vfio_iommufd_physical_detach_ioas, 1465 }; 1466 1467 static int mlx5vf_pci_probe(struct pci_dev *pdev, 1468 const struct pci_device_id *id) 1469 { 1470 struct mlx5vf_pci_core_device *mvdev; 1471 int ret; 1472 1473 mvdev = vfio_alloc_device(mlx5vf_pci_core_device, core_device.vdev, 1474 &pdev->dev, &mlx5vf_pci_ops); 1475 if (IS_ERR(mvdev)) 1476 return PTR_ERR(mvdev); 1477 1478 dev_set_drvdata(&pdev->dev, &mvdev->core_device); 1479 ret = vfio_pci_core_register_device(&mvdev->core_device); 1480 if (ret) 1481 goto out_put_vdev; 1482 return 0; 1483 1484 out_put_vdev: 1485 vfio_put_device(&mvdev->core_device.vdev); 1486 return ret; 1487 } 1488 1489 static void mlx5vf_pci_remove(struct pci_dev *pdev) 1490 { 1491 struct mlx5vf_pci_core_device *mvdev = mlx5vf_drvdata(pdev); 1492 1493 vfio_pci_core_unregister_device(&mvdev->core_device); 1494 vfio_put_device(&mvdev->core_device.vdev); 1495 } 1496 1497 static const struct pci_device_id mlx5vf_pci_table[] = { 1498 { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_MELLANOX, 0x101e) }, /* ConnectX Family mlx5Gen Virtual Function */ 1499 {} 1500 }; 1501 1502 MODULE_DEVICE_TABLE(pci, mlx5vf_pci_table); 1503 1504 static const struct pci_error_handlers mlx5vf_err_handlers = { 1505 .reset_done = mlx5vf_pci_aer_reset_done, 1506 .error_detected = vfio_pci_core_aer_err_detected, 1507 }; 1508 1509 static struct pci_driver mlx5vf_pci_driver = { 1510 .name = KBUILD_MODNAME, 1511 .id_table = mlx5vf_pci_table, 1512 .probe = mlx5vf_pci_probe, 1513 .remove = mlx5vf_pci_remove, 1514 .err_handler = &mlx5vf_err_handlers, 1515 .driver_managed_dma = true, 1516 }; 1517 1518 module_pci_driver(mlx5vf_pci_driver); 1519 1520 MODULE_IMPORT_NS(IOMMUFD); 1521 MODULE_LICENSE("GPL"); 1522 MODULE_AUTHOR("Max Gurtovoy <mgurtovoy@nvidia.com>"); 1523 MODULE_AUTHOR("Yishai Hadas <yishaih@nvidia.com>"); 1524 MODULE_DESCRIPTION( 1525 "MLX5 VFIO PCI - User Level meta-driver for MLX5 device family"); 1526