1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved 4 */ 5 6 #include <linux/device.h> 7 #include <linux/eventfd.h> 8 #include <linux/file.h> 9 #include <linux/interrupt.h> 10 #include <linux/iommu.h> 11 #include <linux/module.h> 12 #include <linux/mutex.h> 13 #include <linux/notifier.h> 14 #include <linux/pci.h> 15 #include <linux/pm_runtime.h> 16 #include <linux/types.h> 17 #include <linux/uaccess.h> 18 #include <linux/vfio.h> 19 #include <linux/sched/mm.h> 20 #include <linux/anon_inodes.h> 21 22 #include "cmd.h" 23 24 /* Device specification max LOAD size */ 25 #define MAX_LOAD_SIZE (BIT_ULL(__mlx5_bit_sz(load_vhca_state_in, size)) - 1) 26 27 #define MAX_CHUNK_SIZE SZ_8M 28 29 static struct mlx5vf_pci_core_device *mlx5vf_drvdata(struct pci_dev *pdev) 30 { 31 struct vfio_pci_core_device *core_device = dev_get_drvdata(&pdev->dev); 32 33 return container_of(core_device, struct mlx5vf_pci_core_device, 34 core_device); 35 } 36 37 struct page * 38 mlx5vf_get_migration_page(struct mlx5_vhca_data_buffer *buf, 39 unsigned long offset) 40 { 41 unsigned long cur_offset = 0; 42 struct scatterlist *sg; 43 unsigned int i; 44 45 /* All accesses are sequential */ 46 if (offset < buf->last_offset || !buf->last_offset_sg) { 47 buf->last_offset = 0; 48 buf->last_offset_sg = buf->table.sgt.sgl; 49 buf->sg_last_entry = 0; 50 } 51 52 cur_offset = buf->last_offset; 53 54 for_each_sg(buf->last_offset_sg, sg, 55 buf->table.sgt.orig_nents - buf->sg_last_entry, i) { 56 if (offset < sg->length + cur_offset) { 57 buf->last_offset_sg = sg; 58 buf->sg_last_entry += i; 59 buf->last_offset = cur_offset; 60 return nth_page(sg_page(sg), 61 (offset - cur_offset) / PAGE_SIZE); 62 } 63 cur_offset += sg->length; 64 } 65 return NULL; 66 } 67 68 static void mlx5vf_disable_fd(struct mlx5_vf_migration_file *migf) 69 { 70 mutex_lock(&migf->lock); 71 migf->state = MLX5_MIGF_STATE_ERROR; 72 migf->filp->f_pos = 0; 73 mutex_unlock(&migf->lock); 74 } 75 76 static int mlx5vf_release_file(struct inode *inode, struct file *filp) 77 { 78 struct mlx5_vf_migration_file *migf = filp->private_data; 79 80 mlx5vf_disable_fd(migf); 81 mutex_destroy(&migf->lock); 82 kfree(migf); 83 return 0; 84 } 85 86 static struct mlx5_vhca_data_buffer * 87 mlx5vf_get_data_buff_from_pos(struct mlx5_vf_migration_file *migf, loff_t pos, 88 bool *end_of_data) 89 { 90 struct mlx5_vhca_data_buffer *buf; 91 bool found = false; 92 93 *end_of_data = false; 94 spin_lock_irq(&migf->list_lock); 95 if (list_empty(&migf->buf_list)) { 96 *end_of_data = true; 97 goto end; 98 } 99 100 buf = list_first_entry(&migf->buf_list, struct mlx5_vhca_data_buffer, 101 buf_elm); 102 if (pos >= buf->start_pos && 103 pos < buf->start_pos + buf->length) { 104 found = true; 105 goto end; 106 } 107 108 /* 109 * As we use a stream based FD we may expect having the data always 110 * on first chunk 111 */ 112 migf->state = MLX5_MIGF_STATE_ERROR; 113 114 end: 115 spin_unlock_irq(&migf->list_lock); 116 return found ? buf : NULL; 117 } 118 119 static void mlx5vf_buf_read_done(struct mlx5_vhca_data_buffer *vhca_buf) 120 { 121 struct mlx5_vf_migration_file *migf = vhca_buf->migf; 122 123 if (vhca_buf->stop_copy_chunk_num) { 124 bool is_header = vhca_buf->dma_dir == DMA_NONE; 125 u8 chunk_num = vhca_buf->stop_copy_chunk_num; 126 size_t next_required_umem_size = 0; 127 128 if (is_header) 129 migf->buf_header[chunk_num - 1] = vhca_buf; 130 else 131 migf->buf[chunk_num - 1] = vhca_buf; 132 133 spin_lock_irq(&migf->list_lock); 134 list_del_init(&vhca_buf->buf_elm); 135 if (!is_header) { 136 next_required_umem_size = 137 migf->next_required_umem_size; 138 migf->next_required_umem_size = 0; 139 migf->num_ready_chunks--; 140 } 141 spin_unlock_irq(&migf->list_lock); 142 if (next_required_umem_size) 143 mlx5vf_mig_file_set_save_work(migf, chunk_num, 144 next_required_umem_size); 145 return; 146 } 147 148 spin_lock_irq(&migf->list_lock); 149 list_del_init(&vhca_buf->buf_elm); 150 list_add_tail(&vhca_buf->buf_elm, &vhca_buf->migf->avail_list); 151 spin_unlock_irq(&migf->list_lock); 152 } 153 154 static ssize_t mlx5vf_buf_read(struct mlx5_vhca_data_buffer *vhca_buf, 155 char __user **buf, size_t *len, loff_t *pos) 156 { 157 unsigned long offset; 158 ssize_t done = 0; 159 size_t copy_len; 160 161 copy_len = min_t(size_t, 162 vhca_buf->start_pos + vhca_buf->length - *pos, *len); 163 while (copy_len) { 164 size_t page_offset; 165 struct page *page; 166 size_t page_len; 167 u8 *from_buff; 168 int ret; 169 170 offset = *pos - vhca_buf->start_pos; 171 page_offset = offset % PAGE_SIZE; 172 offset -= page_offset; 173 page = mlx5vf_get_migration_page(vhca_buf, offset); 174 if (!page) 175 return -EINVAL; 176 page_len = min_t(size_t, copy_len, PAGE_SIZE - page_offset); 177 from_buff = kmap_local_page(page); 178 ret = copy_to_user(*buf, from_buff + page_offset, page_len); 179 kunmap_local(from_buff); 180 if (ret) 181 return -EFAULT; 182 *pos += page_len; 183 *len -= page_len; 184 *buf += page_len; 185 done += page_len; 186 copy_len -= page_len; 187 } 188 189 if (*pos >= vhca_buf->start_pos + vhca_buf->length) 190 mlx5vf_buf_read_done(vhca_buf); 191 192 return done; 193 } 194 195 static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len, 196 loff_t *pos) 197 { 198 struct mlx5_vf_migration_file *migf = filp->private_data; 199 struct mlx5_vhca_data_buffer *vhca_buf; 200 bool first_loop_call = true; 201 bool end_of_data; 202 ssize_t done = 0; 203 204 if (pos) 205 return -ESPIPE; 206 pos = &filp->f_pos; 207 208 if (!(filp->f_flags & O_NONBLOCK)) { 209 if (wait_event_interruptible(migf->poll_wait, 210 !list_empty(&migf->buf_list) || 211 migf->state == MLX5_MIGF_STATE_ERROR || 212 migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR || 213 migf->state == MLX5_MIGF_STATE_PRE_COPY || 214 migf->state == MLX5_MIGF_STATE_COMPLETE)) 215 return -ERESTARTSYS; 216 } 217 218 mutex_lock(&migf->lock); 219 if (migf->state == MLX5_MIGF_STATE_ERROR) { 220 done = -ENODEV; 221 goto out_unlock; 222 } 223 224 while (len) { 225 ssize_t count; 226 227 vhca_buf = mlx5vf_get_data_buff_from_pos(migf, *pos, 228 &end_of_data); 229 if (first_loop_call) { 230 first_loop_call = false; 231 /* Temporary end of file as part of PRE_COPY */ 232 if (end_of_data && (migf->state == MLX5_MIGF_STATE_PRE_COPY || 233 migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR)) { 234 done = -ENOMSG; 235 goto out_unlock; 236 } 237 238 if (end_of_data && migf->state != MLX5_MIGF_STATE_COMPLETE) { 239 if (filp->f_flags & O_NONBLOCK) { 240 done = -EAGAIN; 241 goto out_unlock; 242 } 243 } 244 } 245 246 if (end_of_data) 247 goto out_unlock; 248 249 if (!vhca_buf) { 250 done = -EINVAL; 251 goto out_unlock; 252 } 253 254 count = mlx5vf_buf_read(vhca_buf, &buf, &len, pos); 255 if (count < 0) { 256 done = count; 257 goto out_unlock; 258 } 259 done += count; 260 } 261 262 out_unlock: 263 mutex_unlock(&migf->lock); 264 return done; 265 } 266 267 static __poll_t mlx5vf_save_poll(struct file *filp, 268 struct poll_table_struct *wait) 269 { 270 struct mlx5_vf_migration_file *migf = filp->private_data; 271 __poll_t pollflags = 0; 272 273 poll_wait(filp, &migf->poll_wait, wait); 274 275 mutex_lock(&migf->lock); 276 if (migf->state == MLX5_MIGF_STATE_ERROR) 277 pollflags = EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; 278 else if (!list_empty(&migf->buf_list) || 279 migf->state == MLX5_MIGF_STATE_COMPLETE) 280 pollflags = EPOLLIN | EPOLLRDNORM; 281 mutex_unlock(&migf->lock); 282 283 return pollflags; 284 } 285 286 /* 287 * FD is exposed and user can use it after receiving an error. 288 * Mark migf in error, and wake the user. 289 */ 290 static void mlx5vf_mark_err(struct mlx5_vf_migration_file *migf) 291 { 292 migf->state = MLX5_MIGF_STATE_ERROR; 293 wake_up_interruptible(&migf->poll_wait); 294 } 295 296 void mlx5vf_mig_file_set_save_work(struct mlx5_vf_migration_file *migf, 297 u8 chunk_num, size_t next_required_umem_size) 298 { 299 migf->save_data[chunk_num - 1].next_required_umem_size = 300 next_required_umem_size; 301 migf->save_data[chunk_num - 1].migf = migf; 302 get_file(migf->filp); 303 queue_work(migf->mvdev->cb_wq, 304 &migf->save_data[chunk_num - 1].work); 305 } 306 307 static struct mlx5_vhca_data_buffer * 308 mlx5vf_mig_file_get_stop_copy_buf(struct mlx5_vf_migration_file *migf, 309 u8 index, size_t required_length) 310 { 311 struct mlx5_vhca_data_buffer *buf = migf->buf[index]; 312 u8 chunk_num; 313 314 WARN_ON(!buf); 315 chunk_num = buf->stop_copy_chunk_num; 316 buf->migf->buf[index] = NULL; 317 /* Checking whether the pre-allocated buffer can fit */ 318 if (buf->allocated_length >= required_length) 319 return buf; 320 321 mlx5vf_put_data_buffer(buf); 322 buf = mlx5vf_get_data_buffer(buf->migf, required_length, 323 DMA_FROM_DEVICE); 324 if (IS_ERR(buf)) 325 return buf; 326 327 buf->stop_copy_chunk_num = chunk_num; 328 return buf; 329 } 330 331 static void mlx5vf_mig_file_save_work(struct work_struct *_work) 332 { 333 struct mlx5vf_save_work_data *save_data = container_of(_work, 334 struct mlx5vf_save_work_data, work); 335 struct mlx5_vf_migration_file *migf = save_data->migf; 336 struct mlx5vf_pci_core_device *mvdev = migf->mvdev; 337 struct mlx5_vhca_data_buffer *buf; 338 339 mutex_lock(&mvdev->state_mutex); 340 if (migf->state == MLX5_MIGF_STATE_ERROR) 341 goto end; 342 343 buf = mlx5vf_mig_file_get_stop_copy_buf(migf, 344 save_data->chunk_num - 1, 345 save_data->next_required_umem_size); 346 if (IS_ERR(buf)) 347 goto err; 348 349 if (mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, false)) 350 goto err_save; 351 352 goto end; 353 354 err_save: 355 mlx5vf_put_data_buffer(buf); 356 err: 357 mlx5vf_mark_err(migf); 358 end: 359 mlx5vf_state_mutex_unlock(mvdev); 360 fput(migf->filp); 361 } 362 363 static int mlx5vf_add_stop_copy_header(struct mlx5_vf_migration_file *migf, 364 bool track) 365 { 366 size_t size = sizeof(struct mlx5_vf_migration_header) + 367 sizeof(struct mlx5_vf_migration_tag_stop_copy_data); 368 struct mlx5_vf_migration_tag_stop_copy_data data = {}; 369 struct mlx5_vhca_data_buffer *header_buf = NULL; 370 struct mlx5_vf_migration_header header = {}; 371 unsigned long flags; 372 struct page *page; 373 u8 *to_buff; 374 int ret; 375 376 header_buf = mlx5vf_get_data_buffer(migf, size, DMA_NONE); 377 if (IS_ERR(header_buf)) 378 return PTR_ERR(header_buf); 379 380 header.record_size = cpu_to_le64(sizeof(data)); 381 header.flags = cpu_to_le32(MLX5_MIGF_HEADER_FLAGS_TAG_OPTIONAL); 382 header.tag = cpu_to_le32(MLX5_MIGF_HEADER_TAG_STOP_COPY_SIZE); 383 page = mlx5vf_get_migration_page(header_buf, 0); 384 if (!page) { 385 ret = -EINVAL; 386 goto err; 387 } 388 to_buff = kmap_local_page(page); 389 memcpy(to_buff, &header, sizeof(header)); 390 header_buf->length = sizeof(header); 391 data.stop_copy_size = cpu_to_le64(migf->buf[0]->allocated_length); 392 memcpy(to_buff + sizeof(header), &data, sizeof(data)); 393 header_buf->length += sizeof(data); 394 kunmap_local(to_buff); 395 header_buf->start_pos = header_buf->migf->max_pos; 396 migf->max_pos += header_buf->length; 397 spin_lock_irqsave(&migf->list_lock, flags); 398 list_add_tail(&header_buf->buf_elm, &migf->buf_list); 399 spin_unlock_irqrestore(&migf->list_lock, flags); 400 if (track) 401 migf->pre_copy_initial_bytes = size; 402 return 0; 403 err: 404 mlx5vf_put_data_buffer(header_buf); 405 return ret; 406 } 407 408 static int mlx5vf_prep_stop_copy(struct mlx5vf_pci_core_device *mvdev, 409 struct mlx5_vf_migration_file *migf, 410 size_t state_size, u64 full_size, 411 bool track) 412 { 413 struct mlx5_vhca_data_buffer *buf; 414 size_t inc_state_size; 415 int num_chunks; 416 int ret; 417 int i; 418 419 if (mvdev->chunk_mode) { 420 size_t chunk_size = min_t(size_t, MAX_CHUNK_SIZE, full_size); 421 422 /* from firmware perspective at least 'state_size' buffer should be set */ 423 inc_state_size = max(state_size, chunk_size); 424 } else { 425 if (track) { 426 /* let's be ready for stop_copy size that might grow by 10 percents */ 427 if (check_add_overflow(state_size, state_size / 10, &inc_state_size)) 428 inc_state_size = state_size; 429 } else { 430 inc_state_size = state_size; 431 } 432 } 433 434 /* let's not overflow the device specification max SAVE size */ 435 inc_state_size = min_t(size_t, inc_state_size, 436 (BIT_ULL(__mlx5_bit_sz(save_vhca_state_in, size)) - PAGE_SIZE)); 437 438 num_chunks = mvdev->chunk_mode ? MAX_NUM_CHUNKS : 1; 439 for (i = 0; i < num_chunks; i++) { 440 buf = mlx5vf_get_data_buffer(migf, inc_state_size, DMA_FROM_DEVICE); 441 if (IS_ERR(buf)) { 442 ret = PTR_ERR(buf); 443 goto err; 444 } 445 446 migf->buf[i] = buf; 447 buf = mlx5vf_get_data_buffer(migf, 448 sizeof(struct mlx5_vf_migration_header), DMA_NONE); 449 if (IS_ERR(buf)) { 450 ret = PTR_ERR(buf); 451 goto err; 452 } 453 migf->buf_header[i] = buf; 454 if (mvdev->chunk_mode) { 455 migf->buf[i]->stop_copy_chunk_num = i + 1; 456 migf->buf_header[i]->stop_copy_chunk_num = i + 1; 457 INIT_WORK(&migf->save_data[i].work, 458 mlx5vf_mig_file_save_work); 459 migf->save_data[i].chunk_num = i + 1; 460 } 461 } 462 463 ret = mlx5vf_add_stop_copy_header(migf, track); 464 if (ret) 465 goto err; 466 return 0; 467 468 err: 469 for (i = 0; i < num_chunks; i++) { 470 if (migf->buf[i]) { 471 mlx5vf_put_data_buffer(migf->buf[i]); 472 migf->buf[i] = NULL; 473 } 474 if (migf->buf_header[i]) { 475 mlx5vf_put_data_buffer(migf->buf_header[i]); 476 migf->buf_header[i] = NULL; 477 } 478 } 479 480 return ret; 481 } 482 483 static long mlx5vf_precopy_ioctl(struct file *filp, unsigned int cmd, 484 unsigned long arg) 485 { 486 struct mlx5_vf_migration_file *migf = filp->private_data; 487 struct mlx5vf_pci_core_device *mvdev = migf->mvdev; 488 struct mlx5_vhca_data_buffer *buf; 489 struct vfio_precopy_info info = {}; 490 loff_t *pos = &filp->f_pos; 491 unsigned long minsz; 492 size_t inc_length = 0; 493 bool end_of_data = false; 494 int ret; 495 496 if (cmd != VFIO_MIG_GET_PRECOPY_INFO) 497 return -ENOTTY; 498 499 minsz = offsetofend(struct vfio_precopy_info, dirty_bytes); 500 501 if (copy_from_user(&info, (void __user *)arg, minsz)) 502 return -EFAULT; 503 504 if (info.argsz < minsz) 505 return -EINVAL; 506 507 mutex_lock(&mvdev->state_mutex); 508 if (mvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY && 509 mvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY_P2P) { 510 ret = -EINVAL; 511 goto err_state_unlock; 512 } 513 514 /* 515 * We can't issue a SAVE command when the device is suspended, so as 516 * part of VFIO_DEVICE_STATE_PRE_COPY_P2P no reason to query for extra 517 * bytes that can't be read. 518 */ 519 if (mvdev->mig_state == VFIO_DEVICE_STATE_PRE_COPY) { 520 /* 521 * Once the query returns it's guaranteed that there is no 522 * active SAVE command. 523 * As so, the other code below is safe with the proper locks. 524 */ 525 ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &inc_length, 526 NULL, MLX5VF_QUERY_INC); 527 if (ret) 528 goto err_state_unlock; 529 } 530 531 mutex_lock(&migf->lock); 532 if (migf->state == MLX5_MIGF_STATE_ERROR) { 533 ret = -ENODEV; 534 goto err_migf_unlock; 535 } 536 537 if (migf->pre_copy_initial_bytes > *pos) { 538 info.initial_bytes = migf->pre_copy_initial_bytes - *pos; 539 } else { 540 info.dirty_bytes = migf->max_pos - *pos; 541 if (!info.dirty_bytes) 542 end_of_data = true; 543 info.dirty_bytes += inc_length; 544 } 545 546 if (!end_of_data || !inc_length) { 547 mutex_unlock(&migf->lock); 548 goto done; 549 } 550 551 mutex_unlock(&migf->lock); 552 /* 553 * We finished transferring the current state and the device has a 554 * dirty state, save a new state to be ready for. 555 */ 556 buf = mlx5vf_get_data_buffer(migf, inc_length, DMA_FROM_DEVICE); 557 if (IS_ERR(buf)) { 558 ret = PTR_ERR(buf); 559 mlx5vf_mark_err(migf); 560 goto err_state_unlock; 561 } 562 563 ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, true); 564 if (ret) { 565 mlx5vf_mark_err(migf); 566 mlx5vf_put_data_buffer(buf); 567 goto err_state_unlock; 568 } 569 570 done: 571 mlx5vf_state_mutex_unlock(mvdev); 572 if (copy_to_user((void __user *)arg, &info, minsz)) 573 return -EFAULT; 574 return 0; 575 576 err_migf_unlock: 577 mutex_unlock(&migf->lock); 578 err_state_unlock: 579 mlx5vf_state_mutex_unlock(mvdev); 580 return ret; 581 } 582 583 static const struct file_operations mlx5vf_save_fops = { 584 .owner = THIS_MODULE, 585 .read = mlx5vf_save_read, 586 .poll = mlx5vf_save_poll, 587 .unlocked_ioctl = mlx5vf_precopy_ioctl, 588 .compat_ioctl = compat_ptr_ioctl, 589 .release = mlx5vf_release_file, 590 .llseek = no_llseek, 591 }; 592 593 static int mlx5vf_pci_save_device_inc_data(struct mlx5vf_pci_core_device *mvdev) 594 { 595 struct mlx5_vf_migration_file *migf = mvdev->saving_migf; 596 struct mlx5_vhca_data_buffer *buf; 597 size_t length; 598 int ret; 599 600 if (migf->state == MLX5_MIGF_STATE_ERROR) 601 return -ENODEV; 602 603 ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, NULL, 604 MLX5VF_QUERY_INC | MLX5VF_QUERY_FINAL); 605 if (ret) 606 goto err; 607 608 buf = mlx5vf_mig_file_get_stop_copy_buf(migf, 0, length); 609 if (IS_ERR(buf)) { 610 ret = PTR_ERR(buf); 611 goto err; 612 } 613 614 ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, false); 615 if (ret) 616 goto err_save; 617 618 return 0; 619 620 err_save: 621 mlx5vf_put_data_buffer(buf); 622 err: 623 mlx5vf_mark_err(migf); 624 return ret; 625 } 626 627 static struct mlx5_vf_migration_file * 628 mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev, bool track) 629 { 630 struct mlx5_vf_migration_file *migf; 631 struct mlx5_vhca_data_buffer *buf; 632 size_t length; 633 u64 full_size; 634 int ret; 635 636 migf = kzalloc(sizeof(*migf), GFP_KERNEL_ACCOUNT); 637 if (!migf) 638 return ERR_PTR(-ENOMEM); 639 640 migf->filp = anon_inode_getfile("mlx5vf_mig", &mlx5vf_save_fops, migf, 641 O_RDONLY); 642 if (IS_ERR(migf->filp)) { 643 ret = PTR_ERR(migf->filp); 644 goto end; 645 } 646 647 migf->mvdev = mvdev; 648 ret = mlx5vf_cmd_alloc_pd(migf); 649 if (ret) 650 goto out_free; 651 652 stream_open(migf->filp->f_inode, migf->filp); 653 mutex_init(&migf->lock); 654 init_waitqueue_head(&migf->poll_wait); 655 init_completion(&migf->save_comp); 656 /* 657 * save_comp is being used as a binary semaphore built from 658 * a completion. A normal mutex cannot be used because the lock is 659 * passed between kernel threads and lockdep can't model this. 660 */ 661 complete(&migf->save_comp); 662 mlx5_cmd_init_async_ctx(mvdev->mdev, &migf->async_ctx); 663 INIT_WORK(&migf->async_data.work, mlx5vf_mig_file_cleanup_cb); 664 INIT_LIST_HEAD(&migf->buf_list); 665 INIT_LIST_HEAD(&migf->avail_list); 666 spin_lock_init(&migf->list_lock); 667 ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, &full_size, 0); 668 if (ret) 669 goto out_pd; 670 671 ret = mlx5vf_prep_stop_copy(mvdev, migf, length, full_size, track); 672 if (ret) 673 goto out_pd; 674 675 if (track) { 676 /* leave the allocated buffer ready for the stop-copy phase */ 677 buf = mlx5vf_alloc_data_buffer(migf, 678 migf->buf[0]->allocated_length, DMA_FROM_DEVICE); 679 if (IS_ERR(buf)) { 680 ret = PTR_ERR(buf); 681 goto out_pd; 682 } 683 } else { 684 buf = migf->buf[0]; 685 migf->buf[0] = NULL; 686 } 687 688 ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, false, track); 689 if (ret) 690 goto out_save; 691 return migf; 692 out_save: 693 mlx5vf_free_data_buffer(buf); 694 out_pd: 695 mlx5fv_cmd_clean_migf_resources(migf); 696 out_free: 697 fput(migf->filp); 698 end: 699 kfree(migf); 700 return ERR_PTR(ret); 701 } 702 703 static int 704 mlx5vf_append_page_to_mig_buf(struct mlx5_vhca_data_buffer *vhca_buf, 705 const char __user **buf, size_t *len, 706 loff_t *pos, ssize_t *done) 707 { 708 unsigned long offset; 709 size_t page_offset; 710 struct page *page; 711 size_t page_len; 712 u8 *to_buff; 713 int ret; 714 715 offset = *pos - vhca_buf->start_pos; 716 page_offset = offset % PAGE_SIZE; 717 718 page = mlx5vf_get_migration_page(vhca_buf, offset - page_offset); 719 if (!page) 720 return -EINVAL; 721 page_len = min_t(size_t, *len, PAGE_SIZE - page_offset); 722 to_buff = kmap_local_page(page); 723 ret = copy_from_user(to_buff + page_offset, *buf, page_len); 724 kunmap_local(to_buff); 725 if (ret) 726 return -EFAULT; 727 728 *pos += page_len; 729 *done += page_len; 730 *buf += page_len; 731 *len -= page_len; 732 vhca_buf->length += page_len; 733 return 0; 734 } 735 736 static ssize_t 737 mlx5vf_resume_read_image(struct mlx5_vf_migration_file *migf, 738 struct mlx5_vhca_data_buffer *vhca_buf, 739 size_t image_size, const char __user **buf, 740 size_t *len, loff_t *pos, ssize_t *done, 741 bool *has_work) 742 { 743 size_t copy_len, to_copy; 744 int ret; 745 746 to_copy = min_t(size_t, *len, image_size - vhca_buf->length); 747 copy_len = to_copy; 748 while (to_copy) { 749 ret = mlx5vf_append_page_to_mig_buf(vhca_buf, buf, &to_copy, pos, 750 done); 751 if (ret) 752 return ret; 753 } 754 755 *len -= copy_len; 756 if (vhca_buf->length == image_size) { 757 migf->load_state = MLX5_VF_LOAD_STATE_LOAD_IMAGE; 758 migf->max_pos += image_size; 759 *has_work = true; 760 } 761 762 return 0; 763 } 764 765 static int 766 mlx5vf_resume_read_header_data(struct mlx5_vf_migration_file *migf, 767 struct mlx5_vhca_data_buffer *vhca_buf, 768 const char __user **buf, size_t *len, 769 loff_t *pos, ssize_t *done) 770 { 771 size_t copy_len, to_copy; 772 size_t required_data; 773 u8 *to_buff; 774 int ret; 775 776 required_data = migf->record_size - vhca_buf->length; 777 to_copy = min_t(size_t, *len, required_data); 778 copy_len = to_copy; 779 while (to_copy) { 780 ret = mlx5vf_append_page_to_mig_buf(vhca_buf, buf, &to_copy, pos, 781 done); 782 if (ret) 783 return ret; 784 } 785 786 *len -= copy_len; 787 if (vhca_buf->length == migf->record_size) { 788 switch (migf->record_tag) { 789 case MLX5_MIGF_HEADER_TAG_STOP_COPY_SIZE: 790 { 791 struct page *page; 792 793 page = mlx5vf_get_migration_page(vhca_buf, 0); 794 if (!page) 795 return -EINVAL; 796 to_buff = kmap_local_page(page); 797 migf->stop_copy_prep_size = min_t(u64, 798 le64_to_cpup((__le64 *)to_buff), MAX_LOAD_SIZE); 799 kunmap_local(to_buff); 800 break; 801 } 802 default: 803 /* Optional tag */ 804 break; 805 } 806 807 migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER; 808 migf->max_pos += migf->record_size; 809 vhca_buf->length = 0; 810 } 811 812 return 0; 813 } 814 815 static int 816 mlx5vf_resume_read_header(struct mlx5_vf_migration_file *migf, 817 struct mlx5_vhca_data_buffer *vhca_buf, 818 const char __user **buf, 819 size_t *len, loff_t *pos, 820 ssize_t *done, bool *has_work) 821 { 822 struct page *page; 823 size_t copy_len; 824 u8 *to_buff; 825 int ret; 826 827 copy_len = min_t(size_t, *len, 828 sizeof(struct mlx5_vf_migration_header) - vhca_buf->length); 829 page = mlx5vf_get_migration_page(vhca_buf, 0); 830 if (!page) 831 return -EINVAL; 832 to_buff = kmap_local_page(page); 833 ret = copy_from_user(to_buff + vhca_buf->length, *buf, copy_len); 834 if (ret) { 835 ret = -EFAULT; 836 goto end; 837 } 838 839 *buf += copy_len; 840 *pos += copy_len; 841 *done += copy_len; 842 *len -= copy_len; 843 vhca_buf->length += copy_len; 844 if (vhca_buf->length == sizeof(struct mlx5_vf_migration_header)) { 845 u64 record_size; 846 u32 flags; 847 848 record_size = le64_to_cpup((__le64 *)to_buff); 849 if (record_size > MAX_LOAD_SIZE) { 850 ret = -ENOMEM; 851 goto end; 852 } 853 854 migf->record_size = record_size; 855 flags = le32_to_cpup((__le32 *)(to_buff + 856 offsetof(struct mlx5_vf_migration_header, flags))); 857 migf->record_tag = le32_to_cpup((__le32 *)(to_buff + 858 offsetof(struct mlx5_vf_migration_header, tag))); 859 switch (migf->record_tag) { 860 case MLX5_MIGF_HEADER_TAG_FW_DATA: 861 migf->load_state = MLX5_VF_LOAD_STATE_PREP_IMAGE; 862 break; 863 case MLX5_MIGF_HEADER_TAG_STOP_COPY_SIZE: 864 migf->load_state = MLX5_VF_LOAD_STATE_PREP_HEADER_DATA; 865 break; 866 default: 867 if (!(flags & MLX5_MIGF_HEADER_FLAGS_TAG_OPTIONAL)) { 868 ret = -EOPNOTSUPP; 869 goto end; 870 } 871 /* We may read and skip this optional record data */ 872 migf->load_state = MLX5_VF_LOAD_STATE_PREP_HEADER_DATA; 873 } 874 875 migf->max_pos += vhca_buf->length; 876 vhca_buf->length = 0; 877 *has_work = true; 878 } 879 end: 880 kunmap_local(to_buff); 881 return ret; 882 } 883 884 static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf, 885 size_t len, loff_t *pos) 886 { 887 struct mlx5_vf_migration_file *migf = filp->private_data; 888 struct mlx5_vhca_data_buffer *vhca_buf = migf->buf[0]; 889 struct mlx5_vhca_data_buffer *vhca_buf_header = migf->buf_header[0]; 890 loff_t requested_length; 891 bool has_work = false; 892 ssize_t done = 0; 893 int ret = 0; 894 895 if (pos) 896 return -ESPIPE; 897 pos = &filp->f_pos; 898 899 if (*pos < 0 || 900 check_add_overflow((loff_t)len, *pos, &requested_length)) 901 return -EINVAL; 902 903 mutex_lock(&migf->mvdev->state_mutex); 904 mutex_lock(&migf->lock); 905 if (migf->state == MLX5_MIGF_STATE_ERROR) { 906 ret = -ENODEV; 907 goto out_unlock; 908 } 909 910 while (len || has_work) { 911 has_work = false; 912 switch (migf->load_state) { 913 case MLX5_VF_LOAD_STATE_READ_HEADER: 914 ret = mlx5vf_resume_read_header(migf, vhca_buf_header, 915 &buf, &len, pos, 916 &done, &has_work); 917 if (ret) 918 goto out_unlock; 919 break; 920 case MLX5_VF_LOAD_STATE_PREP_HEADER_DATA: 921 if (vhca_buf_header->allocated_length < migf->record_size) { 922 mlx5vf_free_data_buffer(vhca_buf_header); 923 924 migf->buf_header[0] = mlx5vf_alloc_data_buffer(migf, 925 migf->record_size, DMA_NONE); 926 if (IS_ERR(migf->buf_header[0])) { 927 ret = PTR_ERR(migf->buf_header[0]); 928 migf->buf_header[0] = NULL; 929 goto out_unlock; 930 } 931 932 vhca_buf_header = migf->buf_header[0]; 933 } 934 935 vhca_buf_header->start_pos = migf->max_pos; 936 migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER_DATA; 937 break; 938 case MLX5_VF_LOAD_STATE_READ_HEADER_DATA: 939 ret = mlx5vf_resume_read_header_data(migf, vhca_buf_header, 940 &buf, &len, pos, &done); 941 if (ret) 942 goto out_unlock; 943 break; 944 case MLX5_VF_LOAD_STATE_PREP_IMAGE: 945 { 946 u64 size = max(migf->record_size, 947 migf->stop_copy_prep_size); 948 949 if (vhca_buf->allocated_length < size) { 950 mlx5vf_free_data_buffer(vhca_buf); 951 952 migf->buf[0] = mlx5vf_alloc_data_buffer(migf, 953 size, DMA_TO_DEVICE); 954 if (IS_ERR(migf->buf[0])) { 955 ret = PTR_ERR(migf->buf[0]); 956 migf->buf[0] = NULL; 957 goto out_unlock; 958 } 959 960 vhca_buf = migf->buf[0]; 961 } 962 963 vhca_buf->start_pos = migf->max_pos; 964 migf->load_state = MLX5_VF_LOAD_STATE_READ_IMAGE; 965 break; 966 } 967 case MLX5_VF_LOAD_STATE_READ_IMAGE: 968 ret = mlx5vf_resume_read_image(migf, vhca_buf, 969 migf->record_size, 970 &buf, &len, pos, &done, &has_work); 971 if (ret) 972 goto out_unlock; 973 break; 974 case MLX5_VF_LOAD_STATE_LOAD_IMAGE: 975 ret = mlx5vf_cmd_load_vhca_state(migf->mvdev, migf, vhca_buf); 976 if (ret) 977 goto out_unlock; 978 migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER; 979 980 /* prep header buf for next image */ 981 vhca_buf_header->length = 0; 982 /* prep data buf for next image */ 983 vhca_buf->length = 0; 984 985 break; 986 default: 987 break; 988 } 989 } 990 991 out_unlock: 992 if (ret) 993 migf->state = MLX5_MIGF_STATE_ERROR; 994 mutex_unlock(&migf->lock); 995 mlx5vf_state_mutex_unlock(migf->mvdev); 996 return ret ? ret : done; 997 } 998 999 static const struct file_operations mlx5vf_resume_fops = { 1000 .owner = THIS_MODULE, 1001 .write = mlx5vf_resume_write, 1002 .release = mlx5vf_release_file, 1003 .llseek = no_llseek, 1004 }; 1005 1006 static struct mlx5_vf_migration_file * 1007 mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev) 1008 { 1009 struct mlx5_vf_migration_file *migf; 1010 struct mlx5_vhca_data_buffer *buf; 1011 int ret; 1012 1013 migf = kzalloc(sizeof(*migf), GFP_KERNEL_ACCOUNT); 1014 if (!migf) 1015 return ERR_PTR(-ENOMEM); 1016 1017 migf->filp = anon_inode_getfile("mlx5vf_mig", &mlx5vf_resume_fops, migf, 1018 O_WRONLY); 1019 if (IS_ERR(migf->filp)) { 1020 ret = PTR_ERR(migf->filp); 1021 goto end; 1022 } 1023 1024 migf->mvdev = mvdev; 1025 ret = mlx5vf_cmd_alloc_pd(migf); 1026 if (ret) 1027 goto out_free; 1028 1029 buf = mlx5vf_alloc_data_buffer(migf, 0, DMA_TO_DEVICE); 1030 if (IS_ERR(buf)) { 1031 ret = PTR_ERR(buf); 1032 goto out_pd; 1033 } 1034 1035 migf->buf[0] = buf; 1036 buf = mlx5vf_alloc_data_buffer(migf, 1037 sizeof(struct mlx5_vf_migration_header), DMA_NONE); 1038 if (IS_ERR(buf)) { 1039 ret = PTR_ERR(buf); 1040 goto out_buf; 1041 } 1042 1043 migf->buf_header[0] = buf; 1044 migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER; 1045 1046 stream_open(migf->filp->f_inode, migf->filp); 1047 mutex_init(&migf->lock); 1048 INIT_LIST_HEAD(&migf->buf_list); 1049 INIT_LIST_HEAD(&migf->avail_list); 1050 spin_lock_init(&migf->list_lock); 1051 return migf; 1052 out_buf: 1053 mlx5vf_free_data_buffer(migf->buf[0]); 1054 out_pd: 1055 mlx5vf_cmd_dealloc_pd(migf); 1056 out_free: 1057 fput(migf->filp); 1058 end: 1059 kfree(migf); 1060 return ERR_PTR(ret); 1061 } 1062 1063 void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev, 1064 enum mlx5_vf_migf_state *last_save_state) 1065 { 1066 if (mvdev->resuming_migf) { 1067 mlx5vf_disable_fd(mvdev->resuming_migf); 1068 mlx5fv_cmd_clean_migf_resources(mvdev->resuming_migf); 1069 fput(mvdev->resuming_migf->filp); 1070 mvdev->resuming_migf = NULL; 1071 } 1072 if (mvdev->saving_migf) { 1073 mlx5_cmd_cleanup_async_ctx(&mvdev->saving_migf->async_ctx); 1074 cancel_work_sync(&mvdev->saving_migf->async_data.work); 1075 if (last_save_state) 1076 *last_save_state = mvdev->saving_migf->state; 1077 mlx5vf_disable_fd(mvdev->saving_migf); 1078 wake_up_interruptible(&mvdev->saving_migf->poll_wait); 1079 mlx5fv_cmd_clean_migf_resources(mvdev->saving_migf); 1080 fput(mvdev->saving_migf->filp); 1081 mvdev->saving_migf = NULL; 1082 } 1083 } 1084 1085 static struct file * 1086 mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev, 1087 u32 new) 1088 { 1089 u32 cur = mvdev->mig_state; 1090 int ret; 1091 1092 if (cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_STOP) { 1093 ret = mlx5vf_cmd_suspend_vhca(mvdev, 1094 MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_RESPONDER); 1095 if (ret) 1096 return ERR_PTR(ret); 1097 return NULL; 1098 } 1099 1100 if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RUNNING_P2P) { 1101 ret = mlx5vf_cmd_resume_vhca(mvdev, 1102 MLX5_RESUME_VHCA_IN_OP_MOD_RESUME_RESPONDER); 1103 if (ret) 1104 return ERR_PTR(ret); 1105 return NULL; 1106 } 1107 1108 if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_RUNNING_P2P) || 1109 (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) { 1110 ret = mlx5vf_cmd_suspend_vhca(mvdev, 1111 MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_INITIATOR); 1112 if (ret) 1113 return ERR_PTR(ret); 1114 return NULL; 1115 } 1116 1117 if ((cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_RUNNING) || 1118 (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_PRE_COPY)) { 1119 ret = mlx5vf_cmd_resume_vhca(mvdev, 1120 MLX5_RESUME_VHCA_IN_OP_MOD_RESUME_INITIATOR); 1121 if (ret) 1122 return ERR_PTR(ret); 1123 return NULL; 1124 } 1125 1126 if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_STOP_COPY) { 1127 struct mlx5_vf_migration_file *migf; 1128 1129 migf = mlx5vf_pci_save_device_data(mvdev, false); 1130 if (IS_ERR(migf)) 1131 return ERR_CAST(migf); 1132 get_file(migf->filp); 1133 mvdev->saving_migf = migf; 1134 return migf->filp; 1135 } 1136 1137 if (cur == VFIO_DEVICE_STATE_STOP_COPY && new == VFIO_DEVICE_STATE_STOP) { 1138 mlx5vf_disable_fds(mvdev, NULL); 1139 return NULL; 1140 } 1141 1142 if ((cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_RUNNING) || 1143 (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && 1144 new == VFIO_DEVICE_STATE_RUNNING_P2P)) { 1145 struct mlx5_vf_migration_file *migf = mvdev->saving_migf; 1146 struct mlx5_vhca_data_buffer *buf; 1147 enum mlx5_vf_migf_state state; 1148 size_t size; 1149 1150 ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &size, NULL, 1151 MLX5VF_QUERY_INC | MLX5VF_QUERY_CLEANUP); 1152 if (ret) 1153 return ERR_PTR(ret); 1154 buf = mlx5vf_get_data_buffer(migf, size, DMA_FROM_DEVICE); 1155 if (IS_ERR(buf)) 1156 return ERR_CAST(buf); 1157 /* pre_copy cleanup */ 1158 ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, false, false); 1159 if (ret) { 1160 mlx5vf_put_data_buffer(buf); 1161 return ERR_PTR(ret); 1162 } 1163 mlx5vf_disable_fds(mvdev, &state); 1164 return (state != MLX5_MIGF_STATE_ERROR) ? NULL : ERR_PTR(-EIO); 1165 } 1166 1167 if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RESUMING) { 1168 struct mlx5_vf_migration_file *migf; 1169 1170 migf = mlx5vf_pci_resume_device_data(mvdev); 1171 if (IS_ERR(migf)) 1172 return ERR_CAST(migf); 1173 get_file(migf->filp); 1174 mvdev->resuming_migf = migf; 1175 return migf->filp; 1176 } 1177 1178 if (cur == VFIO_DEVICE_STATE_RESUMING && new == VFIO_DEVICE_STATE_STOP) { 1179 mlx5vf_disable_fds(mvdev, NULL); 1180 return NULL; 1181 } 1182 1183 if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_PRE_COPY) || 1184 (cur == VFIO_DEVICE_STATE_RUNNING_P2P && 1185 new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) { 1186 struct mlx5_vf_migration_file *migf; 1187 1188 migf = mlx5vf_pci_save_device_data(mvdev, true); 1189 if (IS_ERR(migf)) 1190 return ERR_CAST(migf); 1191 get_file(migf->filp); 1192 mvdev->saving_migf = migf; 1193 return migf->filp; 1194 } 1195 1196 if (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_STOP_COPY) { 1197 ret = mlx5vf_cmd_suspend_vhca(mvdev, 1198 MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_RESPONDER); 1199 if (ret) 1200 return ERR_PTR(ret); 1201 ret = mlx5vf_pci_save_device_inc_data(mvdev); 1202 return ret ? ERR_PTR(ret) : NULL; 1203 } 1204 1205 /* 1206 * vfio_mig_get_next_state() does not use arcs other than the above 1207 */ 1208 WARN_ON(true); 1209 return ERR_PTR(-EINVAL); 1210 } 1211 1212 /* 1213 * This function is called in all state_mutex unlock cases to 1214 * handle a 'deferred_reset' if exists. 1215 */ 1216 void mlx5vf_state_mutex_unlock(struct mlx5vf_pci_core_device *mvdev) 1217 { 1218 again: 1219 spin_lock(&mvdev->reset_lock); 1220 if (mvdev->deferred_reset) { 1221 mvdev->deferred_reset = false; 1222 spin_unlock(&mvdev->reset_lock); 1223 mvdev->mig_state = VFIO_DEVICE_STATE_RUNNING; 1224 mlx5vf_disable_fds(mvdev, NULL); 1225 goto again; 1226 } 1227 mutex_unlock(&mvdev->state_mutex); 1228 spin_unlock(&mvdev->reset_lock); 1229 } 1230 1231 static struct file * 1232 mlx5vf_pci_set_device_state(struct vfio_device *vdev, 1233 enum vfio_device_mig_state new_state) 1234 { 1235 struct mlx5vf_pci_core_device *mvdev = container_of( 1236 vdev, struct mlx5vf_pci_core_device, core_device.vdev); 1237 enum vfio_device_mig_state next_state; 1238 struct file *res = NULL; 1239 int ret; 1240 1241 mutex_lock(&mvdev->state_mutex); 1242 while (new_state != mvdev->mig_state) { 1243 ret = vfio_mig_get_next_state(vdev, mvdev->mig_state, 1244 new_state, &next_state); 1245 if (ret) { 1246 res = ERR_PTR(ret); 1247 break; 1248 } 1249 res = mlx5vf_pci_step_device_state_locked(mvdev, next_state); 1250 if (IS_ERR(res)) 1251 break; 1252 mvdev->mig_state = next_state; 1253 if (WARN_ON(res && new_state != mvdev->mig_state)) { 1254 fput(res); 1255 res = ERR_PTR(-EINVAL); 1256 break; 1257 } 1258 } 1259 mlx5vf_state_mutex_unlock(mvdev); 1260 return res; 1261 } 1262 1263 static int mlx5vf_pci_get_data_size(struct vfio_device *vdev, 1264 unsigned long *stop_copy_length) 1265 { 1266 struct mlx5vf_pci_core_device *mvdev = container_of( 1267 vdev, struct mlx5vf_pci_core_device, core_device.vdev); 1268 size_t state_size; 1269 u64 total_size; 1270 int ret; 1271 1272 mutex_lock(&mvdev->state_mutex); 1273 ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &state_size, 1274 &total_size, 0); 1275 if (!ret) 1276 *stop_copy_length = total_size; 1277 mlx5vf_state_mutex_unlock(mvdev); 1278 return ret; 1279 } 1280 1281 static int mlx5vf_pci_get_device_state(struct vfio_device *vdev, 1282 enum vfio_device_mig_state *curr_state) 1283 { 1284 struct mlx5vf_pci_core_device *mvdev = container_of( 1285 vdev, struct mlx5vf_pci_core_device, core_device.vdev); 1286 1287 mutex_lock(&mvdev->state_mutex); 1288 *curr_state = mvdev->mig_state; 1289 mlx5vf_state_mutex_unlock(mvdev); 1290 return 0; 1291 } 1292 1293 static void mlx5vf_pci_aer_reset_done(struct pci_dev *pdev) 1294 { 1295 struct mlx5vf_pci_core_device *mvdev = mlx5vf_drvdata(pdev); 1296 1297 if (!mvdev->migrate_cap) 1298 return; 1299 1300 /* 1301 * As the higher VFIO layers are holding locks across reset and using 1302 * those same locks with the mm_lock we need to prevent ABBA deadlock 1303 * with the state_mutex and mm_lock. 1304 * In case the state_mutex was taken already we defer the cleanup work 1305 * to the unlock flow of the other running context. 1306 */ 1307 spin_lock(&mvdev->reset_lock); 1308 mvdev->deferred_reset = true; 1309 if (!mutex_trylock(&mvdev->state_mutex)) { 1310 spin_unlock(&mvdev->reset_lock); 1311 return; 1312 } 1313 spin_unlock(&mvdev->reset_lock); 1314 mlx5vf_state_mutex_unlock(mvdev); 1315 } 1316 1317 static int mlx5vf_pci_open_device(struct vfio_device *core_vdev) 1318 { 1319 struct mlx5vf_pci_core_device *mvdev = container_of( 1320 core_vdev, struct mlx5vf_pci_core_device, core_device.vdev); 1321 struct vfio_pci_core_device *vdev = &mvdev->core_device; 1322 int ret; 1323 1324 ret = vfio_pci_core_enable(vdev); 1325 if (ret) 1326 return ret; 1327 1328 if (mvdev->migrate_cap) 1329 mvdev->mig_state = VFIO_DEVICE_STATE_RUNNING; 1330 vfio_pci_core_finish_enable(vdev); 1331 return 0; 1332 } 1333 1334 static void mlx5vf_pci_close_device(struct vfio_device *core_vdev) 1335 { 1336 struct mlx5vf_pci_core_device *mvdev = container_of( 1337 core_vdev, struct mlx5vf_pci_core_device, core_device.vdev); 1338 1339 mlx5vf_cmd_close_migratable(mvdev); 1340 vfio_pci_core_close_device(core_vdev); 1341 } 1342 1343 static const struct vfio_migration_ops mlx5vf_pci_mig_ops = { 1344 .migration_set_state = mlx5vf_pci_set_device_state, 1345 .migration_get_state = mlx5vf_pci_get_device_state, 1346 .migration_get_data_size = mlx5vf_pci_get_data_size, 1347 }; 1348 1349 static const struct vfio_log_ops mlx5vf_pci_log_ops = { 1350 .log_start = mlx5vf_start_page_tracker, 1351 .log_stop = mlx5vf_stop_page_tracker, 1352 .log_read_and_clear = mlx5vf_tracker_read_and_clear, 1353 }; 1354 1355 static int mlx5vf_pci_init_dev(struct vfio_device *core_vdev) 1356 { 1357 struct mlx5vf_pci_core_device *mvdev = container_of(core_vdev, 1358 struct mlx5vf_pci_core_device, core_device.vdev); 1359 int ret; 1360 1361 ret = vfio_pci_core_init_dev(core_vdev); 1362 if (ret) 1363 return ret; 1364 1365 mlx5vf_cmd_set_migratable(mvdev, &mlx5vf_pci_mig_ops, 1366 &mlx5vf_pci_log_ops); 1367 1368 return 0; 1369 } 1370 1371 static void mlx5vf_pci_release_dev(struct vfio_device *core_vdev) 1372 { 1373 struct mlx5vf_pci_core_device *mvdev = container_of(core_vdev, 1374 struct mlx5vf_pci_core_device, core_device.vdev); 1375 1376 mlx5vf_cmd_remove_migratable(mvdev); 1377 vfio_pci_core_release_dev(core_vdev); 1378 } 1379 1380 static const struct vfio_device_ops mlx5vf_pci_ops = { 1381 .name = "mlx5-vfio-pci", 1382 .init = mlx5vf_pci_init_dev, 1383 .release = mlx5vf_pci_release_dev, 1384 .open_device = mlx5vf_pci_open_device, 1385 .close_device = mlx5vf_pci_close_device, 1386 .ioctl = vfio_pci_core_ioctl, 1387 .device_feature = vfio_pci_core_ioctl_feature, 1388 .read = vfio_pci_core_read, 1389 .write = vfio_pci_core_write, 1390 .mmap = vfio_pci_core_mmap, 1391 .request = vfio_pci_core_request, 1392 .match = vfio_pci_core_match, 1393 .bind_iommufd = vfio_iommufd_physical_bind, 1394 .unbind_iommufd = vfio_iommufd_physical_unbind, 1395 .attach_ioas = vfio_iommufd_physical_attach_ioas, 1396 .detach_ioas = vfio_iommufd_physical_detach_ioas, 1397 }; 1398 1399 static int mlx5vf_pci_probe(struct pci_dev *pdev, 1400 const struct pci_device_id *id) 1401 { 1402 struct mlx5vf_pci_core_device *mvdev; 1403 int ret; 1404 1405 mvdev = vfio_alloc_device(mlx5vf_pci_core_device, core_device.vdev, 1406 &pdev->dev, &mlx5vf_pci_ops); 1407 if (IS_ERR(mvdev)) 1408 return PTR_ERR(mvdev); 1409 1410 dev_set_drvdata(&pdev->dev, &mvdev->core_device); 1411 ret = vfio_pci_core_register_device(&mvdev->core_device); 1412 if (ret) 1413 goto out_put_vdev; 1414 return 0; 1415 1416 out_put_vdev: 1417 vfio_put_device(&mvdev->core_device.vdev); 1418 return ret; 1419 } 1420 1421 static void mlx5vf_pci_remove(struct pci_dev *pdev) 1422 { 1423 struct mlx5vf_pci_core_device *mvdev = mlx5vf_drvdata(pdev); 1424 1425 vfio_pci_core_unregister_device(&mvdev->core_device); 1426 vfio_put_device(&mvdev->core_device.vdev); 1427 } 1428 1429 static const struct pci_device_id mlx5vf_pci_table[] = { 1430 { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_MELLANOX, 0x101e) }, /* ConnectX Family mlx5Gen Virtual Function */ 1431 {} 1432 }; 1433 1434 MODULE_DEVICE_TABLE(pci, mlx5vf_pci_table); 1435 1436 static const struct pci_error_handlers mlx5vf_err_handlers = { 1437 .reset_done = mlx5vf_pci_aer_reset_done, 1438 .error_detected = vfio_pci_core_aer_err_detected, 1439 }; 1440 1441 static struct pci_driver mlx5vf_pci_driver = { 1442 .name = KBUILD_MODNAME, 1443 .id_table = mlx5vf_pci_table, 1444 .probe = mlx5vf_pci_probe, 1445 .remove = mlx5vf_pci_remove, 1446 .err_handler = &mlx5vf_err_handlers, 1447 .driver_managed_dma = true, 1448 }; 1449 1450 module_pci_driver(mlx5vf_pci_driver); 1451 1452 MODULE_IMPORT_NS(IOMMUFD); 1453 MODULE_LICENSE("GPL"); 1454 MODULE_AUTHOR("Max Gurtovoy <mgurtovoy@nvidia.com>"); 1455 MODULE_AUTHOR("Yishai Hadas <yishaih@nvidia.com>"); 1456 MODULE_DESCRIPTION( 1457 "MLX5 VFIO PCI - User Level meta-driver for MLX5 device family"); 1458