1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved 4 */ 5 6 #include <linux/device.h> 7 #include <linux/eventfd.h> 8 #include <linux/file.h> 9 #include <linux/interrupt.h> 10 #include <linux/iommu.h> 11 #include <linux/module.h> 12 #include <linux/mutex.h> 13 #include <linux/notifier.h> 14 #include <linux/pci.h> 15 #include <linux/pm_runtime.h> 16 #include <linux/types.h> 17 #include <linux/uaccess.h> 18 #include <linux/vfio.h> 19 #include <linux/sched/mm.h> 20 #include <linux/anon_inodes.h> 21 22 #include "cmd.h" 23 24 /* Device specification max LOAD size */ 25 #define MAX_LOAD_SIZE (BIT_ULL(__mlx5_bit_sz(load_vhca_state_in, size)) - 1) 26 27 #define MAX_CHUNK_SIZE SZ_8M 28 29 static struct mlx5vf_pci_core_device *mlx5vf_drvdata(struct pci_dev *pdev) 30 { 31 struct vfio_pci_core_device *core_device = dev_get_drvdata(&pdev->dev); 32 33 return container_of(core_device, struct mlx5vf_pci_core_device, 34 core_device); 35 } 36 37 struct page * 38 mlx5vf_get_migration_page(struct mlx5_vhca_data_buffer *buf, 39 unsigned long offset) 40 { 41 unsigned long cur_offset = 0; 42 struct scatterlist *sg; 43 unsigned int i; 44 45 /* All accesses are sequential */ 46 if (offset < buf->last_offset || !buf->last_offset_sg) { 47 buf->last_offset = 0; 48 buf->last_offset_sg = buf->table.sgt.sgl; 49 buf->sg_last_entry = 0; 50 } 51 52 cur_offset = buf->last_offset; 53 54 for_each_sg(buf->last_offset_sg, sg, 55 buf->table.sgt.orig_nents - buf->sg_last_entry, i) { 56 if (offset < sg->length + cur_offset) { 57 buf->last_offset_sg = sg; 58 buf->sg_last_entry += i; 59 buf->last_offset = cur_offset; 60 return nth_page(sg_page(sg), 61 (offset - cur_offset) / PAGE_SIZE); 62 } 63 cur_offset += sg->length; 64 } 65 return NULL; 66 } 67 68 static void mlx5vf_disable_fd(struct mlx5_vf_migration_file *migf) 69 { 70 mutex_lock(&migf->lock); 71 migf->state = MLX5_MIGF_STATE_ERROR; 72 migf->filp->f_pos = 0; 73 mutex_unlock(&migf->lock); 74 } 75 76 static int mlx5vf_release_file(struct inode *inode, struct file *filp) 77 { 78 struct mlx5_vf_migration_file *migf = filp->private_data; 79 80 mlx5vf_disable_fd(migf); 81 mutex_destroy(&migf->lock); 82 kfree(migf); 83 return 0; 84 } 85 86 static struct mlx5_vhca_data_buffer * 87 mlx5vf_get_data_buff_from_pos(struct mlx5_vf_migration_file *migf, loff_t pos, 88 bool *end_of_data) 89 { 90 struct mlx5_vhca_data_buffer *buf; 91 bool found = false; 92 93 *end_of_data = false; 94 spin_lock_irq(&migf->list_lock); 95 if (list_empty(&migf->buf_list)) { 96 *end_of_data = true; 97 goto end; 98 } 99 100 buf = list_first_entry(&migf->buf_list, struct mlx5_vhca_data_buffer, 101 buf_elm); 102 if (pos >= buf->start_pos && 103 pos < buf->start_pos + buf->length) { 104 found = true; 105 goto end; 106 } 107 108 /* 109 * As we use a stream based FD we may expect having the data always 110 * on first chunk 111 */ 112 migf->state = MLX5_MIGF_STATE_ERROR; 113 114 end: 115 spin_unlock_irq(&migf->list_lock); 116 return found ? buf : NULL; 117 } 118 119 static void mlx5vf_buf_read_done(struct mlx5_vhca_data_buffer *vhca_buf) 120 { 121 struct mlx5_vf_migration_file *migf = vhca_buf->migf; 122 123 if (vhca_buf->stop_copy_chunk_num) { 124 bool is_header = vhca_buf->dma_dir == DMA_NONE; 125 u8 chunk_num = vhca_buf->stop_copy_chunk_num; 126 size_t next_required_umem_size = 0; 127 128 if (is_header) 129 migf->buf_header[chunk_num - 1] = vhca_buf; 130 else 131 migf->buf[chunk_num - 1] = vhca_buf; 132 133 spin_lock_irq(&migf->list_lock); 134 list_del_init(&vhca_buf->buf_elm); 135 if (!is_header) { 136 next_required_umem_size = 137 migf->next_required_umem_size; 138 migf->next_required_umem_size = 0; 139 migf->num_ready_chunks--; 140 } 141 spin_unlock_irq(&migf->list_lock); 142 if (next_required_umem_size) 143 mlx5vf_mig_file_set_save_work(migf, chunk_num, 144 next_required_umem_size); 145 return; 146 } 147 148 spin_lock_irq(&migf->list_lock); 149 list_del_init(&vhca_buf->buf_elm); 150 list_add_tail(&vhca_buf->buf_elm, &vhca_buf->migf->avail_list); 151 spin_unlock_irq(&migf->list_lock); 152 } 153 154 static ssize_t mlx5vf_buf_read(struct mlx5_vhca_data_buffer *vhca_buf, 155 char __user **buf, size_t *len, loff_t *pos) 156 { 157 unsigned long offset; 158 ssize_t done = 0; 159 size_t copy_len; 160 161 copy_len = min_t(size_t, 162 vhca_buf->start_pos + vhca_buf->length - *pos, *len); 163 while (copy_len) { 164 size_t page_offset; 165 struct page *page; 166 size_t page_len; 167 u8 *from_buff; 168 int ret; 169 170 offset = *pos - vhca_buf->start_pos; 171 page_offset = offset % PAGE_SIZE; 172 offset -= page_offset; 173 page = mlx5vf_get_migration_page(vhca_buf, offset); 174 if (!page) 175 return -EINVAL; 176 page_len = min_t(size_t, copy_len, PAGE_SIZE - page_offset); 177 from_buff = kmap_local_page(page); 178 ret = copy_to_user(*buf, from_buff + page_offset, page_len); 179 kunmap_local(from_buff); 180 if (ret) 181 return -EFAULT; 182 *pos += page_len; 183 *len -= page_len; 184 *buf += page_len; 185 done += page_len; 186 copy_len -= page_len; 187 } 188 189 if (*pos >= vhca_buf->start_pos + vhca_buf->length) 190 mlx5vf_buf_read_done(vhca_buf); 191 192 return done; 193 } 194 195 static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len, 196 loff_t *pos) 197 { 198 struct mlx5_vf_migration_file *migf = filp->private_data; 199 struct mlx5_vhca_data_buffer *vhca_buf; 200 bool first_loop_call = true; 201 bool end_of_data; 202 ssize_t done = 0; 203 204 if (pos) 205 return -ESPIPE; 206 pos = &filp->f_pos; 207 208 if (!(filp->f_flags & O_NONBLOCK)) { 209 if (wait_event_interruptible(migf->poll_wait, 210 !list_empty(&migf->buf_list) || 211 migf->state == MLX5_MIGF_STATE_ERROR || 212 migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR || 213 migf->state == MLX5_MIGF_STATE_PRE_COPY || 214 migf->state == MLX5_MIGF_STATE_COMPLETE)) 215 return -ERESTARTSYS; 216 } 217 218 mutex_lock(&migf->lock); 219 if (migf->state == MLX5_MIGF_STATE_ERROR) { 220 done = -ENODEV; 221 goto out_unlock; 222 } 223 224 while (len) { 225 ssize_t count; 226 227 vhca_buf = mlx5vf_get_data_buff_from_pos(migf, *pos, 228 &end_of_data); 229 if (first_loop_call) { 230 first_loop_call = false; 231 /* Temporary end of file as part of PRE_COPY */ 232 if (end_of_data && (migf->state == MLX5_MIGF_STATE_PRE_COPY || 233 migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR)) { 234 done = -ENOMSG; 235 goto out_unlock; 236 } 237 238 if (end_of_data && migf->state != MLX5_MIGF_STATE_COMPLETE) { 239 if (filp->f_flags & O_NONBLOCK) { 240 done = -EAGAIN; 241 goto out_unlock; 242 } 243 } 244 } 245 246 if (end_of_data) 247 goto out_unlock; 248 249 if (!vhca_buf) { 250 done = -EINVAL; 251 goto out_unlock; 252 } 253 254 count = mlx5vf_buf_read(vhca_buf, &buf, &len, pos); 255 if (count < 0) { 256 done = count; 257 goto out_unlock; 258 } 259 done += count; 260 } 261 262 out_unlock: 263 mutex_unlock(&migf->lock); 264 return done; 265 } 266 267 static __poll_t mlx5vf_save_poll(struct file *filp, 268 struct poll_table_struct *wait) 269 { 270 struct mlx5_vf_migration_file *migf = filp->private_data; 271 __poll_t pollflags = 0; 272 273 poll_wait(filp, &migf->poll_wait, wait); 274 275 mutex_lock(&migf->lock); 276 if (migf->state == MLX5_MIGF_STATE_ERROR) 277 pollflags = EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; 278 else if (!list_empty(&migf->buf_list) || 279 migf->state == MLX5_MIGF_STATE_COMPLETE) 280 pollflags = EPOLLIN | EPOLLRDNORM; 281 mutex_unlock(&migf->lock); 282 283 return pollflags; 284 } 285 286 /* 287 * FD is exposed and user can use it after receiving an error. 288 * Mark migf in error, and wake the user. 289 */ 290 static void mlx5vf_mark_err(struct mlx5_vf_migration_file *migf) 291 { 292 migf->state = MLX5_MIGF_STATE_ERROR; 293 wake_up_interruptible(&migf->poll_wait); 294 } 295 296 void mlx5vf_mig_file_set_save_work(struct mlx5_vf_migration_file *migf, 297 u8 chunk_num, size_t next_required_umem_size) 298 { 299 migf->save_data[chunk_num - 1].next_required_umem_size = 300 next_required_umem_size; 301 migf->save_data[chunk_num - 1].migf = migf; 302 get_file(migf->filp); 303 queue_work(migf->mvdev->cb_wq, 304 &migf->save_data[chunk_num - 1].work); 305 } 306 307 static struct mlx5_vhca_data_buffer * 308 mlx5vf_mig_file_get_stop_copy_buf(struct mlx5_vf_migration_file *migf, 309 u8 index, size_t required_length) 310 { 311 struct mlx5_vhca_data_buffer *buf = migf->buf[index]; 312 u8 chunk_num; 313 314 WARN_ON(!buf); 315 chunk_num = buf->stop_copy_chunk_num; 316 buf->migf->buf[index] = NULL; 317 /* Checking whether the pre-allocated buffer can fit */ 318 if (buf->allocated_length >= required_length) 319 return buf; 320 321 mlx5vf_put_data_buffer(buf); 322 buf = mlx5vf_get_data_buffer(buf->migf, required_length, 323 DMA_FROM_DEVICE); 324 if (IS_ERR(buf)) 325 return buf; 326 327 buf->stop_copy_chunk_num = chunk_num; 328 return buf; 329 } 330 331 static void mlx5vf_mig_file_save_work(struct work_struct *_work) 332 { 333 struct mlx5vf_save_work_data *save_data = container_of(_work, 334 struct mlx5vf_save_work_data, work); 335 struct mlx5_vf_migration_file *migf = save_data->migf; 336 struct mlx5vf_pci_core_device *mvdev = migf->mvdev; 337 struct mlx5_vhca_data_buffer *buf; 338 339 mutex_lock(&mvdev->state_mutex); 340 if (migf->state == MLX5_MIGF_STATE_ERROR) 341 goto end; 342 343 buf = mlx5vf_mig_file_get_stop_copy_buf(migf, 344 save_data->chunk_num - 1, 345 save_data->next_required_umem_size); 346 if (IS_ERR(buf)) 347 goto err; 348 349 if (mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, false)) 350 goto err_save; 351 352 goto end; 353 354 err_save: 355 mlx5vf_put_data_buffer(buf); 356 err: 357 mlx5vf_mark_err(migf); 358 end: 359 mlx5vf_state_mutex_unlock(mvdev); 360 fput(migf->filp); 361 } 362 363 static int mlx5vf_add_stop_copy_header(struct mlx5_vf_migration_file *migf, 364 bool track) 365 { 366 size_t size = sizeof(struct mlx5_vf_migration_header) + 367 sizeof(struct mlx5_vf_migration_tag_stop_copy_data); 368 struct mlx5_vf_migration_tag_stop_copy_data data = {}; 369 struct mlx5_vhca_data_buffer *header_buf = NULL; 370 struct mlx5_vf_migration_header header = {}; 371 unsigned long flags; 372 struct page *page; 373 u8 *to_buff; 374 int ret; 375 376 header_buf = mlx5vf_get_data_buffer(migf, size, DMA_NONE); 377 if (IS_ERR(header_buf)) 378 return PTR_ERR(header_buf); 379 380 header.record_size = cpu_to_le64(sizeof(data)); 381 header.flags = cpu_to_le32(MLX5_MIGF_HEADER_FLAGS_TAG_OPTIONAL); 382 header.tag = cpu_to_le32(MLX5_MIGF_HEADER_TAG_STOP_COPY_SIZE); 383 page = mlx5vf_get_migration_page(header_buf, 0); 384 if (!page) { 385 ret = -EINVAL; 386 goto err; 387 } 388 to_buff = kmap_local_page(page); 389 memcpy(to_buff, &header, sizeof(header)); 390 header_buf->length = sizeof(header); 391 data.stop_copy_size = cpu_to_le64(migf->buf[0]->allocated_length); 392 memcpy(to_buff + sizeof(header), &data, sizeof(data)); 393 header_buf->length += sizeof(data); 394 kunmap_local(to_buff); 395 header_buf->start_pos = header_buf->migf->max_pos; 396 migf->max_pos += header_buf->length; 397 spin_lock_irqsave(&migf->list_lock, flags); 398 list_add_tail(&header_buf->buf_elm, &migf->buf_list); 399 spin_unlock_irqrestore(&migf->list_lock, flags); 400 if (track) 401 migf->pre_copy_initial_bytes = size; 402 return 0; 403 err: 404 mlx5vf_put_data_buffer(header_buf); 405 return ret; 406 } 407 408 static int mlx5vf_prep_stop_copy(struct mlx5vf_pci_core_device *mvdev, 409 struct mlx5_vf_migration_file *migf, 410 size_t state_size, u64 full_size, 411 bool track) 412 { 413 struct mlx5_vhca_data_buffer *buf; 414 size_t inc_state_size; 415 int num_chunks; 416 int ret; 417 int i; 418 419 if (mvdev->chunk_mode) { 420 size_t chunk_size = min_t(size_t, MAX_CHUNK_SIZE, full_size); 421 422 /* from firmware perspective at least 'state_size' buffer should be set */ 423 inc_state_size = max(state_size, chunk_size); 424 } else { 425 if (track) { 426 /* let's be ready for stop_copy size that might grow by 10 percents */ 427 if (check_add_overflow(state_size, state_size / 10, &inc_state_size)) 428 inc_state_size = state_size; 429 } else { 430 inc_state_size = state_size; 431 } 432 } 433 434 /* let's not overflow the device specification max SAVE size */ 435 inc_state_size = min_t(size_t, inc_state_size, 436 (BIT_ULL(__mlx5_bit_sz(save_vhca_state_in, size)) - PAGE_SIZE)); 437 438 num_chunks = mvdev->chunk_mode ? MAX_NUM_CHUNKS : 1; 439 for (i = 0; i < num_chunks; i++) { 440 buf = mlx5vf_get_data_buffer(migf, inc_state_size, DMA_FROM_DEVICE); 441 if (IS_ERR(buf)) { 442 ret = PTR_ERR(buf); 443 goto err; 444 } 445 446 migf->buf[i] = buf; 447 buf = mlx5vf_get_data_buffer(migf, 448 sizeof(struct mlx5_vf_migration_header), DMA_NONE); 449 if (IS_ERR(buf)) { 450 ret = PTR_ERR(buf); 451 goto err; 452 } 453 migf->buf_header[i] = buf; 454 if (mvdev->chunk_mode) { 455 migf->buf[i]->stop_copy_chunk_num = i + 1; 456 migf->buf_header[i]->stop_copy_chunk_num = i + 1; 457 INIT_WORK(&migf->save_data[i].work, 458 mlx5vf_mig_file_save_work); 459 migf->save_data[i].chunk_num = i + 1; 460 } 461 } 462 463 ret = mlx5vf_add_stop_copy_header(migf, track); 464 if (ret) 465 goto err; 466 return 0; 467 468 err: 469 for (i = 0; i < num_chunks; i++) { 470 if (migf->buf[i]) { 471 mlx5vf_put_data_buffer(migf->buf[i]); 472 migf->buf[i] = NULL; 473 } 474 if (migf->buf_header[i]) { 475 mlx5vf_put_data_buffer(migf->buf_header[i]); 476 migf->buf_header[i] = NULL; 477 } 478 } 479 480 return ret; 481 } 482 483 static long mlx5vf_precopy_ioctl(struct file *filp, unsigned int cmd, 484 unsigned long arg) 485 { 486 struct mlx5_vf_migration_file *migf = filp->private_data; 487 struct mlx5vf_pci_core_device *mvdev = migf->mvdev; 488 struct mlx5_vhca_data_buffer *buf; 489 struct vfio_precopy_info info = {}; 490 loff_t *pos = &filp->f_pos; 491 unsigned long minsz; 492 size_t inc_length = 0; 493 bool end_of_data = false; 494 int ret; 495 496 if (cmd != VFIO_MIG_GET_PRECOPY_INFO) 497 return -ENOTTY; 498 499 minsz = offsetofend(struct vfio_precopy_info, dirty_bytes); 500 501 if (copy_from_user(&info, (void __user *)arg, minsz)) 502 return -EFAULT; 503 504 if (info.argsz < minsz) 505 return -EINVAL; 506 507 mutex_lock(&mvdev->state_mutex); 508 if (mvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY && 509 mvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY_P2P) { 510 ret = -EINVAL; 511 goto err_state_unlock; 512 } 513 514 /* 515 * We can't issue a SAVE command when the device is suspended, so as 516 * part of VFIO_DEVICE_STATE_PRE_COPY_P2P no reason to query for extra 517 * bytes that can't be read. 518 */ 519 if (mvdev->mig_state == VFIO_DEVICE_STATE_PRE_COPY) { 520 /* 521 * Once the query returns it's guaranteed that there is no 522 * active SAVE command. 523 * As so, the other code below is safe with the proper locks. 524 */ 525 ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &inc_length, 526 NULL, MLX5VF_QUERY_INC); 527 if (ret) 528 goto err_state_unlock; 529 } 530 531 mutex_lock(&migf->lock); 532 if (migf->state == MLX5_MIGF_STATE_ERROR) { 533 ret = -ENODEV; 534 goto err_migf_unlock; 535 } 536 537 if (migf->pre_copy_initial_bytes > *pos) { 538 info.initial_bytes = migf->pre_copy_initial_bytes - *pos; 539 } else { 540 info.dirty_bytes = migf->max_pos - *pos; 541 if (!info.dirty_bytes) 542 end_of_data = true; 543 info.dirty_bytes += inc_length; 544 } 545 546 if (!end_of_data || !inc_length) { 547 mutex_unlock(&migf->lock); 548 goto done; 549 } 550 551 mutex_unlock(&migf->lock); 552 /* 553 * We finished transferring the current state and the device has a 554 * dirty state, save a new state to be ready for. 555 */ 556 buf = mlx5vf_get_data_buffer(migf, inc_length, DMA_FROM_DEVICE); 557 if (IS_ERR(buf)) { 558 ret = PTR_ERR(buf); 559 mlx5vf_mark_err(migf); 560 goto err_state_unlock; 561 } 562 563 ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, true); 564 if (ret) { 565 mlx5vf_mark_err(migf); 566 mlx5vf_put_data_buffer(buf); 567 goto err_state_unlock; 568 } 569 570 done: 571 mlx5vf_state_mutex_unlock(mvdev); 572 if (copy_to_user((void __user *)arg, &info, minsz)) 573 return -EFAULT; 574 return 0; 575 576 err_migf_unlock: 577 mutex_unlock(&migf->lock); 578 err_state_unlock: 579 mlx5vf_state_mutex_unlock(mvdev); 580 return ret; 581 } 582 583 static const struct file_operations mlx5vf_save_fops = { 584 .owner = THIS_MODULE, 585 .read = mlx5vf_save_read, 586 .poll = mlx5vf_save_poll, 587 .unlocked_ioctl = mlx5vf_precopy_ioctl, 588 .compat_ioctl = compat_ptr_ioctl, 589 .release = mlx5vf_release_file, 590 }; 591 592 static int mlx5vf_pci_save_device_inc_data(struct mlx5vf_pci_core_device *mvdev) 593 { 594 struct mlx5_vf_migration_file *migf = mvdev->saving_migf; 595 struct mlx5_vhca_data_buffer *buf; 596 size_t length; 597 int ret; 598 599 if (migf->state == MLX5_MIGF_STATE_ERROR) 600 return -ENODEV; 601 602 ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, NULL, 603 MLX5VF_QUERY_INC | MLX5VF_QUERY_FINAL); 604 if (ret) 605 goto err; 606 607 buf = mlx5vf_mig_file_get_stop_copy_buf(migf, 0, length); 608 if (IS_ERR(buf)) { 609 ret = PTR_ERR(buf); 610 goto err; 611 } 612 613 ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, false); 614 if (ret) 615 goto err_save; 616 617 return 0; 618 619 err_save: 620 mlx5vf_put_data_buffer(buf); 621 err: 622 mlx5vf_mark_err(migf); 623 return ret; 624 } 625 626 static struct mlx5_vf_migration_file * 627 mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev, bool track) 628 { 629 struct mlx5_vf_migration_file *migf; 630 struct mlx5_vhca_data_buffer *buf; 631 size_t length; 632 u64 full_size; 633 int ret; 634 635 migf = kzalloc(sizeof(*migf), GFP_KERNEL_ACCOUNT); 636 if (!migf) 637 return ERR_PTR(-ENOMEM); 638 639 migf->filp = anon_inode_getfile("mlx5vf_mig", &mlx5vf_save_fops, migf, 640 O_RDONLY); 641 if (IS_ERR(migf->filp)) { 642 ret = PTR_ERR(migf->filp); 643 goto end; 644 } 645 646 migf->mvdev = mvdev; 647 ret = mlx5vf_cmd_alloc_pd(migf); 648 if (ret) 649 goto out_free; 650 651 stream_open(migf->filp->f_inode, migf->filp); 652 mutex_init(&migf->lock); 653 init_waitqueue_head(&migf->poll_wait); 654 init_completion(&migf->save_comp); 655 /* 656 * save_comp is being used as a binary semaphore built from 657 * a completion. A normal mutex cannot be used because the lock is 658 * passed between kernel threads and lockdep can't model this. 659 */ 660 complete(&migf->save_comp); 661 mlx5_cmd_init_async_ctx(mvdev->mdev, &migf->async_ctx); 662 INIT_WORK(&migf->async_data.work, mlx5vf_mig_file_cleanup_cb); 663 INIT_LIST_HEAD(&migf->buf_list); 664 INIT_LIST_HEAD(&migf->avail_list); 665 spin_lock_init(&migf->list_lock); 666 ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, &full_size, 0); 667 if (ret) 668 goto out_pd; 669 670 ret = mlx5vf_prep_stop_copy(mvdev, migf, length, full_size, track); 671 if (ret) 672 goto out_pd; 673 674 if (track) { 675 /* leave the allocated buffer ready for the stop-copy phase */ 676 buf = mlx5vf_alloc_data_buffer(migf, 677 migf->buf[0]->allocated_length, DMA_FROM_DEVICE); 678 if (IS_ERR(buf)) { 679 ret = PTR_ERR(buf); 680 goto out_pd; 681 } 682 } else { 683 buf = migf->buf[0]; 684 migf->buf[0] = NULL; 685 } 686 687 ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, false, track); 688 if (ret) 689 goto out_save; 690 return migf; 691 out_save: 692 mlx5vf_free_data_buffer(buf); 693 out_pd: 694 mlx5fv_cmd_clean_migf_resources(migf); 695 out_free: 696 fput(migf->filp); 697 end: 698 kfree(migf); 699 return ERR_PTR(ret); 700 } 701 702 static int 703 mlx5vf_append_page_to_mig_buf(struct mlx5_vhca_data_buffer *vhca_buf, 704 const char __user **buf, size_t *len, 705 loff_t *pos, ssize_t *done) 706 { 707 unsigned long offset; 708 size_t page_offset; 709 struct page *page; 710 size_t page_len; 711 u8 *to_buff; 712 int ret; 713 714 offset = *pos - vhca_buf->start_pos; 715 page_offset = offset % PAGE_SIZE; 716 717 page = mlx5vf_get_migration_page(vhca_buf, offset - page_offset); 718 if (!page) 719 return -EINVAL; 720 page_len = min_t(size_t, *len, PAGE_SIZE - page_offset); 721 to_buff = kmap_local_page(page); 722 ret = copy_from_user(to_buff + page_offset, *buf, page_len); 723 kunmap_local(to_buff); 724 if (ret) 725 return -EFAULT; 726 727 *pos += page_len; 728 *done += page_len; 729 *buf += page_len; 730 *len -= page_len; 731 vhca_buf->length += page_len; 732 return 0; 733 } 734 735 static ssize_t 736 mlx5vf_resume_read_image(struct mlx5_vf_migration_file *migf, 737 struct mlx5_vhca_data_buffer *vhca_buf, 738 size_t image_size, const char __user **buf, 739 size_t *len, loff_t *pos, ssize_t *done, 740 bool *has_work) 741 { 742 size_t copy_len, to_copy; 743 int ret; 744 745 to_copy = min_t(size_t, *len, image_size - vhca_buf->length); 746 copy_len = to_copy; 747 while (to_copy) { 748 ret = mlx5vf_append_page_to_mig_buf(vhca_buf, buf, &to_copy, pos, 749 done); 750 if (ret) 751 return ret; 752 } 753 754 *len -= copy_len; 755 if (vhca_buf->length == image_size) { 756 migf->load_state = MLX5_VF_LOAD_STATE_LOAD_IMAGE; 757 migf->max_pos += image_size; 758 *has_work = true; 759 } 760 761 return 0; 762 } 763 764 static int 765 mlx5vf_resume_read_header_data(struct mlx5_vf_migration_file *migf, 766 struct mlx5_vhca_data_buffer *vhca_buf, 767 const char __user **buf, size_t *len, 768 loff_t *pos, ssize_t *done) 769 { 770 size_t copy_len, to_copy; 771 size_t required_data; 772 u8 *to_buff; 773 int ret; 774 775 required_data = migf->record_size - vhca_buf->length; 776 to_copy = min_t(size_t, *len, required_data); 777 copy_len = to_copy; 778 while (to_copy) { 779 ret = mlx5vf_append_page_to_mig_buf(vhca_buf, buf, &to_copy, pos, 780 done); 781 if (ret) 782 return ret; 783 } 784 785 *len -= copy_len; 786 if (vhca_buf->length == migf->record_size) { 787 switch (migf->record_tag) { 788 case MLX5_MIGF_HEADER_TAG_STOP_COPY_SIZE: 789 { 790 struct page *page; 791 792 page = mlx5vf_get_migration_page(vhca_buf, 0); 793 if (!page) 794 return -EINVAL; 795 to_buff = kmap_local_page(page); 796 migf->stop_copy_prep_size = min_t(u64, 797 le64_to_cpup((__le64 *)to_buff), MAX_LOAD_SIZE); 798 kunmap_local(to_buff); 799 break; 800 } 801 default: 802 /* Optional tag */ 803 break; 804 } 805 806 migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER; 807 migf->max_pos += migf->record_size; 808 vhca_buf->length = 0; 809 } 810 811 return 0; 812 } 813 814 static int 815 mlx5vf_resume_read_header(struct mlx5_vf_migration_file *migf, 816 struct mlx5_vhca_data_buffer *vhca_buf, 817 const char __user **buf, 818 size_t *len, loff_t *pos, 819 ssize_t *done, bool *has_work) 820 { 821 struct page *page; 822 size_t copy_len; 823 u8 *to_buff; 824 int ret; 825 826 copy_len = min_t(size_t, *len, 827 sizeof(struct mlx5_vf_migration_header) - vhca_buf->length); 828 page = mlx5vf_get_migration_page(vhca_buf, 0); 829 if (!page) 830 return -EINVAL; 831 to_buff = kmap_local_page(page); 832 ret = copy_from_user(to_buff + vhca_buf->length, *buf, copy_len); 833 if (ret) { 834 ret = -EFAULT; 835 goto end; 836 } 837 838 *buf += copy_len; 839 *pos += copy_len; 840 *done += copy_len; 841 *len -= copy_len; 842 vhca_buf->length += copy_len; 843 if (vhca_buf->length == sizeof(struct mlx5_vf_migration_header)) { 844 u64 record_size; 845 u32 flags; 846 847 record_size = le64_to_cpup((__le64 *)to_buff); 848 if (record_size > MAX_LOAD_SIZE) { 849 ret = -ENOMEM; 850 goto end; 851 } 852 853 migf->record_size = record_size; 854 flags = le32_to_cpup((__le32 *)(to_buff + 855 offsetof(struct mlx5_vf_migration_header, flags))); 856 migf->record_tag = le32_to_cpup((__le32 *)(to_buff + 857 offsetof(struct mlx5_vf_migration_header, tag))); 858 switch (migf->record_tag) { 859 case MLX5_MIGF_HEADER_TAG_FW_DATA: 860 migf->load_state = MLX5_VF_LOAD_STATE_PREP_IMAGE; 861 break; 862 case MLX5_MIGF_HEADER_TAG_STOP_COPY_SIZE: 863 migf->load_state = MLX5_VF_LOAD_STATE_PREP_HEADER_DATA; 864 break; 865 default: 866 if (!(flags & MLX5_MIGF_HEADER_FLAGS_TAG_OPTIONAL)) { 867 ret = -EOPNOTSUPP; 868 goto end; 869 } 870 /* We may read and skip this optional record data */ 871 migf->load_state = MLX5_VF_LOAD_STATE_PREP_HEADER_DATA; 872 } 873 874 migf->max_pos += vhca_buf->length; 875 vhca_buf->length = 0; 876 *has_work = true; 877 } 878 end: 879 kunmap_local(to_buff); 880 return ret; 881 } 882 883 static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf, 884 size_t len, loff_t *pos) 885 { 886 struct mlx5_vf_migration_file *migf = filp->private_data; 887 struct mlx5_vhca_data_buffer *vhca_buf = migf->buf[0]; 888 struct mlx5_vhca_data_buffer *vhca_buf_header = migf->buf_header[0]; 889 loff_t requested_length; 890 bool has_work = false; 891 ssize_t done = 0; 892 int ret = 0; 893 894 if (pos) 895 return -ESPIPE; 896 pos = &filp->f_pos; 897 898 if (*pos < 0 || 899 check_add_overflow((loff_t)len, *pos, &requested_length)) 900 return -EINVAL; 901 902 mutex_lock(&migf->mvdev->state_mutex); 903 mutex_lock(&migf->lock); 904 if (migf->state == MLX5_MIGF_STATE_ERROR) { 905 ret = -ENODEV; 906 goto out_unlock; 907 } 908 909 while (len || has_work) { 910 has_work = false; 911 switch (migf->load_state) { 912 case MLX5_VF_LOAD_STATE_READ_HEADER: 913 ret = mlx5vf_resume_read_header(migf, vhca_buf_header, 914 &buf, &len, pos, 915 &done, &has_work); 916 if (ret) 917 goto out_unlock; 918 break; 919 case MLX5_VF_LOAD_STATE_PREP_HEADER_DATA: 920 if (vhca_buf_header->allocated_length < migf->record_size) { 921 mlx5vf_free_data_buffer(vhca_buf_header); 922 923 migf->buf_header[0] = mlx5vf_alloc_data_buffer(migf, 924 migf->record_size, DMA_NONE); 925 if (IS_ERR(migf->buf_header[0])) { 926 ret = PTR_ERR(migf->buf_header[0]); 927 migf->buf_header[0] = NULL; 928 goto out_unlock; 929 } 930 931 vhca_buf_header = migf->buf_header[0]; 932 } 933 934 vhca_buf_header->start_pos = migf->max_pos; 935 migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER_DATA; 936 break; 937 case MLX5_VF_LOAD_STATE_READ_HEADER_DATA: 938 ret = mlx5vf_resume_read_header_data(migf, vhca_buf_header, 939 &buf, &len, pos, &done); 940 if (ret) 941 goto out_unlock; 942 break; 943 case MLX5_VF_LOAD_STATE_PREP_IMAGE: 944 { 945 u64 size = max(migf->record_size, 946 migf->stop_copy_prep_size); 947 948 if (vhca_buf->allocated_length < size) { 949 mlx5vf_free_data_buffer(vhca_buf); 950 951 migf->buf[0] = mlx5vf_alloc_data_buffer(migf, 952 size, DMA_TO_DEVICE); 953 if (IS_ERR(migf->buf[0])) { 954 ret = PTR_ERR(migf->buf[0]); 955 migf->buf[0] = NULL; 956 goto out_unlock; 957 } 958 959 vhca_buf = migf->buf[0]; 960 } 961 962 vhca_buf->start_pos = migf->max_pos; 963 migf->load_state = MLX5_VF_LOAD_STATE_READ_IMAGE; 964 break; 965 } 966 case MLX5_VF_LOAD_STATE_READ_IMAGE: 967 ret = mlx5vf_resume_read_image(migf, vhca_buf, 968 migf->record_size, 969 &buf, &len, pos, &done, &has_work); 970 if (ret) 971 goto out_unlock; 972 break; 973 case MLX5_VF_LOAD_STATE_LOAD_IMAGE: 974 ret = mlx5vf_cmd_load_vhca_state(migf->mvdev, migf, vhca_buf); 975 if (ret) 976 goto out_unlock; 977 migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER; 978 979 /* prep header buf for next image */ 980 vhca_buf_header->length = 0; 981 /* prep data buf for next image */ 982 vhca_buf->length = 0; 983 984 break; 985 default: 986 break; 987 } 988 } 989 990 out_unlock: 991 if (ret) 992 migf->state = MLX5_MIGF_STATE_ERROR; 993 mutex_unlock(&migf->lock); 994 mlx5vf_state_mutex_unlock(migf->mvdev); 995 return ret ? ret : done; 996 } 997 998 static const struct file_operations mlx5vf_resume_fops = { 999 .owner = THIS_MODULE, 1000 .write = mlx5vf_resume_write, 1001 .release = mlx5vf_release_file, 1002 }; 1003 1004 static struct mlx5_vf_migration_file * 1005 mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev) 1006 { 1007 struct mlx5_vf_migration_file *migf; 1008 struct mlx5_vhca_data_buffer *buf; 1009 int ret; 1010 1011 migf = kzalloc(sizeof(*migf), GFP_KERNEL_ACCOUNT); 1012 if (!migf) 1013 return ERR_PTR(-ENOMEM); 1014 1015 migf->filp = anon_inode_getfile("mlx5vf_mig", &mlx5vf_resume_fops, migf, 1016 O_WRONLY); 1017 if (IS_ERR(migf->filp)) { 1018 ret = PTR_ERR(migf->filp); 1019 goto end; 1020 } 1021 1022 migf->mvdev = mvdev; 1023 ret = mlx5vf_cmd_alloc_pd(migf); 1024 if (ret) 1025 goto out_free; 1026 1027 buf = mlx5vf_alloc_data_buffer(migf, 0, DMA_TO_DEVICE); 1028 if (IS_ERR(buf)) { 1029 ret = PTR_ERR(buf); 1030 goto out_pd; 1031 } 1032 1033 migf->buf[0] = buf; 1034 buf = mlx5vf_alloc_data_buffer(migf, 1035 sizeof(struct mlx5_vf_migration_header), DMA_NONE); 1036 if (IS_ERR(buf)) { 1037 ret = PTR_ERR(buf); 1038 goto out_buf; 1039 } 1040 1041 migf->buf_header[0] = buf; 1042 migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER; 1043 1044 stream_open(migf->filp->f_inode, migf->filp); 1045 mutex_init(&migf->lock); 1046 INIT_LIST_HEAD(&migf->buf_list); 1047 INIT_LIST_HEAD(&migf->avail_list); 1048 spin_lock_init(&migf->list_lock); 1049 return migf; 1050 out_buf: 1051 mlx5vf_free_data_buffer(migf->buf[0]); 1052 out_pd: 1053 mlx5vf_cmd_dealloc_pd(migf); 1054 out_free: 1055 fput(migf->filp); 1056 end: 1057 kfree(migf); 1058 return ERR_PTR(ret); 1059 } 1060 1061 void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev, 1062 enum mlx5_vf_migf_state *last_save_state) 1063 { 1064 if (mvdev->resuming_migf) { 1065 mlx5vf_disable_fd(mvdev->resuming_migf); 1066 mlx5fv_cmd_clean_migf_resources(mvdev->resuming_migf); 1067 fput(mvdev->resuming_migf->filp); 1068 mvdev->resuming_migf = NULL; 1069 } 1070 if (mvdev->saving_migf) { 1071 mlx5_cmd_cleanup_async_ctx(&mvdev->saving_migf->async_ctx); 1072 cancel_work_sync(&mvdev->saving_migf->async_data.work); 1073 if (last_save_state) 1074 *last_save_state = mvdev->saving_migf->state; 1075 mlx5vf_disable_fd(mvdev->saving_migf); 1076 wake_up_interruptible(&mvdev->saving_migf->poll_wait); 1077 mlx5fv_cmd_clean_migf_resources(mvdev->saving_migf); 1078 fput(mvdev->saving_migf->filp); 1079 mvdev->saving_migf = NULL; 1080 } 1081 } 1082 1083 static struct file * 1084 mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev, 1085 u32 new) 1086 { 1087 u32 cur = mvdev->mig_state; 1088 int ret; 1089 1090 if (cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_STOP) { 1091 ret = mlx5vf_cmd_suspend_vhca(mvdev, 1092 MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_RESPONDER); 1093 if (ret) 1094 return ERR_PTR(ret); 1095 return NULL; 1096 } 1097 1098 if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RUNNING_P2P) { 1099 ret = mlx5vf_cmd_resume_vhca(mvdev, 1100 MLX5_RESUME_VHCA_IN_OP_MOD_RESUME_RESPONDER); 1101 if (ret) 1102 return ERR_PTR(ret); 1103 return NULL; 1104 } 1105 1106 if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_RUNNING_P2P) || 1107 (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) { 1108 ret = mlx5vf_cmd_suspend_vhca(mvdev, 1109 MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_INITIATOR); 1110 if (ret) 1111 return ERR_PTR(ret); 1112 return NULL; 1113 } 1114 1115 if ((cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_RUNNING) || 1116 (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_PRE_COPY)) { 1117 ret = mlx5vf_cmd_resume_vhca(mvdev, 1118 MLX5_RESUME_VHCA_IN_OP_MOD_RESUME_INITIATOR); 1119 if (ret) 1120 return ERR_PTR(ret); 1121 return NULL; 1122 } 1123 1124 if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_STOP_COPY) { 1125 struct mlx5_vf_migration_file *migf; 1126 1127 migf = mlx5vf_pci_save_device_data(mvdev, false); 1128 if (IS_ERR(migf)) 1129 return ERR_CAST(migf); 1130 get_file(migf->filp); 1131 mvdev->saving_migf = migf; 1132 return migf->filp; 1133 } 1134 1135 if (cur == VFIO_DEVICE_STATE_STOP_COPY && new == VFIO_DEVICE_STATE_STOP) { 1136 mlx5vf_disable_fds(mvdev, NULL); 1137 return NULL; 1138 } 1139 1140 if ((cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_RUNNING) || 1141 (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && 1142 new == VFIO_DEVICE_STATE_RUNNING_P2P)) { 1143 struct mlx5_vf_migration_file *migf = mvdev->saving_migf; 1144 struct mlx5_vhca_data_buffer *buf; 1145 enum mlx5_vf_migf_state state; 1146 size_t size; 1147 1148 ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &size, NULL, 1149 MLX5VF_QUERY_INC | MLX5VF_QUERY_CLEANUP); 1150 if (ret) 1151 return ERR_PTR(ret); 1152 buf = mlx5vf_get_data_buffer(migf, size, DMA_FROM_DEVICE); 1153 if (IS_ERR(buf)) 1154 return ERR_CAST(buf); 1155 /* pre_copy cleanup */ 1156 ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, false, false); 1157 if (ret) { 1158 mlx5vf_put_data_buffer(buf); 1159 return ERR_PTR(ret); 1160 } 1161 mlx5vf_disable_fds(mvdev, &state); 1162 return (state != MLX5_MIGF_STATE_ERROR) ? NULL : ERR_PTR(-EIO); 1163 } 1164 1165 if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RESUMING) { 1166 struct mlx5_vf_migration_file *migf; 1167 1168 migf = mlx5vf_pci_resume_device_data(mvdev); 1169 if (IS_ERR(migf)) 1170 return ERR_CAST(migf); 1171 get_file(migf->filp); 1172 mvdev->resuming_migf = migf; 1173 return migf->filp; 1174 } 1175 1176 if (cur == VFIO_DEVICE_STATE_RESUMING && new == VFIO_DEVICE_STATE_STOP) { 1177 mlx5vf_disable_fds(mvdev, NULL); 1178 return NULL; 1179 } 1180 1181 if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_PRE_COPY) || 1182 (cur == VFIO_DEVICE_STATE_RUNNING_P2P && 1183 new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) { 1184 struct mlx5_vf_migration_file *migf; 1185 1186 migf = mlx5vf_pci_save_device_data(mvdev, true); 1187 if (IS_ERR(migf)) 1188 return ERR_CAST(migf); 1189 get_file(migf->filp); 1190 mvdev->saving_migf = migf; 1191 return migf->filp; 1192 } 1193 1194 if (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_STOP_COPY) { 1195 ret = mlx5vf_cmd_suspend_vhca(mvdev, 1196 MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_RESPONDER); 1197 if (ret) 1198 return ERR_PTR(ret); 1199 ret = mlx5vf_pci_save_device_inc_data(mvdev); 1200 return ret ? ERR_PTR(ret) : NULL; 1201 } 1202 1203 /* 1204 * vfio_mig_get_next_state() does not use arcs other than the above 1205 */ 1206 WARN_ON(true); 1207 return ERR_PTR(-EINVAL); 1208 } 1209 1210 /* 1211 * This function is called in all state_mutex unlock cases to 1212 * handle a 'deferred_reset' if exists. 1213 */ 1214 void mlx5vf_state_mutex_unlock(struct mlx5vf_pci_core_device *mvdev) 1215 { 1216 again: 1217 spin_lock(&mvdev->reset_lock); 1218 if (mvdev->deferred_reset) { 1219 mvdev->deferred_reset = false; 1220 spin_unlock(&mvdev->reset_lock); 1221 mvdev->mig_state = VFIO_DEVICE_STATE_RUNNING; 1222 mlx5vf_disable_fds(mvdev, NULL); 1223 goto again; 1224 } 1225 mutex_unlock(&mvdev->state_mutex); 1226 spin_unlock(&mvdev->reset_lock); 1227 } 1228 1229 static struct file * 1230 mlx5vf_pci_set_device_state(struct vfio_device *vdev, 1231 enum vfio_device_mig_state new_state) 1232 { 1233 struct mlx5vf_pci_core_device *mvdev = container_of( 1234 vdev, struct mlx5vf_pci_core_device, core_device.vdev); 1235 enum vfio_device_mig_state next_state; 1236 struct file *res = NULL; 1237 int ret; 1238 1239 mutex_lock(&mvdev->state_mutex); 1240 while (new_state != mvdev->mig_state) { 1241 ret = vfio_mig_get_next_state(vdev, mvdev->mig_state, 1242 new_state, &next_state); 1243 if (ret) { 1244 res = ERR_PTR(ret); 1245 break; 1246 } 1247 res = mlx5vf_pci_step_device_state_locked(mvdev, next_state); 1248 if (IS_ERR(res)) 1249 break; 1250 mvdev->mig_state = next_state; 1251 if (WARN_ON(res && new_state != mvdev->mig_state)) { 1252 fput(res); 1253 res = ERR_PTR(-EINVAL); 1254 break; 1255 } 1256 } 1257 mlx5vf_state_mutex_unlock(mvdev); 1258 return res; 1259 } 1260 1261 static int mlx5vf_pci_get_data_size(struct vfio_device *vdev, 1262 unsigned long *stop_copy_length) 1263 { 1264 struct mlx5vf_pci_core_device *mvdev = container_of( 1265 vdev, struct mlx5vf_pci_core_device, core_device.vdev); 1266 size_t state_size; 1267 u64 total_size; 1268 int ret; 1269 1270 mutex_lock(&mvdev->state_mutex); 1271 ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &state_size, 1272 &total_size, 0); 1273 if (!ret) 1274 *stop_copy_length = total_size; 1275 mlx5vf_state_mutex_unlock(mvdev); 1276 return ret; 1277 } 1278 1279 static int mlx5vf_pci_get_device_state(struct vfio_device *vdev, 1280 enum vfio_device_mig_state *curr_state) 1281 { 1282 struct mlx5vf_pci_core_device *mvdev = container_of( 1283 vdev, struct mlx5vf_pci_core_device, core_device.vdev); 1284 1285 mutex_lock(&mvdev->state_mutex); 1286 *curr_state = mvdev->mig_state; 1287 mlx5vf_state_mutex_unlock(mvdev); 1288 return 0; 1289 } 1290 1291 static void mlx5vf_pci_aer_reset_done(struct pci_dev *pdev) 1292 { 1293 struct mlx5vf_pci_core_device *mvdev = mlx5vf_drvdata(pdev); 1294 1295 if (!mvdev->migrate_cap) 1296 return; 1297 1298 /* 1299 * As the higher VFIO layers are holding locks across reset and using 1300 * those same locks with the mm_lock we need to prevent ABBA deadlock 1301 * with the state_mutex and mm_lock. 1302 * In case the state_mutex was taken already we defer the cleanup work 1303 * to the unlock flow of the other running context. 1304 */ 1305 spin_lock(&mvdev->reset_lock); 1306 mvdev->deferred_reset = true; 1307 if (!mutex_trylock(&mvdev->state_mutex)) { 1308 spin_unlock(&mvdev->reset_lock); 1309 return; 1310 } 1311 spin_unlock(&mvdev->reset_lock); 1312 mlx5vf_state_mutex_unlock(mvdev); 1313 } 1314 1315 static int mlx5vf_pci_open_device(struct vfio_device *core_vdev) 1316 { 1317 struct mlx5vf_pci_core_device *mvdev = container_of( 1318 core_vdev, struct mlx5vf_pci_core_device, core_device.vdev); 1319 struct vfio_pci_core_device *vdev = &mvdev->core_device; 1320 int ret; 1321 1322 ret = vfio_pci_core_enable(vdev); 1323 if (ret) 1324 return ret; 1325 1326 if (mvdev->migrate_cap) 1327 mvdev->mig_state = VFIO_DEVICE_STATE_RUNNING; 1328 vfio_pci_core_finish_enable(vdev); 1329 return 0; 1330 } 1331 1332 static void mlx5vf_pci_close_device(struct vfio_device *core_vdev) 1333 { 1334 struct mlx5vf_pci_core_device *mvdev = container_of( 1335 core_vdev, struct mlx5vf_pci_core_device, core_device.vdev); 1336 1337 mlx5vf_cmd_close_migratable(mvdev); 1338 vfio_pci_core_close_device(core_vdev); 1339 } 1340 1341 static const struct vfio_migration_ops mlx5vf_pci_mig_ops = { 1342 .migration_set_state = mlx5vf_pci_set_device_state, 1343 .migration_get_state = mlx5vf_pci_get_device_state, 1344 .migration_get_data_size = mlx5vf_pci_get_data_size, 1345 }; 1346 1347 static const struct vfio_log_ops mlx5vf_pci_log_ops = { 1348 .log_start = mlx5vf_start_page_tracker, 1349 .log_stop = mlx5vf_stop_page_tracker, 1350 .log_read_and_clear = mlx5vf_tracker_read_and_clear, 1351 }; 1352 1353 static int mlx5vf_pci_init_dev(struct vfio_device *core_vdev) 1354 { 1355 struct mlx5vf_pci_core_device *mvdev = container_of(core_vdev, 1356 struct mlx5vf_pci_core_device, core_device.vdev); 1357 int ret; 1358 1359 ret = vfio_pci_core_init_dev(core_vdev); 1360 if (ret) 1361 return ret; 1362 1363 mlx5vf_cmd_set_migratable(mvdev, &mlx5vf_pci_mig_ops, 1364 &mlx5vf_pci_log_ops); 1365 1366 return 0; 1367 } 1368 1369 static void mlx5vf_pci_release_dev(struct vfio_device *core_vdev) 1370 { 1371 struct mlx5vf_pci_core_device *mvdev = container_of(core_vdev, 1372 struct mlx5vf_pci_core_device, core_device.vdev); 1373 1374 mlx5vf_cmd_remove_migratable(mvdev); 1375 vfio_pci_core_release_dev(core_vdev); 1376 } 1377 1378 static const struct vfio_device_ops mlx5vf_pci_ops = { 1379 .name = "mlx5-vfio-pci", 1380 .init = mlx5vf_pci_init_dev, 1381 .release = mlx5vf_pci_release_dev, 1382 .open_device = mlx5vf_pci_open_device, 1383 .close_device = mlx5vf_pci_close_device, 1384 .ioctl = vfio_pci_core_ioctl, 1385 .device_feature = vfio_pci_core_ioctl_feature, 1386 .read = vfio_pci_core_read, 1387 .write = vfio_pci_core_write, 1388 .mmap = vfio_pci_core_mmap, 1389 .request = vfio_pci_core_request, 1390 .match = vfio_pci_core_match, 1391 .bind_iommufd = vfio_iommufd_physical_bind, 1392 .unbind_iommufd = vfio_iommufd_physical_unbind, 1393 .attach_ioas = vfio_iommufd_physical_attach_ioas, 1394 .detach_ioas = vfio_iommufd_physical_detach_ioas, 1395 }; 1396 1397 static int mlx5vf_pci_probe(struct pci_dev *pdev, 1398 const struct pci_device_id *id) 1399 { 1400 struct mlx5vf_pci_core_device *mvdev; 1401 int ret; 1402 1403 mvdev = vfio_alloc_device(mlx5vf_pci_core_device, core_device.vdev, 1404 &pdev->dev, &mlx5vf_pci_ops); 1405 if (IS_ERR(mvdev)) 1406 return PTR_ERR(mvdev); 1407 1408 dev_set_drvdata(&pdev->dev, &mvdev->core_device); 1409 ret = vfio_pci_core_register_device(&mvdev->core_device); 1410 if (ret) 1411 goto out_put_vdev; 1412 return 0; 1413 1414 out_put_vdev: 1415 vfio_put_device(&mvdev->core_device.vdev); 1416 return ret; 1417 } 1418 1419 static void mlx5vf_pci_remove(struct pci_dev *pdev) 1420 { 1421 struct mlx5vf_pci_core_device *mvdev = mlx5vf_drvdata(pdev); 1422 1423 vfio_pci_core_unregister_device(&mvdev->core_device); 1424 vfio_put_device(&mvdev->core_device.vdev); 1425 } 1426 1427 static const struct pci_device_id mlx5vf_pci_table[] = { 1428 { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_MELLANOX, 0x101e) }, /* ConnectX Family mlx5Gen Virtual Function */ 1429 {} 1430 }; 1431 1432 MODULE_DEVICE_TABLE(pci, mlx5vf_pci_table); 1433 1434 static const struct pci_error_handlers mlx5vf_err_handlers = { 1435 .reset_done = mlx5vf_pci_aer_reset_done, 1436 .error_detected = vfio_pci_core_aer_err_detected, 1437 }; 1438 1439 static struct pci_driver mlx5vf_pci_driver = { 1440 .name = KBUILD_MODNAME, 1441 .id_table = mlx5vf_pci_table, 1442 .probe = mlx5vf_pci_probe, 1443 .remove = mlx5vf_pci_remove, 1444 .err_handler = &mlx5vf_err_handlers, 1445 .driver_managed_dma = true, 1446 }; 1447 1448 module_pci_driver(mlx5vf_pci_driver); 1449 1450 MODULE_IMPORT_NS(IOMMUFD); 1451 MODULE_LICENSE("GPL"); 1452 MODULE_AUTHOR("Max Gurtovoy <mgurtovoy@nvidia.com>"); 1453 MODULE_AUTHOR("Yishai Hadas <yishaih@nvidia.com>"); 1454 MODULE_DESCRIPTION( 1455 "MLX5 VFIO PCI - User Level meta-driver for MLX5 device family"); 1456