1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright © 2025 Intel Corporation 4 */ 5 6 #include <linux/anon_inodes.h> 7 #include <linux/delay.h> 8 #include <linux/file.h> 9 #include <linux/module.h> 10 #include <linux/pci.h> 11 #include <linux/sizes.h> 12 #include <linux/types.h> 13 #include <linux/vfio.h> 14 #include <linux/vfio_pci_core.h> 15 16 #include <drm/intel/xe_sriov_vfio.h> 17 #include <drm/intel/pciids.h> 18 19 struct xe_vfio_pci_migration_file { 20 struct file *filp; 21 /* serializes accesses to migration data */ 22 struct mutex lock; 23 struct xe_vfio_pci_core_device *xe_vdev; 24 u8 disabled:1; 25 }; 26 27 struct xe_vfio_pci_core_device { 28 struct vfio_pci_core_device core_device; 29 struct xe_device *xe; 30 /* PF internal control uses vfid index starting from 1 */ 31 unsigned int vfid; 32 u8 deferred_reset:1; 33 /* protects migration state */ 34 struct mutex state_mutex; 35 enum vfio_device_mig_state mig_state; 36 /* protects the reset_done flow */ 37 spinlock_t reset_lock; 38 struct xe_vfio_pci_migration_file *migf; 39 }; 40 41 #define xe_vdev_to_dev(xe_vdev) (&(xe_vdev)->core_device.pdev->dev) 42 43 static void xe_vfio_pci_disable_file(struct xe_vfio_pci_migration_file *migf) 44 { 45 mutex_lock(&migf->lock); 46 migf->disabled = true; 47 mutex_unlock(&migf->lock); 48 } 49 50 static void xe_vfio_pci_put_file(struct xe_vfio_pci_core_device *xe_vdev) 51 { 52 xe_vfio_pci_disable_file(xe_vdev->migf); 53 fput(xe_vdev->migf->filp); 54 xe_vdev->migf = NULL; 55 } 56 57 static void xe_vfio_pci_reset(struct xe_vfio_pci_core_device *xe_vdev) 58 { 59 if (xe_vdev->migf) 60 xe_vfio_pci_put_file(xe_vdev); 61 62 xe_vdev->mig_state = VFIO_DEVICE_STATE_RUNNING; 63 } 64 65 static void xe_vfio_pci_state_mutex_lock(struct xe_vfio_pci_core_device *xe_vdev) 66 { 67 mutex_lock(&xe_vdev->state_mutex); 68 } 69 70 /* 71 * This function is called in all state_mutex unlock cases to 72 * handle a 'deferred_reset' if exists. 73 */ 74 static void xe_vfio_pci_state_mutex_unlock(struct xe_vfio_pci_core_device *xe_vdev) 75 { 76 again: 77 spin_lock(&xe_vdev->reset_lock); 78 if (xe_vdev->deferred_reset) { 79 xe_vdev->deferred_reset = false; 80 spin_unlock(&xe_vdev->reset_lock); 81 xe_vfio_pci_reset(xe_vdev); 82 goto again; 83 } 84 mutex_unlock(&xe_vdev->state_mutex); 85 spin_unlock(&xe_vdev->reset_lock); 86 } 87 88 static void xe_vfio_pci_reset_prepare(struct pci_dev *pdev) 89 { 90 struct xe_vfio_pci_core_device *xe_vdev = pci_get_drvdata(pdev); 91 int ret; 92 93 if (!pdev->is_virtfn) 94 return; 95 96 ret = xe_sriov_vfio_flr_prepare(xe_vdev->xe, xe_vdev->vfid); 97 if (ret) 98 dev_err(&pdev->dev, "Failed to prepare FLR: %d\n", ret); 99 } 100 101 static void xe_vfio_pci_reset_done(struct pci_dev *pdev) 102 { 103 struct xe_vfio_pci_core_device *xe_vdev = pci_get_drvdata(pdev); 104 int ret; 105 106 if (!pdev->is_virtfn) 107 return; 108 109 /* 110 * VF FLR requires additional processing done by PF driver. 111 * The processing is done after FLR is already finished from PCIe 112 * perspective. 113 * In order to avoid a scenario where VF is used while PF processing 114 * is still in progress, additional synchronization point is needed. 115 */ 116 ret = xe_sriov_vfio_wait_flr_done(xe_vdev->xe, xe_vdev->vfid); 117 if (ret) 118 dev_err(&pdev->dev, "Failed to wait for FLR: %d\n", ret); 119 120 if (!xe_vdev->vfid) 121 return; 122 123 /* 124 * As the higher VFIO layers are holding locks across reset and using 125 * those same locks with the mm_lock we need to prevent ABBA deadlock 126 * with the state_mutex and mm_lock. 127 * In case the state_mutex was taken already we defer the cleanup work 128 * to the unlock flow of the other running context. 129 */ 130 spin_lock(&xe_vdev->reset_lock); 131 xe_vdev->deferred_reset = true; 132 if (!mutex_trylock(&xe_vdev->state_mutex)) { 133 spin_unlock(&xe_vdev->reset_lock); 134 return; 135 } 136 spin_unlock(&xe_vdev->reset_lock); 137 xe_vfio_pci_state_mutex_unlock(xe_vdev); 138 139 xe_vfio_pci_reset(xe_vdev); 140 } 141 142 static const struct pci_error_handlers xe_vfio_pci_err_handlers = { 143 .reset_prepare = xe_vfio_pci_reset_prepare, 144 .reset_done = xe_vfio_pci_reset_done, 145 .error_detected = vfio_pci_core_aer_err_detected, 146 }; 147 148 static int xe_vfio_pci_open_device(struct vfio_device *core_vdev) 149 { 150 struct xe_vfio_pci_core_device *xe_vdev = 151 container_of(core_vdev, struct xe_vfio_pci_core_device, core_device.vdev); 152 struct vfio_pci_core_device *vdev = &xe_vdev->core_device; 153 int ret; 154 155 ret = vfio_pci_core_enable(vdev); 156 if (ret) 157 return ret; 158 159 xe_vdev->mig_state = VFIO_DEVICE_STATE_RUNNING; 160 161 vfio_pci_core_finish_enable(vdev); 162 163 return 0; 164 } 165 166 static void xe_vfio_pci_close_device(struct vfio_device *core_vdev) 167 { 168 struct xe_vfio_pci_core_device *xe_vdev = 169 container_of(core_vdev, struct xe_vfio_pci_core_device, core_device.vdev); 170 171 xe_vfio_pci_state_mutex_lock(xe_vdev); 172 xe_vfio_pci_reset(xe_vdev); 173 xe_vfio_pci_state_mutex_unlock(xe_vdev); 174 vfio_pci_core_close_device(core_vdev); 175 } 176 177 static int xe_vfio_pci_release_file(struct inode *inode, struct file *filp) 178 { 179 struct xe_vfio_pci_migration_file *migf = filp->private_data; 180 181 mutex_destroy(&migf->lock); 182 kfree(migf); 183 184 return 0; 185 } 186 187 static ssize_t xe_vfio_pci_save_read(struct file *filp, char __user *buf, size_t len, loff_t *pos) 188 { 189 struct xe_vfio_pci_migration_file *migf = filp->private_data; 190 ssize_t ret; 191 192 if (pos) 193 return -ESPIPE; 194 195 mutex_lock(&migf->lock); 196 if (migf->disabled) { 197 mutex_unlock(&migf->lock); 198 return -ENODEV; 199 } 200 201 ret = xe_sriov_vfio_data_read(migf->xe_vdev->xe, migf->xe_vdev->vfid, buf, len); 202 mutex_unlock(&migf->lock); 203 204 return ret; 205 } 206 207 static const struct file_operations xe_vfio_pci_save_fops = { 208 .owner = THIS_MODULE, 209 .read = xe_vfio_pci_save_read, 210 .release = xe_vfio_pci_release_file, 211 .llseek = noop_llseek, 212 }; 213 214 static ssize_t xe_vfio_pci_resume_write(struct file *filp, const char __user *buf, 215 size_t len, loff_t *pos) 216 { 217 struct xe_vfio_pci_migration_file *migf = filp->private_data; 218 ssize_t ret; 219 220 if (pos) 221 return -ESPIPE; 222 223 mutex_lock(&migf->lock); 224 if (migf->disabled) { 225 mutex_unlock(&migf->lock); 226 return -ENODEV; 227 } 228 229 ret = xe_sriov_vfio_data_write(migf->xe_vdev->xe, migf->xe_vdev->vfid, buf, len); 230 mutex_unlock(&migf->lock); 231 232 return ret; 233 } 234 235 static const struct file_operations xe_vfio_pci_resume_fops = { 236 .owner = THIS_MODULE, 237 .write = xe_vfio_pci_resume_write, 238 .release = xe_vfio_pci_release_file, 239 .llseek = noop_llseek, 240 }; 241 242 static const char *vfio_dev_state_str(u32 state) 243 { 244 switch (state) { 245 case VFIO_DEVICE_STATE_RUNNING: return "running"; 246 case VFIO_DEVICE_STATE_RUNNING_P2P: return "running_p2p"; 247 case VFIO_DEVICE_STATE_STOP_COPY: return "stopcopy"; 248 case VFIO_DEVICE_STATE_STOP: return "stop"; 249 case VFIO_DEVICE_STATE_RESUMING: return "resuming"; 250 case VFIO_DEVICE_STATE_ERROR: return "error"; 251 default: return ""; 252 } 253 } 254 255 enum xe_vfio_pci_file_type { 256 XE_VFIO_FILE_SAVE = 0, 257 XE_VFIO_FILE_RESUME, 258 }; 259 260 static struct xe_vfio_pci_migration_file * 261 xe_vfio_pci_alloc_file(struct xe_vfio_pci_core_device *xe_vdev, 262 enum xe_vfio_pci_file_type type) 263 { 264 struct xe_vfio_pci_migration_file *migf; 265 const struct file_operations *fops; 266 int flags; 267 int ret; 268 269 migf = kzalloc_obj(*migf, GFP_KERNEL_ACCOUNT); 270 if (!migf) 271 return ERR_PTR(-ENOMEM); 272 273 fops = type == XE_VFIO_FILE_SAVE ? &xe_vfio_pci_save_fops : &xe_vfio_pci_resume_fops; 274 flags = type == XE_VFIO_FILE_SAVE ? O_RDONLY : O_WRONLY; 275 migf->filp = anon_inode_getfile("xe_vfio_mig", fops, migf, flags); 276 if (IS_ERR(migf->filp)) { 277 ret = PTR_ERR(migf->filp); 278 kfree(migf); 279 return ERR_PTR(ret); 280 } 281 282 mutex_init(&migf->lock); 283 migf->xe_vdev = xe_vdev; 284 xe_vdev->migf = migf; 285 286 stream_open(migf->filp->f_inode, migf->filp); 287 288 return migf; 289 } 290 291 static struct file * 292 xe_vfio_set_state(struct xe_vfio_pci_core_device *xe_vdev, u32 new) 293 { 294 u32 cur = xe_vdev->mig_state; 295 int ret; 296 297 dev_dbg(xe_vdev_to_dev(xe_vdev), 298 "state: %s->%s\n", vfio_dev_state_str(cur), vfio_dev_state_str(new)); 299 300 /* 301 * "STOP" handling is reused for "RUNNING_P2P", as the device doesn't 302 * have the capability to selectively block outgoing p2p DMA transfers. 303 * While the device is allowing BAR accesses when the VF is stopped, it 304 * is not processing any new workload requests, effectively stopping 305 * any outgoing DMA transfers (not just p2p). 306 * Any VRAM / MMIO accesses occurring during "RUNNING_P2P" are kept and 307 * will be migrated to target VF during stop-copy. 308 */ 309 if (cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_RUNNING_P2P) { 310 ret = xe_sriov_vfio_suspend_device(xe_vdev->xe, xe_vdev->vfid); 311 if (ret) 312 goto err; 313 314 return NULL; 315 } 316 317 if ((cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_STOP) || 318 (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RUNNING_P2P)) 319 return NULL; 320 321 if (cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_RUNNING) { 322 ret = xe_sriov_vfio_resume_device(xe_vdev->xe, xe_vdev->vfid); 323 if (ret) 324 goto err; 325 326 return NULL; 327 } 328 329 if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_STOP_COPY) { 330 struct xe_vfio_pci_migration_file *migf; 331 332 migf = xe_vfio_pci_alloc_file(xe_vdev, XE_VFIO_FILE_SAVE); 333 if (IS_ERR(migf)) { 334 ret = PTR_ERR(migf); 335 goto err; 336 } 337 get_file(migf->filp); 338 339 ret = xe_sriov_vfio_stop_copy_enter(xe_vdev->xe, xe_vdev->vfid); 340 if (ret) { 341 fput(migf->filp); 342 goto err; 343 } 344 345 return migf->filp; 346 } 347 348 if (cur == VFIO_DEVICE_STATE_STOP_COPY && new == VFIO_DEVICE_STATE_STOP) { 349 if (xe_vdev->migf) 350 xe_vfio_pci_put_file(xe_vdev); 351 352 ret = xe_sriov_vfio_stop_copy_exit(xe_vdev->xe, xe_vdev->vfid); 353 if (ret) 354 goto err; 355 356 return NULL; 357 } 358 359 if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RESUMING) { 360 struct xe_vfio_pci_migration_file *migf; 361 362 migf = xe_vfio_pci_alloc_file(xe_vdev, XE_VFIO_FILE_RESUME); 363 if (IS_ERR(migf)) { 364 ret = PTR_ERR(migf); 365 goto err; 366 } 367 get_file(migf->filp); 368 369 ret = xe_sriov_vfio_resume_data_enter(xe_vdev->xe, xe_vdev->vfid); 370 if (ret) { 371 fput(migf->filp); 372 goto err; 373 } 374 375 return migf->filp; 376 } 377 378 if (cur == VFIO_DEVICE_STATE_RESUMING && new == VFIO_DEVICE_STATE_STOP) { 379 if (xe_vdev->migf) 380 xe_vfio_pci_put_file(xe_vdev); 381 382 ret = xe_sriov_vfio_resume_data_exit(xe_vdev->xe, xe_vdev->vfid); 383 if (ret) 384 goto err; 385 386 return NULL; 387 } 388 389 WARN(true, "Unknown state transition %d->%d", cur, new); 390 return ERR_PTR(-EINVAL); 391 392 err: 393 dev_dbg(xe_vdev_to_dev(xe_vdev), 394 "Failed to transition state: %s->%s err=%d\n", 395 vfio_dev_state_str(cur), vfio_dev_state_str(new), ret); 396 return ERR_PTR(ret); 397 } 398 399 static struct file * 400 xe_vfio_pci_set_device_state(struct vfio_device *core_vdev, 401 enum vfio_device_mig_state new_state) 402 { 403 struct xe_vfio_pci_core_device *xe_vdev = 404 container_of(core_vdev, struct xe_vfio_pci_core_device, core_device.vdev); 405 enum vfio_device_mig_state next_state; 406 struct file *f = NULL; 407 int ret; 408 409 xe_vfio_pci_state_mutex_lock(xe_vdev); 410 while (new_state != xe_vdev->mig_state) { 411 ret = vfio_mig_get_next_state(core_vdev, xe_vdev->mig_state, 412 new_state, &next_state); 413 if (ret) { 414 xe_sriov_vfio_error(xe_vdev->xe, xe_vdev->vfid); 415 f = ERR_PTR(ret); 416 break; 417 } 418 f = xe_vfio_set_state(xe_vdev, next_state); 419 if (IS_ERR(f)) 420 break; 421 422 xe_vdev->mig_state = next_state; 423 424 /* Multiple state transitions with non-NULL file in the middle */ 425 if (f && new_state != xe_vdev->mig_state) { 426 fput(f); 427 f = ERR_PTR(-EINVAL); 428 break; 429 } 430 } 431 xe_vfio_pci_state_mutex_unlock(xe_vdev); 432 433 return f; 434 } 435 436 static int xe_vfio_pci_get_device_state(struct vfio_device *core_vdev, 437 enum vfio_device_mig_state *curr_state) 438 { 439 struct xe_vfio_pci_core_device *xe_vdev = 440 container_of(core_vdev, struct xe_vfio_pci_core_device, core_device.vdev); 441 442 xe_vfio_pci_state_mutex_lock(xe_vdev); 443 *curr_state = xe_vdev->mig_state; 444 xe_vfio_pci_state_mutex_unlock(xe_vdev); 445 446 return 0; 447 } 448 449 static int xe_vfio_pci_get_data_size(struct vfio_device *vdev, 450 unsigned long *stop_copy_length) 451 { 452 struct xe_vfio_pci_core_device *xe_vdev = 453 container_of(vdev, struct xe_vfio_pci_core_device, core_device.vdev); 454 455 xe_vfio_pci_state_mutex_lock(xe_vdev); 456 *stop_copy_length = xe_sriov_vfio_stop_copy_size(xe_vdev->xe, xe_vdev->vfid); 457 xe_vfio_pci_state_mutex_unlock(xe_vdev); 458 459 return 0; 460 } 461 462 static const struct vfio_migration_ops xe_vfio_pci_migration_ops = { 463 .migration_set_state = xe_vfio_pci_set_device_state, 464 .migration_get_state = xe_vfio_pci_get_device_state, 465 .migration_get_data_size = xe_vfio_pci_get_data_size, 466 }; 467 468 static void xe_vfio_pci_migration_init(struct xe_vfio_pci_core_device *xe_vdev) 469 { 470 struct vfio_device *core_vdev = &xe_vdev->core_device.vdev; 471 472 if (!xe_sriov_vfio_migration_supported(xe_vdev->xe)) 473 return; 474 475 core_vdev->migration_flags = VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P; 476 core_vdev->mig_ops = &xe_vfio_pci_migration_ops; 477 } 478 479 static int xe_vfio_pci_vf_init(struct xe_vfio_pci_core_device *xe_vdev) 480 { 481 struct vfio_device *core_vdev = &xe_vdev->core_device.vdev; 482 struct pci_dev *pdev = to_pci_dev(core_vdev->dev); 483 struct xe_device *xe = xe_sriov_vfio_get_pf(pdev); 484 485 if (!pdev->is_virtfn) 486 return 0; 487 if (!xe) 488 return -ENODEV; 489 xe_vdev->xe = xe; 490 491 /* PF internal control uses vfid index starting from 1 */ 492 xe_vdev->vfid = pci_iov_vf_id(pdev) + 1; 493 494 xe_vfio_pci_migration_init(xe_vdev); 495 496 return 0; 497 } 498 499 static int xe_vfio_pci_init_dev(struct vfio_device *core_vdev) 500 { 501 struct xe_vfio_pci_core_device *xe_vdev = 502 container_of(core_vdev, struct xe_vfio_pci_core_device, core_device.vdev); 503 int ret; 504 505 mutex_init(&xe_vdev->state_mutex); 506 spin_lock_init(&xe_vdev->reset_lock); 507 508 ret = xe_vfio_pci_vf_init(xe_vdev); 509 if (ret) 510 return ret; 511 512 return vfio_pci_core_init_dev(core_vdev); 513 } 514 515 static void xe_vfio_pci_release_dev(struct vfio_device *core_vdev) 516 { 517 struct xe_vfio_pci_core_device *xe_vdev = 518 container_of(core_vdev, struct xe_vfio_pci_core_device, core_device.vdev); 519 520 mutex_destroy(&xe_vdev->state_mutex); 521 vfio_pci_core_release_dev(core_vdev); 522 } 523 524 static const struct vfio_device_ops xe_vfio_pci_ops = { 525 .name = "xe-vfio-pci", 526 .init = xe_vfio_pci_init_dev, 527 .release = xe_vfio_pci_release_dev, 528 .open_device = xe_vfio_pci_open_device, 529 .close_device = xe_vfio_pci_close_device, 530 .ioctl = vfio_pci_core_ioctl, 531 .get_region_info_caps = vfio_pci_ioctl_get_region_info, 532 .device_feature = vfio_pci_core_ioctl_feature, 533 .read = vfio_pci_core_read, 534 .write = vfio_pci_core_write, 535 .mmap = vfio_pci_core_mmap, 536 .request = vfio_pci_core_request, 537 .match = vfio_pci_core_match, 538 .match_token_uuid = vfio_pci_core_match_token_uuid, 539 .bind_iommufd = vfio_iommufd_physical_bind, 540 .unbind_iommufd = vfio_iommufd_physical_unbind, 541 .attach_ioas = vfio_iommufd_physical_attach_ioas, 542 .detach_ioas = vfio_iommufd_physical_detach_ioas, 543 }; 544 545 static int xe_vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) 546 { 547 struct xe_vfio_pci_core_device *xe_vdev; 548 int ret; 549 550 xe_vdev = vfio_alloc_device(xe_vfio_pci_core_device, core_device.vdev, &pdev->dev, 551 &xe_vfio_pci_ops); 552 if (IS_ERR(xe_vdev)) 553 return PTR_ERR(xe_vdev); 554 555 dev_set_drvdata(&pdev->dev, &xe_vdev->core_device); 556 557 ret = vfio_pci_core_register_device(&xe_vdev->core_device); 558 if (ret) { 559 vfio_put_device(&xe_vdev->core_device.vdev); 560 return ret; 561 } 562 563 return 0; 564 } 565 566 static void xe_vfio_pci_remove(struct pci_dev *pdev) 567 { 568 struct xe_vfio_pci_core_device *xe_vdev = pci_get_drvdata(pdev); 569 570 vfio_pci_core_unregister_device(&xe_vdev->core_device); 571 vfio_put_device(&xe_vdev->core_device.vdev); 572 } 573 574 #define INTEL_PCI_VFIO_DEVICE(_id) { \ 575 PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_INTEL, (_id)) \ 576 } 577 578 static const struct pci_device_id xe_vfio_pci_table[] = { 579 INTEL_PTL_IDS(INTEL_PCI_VFIO_DEVICE), 580 INTEL_WCL_IDS(INTEL_PCI_VFIO_DEVICE), 581 INTEL_BMG_IDS(INTEL_PCI_VFIO_DEVICE), 582 {} 583 }; 584 MODULE_DEVICE_TABLE(pci, xe_vfio_pci_table); 585 586 static struct pci_driver xe_vfio_pci_driver = { 587 .name = "xe-vfio-pci", 588 .id_table = xe_vfio_pci_table, 589 .probe = xe_vfio_pci_probe, 590 .remove = xe_vfio_pci_remove, 591 .err_handler = &xe_vfio_pci_err_handlers, 592 .driver_managed_dma = true, 593 }; 594 module_pci_driver(xe_vfio_pci_driver); 595 596 MODULE_LICENSE("GPL"); 597 MODULE_AUTHOR("Michał Winiarski <michal.winiarski@intel.com>"); 598 MODULE_DESCRIPTION("VFIO PCI driver with migration support for Intel Graphics"); 599