1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright © 2025 Intel Corporation 4 */ 5 6 #include <linux/anon_inodes.h> 7 #include <linux/delay.h> 8 #include <linux/file.h> 9 #include <linux/module.h> 10 #include <linux/pci.h> 11 #include <linux/sizes.h> 12 #include <linux/types.h> 13 #include <linux/vfio.h> 14 #include <linux/vfio_pci_core.h> 15 16 #include <drm/intel/xe_sriov_vfio.h> 17 #include <drm/intel/pciids.h> 18 19 struct xe_vfio_pci_migration_file { 20 struct file *filp; 21 /* serializes accesses to migration data */ 22 struct mutex lock; 23 struct xe_vfio_pci_core_device *xe_vdev; 24 u8 disabled:1; 25 }; 26 27 struct xe_vfio_pci_core_device { 28 struct vfio_pci_core_device core_device; 29 struct xe_device *xe; 30 /* PF internal control uses vfid index starting from 1 */ 31 unsigned int vfid; 32 u8 deferred_reset:1; 33 /* protects migration state */ 34 struct mutex state_mutex; 35 enum vfio_device_mig_state mig_state; 36 /* protects the reset_done flow */ 37 spinlock_t reset_lock; 38 struct xe_vfio_pci_migration_file *migf; 39 }; 40 41 #define xe_vdev_to_dev(xe_vdev) (&(xe_vdev)->core_device.pdev->dev) 42 43 static void xe_vfio_pci_disable_file(struct xe_vfio_pci_migration_file *migf) 44 { 45 mutex_lock(&migf->lock); 46 migf->disabled = true; 47 mutex_unlock(&migf->lock); 48 } 49 50 static void xe_vfio_pci_put_file(struct xe_vfio_pci_core_device *xe_vdev) 51 { 52 xe_vfio_pci_disable_file(xe_vdev->migf); 53 fput(xe_vdev->migf->filp); 54 xe_vdev->migf = NULL; 55 } 56 57 static void xe_vfio_pci_reset(struct xe_vfio_pci_core_device *xe_vdev) 58 { 59 if (xe_vdev->migf) 60 xe_vfio_pci_put_file(xe_vdev); 61 62 xe_vdev->mig_state = VFIO_DEVICE_STATE_RUNNING; 63 } 64 65 static void xe_vfio_pci_state_mutex_lock(struct xe_vfio_pci_core_device *xe_vdev) 66 { 67 mutex_lock(&xe_vdev->state_mutex); 68 } 69 70 /* 71 * This function is called in all state_mutex unlock cases to 72 * handle a 'deferred_reset' if exists. 73 */ 74 static void xe_vfio_pci_state_mutex_unlock(struct xe_vfio_pci_core_device *xe_vdev) 75 { 76 again: 77 spin_lock(&xe_vdev->reset_lock); 78 if (xe_vdev->deferred_reset) { 79 xe_vdev->deferred_reset = false; 80 spin_unlock(&xe_vdev->reset_lock); 81 xe_vfio_pci_reset(xe_vdev); 82 goto again; 83 } 84 mutex_unlock(&xe_vdev->state_mutex); 85 spin_unlock(&xe_vdev->reset_lock); 86 } 87 88 static void xe_vfio_pci_reset_prepare(struct pci_dev *pdev) 89 { 90 struct xe_vfio_pci_core_device *xe_vdev = pci_get_drvdata(pdev); 91 int ret; 92 93 if (!pdev->is_virtfn) 94 return; 95 96 ret = xe_sriov_vfio_flr_prepare(xe_vdev->xe, xe_vdev->vfid); 97 if (ret) 98 dev_err(&pdev->dev, "Failed to prepare FLR: %d\n", ret); 99 } 100 101 static void xe_vfio_pci_reset_done(struct pci_dev *pdev) 102 { 103 struct xe_vfio_pci_core_device *xe_vdev = pci_get_drvdata(pdev); 104 int ret; 105 106 if (!pdev->is_virtfn) 107 return; 108 109 /* 110 * VF FLR requires additional processing done by PF driver. 111 * The processing is done after FLR is already finished from PCIe 112 * perspective. 113 * In order to avoid a scenario where VF is used while PF processing 114 * is still in progress, additional synchronization point is needed. 115 */ 116 ret = xe_sriov_vfio_wait_flr_done(xe_vdev->xe, xe_vdev->vfid); 117 if (ret) 118 dev_err(&pdev->dev, "Failed to wait for FLR: %d\n", ret); 119 120 if (!xe_vdev->vfid) 121 return; 122 123 /* 124 * As the higher VFIO layers are holding locks across reset and using 125 * those same locks with the mm_lock we need to prevent ABBA deadlock 126 * with the state_mutex and mm_lock. 127 * In case the state_mutex was taken already we defer the cleanup work 128 * to the unlock flow of the other running context. 129 */ 130 spin_lock(&xe_vdev->reset_lock); 131 xe_vdev->deferred_reset = true; 132 if (!mutex_trylock(&xe_vdev->state_mutex)) { 133 spin_unlock(&xe_vdev->reset_lock); 134 return; 135 } 136 spin_unlock(&xe_vdev->reset_lock); 137 xe_vfio_pci_state_mutex_unlock(xe_vdev); 138 139 xe_vfio_pci_reset(xe_vdev); 140 } 141 142 static const struct pci_error_handlers xe_vfio_pci_err_handlers = { 143 .reset_prepare = xe_vfio_pci_reset_prepare, 144 .reset_done = xe_vfio_pci_reset_done, 145 .error_detected = vfio_pci_core_aer_err_detected, 146 }; 147 148 static int xe_vfio_pci_open_device(struct vfio_device *core_vdev) 149 { 150 struct xe_vfio_pci_core_device *xe_vdev = 151 container_of(core_vdev, struct xe_vfio_pci_core_device, core_device.vdev); 152 struct vfio_pci_core_device *vdev = &xe_vdev->core_device; 153 int ret; 154 155 ret = vfio_pci_core_enable(vdev); 156 if (ret) 157 return ret; 158 159 xe_vdev->mig_state = VFIO_DEVICE_STATE_RUNNING; 160 161 vfio_pci_core_finish_enable(vdev); 162 163 return 0; 164 } 165 166 static void xe_vfio_pci_close_device(struct vfio_device *core_vdev) 167 { 168 struct xe_vfio_pci_core_device *xe_vdev = 169 container_of(core_vdev, struct xe_vfio_pci_core_device, core_device.vdev); 170 171 xe_vfio_pci_state_mutex_lock(xe_vdev); 172 xe_vfio_pci_reset(xe_vdev); 173 xe_vfio_pci_state_mutex_unlock(xe_vdev); 174 vfio_pci_core_close_device(core_vdev); 175 } 176 177 static int xe_vfio_pci_release_file(struct inode *inode, struct file *filp) 178 { 179 struct xe_vfio_pci_migration_file *migf = filp->private_data; 180 181 mutex_destroy(&migf->lock); 182 kfree(migf); 183 184 return 0; 185 } 186 187 static ssize_t xe_vfio_pci_save_read(struct file *filp, char __user *buf, size_t len, loff_t *pos) 188 { 189 struct xe_vfio_pci_migration_file *migf = filp->private_data; 190 ssize_t ret; 191 192 if (pos) 193 return -ESPIPE; 194 195 mutex_lock(&migf->lock); 196 if (migf->disabled) { 197 mutex_unlock(&migf->lock); 198 return -ENODEV; 199 } 200 201 ret = xe_sriov_vfio_data_read(migf->xe_vdev->xe, migf->xe_vdev->vfid, buf, len); 202 mutex_unlock(&migf->lock); 203 204 return ret; 205 } 206 207 static const struct file_operations xe_vfio_pci_save_fops = { 208 .owner = THIS_MODULE, 209 .read = xe_vfio_pci_save_read, 210 .release = xe_vfio_pci_release_file, 211 .llseek = noop_llseek, 212 }; 213 214 static ssize_t xe_vfio_pci_resume_write(struct file *filp, const char __user *buf, 215 size_t len, loff_t *pos) 216 { 217 struct xe_vfio_pci_migration_file *migf = filp->private_data; 218 ssize_t ret; 219 220 if (pos) 221 return -ESPIPE; 222 223 mutex_lock(&migf->lock); 224 if (migf->disabled) { 225 mutex_unlock(&migf->lock); 226 return -ENODEV; 227 } 228 229 ret = xe_sriov_vfio_data_write(migf->xe_vdev->xe, migf->xe_vdev->vfid, buf, len); 230 mutex_unlock(&migf->lock); 231 232 return ret; 233 } 234 235 static const struct file_operations xe_vfio_pci_resume_fops = { 236 .owner = THIS_MODULE, 237 .write = xe_vfio_pci_resume_write, 238 .release = xe_vfio_pci_release_file, 239 .llseek = noop_llseek, 240 }; 241 242 static const char *vfio_dev_state_str(u32 state) 243 { 244 switch (state) { 245 case VFIO_DEVICE_STATE_RUNNING: return "running"; 246 case VFIO_DEVICE_STATE_RUNNING_P2P: return "running_p2p"; 247 case VFIO_DEVICE_STATE_STOP_COPY: return "stopcopy"; 248 case VFIO_DEVICE_STATE_STOP: return "stop"; 249 case VFIO_DEVICE_STATE_RESUMING: return "resuming"; 250 case VFIO_DEVICE_STATE_ERROR: return "error"; 251 default: return ""; 252 } 253 } 254 255 enum xe_vfio_pci_file_type { 256 XE_VFIO_FILE_SAVE = 0, 257 XE_VFIO_FILE_RESUME, 258 }; 259 260 static struct xe_vfio_pci_migration_file * 261 xe_vfio_pci_alloc_file(struct xe_vfio_pci_core_device *xe_vdev, 262 enum xe_vfio_pci_file_type type) 263 { 264 struct xe_vfio_pci_migration_file *migf; 265 const struct file_operations *fops; 266 int flags; 267 int ret; 268 269 migf = kzalloc_obj(*migf, GFP_KERNEL_ACCOUNT); 270 if (!migf) 271 return ERR_PTR(-ENOMEM); 272 273 fops = type == XE_VFIO_FILE_SAVE ? &xe_vfio_pci_save_fops : &xe_vfio_pci_resume_fops; 274 flags = type == XE_VFIO_FILE_SAVE ? O_RDONLY : O_WRONLY; 275 migf->filp = anon_inode_getfile("xe_vfio_mig", fops, migf, flags); 276 if (IS_ERR(migf->filp)) { 277 ret = PTR_ERR(migf->filp); 278 kfree(migf); 279 return ERR_PTR(ret); 280 } 281 282 mutex_init(&migf->lock); 283 migf->xe_vdev = xe_vdev; 284 xe_vdev->migf = migf; 285 286 stream_open(migf->filp->f_inode, migf->filp); 287 288 return migf; 289 } 290 291 static struct file * 292 xe_vfio_set_state(struct xe_vfio_pci_core_device *xe_vdev, u32 new) 293 { 294 u32 cur = xe_vdev->mig_state; 295 int ret; 296 297 dev_dbg(xe_vdev_to_dev(xe_vdev), 298 "state: %s->%s\n", vfio_dev_state_str(cur), vfio_dev_state_str(new)); 299 300 /* 301 * "STOP" handling is reused for "RUNNING_P2P", as the device doesn't 302 * have the capability to selectively block outgoing p2p DMA transfers. 303 * While the device is allowing BAR accesses when the VF is stopped, it 304 * is not processing any new workload requests, effectively stopping 305 * any outgoing DMA transfers (not just p2p). 306 * Any VRAM / MMIO accesses occurring during "RUNNING_P2P" are kept and 307 * will be migrated to target VF during stop-copy. 308 */ 309 if (cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_RUNNING_P2P) { 310 ret = xe_sriov_vfio_suspend_device(xe_vdev->xe, xe_vdev->vfid); 311 if (ret) 312 goto err; 313 314 return NULL; 315 } 316 317 if ((cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_STOP) || 318 (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RUNNING_P2P)) 319 return NULL; 320 321 if (cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_RUNNING) { 322 ret = xe_sriov_vfio_resume_device(xe_vdev->xe, xe_vdev->vfid); 323 if (ret) 324 goto err; 325 326 return NULL; 327 } 328 329 if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_STOP_COPY) { 330 struct xe_vfio_pci_migration_file *migf; 331 332 migf = xe_vfio_pci_alloc_file(xe_vdev, XE_VFIO_FILE_SAVE); 333 if (IS_ERR(migf)) { 334 ret = PTR_ERR(migf); 335 goto err; 336 } 337 get_file(migf->filp); 338 339 ret = xe_sriov_vfio_stop_copy_enter(xe_vdev->xe, xe_vdev->vfid); 340 if (ret) { 341 fput(migf->filp); 342 goto err; 343 } 344 345 return migf->filp; 346 } 347 348 if (cur == VFIO_DEVICE_STATE_STOP_COPY && new == VFIO_DEVICE_STATE_STOP) { 349 if (xe_vdev->migf) 350 xe_vfio_pci_put_file(xe_vdev); 351 352 ret = xe_sriov_vfio_stop_copy_exit(xe_vdev->xe, xe_vdev->vfid); 353 if (ret) 354 goto err; 355 356 return NULL; 357 } 358 359 if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RESUMING) { 360 struct xe_vfio_pci_migration_file *migf; 361 362 migf = xe_vfio_pci_alloc_file(xe_vdev, XE_VFIO_FILE_RESUME); 363 if (IS_ERR(migf)) { 364 ret = PTR_ERR(migf); 365 goto err; 366 } 367 get_file(migf->filp); 368 369 ret = xe_sriov_vfio_resume_data_enter(xe_vdev->xe, xe_vdev->vfid); 370 if (ret) { 371 fput(migf->filp); 372 goto err; 373 } 374 375 return migf->filp; 376 } 377 378 if (cur == VFIO_DEVICE_STATE_RESUMING && new == VFIO_DEVICE_STATE_STOP) { 379 if (xe_vdev->migf) 380 xe_vfio_pci_put_file(xe_vdev); 381 382 ret = xe_sriov_vfio_resume_data_exit(xe_vdev->xe, xe_vdev->vfid); 383 if (ret) 384 goto err; 385 386 return NULL; 387 } 388 389 WARN(true, "Unknown state transition %d->%d", cur, new); 390 return ERR_PTR(-EINVAL); 391 392 err: 393 dev_dbg(xe_vdev_to_dev(xe_vdev), 394 "Failed to transition state: %s->%s err=%d\n", 395 vfio_dev_state_str(cur), vfio_dev_state_str(new), ret); 396 return ERR_PTR(ret); 397 } 398 399 static struct file * 400 xe_vfio_pci_set_device_state(struct vfio_device *core_vdev, 401 enum vfio_device_mig_state new_state) 402 { 403 struct xe_vfio_pci_core_device *xe_vdev = 404 container_of(core_vdev, struct xe_vfio_pci_core_device, core_device.vdev); 405 enum vfio_device_mig_state next_state; 406 struct file *f = NULL; 407 int ret; 408 409 xe_vfio_pci_state_mutex_lock(xe_vdev); 410 while (new_state != xe_vdev->mig_state) { 411 ret = vfio_mig_get_next_state(core_vdev, xe_vdev->mig_state, 412 new_state, &next_state); 413 if (ret) { 414 xe_sriov_vfio_error(xe_vdev->xe, xe_vdev->vfid); 415 f = ERR_PTR(ret); 416 break; 417 } 418 f = xe_vfio_set_state(xe_vdev, next_state); 419 if (IS_ERR(f)) 420 break; 421 422 xe_vdev->mig_state = next_state; 423 424 /* Multiple state transitions with non-NULL file in the middle */ 425 if (f && new_state != xe_vdev->mig_state) { 426 fput(f); 427 f = ERR_PTR(-EINVAL); 428 break; 429 } 430 } 431 xe_vfio_pci_state_mutex_unlock(xe_vdev); 432 433 return f; 434 } 435 436 static int xe_vfio_pci_get_device_state(struct vfio_device *core_vdev, 437 enum vfio_device_mig_state *curr_state) 438 { 439 struct xe_vfio_pci_core_device *xe_vdev = 440 container_of(core_vdev, struct xe_vfio_pci_core_device, core_device.vdev); 441 442 xe_vfio_pci_state_mutex_lock(xe_vdev); 443 *curr_state = xe_vdev->mig_state; 444 xe_vfio_pci_state_mutex_unlock(xe_vdev); 445 446 return 0; 447 } 448 449 static int xe_vfio_pci_get_data_size(struct vfio_device *vdev, 450 unsigned long *stop_copy_length) 451 { 452 struct xe_vfio_pci_core_device *xe_vdev = 453 container_of(vdev, struct xe_vfio_pci_core_device, core_device.vdev); 454 455 xe_vfio_pci_state_mutex_lock(xe_vdev); 456 *stop_copy_length = xe_sriov_vfio_stop_copy_size(xe_vdev->xe, xe_vdev->vfid); 457 xe_vfio_pci_state_mutex_unlock(xe_vdev); 458 459 return 0; 460 } 461 462 static const struct vfio_migration_ops xe_vfio_pci_migration_ops = { 463 .migration_set_state = xe_vfio_pci_set_device_state, 464 .migration_get_state = xe_vfio_pci_get_device_state, 465 .migration_get_data_size = xe_vfio_pci_get_data_size, 466 }; 467 468 static void xe_vfio_pci_migration_init(struct xe_vfio_pci_core_device *xe_vdev) 469 { 470 struct vfio_device *core_vdev = &xe_vdev->core_device.vdev; 471 struct pci_dev *pdev = to_pci_dev(core_vdev->dev); 472 struct xe_device *xe = xe_sriov_vfio_get_pf(pdev); 473 474 if (!xe) 475 return; 476 if (!xe_sriov_vfio_migration_supported(xe)) 477 return; 478 479 mutex_init(&xe_vdev->state_mutex); 480 spin_lock_init(&xe_vdev->reset_lock); 481 482 /* PF internal control uses vfid index starting from 1 */ 483 xe_vdev->vfid = pci_iov_vf_id(pdev) + 1; 484 xe_vdev->xe = xe; 485 486 core_vdev->migration_flags = VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P; 487 core_vdev->mig_ops = &xe_vfio_pci_migration_ops; 488 } 489 490 static void xe_vfio_pci_migration_fini(struct xe_vfio_pci_core_device *xe_vdev) 491 { 492 if (!xe_vdev->vfid) 493 return; 494 495 mutex_destroy(&xe_vdev->state_mutex); 496 } 497 498 static int xe_vfio_pci_init_dev(struct vfio_device *core_vdev) 499 { 500 struct xe_vfio_pci_core_device *xe_vdev = 501 container_of(core_vdev, struct xe_vfio_pci_core_device, core_device.vdev); 502 503 xe_vfio_pci_migration_init(xe_vdev); 504 505 return vfio_pci_core_init_dev(core_vdev); 506 } 507 508 static void xe_vfio_pci_release_dev(struct vfio_device *core_vdev) 509 { 510 struct xe_vfio_pci_core_device *xe_vdev = 511 container_of(core_vdev, struct xe_vfio_pci_core_device, core_device.vdev); 512 513 xe_vfio_pci_migration_fini(xe_vdev); 514 } 515 516 static const struct vfio_device_ops xe_vfio_pci_ops = { 517 .name = "xe-vfio-pci", 518 .init = xe_vfio_pci_init_dev, 519 .release = xe_vfio_pci_release_dev, 520 .open_device = xe_vfio_pci_open_device, 521 .close_device = xe_vfio_pci_close_device, 522 .ioctl = vfio_pci_core_ioctl, 523 .get_region_info_caps = vfio_pci_ioctl_get_region_info, 524 .device_feature = vfio_pci_core_ioctl_feature, 525 .read = vfio_pci_core_read, 526 .write = vfio_pci_core_write, 527 .mmap = vfio_pci_core_mmap, 528 .request = vfio_pci_core_request, 529 .match = vfio_pci_core_match, 530 .match_token_uuid = vfio_pci_core_match_token_uuid, 531 .bind_iommufd = vfio_iommufd_physical_bind, 532 .unbind_iommufd = vfio_iommufd_physical_unbind, 533 .attach_ioas = vfio_iommufd_physical_attach_ioas, 534 .detach_ioas = vfio_iommufd_physical_detach_ioas, 535 }; 536 537 static int xe_vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) 538 { 539 struct xe_vfio_pci_core_device *xe_vdev; 540 int ret; 541 542 xe_vdev = vfio_alloc_device(xe_vfio_pci_core_device, core_device.vdev, &pdev->dev, 543 &xe_vfio_pci_ops); 544 if (IS_ERR(xe_vdev)) 545 return PTR_ERR(xe_vdev); 546 547 dev_set_drvdata(&pdev->dev, &xe_vdev->core_device); 548 549 ret = vfio_pci_core_register_device(&xe_vdev->core_device); 550 if (ret) { 551 vfio_put_device(&xe_vdev->core_device.vdev); 552 return ret; 553 } 554 555 return 0; 556 } 557 558 static void xe_vfio_pci_remove(struct pci_dev *pdev) 559 { 560 struct xe_vfio_pci_core_device *xe_vdev = pci_get_drvdata(pdev); 561 562 vfio_pci_core_unregister_device(&xe_vdev->core_device); 563 vfio_put_device(&xe_vdev->core_device.vdev); 564 } 565 566 #define INTEL_PCI_VFIO_DEVICE(_id) { \ 567 PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_INTEL, (_id)) \ 568 } 569 570 static const struct pci_device_id xe_vfio_pci_table[] = { 571 INTEL_PTL_IDS(INTEL_PCI_VFIO_DEVICE), 572 INTEL_WCL_IDS(INTEL_PCI_VFIO_DEVICE), 573 INTEL_BMG_IDS(INTEL_PCI_VFIO_DEVICE), 574 {} 575 }; 576 MODULE_DEVICE_TABLE(pci, xe_vfio_pci_table); 577 578 static struct pci_driver xe_vfio_pci_driver = { 579 .name = "xe-vfio-pci", 580 .id_table = xe_vfio_pci_table, 581 .probe = xe_vfio_pci_probe, 582 .remove = xe_vfio_pci_remove, 583 .err_handler = &xe_vfio_pci_err_handlers, 584 .driver_managed_dma = true, 585 }; 586 module_pci_driver(xe_vfio_pci_driver); 587 588 MODULE_LICENSE("GPL"); 589 MODULE_AUTHOR("Michał Winiarski <michal.winiarski@intel.com>"); 590 MODULE_DESCRIPTION("VFIO PCI driver with migration support for Intel Graphics"); 591