1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright © 2025 Intel Corporation 4 */ 5 6 #include <linux/anon_inodes.h> 7 #include <linux/delay.h> 8 #include <linux/file.h> 9 #include <linux/module.h> 10 #include <linux/pci.h> 11 #include <linux/sizes.h> 12 #include <linux/types.h> 13 #include <linux/vfio.h> 14 #include <linux/vfio_pci_core.h> 15 16 #include <drm/intel/xe_sriov_vfio.h> 17 #include <drm/intel/pciids.h> 18 19 struct xe_vfio_pci_migration_file { 20 struct file *filp; 21 /* serializes accesses to migration data */ 22 struct mutex lock; 23 struct xe_vfio_pci_core_device *xe_vdev; 24 u8 disabled:1; 25 }; 26 27 struct xe_vfio_pci_core_device { 28 struct vfio_pci_core_device core_device; 29 struct xe_device *xe; 30 /* PF internal control uses vfid index starting from 1 */ 31 unsigned int vfid; 32 u8 deferred_reset:1; 33 /* protects migration state */ 34 struct mutex state_mutex; 35 enum vfio_device_mig_state mig_state; 36 /* protects the reset_done flow */ 37 spinlock_t reset_lock; 38 struct xe_vfio_pci_migration_file *migf; 39 }; 40 41 #define xe_vdev_to_dev(xe_vdev) (&(xe_vdev)->core_device.pdev->dev) 42 43 static void xe_vfio_pci_disable_file(struct xe_vfio_pci_migration_file *migf) 44 { 45 mutex_lock(&migf->lock); 46 migf->disabled = true; 47 mutex_unlock(&migf->lock); 48 } 49 50 static void xe_vfio_pci_put_file(struct xe_vfio_pci_core_device *xe_vdev) 51 { 52 xe_vfio_pci_disable_file(xe_vdev->migf); 53 fput(xe_vdev->migf->filp); 54 xe_vdev->migf = NULL; 55 } 56 57 static void xe_vfio_pci_reset(struct xe_vfio_pci_core_device *xe_vdev) 58 { 59 if (xe_vdev->migf) 60 xe_vfio_pci_put_file(xe_vdev); 61 62 xe_vdev->mig_state = VFIO_DEVICE_STATE_RUNNING; 63 } 64 65 static void xe_vfio_pci_state_mutex_lock(struct xe_vfio_pci_core_device *xe_vdev) 66 { 67 mutex_lock(&xe_vdev->state_mutex); 68 } 69 70 /* 71 * This function is called in all state_mutex unlock cases to 72 * handle a 'deferred_reset' if exists. 73 */ 74 static void xe_vfio_pci_state_mutex_unlock(struct xe_vfio_pci_core_device *xe_vdev) 75 { 76 again: 77 spin_lock(&xe_vdev->reset_lock); 78 if (xe_vdev->deferred_reset) { 79 xe_vdev->deferred_reset = false; 80 spin_unlock(&xe_vdev->reset_lock); 81 xe_vfio_pci_reset(xe_vdev); 82 goto again; 83 } 84 mutex_unlock(&xe_vdev->state_mutex); 85 spin_unlock(&xe_vdev->reset_lock); 86 } 87 88 static void xe_vfio_pci_reset_done(struct pci_dev *pdev) 89 { 90 struct xe_vfio_pci_core_device *xe_vdev = pci_get_drvdata(pdev); 91 int ret; 92 93 if (!pdev->is_virtfn) 94 return; 95 96 /* 97 * VF FLR requires additional processing done by PF driver. 98 * The processing is done after FLR is already finished from PCIe 99 * perspective. 100 * In order to avoid a scenario where VF is used while PF processing 101 * is still in progress, additional synchronization point is needed. 102 */ 103 ret = xe_sriov_vfio_wait_flr_done(xe_vdev->xe, xe_vdev->vfid); 104 if (ret) 105 dev_err(&pdev->dev, "Failed to wait for FLR: %d\n", ret); 106 107 if (!xe_vdev->vfid) 108 return; 109 110 /* 111 * As the higher VFIO layers are holding locks across reset and using 112 * those same locks with the mm_lock we need to prevent ABBA deadlock 113 * with the state_mutex and mm_lock. 114 * In case the state_mutex was taken already we defer the cleanup work 115 * to the unlock flow of the other running context. 116 */ 117 spin_lock(&xe_vdev->reset_lock); 118 xe_vdev->deferred_reset = true; 119 if (!mutex_trylock(&xe_vdev->state_mutex)) { 120 spin_unlock(&xe_vdev->reset_lock); 121 return; 122 } 123 spin_unlock(&xe_vdev->reset_lock); 124 xe_vfio_pci_state_mutex_unlock(xe_vdev); 125 126 xe_vfio_pci_reset(xe_vdev); 127 } 128 129 static const struct pci_error_handlers xe_vfio_pci_err_handlers = { 130 .reset_done = xe_vfio_pci_reset_done, 131 .error_detected = vfio_pci_core_aer_err_detected, 132 }; 133 134 static int xe_vfio_pci_open_device(struct vfio_device *core_vdev) 135 { 136 struct xe_vfio_pci_core_device *xe_vdev = 137 container_of(core_vdev, struct xe_vfio_pci_core_device, core_device.vdev); 138 struct vfio_pci_core_device *vdev = &xe_vdev->core_device; 139 int ret; 140 141 ret = vfio_pci_core_enable(vdev); 142 if (ret) 143 return ret; 144 145 xe_vdev->mig_state = VFIO_DEVICE_STATE_RUNNING; 146 147 vfio_pci_core_finish_enable(vdev); 148 149 return 0; 150 } 151 152 static void xe_vfio_pci_close_device(struct vfio_device *core_vdev) 153 { 154 struct xe_vfio_pci_core_device *xe_vdev = 155 container_of(core_vdev, struct xe_vfio_pci_core_device, core_device.vdev); 156 157 xe_vfio_pci_state_mutex_lock(xe_vdev); 158 xe_vfio_pci_reset(xe_vdev); 159 xe_vfio_pci_state_mutex_unlock(xe_vdev); 160 vfio_pci_core_close_device(core_vdev); 161 } 162 163 static int xe_vfio_pci_release_file(struct inode *inode, struct file *filp) 164 { 165 struct xe_vfio_pci_migration_file *migf = filp->private_data; 166 167 mutex_destroy(&migf->lock); 168 kfree(migf); 169 170 return 0; 171 } 172 173 static ssize_t xe_vfio_pci_save_read(struct file *filp, char __user *buf, size_t len, loff_t *pos) 174 { 175 struct xe_vfio_pci_migration_file *migf = filp->private_data; 176 ssize_t ret; 177 178 if (pos) 179 return -ESPIPE; 180 181 mutex_lock(&migf->lock); 182 if (migf->disabled) { 183 mutex_unlock(&migf->lock); 184 return -ENODEV; 185 } 186 187 ret = xe_sriov_vfio_data_read(migf->xe_vdev->xe, migf->xe_vdev->vfid, buf, len); 188 mutex_unlock(&migf->lock); 189 190 return ret; 191 } 192 193 static const struct file_operations xe_vfio_pci_save_fops = { 194 .owner = THIS_MODULE, 195 .read = xe_vfio_pci_save_read, 196 .release = xe_vfio_pci_release_file, 197 .llseek = noop_llseek, 198 }; 199 200 static ssize_t xe_vfio_pci_resume_write(struct file *filp, const char __user *buf, 201 size_t len, loff_t *pos) 202 { 203 struct xe_vfio_pci_migration_file *migf = filp->private_data; 204 ssize_t ret; 205 206 if (pos) 207 return -ESPIPE; 208 209 mutex_lock(&migf->lock); 210 if (migf->disabled) { 211 mutex_unlock(&migf->lock); 212 return -ENODEV; 213 } 214 215 ret = xe_sriov_vfio_data_write(migf->xe_vdev->xe, migf->xe_vdev->vfid, buf, len); 216 mutex_unlock(&migf->lock); 217 218 return ret; 219 } 220 221 static const struct file_operations xe_vfio_pci_resume_fops = { 222 .owner = THIS_MODULE, 223 .write = xe_vfio_pci_resume_write, 224 .release = xe_vfio_pci_release_file, 225 .llseek = noop_llseek, 226 }; 227 228 static const char *vfio_dev_state_str(u32 state) 229 { 230 switch (state) { 231 case VFIO_DEVICE_STATE_RUNNING: return "running"; 232 case VFIO_DEVICE_STATE_RUNNING_P2P: return "running_p2p"; 233 case VFIO_DEVICE_STATE_STOP_COPY: return "stopcopy"; 234 case VFIO_DEVICE_STATE_STOP: return "stop"; 235 case VFIO_DEVICE_STATE_RESUMING: return "resuming"; 236 case VFIO_DEVICE_STATE_ERROR: return "error"; 237 default: return ""; 238 } 239 } 240 241 enum xe_vfio_pci_file_type { 242 XE_VFIO_FILE_SAVE = 0, 243 XE_VFIO_FILE_RESUME, 244 }; 245 246 static struct xe_vfio_pci_migration_file * 247 xe_vfio_pci_alloc_file(struct xe_vfio_pci_core_device *xe_vdev, 248 enum xe_vfio_pci_file_type type) 249 { 250 struct xe_vfio_pci_migration_file *migf; 251 const struct file_operations *fops; 252 int flags; 253 int ret; 254 255 migf = kzalloc(sizeof(*migf), GFP_KERNEL_ACCOUNT); 256 if (!migf) 257 return ERR_PTR(-ENOMEM); 258 259 fops = type == XE_VFIO_FILE_SAVE ? &xe_vfio_pci_save_fops : &xe_vfio_pci_resume_fops; 260 flags = type == XE_VFIO_FILE_SAVE ? O_RDONLY : O_WRONLY; 261 migf->filp = anon_inode_getfile("xe_vfio_mig", fops, migf, flags); 262 if (IS_ERR(migf->filp)) { 263 ret = PTR_ERR(migf->filp); 264 kfree(migf); 265 return ERR_PTR(ret); 266 } 267 268 mutex_init(&migf->lock); 269 migf->xe_vdev = xe_vdev; 270 xe_vdev->migf = migf; 271 272 stream_open(migf->filp->f_inode, migf->filp); 273 274 return migf; 275 } 276 277 static struct file * 278 xe_vfio_set_state(struct xe_vfio_pci_core_device *xe_vdev, u32 new) 279 { 280 u32 cur = xe_vdev->mig_state; 281 int ret; 282 283 dev_dbg(xe_vdev_to_dev(xe_vdev), 284 "state: %s->%s\n", vfio_dev_state_str(cur), vfio_dev_state_str(new)); 285 286 /* 287 * "STOP" handling is reused for "RUNNING_P2P", as the device doesn't 288 * have the capability to selectively block outgoing p2p DMA transfers. 289 * While the device is allowing BAR accesses when the VF is stopped, it 290 * is not processing any new workload requests, effectively stopping 291 * any outgoing DMA transfers (not just p2p). 292 * Any VRAM / MMIO accesses occurring during "RUNNING_P2P" are kept and 293 * will be migrated to target VF during stop-copy. 294 */ 295 if (cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_RUNNING_P2P) { 296 ret = xe_sriov_vfio_suspend_device(xe_vdev->xe, xe_vdev->vfid); 297 if (ret) 298 goto err; 299 300 return NULL; 301 } 302 303 if ((cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_STOP) || 304 (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RUNNING_P2P)) 305 return NULL; 306 307 if (cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_RUNNING) { 308 ret = xe_sriov_vfio_resume_device(xe_vdev->xe, xe_vdev->vfid); 309 if (ret) 310 goto err; 311 312 return NULL; 313 } 314 315 if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_STOP_COPY) { 316 struct xe_vfio_pci_migration_file *migf; 317 318 migf = xe_vfio_pci_alloc_file(xe_vdev, XE_VFIO_FILE_SAVE); 319 if (IS_ERR(migf)) { 320 ret = PTR_ERR(migf); 321 goto err; 322 } 323 get_file(migf->filp); 324 325 ret = xe_sriov_vfio_stop_copy_enter(xe_vdev->xe, xe_vdev->vfid); 326 if (ret) { 327 fput(migf->filp); 328 goto err; 329 } 330 331 return migf->filp; 332 } 333 334 if (cur == VFIO_DEVICE_STATE_STOP_COPY && new == VFIO_DEVICE_STATE_STOP) { 335 if (xe_vdev->migf) 336 xe_vfio_pci_put_file(xe_vdev); 337 338 ret = xe_sriov_vfio_stop_copy_exit(xe_vdev->xe, xe_vdev->vfid); 339 if (ret) 340 goto err; 341 342 return NULL; 343 } 344 345 if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RESUMING) { 346 struct xe_vfio_pci_migration_file *migf; 347 348 migf = xe_vfio_pci_alloc_file(xe_vdev, XE_VFIO_FILE_RESUME); 349 if (IS_ERR(migf)) { 350 ret = PTR_ERR(migf); 351 goto err; 352 } 353 get_file(migf->filp); 354 355 ret = xe_sriov_vfio_resume_data_enter(xe_vdev->xe, xe_vdev->vfid); 356 if (ret) { 357 fput(migf->filp); 358 goto err; 359 } 360 361 return migf->filp; 362 } 363 364 if (cur == VFIO_DEVICE_STATE_RESUMING && new == VFIO_DEVICE_STATE_STOP) { 365 if (xe_vdev->migf) 366 xe_vfio_pci_put_file(xe_vdev); 367 368 ret = xe_sriov_vfio_resume_data_exit(xe_vdev->xe, xe_vdev->vfid); 369 if (ret) 370 goto err; 371 372 return NULL; 373 } 374 375 WARN(true, "Unknown state transition %d->%d", cur, new); 376 return ERR_PTR(-EINVAL); 377 378 err: 379 dev_dbg(xe_vdev_to_dev(xe_vdev), 380 "Failed to transition state: %s->%s err=%d\n", 381 vfio_dev_state_str(cur), vfio_dev_state_str(new), ret); 382 return ERR_PTR(ret); 383 } 384 385 static struct file * 386 xe_vfio_pci_set_device_state(struct vfio_device *core_vdev, 387 enum vfio_device_mig_state new_state) 388 { 389 struct xe_vfio_pci_core_device *xe_vdev = 390 container_of(core_vdev, struct xe_vfio_pci_core_device, core_device.vdev); 391 enum vfio_device_mig_state next_state; 392 struct file *f = NULL; 393 int ret; 394 395 xe_vfio_pci_state_mutex_lock(xe_vdev); 396 while (new_state != xe_vdev->mig_state) { 397 ret = vfio_mig_get_next_state(core_vdev, xe_vdev->mig_state, 398 new_state, &next_state); 399 if (ret) { 400 xe_sriov_vfio_error(xe_vdev->xe, xe_vdev->vfid); 401 f = ERR_PTR(ret); 402 break; 403 } 404 f = xe_vfio_set_state(xe_vdev, next_state); 405 if (IS_ERR(f)) 406 break; 407 408 xe_vdev->mig_state = next_state; 409 410 /* Multiple state transitions with non-NULL file in the middle */ 411 if (f && new_state != xe_vdev->mig_state) { 412 fput(f); 413 f = ERR_PTR(-EINVAL); 414 break; 415 } 416 } 417 xe_vfio_pci_state_mutex_unlock(xe_vdev); 418 419 return f; 420 } 421 422 static int xe_vfio_pci_get_device_state(struct vfio_device *core_vdev, 423 enum vfio_device_mig_state *curr_state) 424 { 425 struct xe_vfio_pci_core_device *xe_vdev = 426 container_of(core_vdev, struct xe_vfio_pci_core_device, core_device.vdev); 427 428 xe_vfio_pci_state_mutex_lock(xe_vdev); 429 *curr_state = xe_vdev->mig_state; 430 xe_vfio_pci_state_mutex_unlock(xe_vdev); 431 432 return 0; 433 } 434 435 static int xe_vfio_pci_get_data_size(struct vfio_device *vdev, 436 unsigned long *stop_copy_length) 437 { 438 struct xe_vfio_pci_core_device *xe_vdev = 439 container_of(vdev, struct xe_vfio_pci_core_device, core_device.vdev); 440 441 xe_vfio_pci_state_mutex_lock(xe_vdev); 442 *stop_copy_length = xe_sriov_vfio_stop_copy_size(xe_vdev->xe, xe_vdev->vfid); 443 xe_vfio_pci_state_mutex_unlock(xe_vdev); 444 445 return 0; 446 } 447 448 static const struct vfio_migration_ops xe_vfio_pci_migration_ops = { 449 .migration_set_state = xe_vfio_pci_set_device_state, 450 .migration_get_state = xe_vfio_pci_get_device_state, 451 .migration_get_data_size = xe_vfio_pci_get_data_size, 452 }; 453 454 static void xe_vfio_pci_migration_init(struct xe_vfio_pci_core_device *xe_vdev) 455 { 456 struct vfio_device *core_vdev = &xe_vdev->core_device.vdev; 457 struct pci_dev *pdev = to_pci_dev(core_vdev->dev); 458 struct xe_device *xe = xe_sriov_vfio_get_pf(pdev); 459 460 if (!xe) 461 return; 462 if (!xe_sriov_vfio_migration_supported(xe)) 463 return; 464 465 mutex_init(&xe_vdev->state_mutex); 466 spin_lock_init(&xe_vdev->reset_lock); 467 468 /* PF internal control uses vfid index starting from 1 */ 469 xe_vdev->vfid = pci_iov_vf_id(pdev) + 1; 470 xe_vdev->xe = xe; 471 472 core_vdev->migration_flags = VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P; 473 core_vdev->mig_ops = &xe_vfio_pci_migration_ops; 474 } 475 476 static void xe_vfio_pci_migration_fini(struct xe_vfio_pci_core_device *xe_vdev) 477 { 478 if (!xe_vdev->vfid) 479 return; 480 481 mutex_destroy(&xe_vdev->state_mutex); 482 } 483 484 static int xe_vfio_pci_init_dev(struct vfio_device *core_vdev) 485 { 486 struct xe_vfio_pci_core_device *xe_vdev = 487 container_of(core_vdev, struct xe_vfio_pci_core_device, core_device.vdev); 488 489 xe_vfio_pci_migration_init(xe_vdev); 490 491 return vfio_pci_core_init_dev(core_vdev); 492 } 493 494 static void xe_vfio_pci_release_dev(struct vfio_device *core_vdev) 495 { 496 struct xe_vfio_pci_core_device *xe_vdev = 497 container_of(core_vdev, struct xe_vfio_pci_core_device, core_device.vdev); 498 499 xe_vfio_pci_migration_fini(xe_vdev); 500 } 501 502 static const struct vfio_device_ops xe_vfio_pci_ops = { 503 .name = "xe-vfio-pci", 504 .init = xe_vfio_pci_init_dev, 505 .release = xe_vfio_pci_release_dev, 506 .open_device = xe_vfio_pci_open_device, 507 .close_device = xe_vfio_pci_close_device, 508 .ioctl = vfio_pci_core_ioctl, 509 .get_region_info_caps = vfio_pci_ioctl_get_region_info, 510 .device_feature = vfio_pci_core_ioctl_feature, 511 .read = vfio_pci_core_read, 512 .write = vfio_pci_core_write, 513 .mmap = vfio_pci_core_mmap, 514 .request = vfio_pci_core_request, 515 .match = vfio_pci_core_match, 516 .match_token_uuid = vfio_pci_core_match_token_uuid, 517 .bind_iommufd = vfio_iommufd_physical_bind, 518 .unbind_iommufd = vfio_iommufd_physical_unbind, 519 .attach_ioas = vfio_iommufd_physical_attach_ioas, 520 .detach_ioas = vfio_iommufd_physical_detach_ioas, 521 }; 522 523 static int xe_vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) 524 { 525 struct xe_vfio_pci_core_device *xe_vdev; 526 int ret; 527 528 xe_vdev = vfio_alloc_device(xe_vfio_pci_core_device, core_device.vdev, &pdev->dev, 529 &xe_vfio_pci_ops); 530 if (IS_ERR(xe_vdev)) 531 return PTR_ERR(xe_vdev); 532 533 dev_set_drvdata(&pdev->dev, &xe_vdev->core_device); 534 535 ret = vfio_pci_core_register_device(&xe_vdev->core_device); 536 if (ret) { 537 vfio_put_device(&xe_vdev->core_device.vdev); 538 return ret; 539 } 540 541 return 0; 542 } 543 544 static void xe_vfio_pci_remove(struct pci_dev *pdev) 545 { 546 struct xe_vfio_pci_core_device *xe_vdev = pci_get_drvdata(pdev); 547 548 vfio_pci_core_unregister_device(&xe_vdev->core_device); 549 vfio_put_device(&xe_vdev->core_device.vdev); 550 } 551 552 #define INTEL_PCI_VFIO_DEVICE(_id) { \ 553 PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_INTEL, (_id)) \ 554 } 555 556 static const struct pci_device_id xe_vfio_pci_table[] = { 557 INTEL_PTL_IDS(INTEL_PCI_VFIO_DEVICE), 558 INTEL_WCL_IDS(INTEL_PCI_VFIO_DEVICE), 559 INTEL_BMG_IDS(INTEL_PCI_VFIO_DEVICE), 560 {} 561 }; 562 MODULE_DEVICE_TABLE(pci, xe_vfio_pci_table); 563 564 static struct pci_driver xe_vfio_pci_driver = { 565 .name = "xe-vfio-pci", 566 .id_table = xe_vfio_pci_table, 567 .probe = xe_vfio_pci_probe, 568 .remove = xe_vfio_pci_remove, 569 .err_handler = &xe_vfio_pci_err_handlers, 570 .driver_managed_dma = true, 571 }; 572 module_pci_driver(xe_vfio_pci_driver); 573 574 MODULE_LICENSE("GPL"); 575 MODULE_AUTHOR("Michał Winiarski <michal.winiarski@intel.com>"); 576 MODULE_DESCRIPTION("VFIO PCI driver with migration support for Intel Graphics"); 577