1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * Copyright © 2025 Intel Corporation
4 */
5
6 #include <linux/anon_inodes.h>
7 #include <linux/delay.h>
8 #include <linux/file.h>
9 #include <linux/module.h>
10 #include <linux/pci.h>
11 #include <linux/sizes.h>
12 #include <linux/types.h>
13 #include <linux/vfio.h>
14 #include <linux/vfio_pci_core.h>
15
16 #include <drm/intel/xe_sriov_vfio.h>
17 #include <drm/intel/pciids.h>
18
19 struct xe_vfio_pci_migration_file {
20 struct file *filp;
21 /* serializes accesses to migration data */
22 struct mutex lock;
23 struct xe_vfio_pci_core_device *xe_vdev;
24 u8 disabled:1;
25 };
26
27 struct xe_vfio_pci_core_device {
28 struct vfio_pci_core_device core_device;
29 struct xe_device *xe;
30 /* PF internal control uses vfid index starting from 1 */
31 unsigned int vfid;
32 u8 deferred_reset:1;
33 /* protects migration state */
34 struct mutex state_mutex;
35 enum vfio_device_mig_state mig_state;
36 /* protects the reset_done flow */
37 spinlock_t reset_lock;
38 struct xe_vfio_pci_migration_file *migf;
39 };
40
41 #define xe_vdev_to_dev(xe_vdev) (&(xe_vdev)->core_device.pdev->dev)
42
xe_vfio_pci_disable_file(struct xe_vfio_pci_migration_file * migf)43 static void xe_vfio_pci_disable_file(struct xe_vfio_pci_migration_file *migf)
44 {
45 mutex_lock(&migf->lock);
46 migf->disabled = true;
47 mutex_unlock(&migf->lock);
48 }
49
xe_vfio_pci_put_file(struct xe_vfio_pci_core_device * xe_vdev)50 static void xe_vfio_pci_put_file(struct xe_vfio_pci_core_device *xe_vdev)
51 {
52 xe_vfio_pci_disable_file(xe_vdev->migf);
53 fput(xe_vdev->migf->filp);
54 xe_vdev->migf = NULL;
55 }
56
xe_vfio_pci_reset(struct xe_vfio_pci_core_device * xe_vdev)57 static void xe_vfio_pci_reset(struct xe_vfio_pci_core_device *xe_vdev)
58 {
59 if (xe_vdev->migf)
60 xe_vfio_pci_put_file(xe_vdev);
61
62 xe_vdev->mig_state = VFIO_DEVICE_STATE_RUNNING;
63 }
64
xe_vfio_pci_state_mutex_lock(struct xe_vfio_pci_core_device * xe_vdev)65 static void xe_vfio_pci_state_mutex_lock(struct xe_vfio_pci_core_device *xe_vdev)
66 {
67 mutex_lock(&xe_vdev->state_mutex);
68 }
69
70 /*
71 * This function is called in all state_mutex unlock cases to
72 * handle a 'deferred_reset' if exists.
73 */
xe_vfio_pci_state_mutex_unlock(struct xe_vfio_pci_core_device * xe_vdev)74 static void xe_vfio_pci_state_mutex_unlock(struct xe_vfio_pci_core_device *xe_vdev)
75 {
76 again:
77 spin_lock(&xe_vdev->reset_lock);
78 if (xe_vdev->deferred_reset) {
79 xe_vdev->deferred_reset = false;
80 spin_unlock(&xe_vdev->reset_lock);
81 xe_vfio_pci_reset(xe_vdev);
82 goto again;
83 }
84 mutex_unlock(&xe_vdev->state_mutex);
85 spin_unlock(&xe_vdev->reset_lock);
86 }
87
xe_vfio_pci_reset_done(struct pci_dev * pdev)88 static void xe_vfio_pci_reset_done(struct pci_dev *pdev)
89 {
90 struct xe_vfio_pci_core_device *xe_vdev = pci_get_drvdata(pdev);
91 int ret;
92
93 if (!pdev->is_virtfn)
94 return;
95
96 /*
97 * VF FLR requires additional processing done by PF driver.
98 * The processing is done after FLR is already finished from PCIe
99 * perspective.
100 * In order to avoid a scenario where VF is used while PF processing
101 * is still in progress, additional synchronization point is needed.
102 */
103 ret = xe_sriov_vfio_wait_flr_done(xe_vdev->xe, xe_vdev->vfid);
104 if (ret)
105 dev_err(&pdev->dev, "Failed to wait for FLR: %d\n", ret);
106
107 if (!xe_vdev->vfid)
108 return;
109
110 /*
111 * As the higher VFIO layers are holding locks across reset and using
112 * those same locks with the mm_lock we need to prevent ABBA deadlock
113 * with the state_mutex and mm_lock.
114 * In case the state_mutex was taken already we defer the cleanup work
115 * to the unlock flow of the other running context.
116 */
117 spin_lock(&xe_vdev->reset_lock);
118 xe_vdev->deferred_reset = true;
119 if (!mutex_trylock(&xe_vdev->state_mutex)) {
120 spin_unlock(&xe_vdev->reset_lock);
121 return;
122 }
123 spin_unlock(&xe_vdev->reset_lock);
124 xe_vfio_pci_state_mutex_unlock(xe_vdev);
125
126 xe_vfio_pci_reset(xe_vdev);
127 }
128
129 static const struct pci_error_handlers xe_vfio_pci_err_handlers = {
130 .reset_done = xe_vfio_pci_reset_done,
131 .error_detected = vfio_pci_core_aer_err_detected,
132 };
133
xe_vfio_pci_open_device(struct vfio_device * core_vdev)134 static int xe_vfio_pci_open_device(struct vfio_device *core_vdev)
135 {
136 struct xe_vfio_pci_core_device *xe_vdev =
137 container_of(core_vdev, struct xe_vfio_pci_core_device, core_device.vdev);
138 struct vfio_pci_core_device *vdev = &xe_vdev->core_device;
139 int ret;
140
141 ret = vfio_pci_core_enable(vdev);
142 if (ret)
143 return ret;
144
145 xe_vdev->mig_state = VFIO_DEVICE_STATE_RUNNING;
146
147 vfio_pci_core_finish_enable(vdev);
148
149 return 0;
150 }
151
xe_vfio_pci_close_device(struct vfio_device * core_vdev)152 static void xe_vfio_pci_close_device(struct vfio_device *core_vdev)
153 {
154 struct xe_vfio_pci_core_device *xe_vdev =
155 container_of(core_vdev, struct xe_vfio_pci_core_device, core_device.vdev);
156
157 xe_vfio_pci_state_mutex_lock(xe_vdev);
158 xe_vfio_pci_reset(xe_vdev);
159 xe_vfio_pci_state_mutex_unlock(xe_vdev);
160 vfio_pci_core_close_device(core_vdev);
161 }
162
xe_vfio_pci_release_file(struct inode * inode,struct file * filp)163 static int xe_vfio_pci_release_file(struct inode *inode, struct file *filp)
164 {
165 struct xe_vfio_pci_migration_file *migf = filp->private_data;
166
167 mutex_destroy(&migf->lock);
168 kfree(migf);
169
170 return 0;
171 }
172
xe_vfio_pci_save_read(struct file * filp,char __user * buf,size_t len,loff_t * pos)173 static ssize_t xe_vfio_pci_save_read(struct file *filp, char __user *buf, size_t len, loff_t *pos)
174 {
175 struct xe_vfio_pci_migration_file *migf = filp->private_data;
176 ssize_t ret;
177
178 if (pos)
179 return -ESPIPE;
180
181 mutex_lock(&migf->lock);
182 if (migf->disabled) {
183 mutex_unlock(&migf->lock);
184 return -ENODEV;
185 }
186
187 ret = xe_sriov_vfio_data_read(migf->xe_vdev->xe, migf->xe_vdev->vfid, buf, len);
188 mutex_unlock(&migf->lock);
189
190 return ret;
191 }
192
193 static const struct file_operations xe_vfio_pci_save_fops = {
194 .owner = THIS_MODULE,
195 .read = xe_vfio_pci_save_read,
196 .release = xe_vfio_pci_release_file,
197 .llseek = noop_llseek,
198 };
199
xe_vfio_pci_resume_write(struct file * filp,const char __user * buf,size_t len,loff_t * pos)200 static ssize_t xe_vfio_pci_resume_write(struct file *filp, const char __user *buf,
201 size_t len, loff_t *pos)
202 {
203 struct xe_vfio_pci_migration_file *migf = filp->private_data;
204 ssize_t ret;
205
206 if (pos)
207 return -ESPIPE;
208
209 mutex_lock(&migf->lock);
210 if (migf->disabled) {
211 mutex_unlock(&migf->lock);
212 return -ENODEV;
213 }
214
215 ret = xe_sriov_vfio_data_write(migf->xe_vdev->xe, migf->xe_vdev->vfid, buf, len);
216 mutex_unlock(&migf->lock);
217
218 return ret;
219 }
220
221 static const struct file_operations xe_vfio_pci_resume_fops = {
222 .owner = THIS_MODULE,
223 .write = xe_vfio_pci_resume_write,
224 .release = xe_vfio_pci_release_file,
225 .llseek = noop_llseek,
226 };
227
vfio_dev_state_str(u32 state)228 static const char *vfio_dev_state_str(u32 state)
229 {
230 switch (state) {
231 case VFIO_DEVICE_STATE_RUNNING: return "running";
232 case VFIO_DEVICE_STATE_RUNNING_P2P: return "running_p2p";
233 case VFIO_DEVICE_STATE_STOP_COPY: return "stopcopy";
234 case VFIO_DEVICE_STATE_STOP: return "stop";
235 case VFIO_DEVICE_STATE_RESUMING: return "resuming";
236 case VFIO_DEVICE_STATE_ERROR: return "error";
237 default: return "";
238 }
239 }
240
241 enum xe_vfio_pci_file_type {
242 XE_VFIO_FILE_SAVE = 0,
243 XE_VFIO_FILE_RESUME,
244 };
245
246 static struct xe_vfio_pci_migration_file *
xe_vfio_pci_alloc_file(struct xe_vfio_pci_core_device * xe_vdev,enum xe_vfio_pci_file_type type)247 xe_vfio_pci_alloc_file(struct xe_vfio_pci_core_device *xe_vdev,
248 enum xe_vfio_pci_file_type type)
249 {
250 struct xe_vfio_pci_migration_file *migf;
251 const struct file_operations *fops;
252 int flags;
253 int ret;
254
255 migf = kzalloc_obj(*migf, GFP_KERNEL_ACCOUNT);
256 if (!migf)
257 return ERR_PTR(-ENOMEM);
258
259 fops = type == XE_VFIO_FILE_SAVE ? &xe_vfio_pci_save_fops : &xe_vfio_pci_resume_fops;
260 flags = type == XE_VFIO_FILE_SAVE ? O_RDONLY : O_WRONLY;
261 migf->filp = anon_inode_getfile("xe_vfio_mig", fops, migf, flags);
262 if (IS_ERR(migf->filp)) {
263 ret = PTR_ERR(migf->filp);
264 kfree(migf);
265 return ERR_PTR(ret);
266 }
267
268 mutex_init(&migf->lock);
269 migf->xe_vdev = xe_vdev;
270 xe_vdev->migf = migf;
271
272 stream_open(migf->filp->f_inode, migf->filp);
273
274 return migf;
275 }
276
277 static struct file *
xe_vfio_set_state(struct xe_vfio_pci_core_device * xe_vdev,u32 new)278 xe_vfio_set_state(struct xe_vfio_pci_core_device *xe_vdev, u32 new)
279 {
280 u32 cur = xe_vdev->mig_state;
281 int ret;
282
283 dev_dbg(xe_vdev_to_dev(xe_vdev),
284 "state: %s->%s\n", vfio_dev_state_str(cur), vfio_dev_state_str(new));
285
286 /*
287 * "STOP" handling is reused for "RUNNING_P2P", as the device doesn't
288 * have the capability to selectively block outgoing p2p DMA transfers.
289 * While the device is allowing BAR accesses when the VF is stopped, it
290 * is not processing any new workload requests, effectively stopping
291 * any outgoing DMA transfers (not just p2p).
292 * Any VRAM / MMIO accesses occurring during "RUNNING_P2P" are kept and
293 * will be migrated to target VF during stop-copy.
294 */
295 if (cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_RUNNING_P2P) {
296 ret = xe_sriov_vfio_suspend_device(xe_vdev->xe, xe_vdev->vfid);
297 if (ret)
298 goto err;
299
300 return NULL;
301 }
302
303 if ((cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_STOP) ||
304 (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RUNNING_P2P))
305 return NULL;
306
307 if (cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_RUNNING) {
308 ret = xe_sriov_vfio_resume_device(xe_vdev->xe, xe_vdev->vfid);
309 if (ret)
310 goto err;
311
312 return NULL;
313 }
314
315 if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_STOP_COPY) {
316 struct xe_vfio_pci_migration_file *migf;
317
318 migf = xe_vfio_pci_alloc_file(xe_vdev, XE_VFIO_FILE_SAVE);
319 if (IS_ERR(migf)) {
320 ret = PTR_ERR(migf);
321 goto err;
322 }
323 get_file(migf->filp);
324
325 ret = xe_sriov_vfio_stop_copy_enter(xe_vdev->xe, xe_vdev->vfid);
326 if (ret) {
327 fput(migf->filp);
328 goto err;
329 }
330
331 return migf->filp;
332 }
333
334 if (cur == VFIO_DEVICE_STATE_STOP_COPY && new == VFIO_DEVICE_STATE_STOP) {
335 if (xe_vdev->migf)
336 xe_vfio_pci_put_file(xe_vdev);
337
338 ret = xe_sriov_vfio_stop_copy_exit(xe_vdev->xe, xe_vdev->vfid);
339 if (ret)
340 goto err;
341
342 return NULL;
343 }
344
345 if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RESUMING) {
346 struct xe_vfio_pci_migration_file *migf;
347
348 migf = xe_vfio_pci_alloc_file(xe_vdev, XE_VFIO_FILE_RESUME);
349 if (IS_ERR(migf)) {
350 ret = PTR_ERR(migf);
351 goto err;
352 }
353 get_file(migf->filp);
354
355 ret = xe_sriov_vfio_resume_data_enter(xe_vdev->xe, xe_vdev->vfid);
356 if (ret) {
357 fput(migf->filp);
358 goto err;
359 }
360
361 return migf->filp;
362 }
363
364 if (cur == VFIO_DEVICE_STATE_RESUMING && new == VFIO_DEVICE_STATE_STOP) {
365 if (xe_vdev->migf)
366 xe_vfio_pci_put_file(xe_vdev);
367
368 ret = xe_sriov_vfio_resume_data_exit(xe_vdev->xe, xe_vdev->vfid);
369 if (ret)
370 goto err;
371
372 return NULL;
373 }
374
375 WARN(true, "Unknown state transition %d->%d", cur, new);
376 return ERR_PTR(-EINVAL);
377
378 err:
379 dev_dbg(xe_vdev_to_dev(xe_vdev),
380 "Failed to transition state: %s->%s err=%d\n",
381 vfio_dev_state_str(cur), vfio_dev_state_str(new), ret);
382 return ERR_PTR(ret);
383 }
384
385 static struct file *
xe_vfio_pci_set_device_state(struct vfio_device * core_vdev,enum vfio_device_mig_state new_state)386 xe_vfio_pci_set_device_state(struct vfio_device *core_vdev,
387 enum vfio_device_mig_state new_state)
388 {
389 struct xe_vfio_pci_core_device *xe_vdev =
390 container_of(core_vdev, struct xe_vfio_pci_core_device, core_device.vdev);
391 enum vfio_device_mig_state next_state;
392 struct file *f = NULL;
393 int ret;
394
395 xe_vfio_pci_state_mutex_lock(xe_vdev);
396 while (new_state != xe_vdev->mig_state) {
397 ret = vfio_mig_get_next_state(core_vdev, xe_vdev->mig_state,
398 new_state, &next_state);
399 if (ret) {
400 xe_sriov_vfio_error(xe_vdev->xe, xe_vdev->vfid);
401 f = ERR_PTR(ret);
402 break;
403 }
404 f = xe_vfio_set_state(xe_vdev, next_state);
405 if (IS_ERR(f))
406 break;
407
408 xe_vdev->mig_state = next_state;
409
410 /* Multiple state transitions with non-NULL file in the middle */
411 if (f && new_state != xe_vdev->mig_state) {
412 fput(f);
413 f = ERR_PTR(-EINVAL);
414 break;
415 }
416 }
417 xe_vfio_pci_state_mutex_unlock(xe_vdev);
418
419 return f;
420 }
421
xe_vfio_pci_get_device_state(struct vfio_device * core_vdev,enum vfio_device_mig_state * curr_state)422 static int xe_vfio_pci_get_device_state(struct vfio_device *core_vdev,
423 enum vfio_device_mig_state *curr_state)
424 {
425 struct xe_vfio_pci_core_device *xe_vdev =
426 container_of(core_vdev, struct xe_vfio_pci_core_device, core_device.vdev);
427
428 xe_vfio_pci_state_mutex_lock(xe_vdev);
429 *curr_state = xe_vdev->mig_state;
430 xe_vfio_pci_state_mutex_unlock(xe_vdev);
431
432 return 0;
433 }
434
xe_vfio_pci_get_data_size(struct vfio_device * vdev,unsigned long * stop_copy_length)435 static int xe_vfio_pci_get_data_size(struct vfio_device *vdev,
436 unsigned long *stop_copy_length)
437 {
438 struct xe_vfio_pci_core_device *xe_vdev =
439 container_of(vdev, struct xe_vfio_pci_core_device, core_device.vdev);
440
441 xe_vfio_pci_state_mutex_lock(xe_vdev);
442 *stop_copy_length = xe_sriov_vfio_stop_copy_size(xe_vdev->xe, xe_vdev->vfid);
443 xe_vfio_pci_state_mutex_unlock(xe_vdev);
444
445 return 0;
446 }
447
448 static const struct vfio_migration_ops xe_vfio_pci_migration_ops = {
449 .migration_set_state = xe_vfio_pci_set_device_state,
450 .migration_get_state = xe_vfio_pci_get_device_state,
451 .migration_get_data_size = xe_vfio_pci_get_data_size,
452 };
453
xe_vfio_pci_migration_init(struct xe_vfio_pci_core_device * xe_vdev)454 static void xe_vfio_pci_migration_init(struct xe_vfio_pci_core_device *xe_vdev)
455 {
456 struct vfio_device *core_vdev = &xe_vdev->core_device.vdev;
457 struct pci_dev *pdev = to_pci_dev(core_vdev->dev);
458 struct xe_device *xe = xe_sriov_vfio_get_pf(pdev);
459
460 if (!xe)
461 return;
462 if (!xe_sriov_vfio_migration_supported(xe))
463 return;
464
465 mutex_init(&xe_vdev->state_mutex);
466 spin_lock_init(&xe_vdev->reset_lock);
467
468 /* PF internal control uses vfid index starting from 1 */
469 xe_vdev->vfid = pci_iov_vf_id(pdev) + 1;
470 xe_vdev->xe = xe;
471
472 core_vdev->migration_flags = VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P;
473 core_vdev->mig_ops = &xe_vfio_pci_migration_ops;
474 }
475
xe_vfio_pci_migration_fini(struct xe_vfio_pci_core_device * xe_vdev)476 static void xe_vfio_pci_migration_fini(struct xe_vfio_pci_core_device *xe_vdev)
477 {
478 if (!xe_vdev->vfid)
479 return;
480
481 mutex_destroy(&xe_vdev->state_mutex);
482 }
483
xe_vfio_pci_init_dev(struct vfio_device * core_vdev)484 static int xe_vfio_pci_init_dev(struct vfio_device *core_vdev)
485 {
486 struct xe_vfio_pci_core_device *xe_vdev =
487 container_of(core_vdev, struct xe_vfio_pci_core_device, core_device.vdev);
488
489 xe_vfio_pci_migration_init(xe_vdev);
490
491 return vfio_pci_core_init_dev(core_vdev);
492 }
493
xe_vfio_pci_release_dev(struct vfio_device * core_vdev)494 static void xe_vfio_pci_release_dev(struct vfio_device *core_vdev)
495 {
496 struct xe_vfio_pci_core_device *xe_vdev =
497 container_of(core_vdev, struct xe_vfio_pci_core_device, core_device.vdev);
498
499 xe_vfio_pci_migration_fini(xe_vdev);
500 }
501
502 static const struct vfio_device_ops xe_vfio_pci_ops = {
503 .name = "xe-vfio-pci",
504 .init = xe_vfio_pci_init_dev,
505 .release = xe_vfio_pci_release_dev,
506 .open_device = xe_vfio_pci_open_device,
507 .close_device = xe_vfio_pci_close_device,
508 .ioctl = vfio_pci_core_ioctl,
509 .get_region_info_caps = vfio_pci_ioctl_get_region_info,
510 .device_feature = vfio_pci_core_ioctl_feature,
511 .read = vfio_pci_core_read,
512 .write = vfio_pci_core_write,
513 .mmap = vfio_pci_core_mmap,
514 .request = vfio_pci_core_request,
515 .match = vfio_pci_core_match,
516 .match_token_uuid = vfio_pci_core_match_token_uuid,
517 .bind_iommufd = vfio_iommufd_physical_bind,
518 .unbind_iommufd = vfio_iommufd_physical_unbind,
519 .attach_ioas = vfio_iommufd_physical_attach_ioas,
520 .detach_ioas = vfio_iommufd_physical_detach_ioas,
521 };
522
xe_vfio_pci_probe(struct pci_dev * pdev,const struct pci_device_id * id)523 static int xe_vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
524 {
525 struct xe_vfio_pci_core_device *xe_vdev;
526 int ret;
527
528 xe_vdev = vfio_alloc_device(xe_vfio_pci_core_device, core_device.vdev, &pdev->dev,
529 &xe_vfio_pci_ops);
530 if (IS_ERR(xe_vdev))
531 return PTR_ERR(xe_vdev);
532
533 dev_set_drvdata(&pdev->dev, &xe_vdev->core_device);
534
535 ret = vfio_pci_core_register_device(&xe_vdev->core_device);
536 if (ret) {
537 vfio_put_device(&xe_vdev->core_device.vdev);
538 return ret;
539 }
540
541 return 0;
542 }
543
xe_vfio_pci_remove(struct pci_dev * pdev)544 static void xe_vfio_pci_remove(struct pci_dev *pdev)
545 {
546 struct xe_vfio_pci_core_device *xe_vdev = pci_get_drvdata(pdev);
547
548 vfio_pci_core_unregister_device(&xe_vdev->core_device);
549 vfio_put_device(&xe_vdev->core_device.vdev);
550 }
551
552 #define INTEL_PCI_VFIO_DEVICE(_id) { \
553 PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_INTEL, (_id)) \
554 }
555
556 static const struct pci_device_id xe_vfio_pci_table[] = {
557 INTEL_PTL_IDS(INTEL_PCI_VFIO_DEVICE),
558 INTEL_WCL_IDS(INTEL_PCI_VFIO_DEVICE),
559 INTEL_BMG_IDS(INTEL_PCI_VFIO_DEVICE),
560 {}
561 };
562 MODULE_DEVICE_TABLE(pci, xe_vfio_pci_table);
563
564 static struct pci_driver xe_vfio_pci_driver = {
565 .name = "xe-vfio-pci",
566 .id_table = xe_vfio_pci_table,
567 .probe = xe_vfio_pci_probe,
568 .remove = xe_vfio_pci_remove,
569 .err_handler = &xe_vfio_pci_err_handlers,
570 .driver_managed_dma = true,
571 };
572 module_pci_driver(xe_vfio_pci_driver);
573
574 MODULE_LICENSE("GPL");
575 MODULE_AUTHOR("Michał Winiarski <michal.winiarski@intel.com>");
576 MODULE_DESCRIPTION("VFIO PCI driver with migration support for Intel Graphics");
577