xref: /linux/drivers/vfio/pci/xe/main.c (revision ff7e082ea40d70b7613e8db2cb11e3555ebcc546)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright © 2025 Intel Corporation
4  */
5 
6 #include <linux/anon_inodes.h>
7 #include <linux/delay.h>
8 #include <linux/file.h>
9 #include <linux/module.h>
10 #include <linux/pci.h>
11 #include <linux/sizes.h>
12 #include <linux/types.h>
13 #include <linux/vfio.h>
14 #include <linux/vfio_pci_core.h>
15 
16 #include <drm/intel/xe_sriov_vfio.h>
17 #include <drm/intel/pciids.h>
18 
19 struct xe_vfio_pci_migration_file {
20 	struct file *filp;
21 	/* serializes accesses to migration data */
22 	struct mutex lock;
23 	struct xe_vfio_pci_core_device *xe_vdev;
24 	u8 disabled:1;
25 };
26 
27 struct xe_vfio_pci_core_device {
28 	struct vfio_pci_core_device core_device;
29 	struct xe_device *xe;
30 	/* PF internal control uses vfid index starting from 1 */
31 	unsigned int vfid;
32 	u8 deferred_reset:1;
33 	/* protects migration state */
34 	struct mutex state_mutex;
35 	enum vfio_device_mig_state mig_state;
36 	/* protects the reset_done flow */
37 	spinlock_t reset_lock;
38 	struct xe_vfio_pci_migration_file *migf;
39 };
40 
41 #define xe_vdev_to_dev(xe_vdev) (&(xe_vdev)->core_device.pdev->dev)
42 
43 static void xe_vfio_pci_disable_file(struct xe_vfio_pci_migration_file *migf)
44 {
45 	mutex_lock(&migf->lock);
46 	migf->disabled = true;
47 	mutex_unlock(&migf->lock);
48 }
49 
50 static void xe_vfio_pci_put_file(struct xe_vfio_pci_core_device *xe_vdev)
51 {
52 	xe_vfio_pci_disable_file(xe_vdev->migf);
53 	fput(xe_vdev->migf->filp);
54 	xe_vdev->migf = NULL;
55 }
56 
57 static void xe_vfio_pci_reset(struct xe_vfio_pci_core_device *xe_vdev)
58 {
59 	if (xe_vdev->migf)
60 		xe_vfio_pci_put_file(xe_vdev);
61 
62 	xe_vdev->mig_state = VFIO_DEVICE_STATE_RUNNING;
63 }
64 
65 static void xe_vfio_pci_state_mutex_lock(struct xe_vfio_pci_core_device *xe_vdev)
66 {
67 	mutex_lock(&xe_vdev->state_mutex);
68 }
69 
70 /*
71  * This function is called in all state_mutex unlock cases to
72  * handle a 'deferred_reset' if exists.
73  */
74 static void xe_vfio_pci_state_mutex_unlock(struct xe_vfio_pci_core_device *xe_vdev)
75 {
76 again:
77 	spin_lock(&xe_vdev->reset_lock);
78 	if (xe_vdev->deferred_reset) {
79 		xe_vdev->deferred_reset = false;
80 		spin_unlock(&xe_vdev->reset_lock);
81 		xe_vfio_pci_reset(xe_vdev);
82 		goto again;
83 	}
84 	mutex_unlock(&xe_vdev->state_mutex);
85 	spin_unlock(&xe_vdev->reset_lock);
86 }
87 
88 static void xe_vfio_pci_reset_done(struct pci_dev *pdev)
89 {
90 	struct xe_vfio_pci_core_device *xe_vdev = pci_get_drvdata(pdev);
91 	int ret;
92 
93 	if (!pdev->is_virtfn)
94 		return;
95 
96 	/*
97 	 * VF FLR requires additional processing done by PF driver.
98 	 * The processing is done after FLR is already finished from PCIe
99 	 * perspective.
100 	 * In order to avoid a scenario where VF is used while PF processing
101 	 * is still in progress, additional synchronization point is needed.
102 	 */
103 	ret = xe_sriov_vfio_wait_flr_done(xe_vdev->xe, xe_vdev->vfid);
104 	if (ret)
105 		dev_err(&pdev->dev, "Failed to wait for FLR: %d\n", ret);
106 
107 	if (!xe_vdev->vfid)
108 		return;
109 
110 	/*
111 	 * As the higher VFIO layers are holding locks across reset and using
112 	 * those same locks with the mm_lock we need to prevent ABBA deadlock
113 	 * with the state_mutex and mm_lock.
114 	 * In case the state_mutex was taken already we defer the cleanup work
115 	 * to the unlock flow of the other running context.
116 	 */
117 	spin_lock(&xe_vdev->reset_lock);
118 	xe_vdev->deferred_reset = true;
119 	if (!mutex_trylock(&xe_vdev->state_mutex)) {
120 		spin_unlock(&xe_vdev->reset_lock);
121 		return;
122 	}
123 	spin_unlock(&xe_vdev->reset_lock);
124 	xe_vfio_pci_state_mutex_unlock(xe_vdev);
125 
126 	xe_vfio_pci_reset(xe_vdev);
127 }
128 
129 static const struct pci_error_handlers xe_vfio_pci_err_handlers = {
130 	.reset_done = xe_vfio_pci_reset_done,
131 	.error_detected = vfio_pci_core_aer_err_detected,
132 };
133 
134 static int xe_vfio_pci_open_device(struct vfio_device *core_vdev)
135 {
136 	struct xe_vfio_pci_core_device *xe_vdev =
137 		container_of(core_vdev, struct xe_vfio_pci_core_device, core_device.vdev);
138 	struct vfio_pci_core_device *vdev = &xe_vdev->core_device;
139 	int ret;
140 
141 	ret = vfio_pci_core_enable(vdev);
142 	if (ret)
143 		return ret;
144 
145 	xe_vdev->mig_state = VFIO_DEVICE_STATE_RUNNING;
146 
147 	vfio_pci_core_finish_enable(vdev);
148 
149 	return 0;
150 }
151 
152 static void xe_vfio_pci_close_device(struct vfio_device *core_vdev)
153 {
154 	struct xe_vfio_pci_core_device *xe_vdev =
155 		container_of(core_vdev, struct xe_vfio_pci_core_device, core_device.vdev);
156 
157 	xe_vfio_pci_state_mutex_lock(xe_vdev);
158 	xe_vfio_pci_reset(xe_vdev);
159 	xe_vfio_pci_state_mutex_unlock(xe_vdev);
160 	vfio_pci_core_close_device(core_vdev);
161 }
162 
163 static int xe_vfio_pci_release_file(struct inode *inode, struct file *filp)
164 {
165 	struct xe_vfio_pci_migration_file *migf = filp->private_data;
166 
167 	mutex_destroy(&migf->lock);
168 	kfree(migf);
169 
170 	return 0;
171 }
172 
173 static ssize_t xe_vfio_pci_save_read(struct file *filp, char __user *buf, size_t len, loff_t *pos)
174 {
175 	struct xe_vfio_pci_migration_file *migf = filp->private_data;
176 	ssize_t ret;
177 
178 	if (pos)
179 		return -ESPIPE;
180 
181 	mutex_lock(&migf->lock);
182 	if (migf->disabled) {
183 		mutex_unlock(&migf->lock);
184 		return -ENODEV;
185 	}
186 
187 	ret = xe_sriov_vfio_data_read(migf->xe_vdev->xe, migf->xe_vdev->vfid, buf, len);
188 	mutex_unlock(&migf->lock);
189 
190 	return ret;
191 }
192 
193 static const struct file_operations xe_vfio_pci_save_fops = {
194 	.owner = THIS_MODULE,
195 	.read = xe_vfio_pci_save_read,
196 	.release = xe_vfio_pci_release_file,
197 	.llseek = noop_llseek,
198 };
199 
200 static ssize_t xe_vfio_pci_resume_write(struct file *filp, const char __user *buf,
201 					size_t len, loff_t *pos)
202 {
203 	struct xe_vfio_pci_migration_file *migf = filp->private_data;
204 	ssize_t ret;
205 
206 	if (pos)
207 		return -ESPIPE;
208 
209 	mutex_lock(&migf->lock);
210 	if (migf->disabled) {
211 		mutex_unlock(&migf->lock);
212 		return -ENODEV;
213 	}
214 
215 	ret = xe_sriov_vfio_data_write(migf->xe_vdev->xe, migf->xe_vdev->vfid, buf, len);
216 	mutex_unlock(&migf->lock);
217 
218 	return ret;
219 }
220 
221 static const struct file_operations xe_vfio_pci_resume_fops = {
222 	.owner = THIS_MODULE,
223 	.write = xe_vfio_pci_resume_write,
224 	.release = xe_vfio_pci_release_file,
225 	.llseek = noop_llseek,
226 };
227 
228 static const char *vfio_dev_state_str(u32 state)
229 {
230 	switch (state) {
231 	case VFIO_DEVICE_STATE_RUNNING: return "running";
232 	case VFIO_DEVICE_STATE_RUNNING_P2P: return "running_p2p";
233 	case VFIO_DEVICE_STATE_STOP_COPY: return "stopcopy";
234 	case VFIO_DEVICE_STATE_STOP: return "stop";
235 	case VFIO_DEVICE_STATE_RESUMING: return "resuming";
236 	case VFIO_DEVICE_STATE_ERROR: return "error";
237 	default: return "";
238 	}
239 }
240 
241 enum xe_vfio_pci_file_type {
242 	XE_VFIO_FILE_SAVE = 0,
243 	XE_VFIO_FILE_RESUME,
244 };
245 
246 static struct xe_vfio_pci_migration_file *
247 xe_vfio_pci_alloc_file(struct xe_vfio_pci_core_device *xe_vdev,
248 		       enum xe_vfio_pci_file_type type)
249 {
250 	struct xe_vfio_pci_migration_file *migf;
251 	const struct file_operations *fops;
252 	int flags;
253 
254 	migf = kzalloc(sizeof(*migf), GFP_KERNEL_ACCOUNT);
255 	if (!migf)
256 		return ERR_PTR(-ENOMEM);
257 
258 	fops = type == XE_VFIO_FILE_SAVE ? &xe_vfio_pci_save_fops : &xe_vfio_pci_resume_fops;
259 	flags = type == XE_VFIO_FILE_SAVE ? O_RDONLY : O_WRONLY;
260 	migf->filp = anon_inode_getfile("xe_vfio_mig", fops, migf, flags);
261 	if (IS_ERR(migf->filp)) {
262 		kfree(migf);
263 		return ERR_CAST(migf->filp);
264 	}
265 
266 	mutex_init(&migf->lock);
267 	migf->xe_vdev = xe_vdev;
268 	xe_vdev->migf = migf;
269 
270 	stream_open(migf->filp->f_inode, migf->filp);
271 
272 	return migf;
273 }
274 
275 static struct file *
276 xe_vfio_set_state(struct xe_vfio_pci_core_device *xe_vdev, u32 new)
277 {
278 	u32 cur = xe_vdev->mig_state;
279 	int ret;
280 
281 	dev_dbg(xe_vdev_to_dev(xe_vdev),
282 		"state: %s->%s\n", vfio_dev_state_str(cur), vfio_dev_state_str(new));
283 
284 	/*
285 	 * "STOP" handling is reused for "RUNNING_P2P", as the device doesn't
286 	 * have the capability to selectively block outgoing p2p DMA transfers.
287 	 * While the device is allowing BAR accesses when the VF is stopped, it
288 	 * is not processing any new workload requests, effectively stopping
289 	 * any outgoing DMA transfers (not just p2p).
290 	 * Any VRAM / MMIO accesses occurring during "RUNNING_P2P" are kept and
291 	 * will be migrated to target VF during stop-copy.
292 	 */
293 	if (cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_RUNNING_P2P) {
294 		ret = xe_sriov_vfio_suspend_device(xe_vdev->xe, xe_vdev->vfid);
295 		if (ret)
296 			goto err;
297 
298 		return NULL;
299 	}
300 
301 	if ((cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_STOP) ||
302 	    (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RUNNING_P2P))
303 		return NULL;
304 
305 	if (cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_RUNNING) {
306 		ret = xe_sriov_vfio_resume_device(xe_vdev->xe, xe_vdev->vfid);
307 		if (ret)
308 			goto err;
309 
310 		return NULL;
311 	}
312 
313 	if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_STOP_COPY) {
314 		struct xe_vfio_pci_migration_file *migf;
315 
316 		migf = xe_vfio_pci_alloc_file(xe_vdev, XE_VFIO_FILE_SAVE);
317 		if (IS_ERR(migf)) {
318 			ret = PTR_ERR(migf);
319 			goto err;
320 		}
321 		get_file(migf->filp);
322 
323 		ret = xe_sriov_vfio_stop_copy_enter(xe_vdev->xe, xe_vdev->vfid);
324 		if (ret) {
325 			fput(migf->filp);
326 			goto err;
327 		}
328 
329 		return migf->filp;
330 	}
331 
332 	if (cur == VFIO_DEVICE_STATE_STOP_COPY && new == VFIO_DEVICE_STATE_STOP) {
333 		if (xe_vdev->migf)
334 			xe_vfio_pci_put_file(xe_vdev);
335 
336 		ret = xe_sriov_vfio_stop_copy_exit(xe_vdev->xe, xe_vdev->vfid);
337 		if (ret)
338 			goto err;
339 
340 		return NULL;
341 	}
342 
343 	if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RESUMING) {
344 		struct xe_vfio_pci_migration_file *migf;
345 
346 		migf = xe_vfio_pci_alloc_file(xe_vdev, XE_VFIO_FILE_RESUME);
347 		if (IS_ERR(migf)) {
348 			ret = PTR_ERR(migf);
349 			goto err;
350 		}
351 		get_file(migf->filp);
352 
353 		ret = xe_sriov_vfio_resume_data_enter(xe_vdev->xe, xe_vdev->vfid);
354 		if (ret) {
355 			fput(migf->filp);
356 			goto err;
357 		}
358 
359 		return migf->filp;
360 	}
361 
362 	if (cur == VFIO_DEVICE_STATE_RESUMING && new == VFIO_DEVICE_STATE_STOP) {
363 		if (xe_vdev->migf)
364 			xe_vfio_pci_put_file(xe_vdev);
365 
366 		ret = xe_sriov_vfio_resume_data_exit(xe_vdev->xe, xe_vdev->vfid);
367 		if (ret)
368 			goto err;
369 
370 		return NULL;
371 	}
372 
373 	WARN(true, "Unknown state transition %d->%d", cur, new);
374 	return ERR_PTR(-EINVAL);
375 
376 err:
377 	dev_dbg(xe_vdev_to_dev(xe_vdev),
378 		"Failed to transition state: %s->%s err=%d\n",
379 		vfio_dev_state_str(cur), vfio_dev_state_str(new), ret);
380 	return ERR_PTR(ret);
381 }
382 
383 static struct file *
384 xe_vfio_pci_set_device_state(struct vfio_device *core_vdev,
385 			     enum vfio_device_mig_state new_state)
386 {
387 	struct xe_vfio_pci_core_device *xe_vdev =
388 		container_of(core_vdev, struct xe_vfio_pci_core_device, core_device.vdev);
389 	enum vfio_device_mig_state next_state;
390 	struct file *f = NULL;
391 	int ret;
392 
393 	xe_vfio_pci_state_mutex_lock(xe_vdev);
394 	while (new_state != xe_vdev->mig_state) {
395 		ret = vfio_mig_get_next_state(core_vdev, xe_vdev->mig_state,
396 					      new_state, &next_state);
397 		if (ret) {
398 			xe_sriov_vfio_error(xe_vdev->xe, xe_vdev->vfid);
399 			f = ERR_PTR(ret);
400 			break;
401 		}
402 		f = xe_vfio_set_state(xe_vdev, next_state);
403 		if (IS_ERR(f))
404 			break;
405 
406 		xe_vdev->mig_state = next_state;
407 
408 		/* Multiple state transitions with non-NULL file in the middle */
409 		if (f && new_state != xe_vdev->mig_state) {
410 			fput(f);
411 			f = ERR_PTR(-EINVAL);
412 			break;
413 		}
414 	}
415 	xe_vfio_pci_state_mutex_unlock(xe_vdev);
416 
417 	return f;
418 }
419 
420 static int xe_vfio_pci_get_device_state(struct vfio_device *core_vdev,
421 					enum vfio_device_mig_state *curr_state)
422 {
423 	struct xe_vfio_pci_core_device *xe_vdev =
424 		container_of(core_vdev, struct xe_vfio_pci_core_device, core_device.vdev);
425 
426 	xe_vfio_pci_state_mutex_lock(xe_vdev);
427 	*curr_state = xe_vdev->mig_state;
428 	xe_vfio_pci_state_mutex_unlock(xe_vdev);
429 
430 	return 0;
431 }
432 
433 static int xe_vfio_pci_get_data_size(struct vfio_device *vdev,
434 				     unsigned long *stop_copy_length)
435 {
436 	struct xe_vfio_pci_core_device *xe_vdev =
437 		container_of(vdev, struct xe_vfio_pci_core_device, core_device.vdev);
438 
439 	xe_vfio_pci_state_mutex_lock(xe_vdev);
440 	*stop_copy_length = xe_sriov_vfio_stop_copy_size(xe_vdev->xe, xe_vdev->vfid);
441 	xe_vfio_pci_state_mutex_unlock(xe_vdev);
442 
443 	return 0;
444 }
445 
446 static const struct vfio_migration_ops xe_vfio_pci_migration_ops = {
447 	.migration_set_state = xe_vfio_pci_set_device_state,
448 	.migration_get_state = xe_vfio_pci_get_device_state,
449 	.migration_get_data_size = xe_vfio_pci_get_data_size,
450 };
451 
452 static void xe_vfio_pci_migration_init(struct xe_vfio_pci_core_device *xe_vdev)
453 {
454 	struct vfio_device *core_vdev = &xe_vdev->core_device.vdev;
455 	struct pci_dev *pdev = to_pci_dev(core_vdev->dev);
456 	struct xe_device *xe = xe_sriov_vfio_get_pf(pdev);
457 
458 	if (!xe)
459 		return;
460 	if (!xe_sriov_vfio_migration_supported(xe))
461 		return;
462 
463 	mutex_init(&xe_vdev->state_mutex);
464 	spin_lock_init(&xe_vdev->reset_lock);
465 
466 	/* PF internal control uses vfid index starting from 1 */
467 	xe_vdev->vfid = pci_iov_vf_id(pdev) + 1;
468 	xe_vdev->xe = xe;
469 
470 	core_vdev->migration_flags = VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P;
471 	core_vdev->mig_ops = &xe_vfio_pci_migration_ops;
472 }
473 
474 static void xe_vfio_pci_migration_fini(struct xe_vfio_pci_core_device *xe_vdev)
475 {
476 	if (!xe_vdev->vfid)
477 		return;
478 
479 	mutex_destroy(&xe_vdev->state_mutex);
480 }
481 
482 static int xe_vfio_pci_init_dev(struct vfio_device *core_vdev)
483 {
484 	struct xe_vfio_pci_core_device *xe_vdev =
485 		container_of(core_vdev, struct xe_vfio_pci_core_device, core_device.vdev);
486 
487 	xe_vfio_pci_migration_init(xe_vdev);
488 
489 	return vfio_pci_core_init_dev(core_vdev);
490 }
491 
492 static void xe_vfio_pci_release_dev(struct vfio_device *core_vdev)
493 {
494 	struct xe_vfio_pci_core_device *xe_vdev =
495 		container_of(core_vdev, struct xe_vfio_pci_core_device, core_device.vdev);
496 
497 	xe_vfio_pci_migration_fini(xe_vdev);
498 }
499 
500 static const struct vfio_device_ops xe_vfio_pci_ops = {
501 	.name = "xe-vfio-pci",
502 	.init = xe_vfio_pci_init_dev,
503 	.release = xe_vfio_pci_release_dev,
504 	.open_device = xe_vfio_pci_open_device,
505 	.close_device = xe_vfio_pci_close_device,
506 	.ioctl = vfio_pci_core_ioctl,
507 	.device_feature = vfio_pci_core_ioctl_feature,
508 	.read = vfio_pci_core_read,
509 	.write = vfio_pci_core_write,
510 	.mmap = vfio_pci_core_mmap,
511 	.request = vfio_pci_core_request,
512 	.match = vfio_pci_core_match,
513 	.match_token_uuid = vfio_pci_core_match_token_uuid,
514 	.bind_iommufd = vfio_iommufd_physical_bind,
515 	.unbind_iommufd = vfio_iommufd_physical_unbind,
516 	.attach_ioas = vfio_iommufd_physical_attach_ioas,
517 	.detach_ioas = vfio_iommufd_physical_detach_ioas,
518 };
519 
520 static int xe_vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
521 {
522 	struct xe_vfio_pci_core_device *xe_vdev;
523 	int ret;
524 
525 	xe_vdev = vfio_alloc_device(xe_vfio_pci_core_device, core_device.vdev, &pdev->dev,
526 				    &xe_vfio_pci_ops);
527 	if (IS_ERR(xe_vdev))
528 		return PTR_ERR(xe_vdev);
529 
530 	dev_set_drvdata(&pdev->dev, &xe_vdev->core_device);
531 
532 	ret = vfio_pci_core_register_device(&xe_vdev->core_device);
533 	if (ret) {
534 		vfio_put_device(&xe_vdev->core_device.vdev);
535 		return ret;
536 	}
537 
538 	return 0;
539 }
540 
541 static void xe_vfio_pci_remove(struct pci_dev *pdev)
542 {
543 	struct xe_vfio_pci_core_device *xe_vdev = pci_get_drvdata(pdev);
544 
545 	vfio_pci_core_unregister_device(&xe_vdev->core_device);
546 	vfio_put_device(&xe_vdev->core_device.vdev);
547 }
548 
549 #define INTEL_PCI_VFIO_DEVICE(_id) { \
550 	PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_INTEL, (_id)) \
551 }
552 
553 static const struct pci_device_id xe_vfio_pci_table[] = {
554 	INTEL_PTL_IDS(INTEL_PCI_VFIO_DEVICE),
555 	INTEL_WCL_IDS(INTEL_PCI_VFIO_DEVICE),
556 	INTEL_BMG_IDS(INTEL_PCI_VFIO_DEVICE),
557 	{}
558 };
559 MODULE_DEVICE_TABLE(pci, xe_vfio_pci_table);
560 
561 static struct pci_driver xe_vfio_pci_driver = {
562 	.name = "xe-vfio-pci",
563 	.id_table = xe_vfio_pci_table,
564 	.probe = xe_vfio_pci_probe,
565 	.remove = xe_vfio_pci_remove,
566 	.err_handler = &xe_vfio_pci_err_handlers,
567 	.driver_managed_dma = true,
568 };
569 module_pci_driver(xe_vfio_pci_driver);
570 
571 MODULE_LICENSE("GPL");
572 MODULE_AUTHOR("Michał Winiarski <michal.winiarski@intel.com>");
573 MODULE_DESCRIPTION("VFIO PCI driver with migration support for Intel Graphics");
574