1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Copyright (C) 2025 Ant Group
4 * Author: Tiwei Bie <tiwei.btw@antgroup.com>
5 */
6
7 #define pr_fmt(fmt) "vfio-uml: " fmt
8
9 #include <linux/module.h>
10 #include <linux/logic_iomem.h>
11 #include <linux/mutex.h>
12 #include <linux/list.h>
13 #include <linux/string.h>
14 #include <linux/unaligned.h>
15 #include <irq_kern.h>
16 #include <init.h>
17 #include <os.h>
18
19 #include "virt-pci.h"
20 #include "vfio_user.h"
21
22 #define to_vdev(_pdev) container_of(_pdev, struct uml_vfio_device, pdev)
23
24 struct uml_vfio_intr_ctx {
25 struct uml_vfio_device *dev;
26 int irq;
27 };
28
29 struct uml_vfio_device {
30 const char *name;
31 int group;
32
33 struct um_pci_device pdev;
34 struct uml_vfio_user_device udev;
35 struct uml_vfio_intr_ctx *intr_ctx;
36
37 int msix_cap;
38 int msix_bar;
39 int msix_offset;
40 int msix_size;
41 u32 *msix_data;
42
43 struct list_head list;
44 };
45
46 struct uml_vfio_group {
47 int id;
48 int fd;
49 int users;
50 struct list_head list;
51 };
52
53 static struct {
54 int fd;
55 int users;
56 } uml_vfio_container = { .fd = -1 };
57 static DEFINE_MUTEX(uml_vfio_container_mtx);
58
59 static LIST_HEAD(uml_vfio_groups);
60 static DEFINE_MUTEX(uml_vfio_groups_mtx);
61
62 static LIST_HEAD(uml_vfio_devices);
63
uml_vfio_set_container(int group_fd)64 static int uml_vfio_set_container(int group_fd)
65 {
66 int err;
67
68 guard(mutex)(¨_vfio_container_mtx);
69
70 err = uml_vfio_user_set_container(uml_vfio_container.fd, group_fd);
71 if (err)
72 return err;
73
74 uml_vfio_container.users++;
75 if (uml_vfio_container.users > 1)
76 return 0;
77
78 err = uml_vfio_user_setup_iommu(uml_vfio_container.fd);
79 if (err) {
80 uml_vfio_user_unset_container(uml_vfio_container.fd, group_fd);
81 uml_vfio_container.users--;
82 }
83 return err;
84 }
85
uml_vfio_unset_container(int group_fd)86 static void uml_vfio_unset_container(int group_fd)
87 {
88 guard(mutex)(¨_vfio_container_mtx);
89
90 uml_vfio_user_unset_container(uml_vfio_container.fd, group_fd);
91 uml_vfio_container.users--;
92 }
93
uml_vfio_open_group(int group_id)94 static int uml_vfio_open_group(int group_id)
95 {
96 struct uml_vfio_group *group;
97 int err;
98
99 guard(mutex)(¨_vfio_groups_mtx);
100
101 list_for_each_entry(group, ¨_vfio_groups, list) {
102 if (group->id == group_id) {
103 group->users++;
104 return group->fd;
105 }
106 }
107
108 group = kzalloc(sizeof(*group), GFP_KERNEL);
109 if (!group)
110 return -ENOMEM;
111
112 group->fd = uml_vfio_user_open_group(group_id);
113 if (group->fd < 0) {
114 err = group->fd;
115 goto free_group;
116 }
117
118 err = uml_vfio_set_container(group->fd);
119 if (err)
120 goto close_group;
121
122 group->id = group_id;
123 group->users = 1;
124
125 list_add(&group->list, ¨_vfio_groups);
126
127 return group->fd;
128
129 close_group:
130 os_close_file(group->fd);
131 free_group:
132 kfree(group);
133 return err;
134 }
135
uml_vfio_release_group(int group_fd)136 static int uml_vfio_release_group(int group_fd)
137 {
138 struct uml_vfio_group *group;
139
140 guard(mutex)(¨_vfio_groups_mtx);
141
142 list_for_each_entry(group, ¨_vfio_groups, list) {
143 if (group->fd == group_fd) {
144 group->users--;
145 if (group->users == 0) {
146 uml_vfio_unset_container(group_fd);
147 os_close_file(group_fd);
148 list_del(&group->list);
149 kfree(group);
150 }
151 return 0;
152 }
153 }
154
155 return -ENOENT;
156 }
157
uml_vfio_interrupt(int unused,void * opaque)158 static irqreturn_t uml_vfio_interrupt(int unused, void *opaque)
159 {
160 struct uml_vfio_intr_ctx *ctx = opaque;
161 struct uml_vfio_device *dev = ctx->dev;
162 int index = ctx - dev->intr_ctx;
163 int irqfd = dev->udev.irqfd[index];
164 int irq = dev->msix_data[index];
165 uint64_t v;
166 int r;
167
168 do {
169 r = os_read_file(irqfd, &v, sizeof(v));
170 if (r == sizeof(v))
171 generic_handle_irq(irq);
172 } while (r == sizeof(v) || r == -EINTR);
173 WARN(r != -EAGAIN, "read returned %d\n", r);
174
175 return IRQ_HANDLED;
176 }
177
uml_vfio_activate_irq(struct uml_vfio_device * dev,int index)178 static int uml_vfio_activate_irq(struct uml_vfio_device *dev, int index)
179 {
180 struct uml_vfio_intr_ctx *ctx = &dev->intr_ctx[index];
181 int err, irqfd;
182
183 if (ctx->irq >= 0)
184 return 0;
185
186 irqfd = uml_vfio_user_activate_irq(&dev->udev, index);
187 if (irqfd < 0)
188 return irqfd;
189
190 ctx->irq = um_request_irq(UM_IRQ_ALLOC, irqfd, IRQ_READ,
191 uml_vfio_interrupt, 0,
192 "vfio-uml", ctx);
193 if (ctx->irq < 0) {
194 err = ctx->irq;
195 goto deactivate;
196 }
197
198 err = add_sigio_fd(irqfd);
199 if (err)
200 goto free_irq;
201
202 return 0;
203
204 free_irq:
205 um_free_irq(ctx->irq, ctx);
206 ctx->irq = -1;
207 deactivate:
208 uml_vfio_user_deactivate_irq(&dev->udev, index);
209 return err;
210 }
211
uml_vfio_deactivate_irq(struct uml_vfio_device * dev,int index)212 static int uml_vfio_deactivate_irq(struct uml_vfio_device *dev, int index)
213 {
214 struct uml_vfio_intr_ctx *ctx = &dev->intr_ctx[index];
215
216 if (ctx->irq >= 0) {
217 ignore_sigio_fd(dev->udev.irqfd[index]);
218 um_free_irq(ctx->irq, ctx);
219 uml_vfio_user_deactivate_irq(&dev->udev, index);
220 ctx->irq = -1;
221 }
222 return 0;
223 }
224
uml_vfio_update_msix_cap(struct uml_vfio_device * dev,unsigned int offset,int size,unsigned long val)225 static int uml_vfio_update_msix_cap(struct uml_vfio_device *dev,
226 unsigned int offset, int size,
227 unsigned long val)
228 {
229 /*
230 * Here, we handle only the operations we care about,
231 * ignoring the rest.
232 */
233 if (size == 2 && offset == dev->msix_cap + PCI_MSIX_FLAGS) {
234 switch (val & ~PCI_MSIX_FLAGS_QSIZE) {
235 case PCI_MSIX_FLAGS_ENABLE:
236 case 0:
237 return uml_vfio_user_update_irqs(&dev->udev);
238 }
239 }
240 return 0;
241 }
242
uml_vfio_update_msix_table(struct uml_vfio_device * dev,unsigned int offset,int size,unsigned long val)243 static int uml_vfio_update_msix_table(struct uml_vfio_device *dev,
244 unsigned int offset, int size,
245 unsigned long val)
246 {
247 int index;
248
249 /*
250 * Here, we handle only the operations we care about,
251 * ignoring the rest.
252 */
253 offset -= dev->msix_offset + PCI_MSIX_ENTRY_DATA;
254
255 if (size != 4 || offset % PCI_MSIX_ENTRY_SIZE != 0)
256 return 0;
257
258 index = offset / PCI_MSIX_ENTRY_SIZE;
259 if (index >= dev->udev.irq_count)
260 return -EINVAL;
261
262 dev->msix_data[index] = val;
263
264 return val ? uml_vfio_activate_irq(dev, index) :
265 uml_vfio_deactivate_irq(dev, index);
266 }
267
__uml_vfio_cfgspace_read(struct uml_vfio_device * dev,unsigned int offset,int size)268 static unsigned long __uml_vfio_cfgspace_read(struct uml_vfio_device *dev,
269 unsigned int offset, int size)
270 {
271 u8 data[8];
272
273 memset(data, 0xff, sizeof(data));
274
275 if (uml_vfio_user_cfgspace_read(&dev->udev, offset, data, size))
276 return ULONG_MAX;
277
278 switch (size) {
279 case 1:
280 return data[0];
281 case 2:
282 return le16_to_cpup((void *)data);
283 case 4:
284 return le32_to_cpup((void *)data);
285 #ifdef CONFIG_64BIT
286 case 8:
287 return le64_to_cpup((void *)data);
288 #endif
289 default:
290 return ULONG_MAX;
291 }
292 }
293
uml_vfio_cfgspace_read(struct um_pci_device * pdev,unsigned int offset,int size)294 static unsigned long uml_vfio_cfgspace_read(struct um_pci_device *pdev,
295 unsigned int offset, int size)
296 {
297 struct uml_vfio_device *dev = to_vdev(pdev);
298
299 return __uml_vfio_cfgspace_read(dev, offset, size);
300 }
301
__uml_vfio_cfgspace_write(struct uml_vfio_device * dev,unsigned int offset,int size,unsigned long val)302 static void __uml_vfio_cfgspace_write(struct uml_vfio_device *dev,
303 unsigned int offset, int size,
304 unsigned long val)
305 {
306 u8 data[8];
307
308 switch (size) {
309 case 1:
310 data[0] = (u8)val;
311 break;
312 case 2:
313 put_unaligned_le16(val, (void *)data);
314 break;
315 case 4:
316 put_unaligned_le32(val, (void *)data);
317 break;
318 #ifdef CONFIG_64BIT
319 case 8:
320 put_unaligned_le64(val, (void *)data);
321 break;
322 #endif
323 }
324
325 WARN_ON(uml_vfio_user_cfgspace_write(&dev->udev, offset, data, size));
326 }
327
uml_vfio_cfgspace_write(struct um_pci_device * pdev,unsigned int offset,int size,unsigned long val)328 static void uml_vfio_cfgspace_write(struct um_pci_device *pdev,
329 unsigned int offset, int size,
330 unsigned long val)
331 {
332 struct uml_vfio_device *dev = to_vdev(pdev);
333
334 if (offset < dev->msix_cap + PCI_CAP_MSIX_SIZEOF &&
335 offset + size > dev->msix_cap)
336 WARN_ON(uml_vfio_update_msix_cap(dev, offset, size, val));
337
338 __uml_vfio_cfgspace_write(dev, offset, size, val);
339 }
340
uml_vfio_bar_copy_from(struct um_pci_device * pdev,int bar,void * buffer,unsigned int offset,int size)341 static void uml_vfio_bar_copy_from(struct um_pci_device *pdev, int bar,
342 void *buffer, unsigned int offset, int size)
343 {
344 struct uml_vfio_device *dev = to_vdev(pdev);
345
346 memset(buffer, 0xff, size);
347 uml_vfio_user_bar_read(&dev->udev, bar, offset, buffer, size);
348 }
349
uml_vfio_bar_read(struct um_pci_device * pdev,int bar,unsigned int offset,int size)350 static unsigned long uml_vfio_bar_read(struct um_pci_device *pdev, int bar,
351 unsigned int offset, int size)
352 {
353 u8 data[8];
354
355 uml_vfio_bar_copy_from(pdev, bar, data, offset, size);
356
357 switch (size) {
358 case 1:
359 return data[0];
360 case 2:
361 return le16_to_cpup((void *)data);
362 case 4:
363 return le32_to_cpup((void *)data);
364 #ifdef CONFIG_64BIT
365 case 8:
366 return le64_to_cpup((void *)data);
367 #endif
368 default:
369 return ULONG_MAX;
370 }
371 }
372
uml_vfio_bar_copy_to(struct um_pci_device * pdev,int bar,unsigned int offset,const void * buffer,int size)373 static void uml_vfio_bar_copy_to(struct um_pci_device *pdev, int bar,
374 unsigned int offset, const void *buffer,
375 int size)
376 {
377 struct uml_vfio_device *dev = to_vdev(pdev);
378
379 uml_vfio_user_bar_write(&dev->udev, bar, offset, buffer, size);
380 }
381
uml_vfio_bar_write(struct um_pci_device * pdev,int bar,unsigned int offset,int size,unsigned long val)382 static void uml_vfio_bar_write(struct um_pci_device *pdev, int bar,
383 unsigned int offset, int size,
384 unsigned long val)
385 {
386 struct uml_vfio_device *dev = to_vdev(pdev);
387 u8 data[8];
388
389 if (bar == dev->msix_bar && offset + size > dev->msix_offset &&
390 offset < dev->msix_offset + dev->msix_size)
391 WARN_ON(uml_vfio_update_msix_table(dev, offset, size, val));
392
393 switch (size) {
394 case 1:
395 data[0] = (u8)val;
396 break;
397 case 2:
398 put_unaligned_le16(val, (void *)data);
399 break;
400 case 4:
401 put_unaligned_le32(val, (void *)data);
402 break;
403 #ifdef CONFIG_64BIT
404 case 8:
405 put_unaligned_le64(val, (void *)data);
406 break;
407 #endif
408 }
409
410 uml_vfio_bar_copy_to(pdev, bar, offset, data, size);
411 }
412
uml_vfio_bar_set(struct um_pci_device * pdev,int bar,unsigned int offset,u8 value,int size)413 static void uml_vfio_bar_set(struct um_pci_device *pdev, int bar,
414 unsigned int offset, u8 value, int size)
415 {
416 struct uml_vfio_device *dev = to_vdev(pdev);
417 int i;
418
419 for (i = 0; i < size; i++)
420 uml_vfio_user_bar_write(&dev->udev, bar, offset + i, &value, 1);
421 }
422
423 static const struct um_pci_ops uml_vfio_um_pci_ops = {
424 .cfgspace_read = uml_vfio_cfgspace_read,
425 .cfgspace_write = uml_vfio_cfgspace_write,
426 .bar_read = uml_vfio_bar_read,
427 .bar_write = uml_vfio_bar_write,
428 .bar_copy_from = uml_vfio_bar_copy_from,
429 .bar_copy_to = uml_vfio_bar_copy_to,
430 .bar_set = uml_vfio_bar_set,
431 };
432
uml_vfio_find_capability(struct uml_vfio_device * dev,u8 cap)433 static u8 uml_vfio_find_capability(struct uml_vfio_device *dev, u8 cap)
434 {
435 u8 id, pos;
436 u16 ent;
437 int ttl = 48; /* PCI_FIND_CAP_TTL */
438
439 pos = __uml_vfio_cfgspace_read(dev, PCI_CAPABILITY_LIST, sizeof(pos));
440
441 while (pos && ttl--) {
442 ent = __uml_vfio_cfgspace_read(dev, pos, sizeof(ent));
443
444 id = ent & 0xff;
445 if (id == 0xff)
446 break;
447 if (id == cap)
448 return pos;
449
450 pos = ent >> 8;
451 }
452
453 return 0;
454 }
455
uml_vfio_read_msix_table(struct uml_vfio_device * dev)456 static int uml_vfio_read_msix_table(struct uml_vfio_device *dev)
457 {
458 unsigned int off;
459 u16 flags;
460 u32 tbl;
461
462 off = uml_vfio_find_capability(dev, PCI_CAP_ID_MSIX);
463 if (!off)
464 return -ENOTSUPP;
465
466 dev->msix_cap = off;
467
468 tbl = __uml_vfio_cfgspace_read(dev, off + PCI_MSIX_TABLE, sizeof(tbl));
469 flags = __uml_vfio_cfgspace_read(dev, off + PCI_MSIX_FLAGS, sizeof(flags));
470
471 dev->msix_bar = tbl & PCI_MSIX_TABLE_BIR;
472 dev->msix_offset = tbl & PCI_MSIX_TABLE_OFFSET;
473 dev->msix_size = ((flags & PCI_MSIX_FLAGS_QSIZE) + 1) * PCI_MSIX_ENTRY_SIZE;
474
475 dev->msix_data = kzalloc(dev->msix_size, GFP_KERNEL);
476 if (!dev->msix_data)
477 return -ENOMEM;
478
479 return 0;
480 }
481
uml_vfio_open_device(struct uml_vfio_device * dev)482 static void uml_vfio_open_device(struct uml_vfio_device *dev)
483 {
484 struct uml_vfio_intr_ctx *ctx;
485 int err, group_id, i;
486
487 group_id = uml_vfio_user_get_group_id(dev->name);
488 if (group_id < 0) {
489 pr_err("Failed to get group id (%s), error %d\n",
490 dev->name, group_id);
491 goto free_dev;
492 }
493
494 dev->group = uml_vfio_open_group(group_id);
495 if (dev->group < 0) {
496 pr_err("Failed to open group %d (%s), error %d\n",
497 group_id, dev->name, dev->group);
498 goto free_dev;
499 }
500
501 err = uml_vfio_user_setup_device(&dev->udev, dev->group, dev->name);
502 if (err) {
503 pr_err("Failed to setup device (%s), error %d\n",
504 dev->name, err);
505 goto release_group;
506 }
507
508 err = uml_vfio_read_msix_table(dev);
509 if (err) {
510 pr_err("Failed to read MSI-X table (%s), error %d\n",
511 dev->name, err);
512 goto teardown_udev;
513 }
514
515 dev->intr_ctx = kmalloc_array(dev->udev.irq_count,
516 sizeof(struct uml_vfio_intr_ctx),
517 GFP_KERNEL);
518 if (!dev->intr_ctx) {
519 pr_err("Failed to allocate interrupt context (%s)\n",
520 dev->name);
521 goto free_msix;
522 }
523
524 for (i = 0; i < dev->udev.irq_count; i++) {
525 ctx = &dev->intr_ctx[i];
526 ctx->dev = dev;
527 ctx->irq = -1;
528 }
529
530 dev->pdev.ops = ¨_vfio_um_pci_ops;
531
532 err = um_pci_device_register(&dev->pdev);
533 if (err) {
534 pr_err("Failed to register UM PCI device (%s), error %d\n",
535 dev->name, err);
536 goto free_intr_ctx;
537 }
538
539 return;
540
541 free_intr_ctx:
542 kfree(dev->intr_ctx);
543 free_msix:
544 kfree(dev->msix_data);
545 teardown_udev:
546 uml_vfio_user_teardown_device(&dev->udev);
547 release_group:
548 uml_vfio_release_group(dev->group);
549 free_dev:
550 list_del(&dev->list);
551 kfree(dev->name);
552 kfree(dev);
553 }
554
uml_vfio_release_device(struct uml_vfio_device * dev)555 static void uml_vfio_release_device(struct uml_vfio_device *dev)
556 {
557 int i;
558
559 for (i = 0; i < dev->udev.irq_count; i++)
560 uml_vfio_deactivate_irq(dev, i);
561 uml_vfio_user_update_irqs(&dev->udev);
562
563 um_pci_device_unregister(&dev->pdev);
564 kfree(dev->intr_ctx);
565 kfree(dev->msix_data);
566 uml_vfio_user_teardown_device(&dev->udev);
567 uml_vfio_release_group(dev->group);
568 list_del(&dev->list);
569 kfree(dev->name);
570 kfree(dev);
571 }
572
uml_vfio_find_device(const char * device)573 static struct uml_vfio_device *uml_vfio_find_device(const char *device)
574 {
575 struct uml_vfio_device *dev;
576
577 list_for_each_entry(dev, ¨_vfio_devices, list) {
578 if (!strcmp(dev->name, device))
579 return dev;
580 }
581 return NULL;
582 }
583
uml_vfio_cmdline_set(const char * device,const struct kernel_param * kp)584 static int uml_vfio_cmdline_set(const char *device, const struct kernel_param *kp)
585 {
586 struct uml_vfio_device *dev;
587 int fd;
588
589 if (uml_vfio_container.fd < 0) {
590 fd = uml_vfio_user_open_container();
591 if (fd < 0)
592 return fd;
593 uml_vfio_container.fd = fd;
594 }
595
596 if (uml_vfio_find_device(device))
597 return -EEXIST;
598
599 dev = kzalloc(sizeof(*dev), GFP_KERNEL);
600 if (!dev)
601 return -ENOMEM;
602
603 dev->name = kstrdup(device, GFP_KERNEL);
604 if (!dev->name) {
605 kfree(dev);
606 return -ENOMEM;
607 }
608
609 list_add_tail(&dev->list, ¨_vfio_devices);
610 return 0;
611 }
612
uml_vfio_cmdline_get(char * buffer,const struct kernel_param * kp)613 static int uml_vfio_cmdline_get(char *buffer, const struct kernel_param *kp)
614 {
615 return 0;
616 }
617
618 static const struct kernel_param_ops uml_vfio_cmdline_param_ops = {
619 .set = uml_vfio_cmdline_set,
620 .get = uml_vfio_cmdline_get,
621 };
622
623 device_param_cb(device, ¨_vfio_cmdline_param_ops, NULL, 0400);
624 __uml_help(uml_vfio_cmdline_param_ops,
625 "vfio_uml.device=<domain:bus:slot.function>\n"
626 " Pass through a PCI device to UML via VFIO. Currently, only MSI-X\n"
627 " capable devices are supported, and it is assumed that drivers will\n"
628 " use MSI-X. This parameter can be specified multiple times to pass\n"
629 " through multiple PCI devices to UML.\n\n"
630 );
631
uml_vfio_init(void)632 static int __init uml_vfio_init(void)
633 {
634 struct uml_vfio_device *dev, *n;
635
636 sigio_broken();
637
638 /* If the opening fails, the device will be released. */
639 list_for_each_entry_safe(dev, n, ¨_vfio_devices, list)
640 uml_vfio_open_device(dev);
641
642 return 0;
643 }
644 late_initcall(uml_vfio_init);
645
uml_vfio_exit(void)646 static void __exit uml_vfio_exit(void)
647 {
648 struct uml_vfio_device *dev, *n;
649
650 list_for_each_entry_safe(dev, n, ¨_vfio_devices, list)
651 uml_vfio_release_device(dev);
652
653 if (uml_vfio_container.fd >= 0)
654 os_close_file(uml_vfio_container.fd);
655 }
656 module_exit(uml_vfio_exit);
657