1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Copyright (C) 2025 Ant Group
4 * Author: Tiwei Bie <tiwei.btw@antgroup.com>
5 */
6
7 #define pr_fmt(fmt) "vfio-uml: " fmt
8
9 #include <linux/module.h>
10 #include <linux/logic_iomem.h>
11 #include <linux/mutex.h>
12 #include <linux/list.h>
13 #include <linux/string.h>
14 #include <linux/unaligned.h>
15 #include <irq_kern.h>
16 #include <init.h>
17 #include <os.h>
18
19 #include "mconsole_kern.h"
20 #include "virt-pci.h"
21 #include "vfio_user.h"
22
23 #define to_vdev(_pdev) container_of(_pdev, struct uml_vfio_device, pdev)
24
25 struct uml_vfio_intr_ctx {
26 struct uml_vfio_device *dev;
27 int irq;
28 };
29
30 struct uml_vfio_device {
31 const char *name;
32 int group;
33
34 struct um_pci_device pdev;
35 struct uml_vfio_user_device udev;
36 struct uml_vfio_intr_ctx *intr_ctx;
37
38 int msix_cap;
39 int msix_bar;
40 int msix_offset;
41 int msix_size;
42 u32 *msix_data;
43
44 struct list_head list;
45 };
46
47 struct uml_vfio_group {
48 int id;
49 int fd;
50 int users;
51 struct list_head list;
52 };
53
54 static struct {
55 int fd;
56 int users;
57 } uml_vfio_container = { .fd = -1 };
58 static DEFINE_MUTEX(uml_vfio_container_mtx);
59
60 static LIST_HEAD(uml_vfio_groups);
61 static DEFINE_MUTEX(uml_vfio_groups_mtx);
62
63 static LIST_HEAD(uml_vfio_devices);
64 static DEFINE_MUTEX(uml_vfio_devices_mtx);
65
uml_vfio_set_container(int group_fd)66 static int uml_vfio_set_container(int group_fd)
67 {
68 int err;
69
70 guard(mutex)(¨_vfio_container_mtx);
71
72 err = uml_vfio_user_set_container(uml_vfio_container.fd, group_fd);
73 if (err)
74 return err;
75
76 uml_vfio_container.users++;
77 if (uml_vfio_container.users > 1)
78 return 0;
79
80 err = uml_vfio_user_setup_iommu(uml_vfio_container.fd);
81 if (err) {
82 uml_vfio_user_unset_container(uml_vfio_container.fd, group_fd);
83 uml_vfio_container.users--;
84 }
85 return err;
86 }
87
uml_vfio_unset_container(int group_fd)88 static void uml_vfio_unset_container(int group_fd)
89 {
90 guard(mutex)(¨_vfio_container_mtx);
91
92 uml_vfio_user_unset_container(uml_vfio_container.fd, group_fd);
93 uml_vfio_container.users--;
94 }
95
uml_vfio_open_group(int group_id)96 static int uml_vfio_open_group(int group_id)
97 {
98 struct uml_vfio_group *group;
99 int err;
100
101 guard(mutex)(¨_vfio_groups_mtx);
102
103 list_for_each_entry(group, ¨_vfio_groups, list) {
104 if (group->id == group_id) {
105 group->users++;
106 return group->fd;
107 }
108 }
109
110 group = kzalloc(sizeof(*group), GFP_KERNEL);
111 if (!group)
112 return -ENOMEM;
113
114 group->fd = uml_vfio_user_open_group(group_id);
115 if (group->fd < 0) {
116 err = group->fd;
117 goto free_group;
118 }
119
120 err = uml_vfio_set_container(group->fd);
121 if (err)
122 goto close_group;
123
124 group->id = group_id;
125 group->users = 1;
126
127 list_add(&group->list, ¨_vfio_groups);
128
129 return group->fd;
130
131 close_group:
132 os_close_file(group->fd);
133 free_group:
134 kfree(group);
135 return err;
136 }
137
uml_vfio_release_group(int group_fd)138 static int uml_vfio_release_group(int group_fd)
139 {
140 struct uml_vfio_group *group;
141
142 guard(mutex)(¨_vfio_groups_mtx);
143
144 list_for_each_entry(group, ¨_vfio_groups, list) {
145 if (group->fd == group_fd) {
146 group->users--;
147 if (group->users == 0) {
148 uml_vfio_unset_container(group_fd);
149 os_close_file(group_fd);
150 list_del(&group->list);
151 kfree(group);
152 }
153 return 0;
154 }
155 }
156
157 return -ENOENT;
158 }
159
uml_vfio_interrupt(int unused,void * opaque)160 static irqreturn_t uml_vfio_interrupt(int unused, void *opaque)
161 {
162 struct uml_vfio_intr_ctx *ctx = opaque;
163 struct uml_vfio_device *dev = ctx->dev;
164 int index = ctx - dev->intr_ctx;
165 int irqfd = dev->udev.irqfd[index];
166 int irq = dev->msix_data[index];
167 uint64_t v;
168 int r;
169
170 do {
171 r = os_read_file(irqfd, &v, sizeof(v));
172 if (r == sizeof(v))
173 generic_handle_irq(irq);
174 } while (r == sizeof(v) || r == -EINTR);
175 WARN(r != -EAGAIN, "read returned %d\n", r);
176
177 return IRQ_HANDLED;
178 }
179
uml_vfio_activate_irq(struct uml_vfio_device * dev,int index)180 static int uml_vfio_activate_irq(struct uml_vfio_device *dev, int index)
181 {
182 struct uml_vfio_intr_ctx *ctx = &dev->intr_ctx[index];
183 int err, irqfd;
184
185 if (ctx->irq >= 0)
186 return 0;
187
188 irqfd = uml_vfio_user_activate_irq(&dev->udev, index);
189 if (irqfd < 0)
190 return irqfd;
191
192 ctx->irq = um_request_irq(UM_IRQ_ALLOC, irqfd, IRQ_READ,
193 uml_vfio_interrupt, 0,
194 "vfio-uml", ctx);
195 if (ctx->irq < 0) {
196 err = ctx->irq;
197 goto deactivate;
198 }
199
200 err = add_sigio_fd(irqfd);
201 if (err)
202 goto free_irq;
203
204 return 0;
205
206 free_irq:
207 um_free_irq(ctx->irq, ctx);
208 ctx->irq = -1;
209 deactivate:
210 uml_vfio_user_deactivate_irq(&dev->udev, index);
211 return err;
212 }
213
uml_vfio_deactivate_irq(struct uml_vfio_device * dev,int index)214 static int uml_vfio_deactivate_irq(struct uml_vfio_device *dev, int index)
215 {
216 struct uml_vfio_intr_ctx *ctx = &dev->intr_ctx[index];
217
218 if (ctx->irq >= 0) {
219 ignore_sigio_fd(dev->udev.irqfd[index]);
220 um_free_irq(ctx->irq, ctx);
221 uml_vfio_user_deactivate_irq(&dev->udev, index);
222 ctx->irq = -1;
223 }
224 return 0;
225 }
226
uml_vfio_update_msix_cap(struct uml_vfio_device * dev,unsigned int offset,int size,unsigned long val)227 static int uml_vfio_update_msix_cap(struct uml_vfio_device *dev,
228 unsigned int offset, int size,
229 unsigned long val)
230 {
231 /*
232 * Here, we handle only the operations we care about,
233 * ignoring the rest.
234 */
235 if (size == 2 && offset == dev->msix_cap + PCI_MSIX_FLAGS) {
236 switch (val & ~PCI_MSIX_FLAGS_QSIZE) {
237 case PCI_MSIX_FLAGS_ENABLE:
238 case 0:
239 return uml_vfio_user_update_irqs(&dev->udev);
240 }
241 }
242 return 0;
243 }
244
uml_vfio_update_msix_table(struct uml_vfio_device * dev,unsigned int offset,int size,unsigned long val)245 static int uml_vfio_update_msix_table(struct uml_vfio_device *dev,
246 unsigned int offset, int size,
247 unsigned long val)
248 {
249 int index;
250
251 /*
252 * Here, we handle only the operations we care about,
253 * ignoring the rest.
254 */
255 offset -= dev->msix_offset + PCI_MSIX_ENTRY_DATA;
256
257 if (size != 4 || offset % PCI_MSIX_ENTRY_SIZE != 0)
258 return 0;
259
260 index = offset / PCI_MSIX_ENTRY_SIZE;
261 if (index >= dev->udev.irq_count)
262 return -EINVAL;
263
264 dev->msix_data[index] = val;
265
266 return val ? uml_vfio_activate_irq(dev, index) :
267 uml_vfio_deactivate_irq(dev, index);
268 }
269
__uml_vfio_cfgspace_read(struct uml_vfio_device * dev,unsigned int offset,int size)270 static unsigned long __uml_vfio_cfgspace_read(struct uml_vfio_device *dev,
271 unsigned int offset, int size)
272 {
273 u8 data[8];
274
275 memset(data, 0xff, sizeof(data));
276
277 if (uml_vfio_user_cfgspace_read(&dev->udev, offset, data, size))
278 return ULONG_MAX;
279
280 switch (size) {
281 case 1:
282 return data[0];
283 case 2:
284 return le16_to_cpup((void *)data);
285 case 4:
286 return le32_to_cpup((void *)data);
287 #ifdef CONFIG_64BIT
288 case 8:
289 return le64_to_cpup((void *)data);
290 #endif
291 default:
292 return ULONG_MAX;
293 }
294 }
295
uml_vfio_cfgspace_read(struct um_pci_device * pdev,unsigned int offset,int size)296 static unsigned long uml_vfio_cfgspace_read(struct um_pci_device *pdev,
297 unsigned int offset, int size)
298 {
299 struct uml_vfio_device *dev = to_vdev(pdev);
300
301 return __uml_vfio_cfgspace_read(dev, offset, size);
302 }
303
__uml_vfio_cfgspace_write(struct uml_vfio_device * dev,unsigned int offset,int size,unsigned long val)304 static void __uml_vfio_cfgspace_write(struct uml_vfio_device *dev,
305 unsigned int offset, int size,
306 unsigned long val)
307 {
308 u8 data[8];
309
310 switch (size) {
311 case 1:
312 data[0] = (u8)val;
313 break;
314 case 2:
315 put_unaligned_le16(val, (void *)data);
316 break;
317 case 4:
318 put_unaligned_le32(val, (void *)data);
319 break;
320 #ifdef CONFIG_64BIT
321 case 8:
322 put_unaligned_le64(val, (void *)data);
323 break;
324 #endif
325 }
326
327 WARN_ON(uml_vfio_user_cfgspace_write(&dev->udev, offset, data, size));
328 }
329
uml_vfio_cfgspace_write(struct um_pci_device * pdev,unsigned int offset,int size,unsigned long val)330 static void uml_vfio_cfgspace_write(struct um_pci_device *pdev,
331 unsigned int offset, int size,
332 unsigned long val)
333 {
334 struct uml_vfio_device *dev = to_vdev(pdev);
335
336 if (offset < dev->msix_cap + PCI_CAP_MSIX_SIZEOF &&
337 offset + size > dev->msix_cap)
338 WARN_ON(uml_vfio_update_msix_cap(dev, offset, size, val));
339
340 __uml_vfio_cfgspace_write(dev, offset, size, val);
341 }
342
uml_vfio_bar_copy_from(struct um_pci_device * pdev,int bar,void * buffer,unsigned int offset,int size)343 static void uml_vfio_bar_copy_from(struct um_pci_device *pdev, int bar,
344 void *buffer, unsigned int offset, int size)
345 {
346 struct uml_vfio_device *dev = to_vdev(pdev);
347
348 memset(buffer, 0xff, size);
349 uml_vfio_user_bar_read(&dev->udev, bar, offset, buffer, size);
350 }
351
uml_vfio_bar_read(struct um_pci_device * pdev,int bar,unsigned int offset,int size)352 static unsigned long uml_vfio_bar_read(struct um_pci_device *pdev, int bar,
353 unsigned int offset, int size)
354 {
355 u8 data[8];
356
357 uml_vfio_bar_copy_from(pdev, bar, data, offset, size);
358
359 switch (size) {
360 case 1:
361 return data[0];
362 case 2:
363 return le16_to_cpup((void *)data);
364 case 4:
365 return le32_to_cpup((void *)data);
366 #ifdef CONFIG_64BIT
367 case 8:
368 return le64_to_cpup((void *)data);
369 #endif
370 default:
371 return ULONG_MAX;
372 }
373 }
374
uml_vfio_bar_copy_to(struct um_pci_device * pdev,int bar,unsigned int offset,const void * buffer,int size)375 static void uml_vfio_bar_copy_to(struct um_pci_device *pdev, int bar,
376 unsigned int offset, const void *buffer,
377 int size)
378 {
379 struct uml_vfio_device *dev = to_vdev(pdev);
380
381 uml_vfio_user_bar_write(&dev->udev, bar, offset, buffer, size);
382 }
383
uml_vfio_bar_write(struct um_pci_device * pdev,int bar,unsigned int offset,int size,unsigned long val)384 static void uml_vfio_bar_write(struct um_pci_device *pdev, int bar,
385 unsigned int offset, int size,
386 unsigned long val)
387 {
388 struct uml_vfio_device *dev = to_vdev(pdev);
389 u8 data[8];
390
391 if (bar == dev->msix_bar && offset + size > dev->msix_offset &&
392 offset < dev->msix_offset + dev->msix_size)
393 WARN_ON(uml_vfio_update_msix_table(dev, offset, size, val));
394
395 switch (size) {
396 case 1:
397 data[0] = (u8)val;
398 break;
399 case 2:
400 put_unaligned_le16(val, (void *)data);
401 break;
402 case 4:
403 put_unaligned_le32(val, (void *)data);
404 break;
405 #ifdef CONFIG_64BIT
406 case 8:
407 put_unaligned_le64(val, (void *)data);
408 break;
409 #endif
410 }
411
412 uml_vfio_bar_copy_to(pdev, bar, offset, data, size);
413 }
414
uml_vfio_bar_set(struct um_pci_device * pdev,int bar,unsigned int offset,u8 value,int size)415 static void uml_vfio_bar_set(struct um_pci_device *pdev, int bar,
416 unsigned int offset, u8 value, int size)
417 {
418 struct uml_vfio_device *dev = to_vdev(pdev);
419 int i;
420
421 for (i = 0; i < size; i++)
422 uml_vfio_user_bar_write(&dev->udev, bar, offset + i, &value, 1);
423 }
424
425 static const struct um_pci_ops uml_vfio_um_pci_ops = {
426 .cfgspace_read = uml_vfio_cfgspace_read,
427 .cfgspace_write = uml_vfio_cfgspace_write,
428 .bar_read = uml_vfio_bar_read,
429 .bar_write = uml_vfio_bar_write,
430 .bar_copy_from = uml_vfio_bar_copy_from,
431 .bar_copy_to = uml_vfio_bar_copy_to,
432 .bar_set = uml_vfio_bar_set,
433 };
434
uml_vfio_find_capability(struct uml_vfio_device * dev,u8 cap)435 static u8 uml_vfio_find_capability(struct uml_vfio_device *dev, u8 cap)
436 {
437 u8 id, pos;
438 u16 ent;
439 int ttl = 48; /* PCI_FIND_CAP_TTL */
440
441 pos = __uml_vfio_cfgspace_read(dev, PCI_CAPABILITY_LIST, sizeof(pos));
442
443 while (pos && ttl--) {
444 ent = __uml_vfio_cfgspace_read(dev, pos, sizeof(ent));
445
446 id = ent & 0xff;
447 if (id == 0xff)
448 break;
449 if (id == cap)
450 return pos;
451
452 pos = ent >> 8;
453 }
454
455 return 0;
456 }
457
uml_vfio_read_msix_table(struct uml_vfio_device * dev)458 static int uml_vfio_read_msix_table(struct uml_vfio_device *dev)
459 {
460 unsigned int off;
461 u16 flags;
462 u32 tbl;
463
464 off = uml_vfio_find_capability(dev, PCI_CAP_ID_MSIX);
465 if (!off)
466 return -ENOTSUPP;
467
468 dev->msix_cap = off;
469
470 tbl = __uml_vfio_cfgspace_read(dev, off + PCI_MSIX_TABLE, sizeof(tbl));
471 flags = __uml_vfio_cfgspace_read(dev, off + PCI_MSIX_FLAGS, sizeof(flags));
472
473 dev->msix_bar = tbl & PCI_MSIX_TABLE_BIR;
474 dev->msix_offset = tbl & PCI_MSIX_TABLE_OFFSET;
475 dev->msix_size = ((flags & PCI_MSIX_FLAGS_QSIZE) + 1) * PCI_MSIX_ENTRY_SIZE;
476
477 dev->msix_data = kzalloc(dev->msix_size, GFP_KERNEL);
478 if (!dev->msix_data)
479 return -ENOMEM;
480
481 return 0;
482 }
483
uml_vfio_open_device(struct uml_vfio_device * dev)484 static void uml_vfio_open_device(struct uml_vfio_device *dev)
485 {
486 struct uml_vfio_intr_ctx *ctx;
487 int err, group_id, i;
488
489 group_id = uml_vfio_user_get_group_id(dev->name);
490 if (group_id < 0) {
491 pr_err("Failed to get group id (%s), error %d\n",
492 dev->name, group_id);
493 goto free_dev;
494 }
495
496 dev->group = uml_vfio_open_group(group_id);
497 if (dev->group < 0) {
498 pr_err("Failed to open group %d (%s), error %d\n",
499 group_id, dev->name, dev->group);
500 goto free_dev;
501 }
502
503 err = uml_vfio_user_setup_device(&dev->udev, dev->group, dev->name);
504 if (err) {
505 pr_err("Failed to setup device (%s), error %d\n",
506 dev->name, err);
507 goto release_group;
508 }
509
510 err = uml_vfio_read_msix_table(dev);
511 if (err) {
512 pr_err("Failed to read MSI-X table (%s), error %d\n",
513 dev->name, err);
514 goto teardown_udev;
515 }
516
517 dev->intr_ctx = kmalloc_array(dev->udev.irq_count,
518 sizeof(struct uml_vfio_intr_ctx),
519 GFP_KERNEL);
520 if (!dev->intr_ctx) {
521 pr_err("Failed to allocate interrupt context (%s)\n",
522 dev->name);
523 goto free_msix;
524 }
525
526 for (i = 0; i < dev->udev.irq_count; i++) {
527 ctx = &dev->intr_ctx[i];
528 ctx->dev = dev;
529 ctx->irq = -1;
530 }
531
532 dev->pdev.ops = ¨_vfio_um_pci_ops;
533
534 err = um_pci_device_register(&dev->pdev);
535 if (err) {
536 pr_err("Failed to register UM PCI device (%s), error %d\n",
537 dev->name, err);
538 goto free_intr_ctx;
539 }
540
541 return;
542
543 free_intr_ctx:
544 kfree(dev->intr_ctx);
545 free_msix:
546 kfree(dev->msix_data);
547 teardown_udev:
548 uml_vfio_user_teardown_device(&dev->udev);
549 release_group:
550 uml_vfio_release_group(dev->group);
551 free_dev:
552 list_del(&dev->list);
553 kfree(dev->name);
554 kfree(dev);
555 }
556
uml_vfio_release_device(struct uml_vfio_device * dev)557 static void uml_vfio_release_device(struct uml_vfio_device *dev)
558 {
559 int i;
560
561 for (i = 0; i < dev->udev.irq_count; i++)
562 uml_vfio_deactivate_irq(dev, i);
563 uml_vfio_user_update_irqs(&dev->udev);
564
565 um_pci_device_unregister(&dev->pdev);
566 kfree(dev->intr_ctx);
567 kfree(dev->msix_data);
568 uml_vfio_user_teardown_device(&dev->udev);
569 uml_vfio_release_group(dev->group);
570 list_del(&dev->list);
571 kfree(dev->name);
572 kfree(dev);
573 }
574
uml_vfio_find_device(const char * device)575 static struct uml_vfio_device *uml_vfio_find_device(const char *device)
576 {
577 struct uml_vfio_device *dev;
578
579 list_for_each_entry(dev, ¨_vfio_devices, list) {
580 if (!strcmp(dev->name, device))
581 return dev;
582 }
583 return NULL;
584 }
585
uml_vfio_add_device(const char * device)586 static struct uml_vfio_device *uml_vfio_add_device(const char *device)
587 {
588 struct uml_vfio_device *dev;
589 int fd;
590
591 guard(mutex)(¨_vfio_devices_mtx);
592
593 if (uml_vfio_container.fd < 0) {
594 fd = uml_vfio_user_open_container();
595 if (fd < 0)
596 return ERR_PTR(fd);
597 uml_vfio_container.fd = fd;
598 }
599
600 if (uml_vfio_find_device(device))
601 return ERR_PTR(-EEXIST);
602
603 dev = kzalloc(sizeof(*dev), GFP_KERNEL);
604 if (!dev)
605 return ERR_PTR(-ENOMEM);
606
607 dev->name = kstrdup(device, GFP_KERNEL);
608 if (!dev->name) {
609 kfree(dev);
610 return ERR_PTR(-ENOMEM);
611 }
612
613 list_add_tail(&dev->list, ¨_vfio_devices);
614 return dev;
615 }
616
uml_vfio_cmdline_set(const char * device,const struct kernel_param * kp)617 static int uml_vfio_cmdline_set(const char *device, const struct kernel_param *kp)
618 {
619 struct uml_vfio_device *dev;
620
621 dev = uml_vfio_add_device(device);
622 if (IS_ERR(dev))
623 return PTR_ERR(dev);
624 return 0;
625 }
626
uml_vfio_cmdline_get(char * buffer,const struct kernel_param * kp)627 static int uml_vfio_cmdline_get(char *buffer, const struct kernel_param *kp)
628 {
629 return 0;
630 }
631
632 static const struct kernel_param_ops uml_vfio_cmdline_param_ops = {
633 .set = uml_vfio_cmdline_set,
634 .get = uml_vfio_cmdline_get,
635 };
636
637 device_param_cb(device, ¨_vfio_cmdline_param_ops, NULL, 0400);
638 __uml_help(uml_vfio_cmdline_param_ops,
639 "vfio_uml.device=<domain:bus:slot.function>\n"
640 " Pass through a PCI device to UML via VFIO. Currently, only MSI-X\n"
641 " capable devices are supported, and it is assumed that drivers will\n"
642 " use MSI-X. This parameter can be specified multiple times to pass\n"
643 " through multiple PCI devices to UML.\n\n"
644 );
645
uml_vfio_mc_config(char * str,char ** error_out)646 static int uml_vfio_mc_config(char *str, char **error_out)
647 {
648 struct uml_vfio_device *dev;
649
650 if (*str != '=') {
651 *error_out = "Invalid config";
652 return -EINVAL;
653 }
654 str += 1;
655
656 dev = uml_vfio_add_device(str);
657 if (IS_ERR(dev))
658 return PTR_ERR(dev);
659 uml_vfio_open_device(dev);
660 return 0;
661 }
662
uml_vfio_mc_id(char ** str,int * start_out,int * end_out)663 static int uml_vfio_mc_id(char **str, int *start_out, int *end_out)
664 {
665 return -EOPNOTSUPP;
666 }
667
uml_vfio_mc_remove(int n,char ** error_out)668 static int uml_vfio_mc_remove(int n, char **error_out)
669 {
670 return -EOPNOTSUPP;
671 }
672
673 static struct mc_device uml_vfio_mc = {
674 .list = LIST_HEAD_INIT(uml_vfio_mc.list),
675 .name = "vfio_uml.device",
676 .config = uml_vfio_mc_config,
677 .get_config = NULL,
678 .id = uml_vfio_mc_id,
679 .remove = uml_vfio_mc_remove,
680 };
681
uml_vfio_init(void)682 static int __init uml_vfio_init(void)
683 {
684 struct uml_vfio_device *dev, *n;
685
686 sigio_broken();
687
688 /* If the opening fails, the device will be released. */
689 list_for_each_entry_safe(dev, n, ¨_vfio_devices, list)
690 uml_vfio_open_device(dev);
691
692 mconsole_register_dev(¨_vfio_mc);
693
694 return 0;
695 }
696 late_initcall(uml_vfio_init);
697
uml_vfio_exit(void)698 static void __exit uml_vfio_exit(void)
699 {
700 struct uml_vfio_device *dev, *n;
701
702 list_for_each_entry_safe(dev, n, ¨_vfio_devices, list)
703 uml_vfio_release_device(dev);
704
705 if (uml_vfio_container.fd >= 0)
706 os_close_file(uml_vfio_container.fd);
707 }
708 module_exit(uml_vfio_exit);
709