1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Copyright (C) 2025 Ant Group
4 * Author: Tiwei Bie <tiwei.btw@antgroup.com>
5 */
6
7 #define pr_fmt(fmt) "vfio-uml: " fmt
8
9 #include <linux/module.h>
10 #include <linux/logic_iomem.h>
11 #include <linux/mutex.h>
12 #include <linux/list.h>
13 #include <linux/string.h>
14 #include <linux/unaligned.h>
15 #include <irq_kern.h>
16 #include <init.h>
17 #include <os.h>
18
19 #include "mconsole_kern.h"
20 #include "virt-pci.h"
21 #include "vfio_user.h"
22
23 #define to_vdev(_pdev) container_of(_pdev, struct uml_vfio_device, pdev)
24
25 struct uml_vfio_intr_ctx {
26 struct uml_vfio_device *dev;
27 int irq;
28 };
29
30 struct uml_vfio_device {
31 const char *name;
32 int group;
33
34 struct um_pci_device pdev;
35 struct uml_vfio_user_device udev;
36 struct uml_vfio_intr_ctx *intr_ctx;
37
38 int msix_cap;
39 int msix_bar;
40 int msix_offset;
41 int msix_size;
42 u32 *msix_data;
43
44 struct list_head list;
45 };
46
47 struct uml_vfio_group {
48 int id;
49 int fd;
50 int users;
51 struct list_head list;
52 };
53
54 static struct {
55 int fd;
56 int users;
57 } uml_vfio_container = { .fd = -1 };
58 static DEFINE_MUTEX(uml_vfio_container_mtx);
59
60 static LIST_HEAD(uml_vfio_groups);
61 static DEFINE_MUTEX(uml_vfio_groups_mtx);
62
63 static LIST_HEAD(uml_vfio_devices);
64 static DEFINE_MUTEX(uml_vfio_devices_mtx);
65
uml_vfio_set_container(int group_fd)66 static int uml_vfio_set_container(int group_fd)
67 {
68 int err;
69
70 guard(mutex)(¨_vfio_container_mtx);
71
72 err = uml_vfio_user_set_container(uml_vfio_container.fd, group_fd);
73 if (err)
74 return err;
75
76 uml_vfio_container.users++;
77 if (uml_vfio_container.users > 1)
78 return 0;
79
80 err = uml_vfio_user_setup_iommu(uml_vfio_container.fd);
81 if (err) {
82 uml_vfio_user_unset_container(uml_vfio_container.fd, group_fd);
83 uml_vfio_container.users--;
84 }
85 return err;
86 }
87
uml_vfio_unset_container(int group_fd)88 static void uml_vfio_unset_container(int group_fd)
89 {
90 guard(mutex)(¨_vfio_container_mtx);
91
92 uml_vfio_user_unset_container(uml_vfio_container.fd, group_fd);
93 uml_vfio_container.users--;
94 }
95
uml_vfio_open_group(int group_id)96 static int uml_vfio_open_group(int group_id)
97 {
98 struct uml_vfio_group *group;
99 int err;
100
101 guard(mutex)(¨_vfio_groups_mtx);
102
103 list_for_each_entry(group, ¨_vfio_groups, list) {
104 if (group->id == group_id) {
105 group->users++;
106 return group->fd;
107 }
108 }
109
110 group = kzalloc_obj(*group);
111 if (!group)
112 return -ENOMEM;
113
114 group->fd = uml_vfio_user_open_group(group_id);
115 if (group->fd < 0) {
116 err = group->fd;
117 goto free_group;
118 }
119
120 err = uml_vfio_set_container(group->fd);
121 if (err)
122 goto close_group;
123
124 group->id = group_id;
125 group->users = 1;
126
127 list_add(&group->list, ¨_vfio_groups);
128
129 return group->fd;
130
131 close_group:
132 os_close_file(group->fd);
133 free_group:
134 kfree(group);
135 return err;
136 }
137
uml_vfio_release_group(int group_fd)138 static int uml_vfio_release_group(int group_fd)
139 {
140 struct uml_vfio_group *group;
141
142 guard(mutex)(¨_vfio_groups_mtx);
143
144 list_for_each_entry(group, ¨_vfio_groups, list) {
145 if (group->fd == group_fd) {
146 group->users--;
147 if (group->users == 0) {
148 uml_vfio_unset_container(group_fd);
149 os_close_file(group_fd);
150 list_del(&group->list);
151 kfree(group);
152 }
153 return 0;
154 }
155 }
156
157 return -ENOENT;
158 }
159
uml_vfio_interrupt(int unused,void * opaque)160 static irqreturn_t uml_vfio_interrupt(int unused, void *opaque)
161 {
162 struct uml_vfio_intr_ctx *ctx = opaque;
163 struct uml_vfio_device *dev = ctx->dev;
164 int index = ctx - dev->intr_ctx;
165 int irqfd = dev->udev.irqfd[index];
166 int irq = dev->msix_data[index];
167 uint64_t v;
168 int r;
169
170 do {
171 r = os_read_file(irqfd, &v, sizeof(v));
172 if (r == sizeof(v))
173 generic_handle_irq(irq);
174 } while (r == sizeof(v) || r == -EINTR);
175 WARN(r != -EAGAIN, "read returned %d\n", r);
176
177 return IRQ_HANDLED;
178 }
179
uml_vfio_activate_irq(struct uml_vfio_device * dev,int index)180 static int uml_vfio_activate_irq(struct uml_vfio_device *dev, int index)
181 {
182 struct uml_vfio_intr_ctx *ctx = &dev->intr_ctx[index];
183 int err, irqfd;
184
185 if (ctx->irq >= 0)
186 return 0;
187
188 irqfd = uml_vfio_user_activate_irq(&dev->udev, index);
189 if (irqfd < 0)
190 return irqfd;
191
192 ctx->irq = um_request_irq(UM_IRQ_ALLOC, irqfd, IRQ_READ,
193 uml_vfio_interrupt, 0,
194 "vfio-uml", ctx);
195 if (ctx->irq < 0) {
196 err = ctx->irq;
197 goto deactivate;
198 }
199
200 err = add_sigio_fd(irqfd);
201 if (err)
202 goto free_irq;
203
204 return 0;
205
206 free_irq:
207 um_free_irq(ctx->irq, ctx);
208 ctx->irq = -1;
209 deactivate:
210 uml_vfio_user_deactivate_irq(&dev->udev, index);
211 return err;
212 }
213
uml_vfio_deactivate_irq(struct uml_vfio_device * dev,int index)214 static int uml_vfio_deactivate_irq(struct uml_vfio_device *dev, int index)
215 {
216 struct uml_vfio_intr_ctx *ctx = &dev->intr_ctx[index];
217
218 if (ctx->irq >= 0) {
219 ignore_sigio_fd(dev->udev.irqfd[index]);
220 um_free_irq(ctx->irq, ctx);
221 uml_vfio_user_deactivate_irq(&dev->udev, index);
222 ctx->irq = -1;
223 }
224 return 0;
225 }
226
uml_vfio_update_msix_cap(struct uml_vfio_device * dev,unsigned int offset,int size,unsigned long val)227 static int uml_vfio_update_msix_cap(struct uml_vfio_device *dev,
228 unsigned int offset, int size,
229 unsigned long val)
230 {
231 /*
232 * Here, we handle only the operations we care about,
233 * ignoring the rest.
234 */
235 if (size == 2 && offset == dev->msix_cap + PCI_MSIX_FLAGS) {
236 switch (val & ~PCI_MSIX_FLAGS_QSIZE) {
237 case PCI_MSIX_FLAGS_ENABLE:
238 case 0:
239 return uml_vfio_user_update_irqs(&dev->udev);
240 }
241 }
242 return 0;
243 }
244
uml_vfio_update_msix_table(struct uml_vfio_device * dev,unsigned int offset,int size,unsigned long val)245 static int uml_vfio_update_msix_table(struct uml_vfio_device *dev,
246 unsigned int offset, int size,
247 unsigned long val)
248 {
249 int index;
250
251 /*
252 * Here, we handle only the operations we care about,
253 * ignoring the rest.
254 */
255 offset -= dev->msix_offset + PCI_MSIX_ENTRY_DATA;
256
257 if (size != 4 || offset % PCI_MSIX_ENTRY_SIZE != 0)
258 return 0;
259
260 index = offset / PCI_MSIX_ENTRY_SIZE;
261 if (index >= dev->udev.irq_count)
262 return -EINVAL;
263
264 dev->msix_data[index] = val;
265
266 return val ? uml_vfio_activate_irq(dev, index) :
267 uml_vfio_deactivate_irq(dev, index);
268 }
269
__uml_vfio_cfgspace_read(struct uml_vfio_device * dev,unsigned int offset,int size)270 static unsigned long __uml_vfio_cfgspace_read(struct uml_vfio_device *dev,
271 unsigned int offset, int size)
272 {
273 u8 data[8];
274
275 memset(data, 0xff, sizeof(data));
276
277 if (uml_vfio_user_cfgspace_read(&dev->udev, offset, data, size))
278 return ULONG_MAX;
279
280 switch (size) {
281 case 1:
282 return data[0];
283 case 2:
284 return le16_to_cpup((void *)data);
285 case 4:
286 return le32_to_cpup((void *)data);
287 #ifdef CONFIG_64BIT
288 case 8:
289 return le64_to_cpup((void *)data);
290 #endif
291 default:
292 return ULONG_MAX;
293 }
294 }
295
uml_vfio_cfgspace_read(struct um_pci_device * pdev,unsigned int offset,int size)296 static unsigned long uml_vfio_cfgspace_read(struct um_pci_device *pdev,
297 unsigned int offset, int size)
298 {
299 struct uml_vfio_device *dev = to_vdev(pdev);
300
301 return __uml_vfio_cfgspace_read(dev, offset, size);
302 }
303
__uml_vfio_cfgspace_write(struct uml_vfio_device * dev,unsigned int offset,int size,unsigned long val)304 static void __uml_vfio_cfgspace_write(struct uml_vfio_device *dev,
305 unsigned int offset, int size,
306 unsigned long val)
307 {
308 u8 data[8];
309
310 switch (size) {
311 case 1:
312 data[0] = (u8)val;
313 break;
314 case 2:
315 put_unaligned_le16(val, (void *)data);
316 break;
317 case 4:
318 put_unaligned_le32(val, (void *)data);
319 break;
320 #ifdef CONFIG_64BIT
321 case 8:
322 put_unaligned_le64(val, (void *)data);
323 break;
324 #endif
325 }
326
327 WARN_ON(uml_vfio_user_cfgspace_write(&dev->udev, offset, data, size));
328 }
329
uml_vfio_cfgspace_write(struct um_pci_device * pdev,unsigned int offset,int size,unsigned long val)330 static void uml_vfio_cfgspace_write(struct um_pci_device *pdev,
331 unsigned int offset, int size,
332 unsigned long val)
333 {
334 struct uml_vfio_device *dev = to_vdev(pdev);
335
336 if (offset < dev->msix_cap + PCI_CAP_MSIX_SIZEOF &&
337 offset + size > dev->msix_cap)
338 WARN_ON(uml_vfio_update_msix_cap(dev, offset, size, val));
339
340 __uml_vfio_cfgspace_write(dev, offset, size, val);
341 }
342
uml_vfio_bar_copy_from(struct um_pci_device * pdev,int bar,void * buffer,unsigned int offset,int size)343 static void uml_vfio_bar_copy_from(struct um_pci_device *pdev, int bar,
344 void *buffer, unsigned int offset, int size)
345 {
346 struct uml_vfio_device *dev = to_vdev(pdev);
347
348 memset(buffer, 0xff, size);
349 uml_vfio_user_bar_read(&dev->udev, bar, offset, buffer, size);
350 }
351
uml_vfio_bar_read(struct um_pci_device * pdev,int bar,unsigned int offset,int size)352 static unsigned long uml_vfio_bar_read(struct um_pci_device *pdev, int bar,
353 unsigned int offset, int size)
354 {
355 u8 data[8];
356
357 uml_vfio_bar_copy_from(pdev, bar, data, offset, size);
358
359 switch (size) {
360 case 1:
361 return data[0];
362 case 2:
363 return le16_to_cpup((void *)data);
364 case 4:
365 return le32_to_cpup((void *)data);
366 #ifdef CONFIG_64BIT
367 case 8:
368 return le64_to_cpup((void *)data);
369 #endif
370 default:
371 return ULONG_MAX;
372 }
373 }
374
uml_vfio_bar_copy_to(struct um_pci_device * pdev,int bar,unsigned int offset,const void * buffer,int size)375 static void uml_vfio_bar_copy_to(struct um_pci_device *pdev, int bar,
376 unsigned int offset, const void *buffer,
377 int size)
378 {
379 struct uml_vfio_device *dev = to_vdev(pdev);
380
381 uml_vfio_user_bar_write(&dev->udev, bar, offset, buffer, size);
382 }
383
uml_vfio_bar_write(struct um_pci_device * pdev,int bar,unsigned int offset,int size,unsigned long val)384 static void uml_vfio_bar_write(struct um_pci_device *pdev, int bar,
385 unsigned int offset, int size,
386 unsigned long val)
387 {
388 struct uml_vfio_device *dev = to_vdev(pdev);
389 u8 data[8];
390
391 if (bar == dev->msix_bar && offset + size > dev->msix_offset &&
392 offset < dev->msix_offset + dev->msix_size)
393 WARN_ON(uml_vfio_update_msix_table(dev, offset, size, val));
394
395 switch (size) {
396 case 1:
397 data[0] = (u8)val;
398 break;
399 case 2:
400 put_unaligned_le16(val, (void *)data);
401 break;
402 case 4:
403 put_unaligned_le32(val, (void *)data);
404 break;
405 #ifdef CONFIG_64BIT
406 case 8:
407 put_unaligned_le64(val, (void *)data);
408 break;
409 #endif
410 }
411
412 uml_vfio_bar_copy_to(pdev, bar, offset, data, size);
413 }
414
uml_vfio_bar_set(struct um_pci_device * pdev,int bar,unsigned int offset,u8 value,int size)415 static void uml_vfio_bar_set(struct um_pci_device *pdev, int bar,
416 unsigned int offset, u8 value, int size)
417 {
418 struct uml_vfio_device *dev = to_vdev(pdev);
419 int i;
420
421 for (i = 0; i < size; i++)
422 uml_vfio_user_bar_write(&dev->udev, bar, offset + i, &value, 1);
423 }
424
425 static const struct um_pci_ops uml_vfio_um_pci_ops = {
426 .cfgspace_read = uml_vfio_cfgspace_read,
427 .cfgspace_write = uml_vfio_cfgspace_write,
428 .bar_read = uml_vfio_bar_read,
429 .bar_write = uml_vfio_bar_write,
430 .bar_copy_from = uml_vfio_bar_copy_from,
431 .bar_copy_to = uml_vfio_bar_copy_to,
432 .bar_set = uml_vfio_bar_set,
433 };
434
uml_vfio_find_capability(struct uml_vfio_device * dev,u8 cap)435 static u8 uml_vfio_find_capability(struct uml_vfio_device *dev, u8 cap)
436 {
437 u8 id, pos;
438 u16 ent;
439 int ttl = 48; /* PCI_FIND_CAP_TTL */
440
441 pos = __uml_vfio_cfgspace_read(dev, PCI_CAPABILITY_LIST, sizeof(pos));
442
443 while (pos && ttl--) {
444 ent = __uml_vfio_cfgspace_read(dev, pos, sizeof(ent));
445
446 id = ent & 0xff;
447 if (id == 0xff)
448 break;
449 if (id == cap)
450 return pos;
451
452 pos = ent >> 8;
453 }
454
455 return 0;
456 }
457
uml_vfio_read_msix_table(struct uml_vfio_device * dev)458 static int uml_vfio_read_msix_table(struct uml_vfio_device *dev)
459 {
460 unsigned int off;
461 u16 flags;
462 u32 tbl;
463
464 off = uml_vfio_find_capability(dev, PCI_CAP_ID_MSIX);
465 if (!off)
466 return -ENOTSUPP;
467
468 dev->msix_cap = off;
469
470 tbl = __uml_vfio_cfgspace_read(dev, off + PCI_MSIX_TABLE, sizeof(tbl));
471 flags = __uml_vfio_cfgspace_read(dev, off + PCI_MSIX_FLAGS, sizeof(flags));
472
473 dev->msix_bar = tbl & PCI_MSIX_TABLE_BIR;
474 dev->msix_offset = tbl & PCI_MSIX_TABLE_OFFSET;
475 dev->msix_size = ((flags & PCI_MSIX_FLAGS_QSIZE) + 1) * PCI_MSIX_ENTRY_SIZE;
476
477 dev->msix_data = kzalloc(dev->msix_size, GFP_KERNEL);
478 if (!dev->msix_data)
479 return -ENOMEM;
480
481 return 0;
482 }
483
uml_vfio_open_device(struct uml_vfio_device * dev)484 static void uml_vfio_open_device(struct uml_vfio_device *dev)
485 {
486 struct uml_vfio_intr_ctx *ctx;
487 int err, group_id, i;
488
489 group_id = uml_vfio_user_get_group_id(dev->name);
490 if (group_id < 0) {
491 pr_err("Failed to get group id (%s), error %d\n",
492 dev->name, group_id);
493 goto free_dev;
494 }
495
496 dev->group = uml_vfio_open_group(group_id);
497 if (dev->group < 0) {
498 pr_err("Failed to open group %d (%s), error %d\n",
499 group_id, dev->name, dev->group);
500 goto free_dev;
501 }
502
503 err = uml_vfio_user_setup_device(&dev->udev, dev->group, dev->name);
504 if (err) {
505 pr_err("Failed to setup device (%s), error %d\n",
506 dev->name, err);
507 goto release_group;
508 }
509
510 err = uml_vfio_read_msix_table(dev);
511 if (err) {
512 pr_err("Failed to read MSI-X table (%s), error %d\n",
513 dev->name, err);
514 goto teardown_udev;
515 }
516
517 dev->intr_ctx = kmalloc_objs(struct uml_vfio_intr_ctx,
518 dev->udev.irq_count);
519 if (!dev->intr_ctx) {
520 pr_err("Failed to allocate interrupt context (%s)\n",
521 dev->name);
522 goto free_msix;
523 }
524
525 for (i = 0; i < dev->udev.irq_count; i++) {
526 ctx = &dev->intr_ctx[i];
527 ctx->dev = dev;
528 ctx->irq = -1;
529 }
530
531 dev->pdev.ops = ¨_vfio_um_pci_ops;
532
533 err = um_pci_device_register(&dev->pdev);
534 if (err) {
535 pr_err("Failed to register UM PCI device (%s), error %d\n",
536 dev->name, err);
537 goto free_intr_ctx;
538 }
539
540 return;
541
542 free_intr_ctx:
543 kfree(dev->intr_ctx);
544 free_msix:
545 kfree(dev->msix_data);
546 teardown_udev:
547 uml_vfio_user_teardown_device(&dev->udev);
548 release_group:
549 uml_vfio_release_group(dev->group);
550 free_dev:
551 list_del(&dev->list);
552 kfree(dev->name);
553 kfree(dev);
554 }
555
uml_vfio_release_device(struct uml_vfio_device * dev)556 static void uml_vfio_release_device(struct uml_vfio_device *dev)
557 {
558 int i;
559
560 for (i = 0; i < dev->udev.irq_count; i++)
561 uml_vfio_deactivate_irq(dev, i);
562 uml_vfio_user_update_irqs(&dev->udev);
563
564 um_pci_device_unregister(&dev->pdev);
565 kfree(dev->intr_ctx);
566 kfree(dev->msix_data);
567 uml_vfio_user_teardown_device(&dev->udev);
568 uml_vfio_release_group(dev->group);
569 list_del(&dev->list);
570 kfree(dev->name);
571 kfree(dev);
572 }
573
uml_vfio_find_device(const char * device)574 static struct uml_vfio_device *uml_vfio_find_device(const char *device)
575 {
576 struct uml_vfio_device *dev;
577
578 list_for_each_entry(dev, ¨_vfio_devices, list) {
579 if (!strcmp(dev->name, device))
580 return dev;
581 }
582 return NULL;
583 }
584
uml_vfio_add_device(const char * device)585 static struct uml_vfio_device *uml_vfio_add_device(const char *device)
586 {
587 struct uml_vfio_device *dev;
588 int fd;
589
590 guard(mutex)(¨_vfio_devices_mtx);
591
592 if (uml_vfio_container.fd < 0) {
593 fd = uml_vfio_user_open_container();
594 if (fd < 0)
595 return ERR_PTR(fd);
596 uml_vfio_container.fd = fd;
597 }
598
599 if (uml_vfio_find_device(device))
600 return ERR_PTR(-EEXIST);
601
602 dev = kzalloc_obj(*dev);
603 if (!dev)
604 return ERR_PTR(-ENOMEM);
605
606 dev->name = kstrdup(device, GFP_KERNEL);
607 if (!dev->name) {
608 kfree(dev);
609 return ERR_PTR(-ENOMEM);
610 }
611
612 list_add_tail(&dev->list, ¨_vfio_devices);
613 return dev;
614 }
615
uml_vfio_cmdline_set(const char * device,const struct kernel_param * kp)616 static int uml_vfio_cmdline_set(const char *device, const struct kernel_param *kp)
617 {
618 struct uml_vfio_device *dev;
619
620 dev = uml_vfio_add_device(device);
621 if (IS_ERR(dev))
622 return PTR_ERR(dev);
623 return 0;
624 }
625
uml_vfio_cmdline_get(char * buffer,const struct kernel_param * kp)626 static int uml_vfio_cmdline_get(char *buffer, const struct kernel_param *kp)
627 {
628 return 0;
629 }
630
631 static const struct kernel_param_ops uml_vfio_cmdline_param_ops = {
632 .set = uml_vfio_cmdline_set,
633 .get = uml_vfio_cmdline_get,
634 };
635
636 device_param_cb(device, ¨_vfio_cmdline_param_ops, NULL, 0400);
637 __uml_help(uml_vfio_cmdline_param_ops,
638 "vfio_uml.device=<domain:bus:slot.function>\n"
639 " Pass through a PCI device to UML via VFIO. Currently, only MSI-X\n"
640 " capable devices are supported, and it is assumed that drivers will\n"
641 " use MSI-X. This parameter can be specified multiple times to pass\n"
642 " through multiple PCI devices to UML.\n\n"
643 );
644
uml_vfio_mc_config(char * str,char ** error_out)645 static int uml_vfio_mc_config(char *str, char **error_out)
646 {
647 struct uml_vfio_device *dev;
648
649 if (*str != '=') {
650 *error_out = "Invalid config";
651 return -EINVAL;
652 }
653 str += 1;
654
655 dev = uml_vfio_add_device(str);
656 if (IS_ERR(dev))
657 return PTR_ERR(dev);
658 uml_vfio_open_device(dev);
659 return 0;
660 }
661
uml_vfio_mc_id(char ** str,int * start_out,int * end_out)662 static int uml_vfio_mc_id(char **str, int *start_out, int *end_out)
663 {
664 return -EOPNOTSUPP;
665 }
666
uml_vfio_mc_remove(int n,char ** error_out)667 static int uml_vfio_mc_remove(int n, char **error_out)
668 {
669 return -EOPNOTSUPP;
670 }
671
672 static struct mc_device uml_vfio_mc = {
673 .list = LIST_HEAD_INIT(uml_vfio_mc.list),
674 .name = "vfio_uml.device",
675 .config = uml_vfio_mc_config,
676 .get_config = NULL,
677 .id = uml_vfio_mc_id,
678 .remove = uml_vfio_mc_remove,
679 };
680
uml_vfio_init(void)681 static int __init uml_vfio_init(void)
682 {
683 struct uml_vfio_device *dev, *n;
684
685 sigio_broken();
686
687 /* If the opening fails, the device will be released. */
688 list_for_each_entry_safe(dev, n, ¨_vfio_devices, list)
689 uml_vfio_open_device(dev);
690
691 mconsole_register_dev(¨_vfio_mc);
692
693 return 0;
694 }
695 late_initcall(uml_vfio_init);
696
uml_vfio_exit(void)697 static void __exit uml_vfio_exit(void)
698 {
699 struct uml_vfio_device *dev, *n;
700
701 list_for_each_entry_safe(dev, n, ¨_vfio_devices, list)
702 uml_vfio_release_device(dev);
703
704 if (uml_vfio_container.fd >= 0)
705 os_close_file(uml_vfio_container.fd);
706 }
707 module_exit(uml_vfio_exit);
708