1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2025 Ant Group 4 * Author: Tiwei Bie <tiwei.btw@antgroup.com> 5 */ 6 7 #define pr_fmt(fmt) "vfio-uml: " fmt 8 9 #include <linux/module.h> 10 #include <linux/logic_iomem.h> 11 #include <linux/mutex.h> 12 #include <linux/list.h> 13 #include <linux/string.h> 14 #include <linux/unaligned.h> 15 #include <irq_kern.h> 16 #include <init.h> 17 #include <os.h> 18 19 #include "mconsole_kern.h" 20 #include "virt-pci.h" 21 #include "vfio_user.h" 22 23 #define to_vdev(_pdev) container_of(_pdev, struct uml_vfio_device, pdev) 24 25 struct uml_vfio_intr_ctx { 26 struct uml_vfio_device *dev; 27 int irq; 28 }; 29 30 struct uml_vfio_device { 31 const char *name; 32 int group; 33 34 struct um_pci_device pdev; 35 struct uml_vfio_user_device udev; 36 struct uml_vfio_intr_ctx *intr_ctx; 37 38 int msix_cap; 39 int msix_bar; 40 int msix_offset; 41 int msix_size; 42 u32 *msix_data; 43 44 struct list_head list; 45 }; 46 47 struct uml_vfio_group { 48 int id; 49 int fd; 50 int users; 51 struct list_head list; 52 }; 53 54 static struct { 55 int fd; 56 int users; 57 } uml_vfio_container = { .fd = -1 }; 58 static DEFINE_MUTEX(uml_vfio_container_mtx); 59 60 static LIST_HEAD(uml_vfio_groups); 61 static DEFINE_MUTEX(uml_vfio_groups_mtx); 62 63 static LIST_HEAD(uml_vfio_devices); 64 static DEFINE_MUTEX(uml_vfio_devices_mtx); 65 66 static int uml_vfio_set_container(int group_fd) 67 { 68 int err; 69 70 guard(mutex)(¨_vfio_container_mtx); 71 72 err = uml_vfio_user_set_container(uml_vfio_container.fd, group_fd); 73 if (err) 74 return err; 75 76 uml_vfio_container.users++; 77 if (uml_vfio_container.users > 1) 78 return 0; 79 80 err = uml_vfio_user_setup_iommu(uml_vfio_container.fd); 81 if (err) { 82 uml_vfio_user_unset_container(uml_vfio_container.fd, group_fd); 83 uml_vfio_container.users--; 84 } 85 return err; 86 } 87 88 static void uml_vfio_unset_container(int group_fd) 89 { 90 guard(mutex)(¨_vfio_container_mtx); 91 92 uml_vfio_user_unset_container(uml_vfio_container.fd, group_fd); 93 uml_vfio_container.users--; 94 } 95 96 static int uml_vfio_open_group(int group_id) 97 { 98 struct uml_vfio_group *group; 99 int err; 100 101 guard(mutex)(¨_vfio_groups_mtx); 102 103 list_for_each_entry(group, ¨_vfio_groups, list) { 104 if (group->id == group_id) { 105 group->users++; 106 return group->fd; 107 } 108 } 109 110 group = kzalloc(sizeof(*group), GFP_KERNEL); 111 if (!group) 112 return -ENOMEM; 113 114 group->fd = uml_vfio_user_open_group(group_id); 115 if (group->fd < 0) { 116 err = group->fd; 117 goto free_group; 118 } 119 120 err = uml_vfio_set_container(group->fd); 121 if (err) 122 goto close_group; 123 124 group->id = group_id; 125 group->users = 1; 126 127 list_add(&group->list, ¨_vfio_groups); 128 129 return group->fd; 130 131 close_group: 132 os_close_file(group->fd); 133 free_group: 134 kfree(group); 135 return err; 136 } 137 138 static int uml_vfio_release_group(int group_fd) 139 { 140 struct uml_vfio_group *group; 141 142 guard(mutex)(¨_vfio_groups_mtx); 143 144 list_for_each_entry(group, ¨_vfio_groups, list) { 145 if (group->fd == group_fd) { 146 group->users--; 147 if (group->users == 0) { 148 uml_vfio_unset_container(group_fd); 149 os_close_file(group_fd); 150 list_del(&group->list); 151 kfree(group); 152 } 153 return 0; 154 } 155 } 156 157 return -ENOENT; 158 } 159 160 static irqreturn_t uml_vfio_interrupt(int unused, void *opaque) 161 { 162 struct uml_vfio_intr_ctx *ctx = opaque; 163 struct uml_vfio_device *dev = ctx->dev; 164 int index = ctx - dev->intr_ctx; 165 int irqfd = dev->udev.irqfd[index]; 166 int irq = dev->msix_data[index]; 167 uint64_t v; 168 int r; 169 170 do { 171 r = os_read_file(irqfd, &v, sizeof(v)); 172 if (r == sizeof(v)) 173 generic_handle_irq(irq); 174 } while (r == sizeof(v) || r == -EINTR); 175 WARN(r != -EAGAIN, "read returned %d\n", r); 176 177 return IRQ_HANDLED; 178 } 179 180 static int uml_vfio_activate_irq(struct uml_vfio_device *dev, int index) 181 { 182 struct uml_vfio_intr_ctx *ctx = &dev->intr_ctx[index]; 183 int err, irqfd; 184 185 if (ctx->irq >= 0) 186 return 0; 187 188 irqfd = uml_vfio_user_activate_irq(&dev->udev, index); 189 if (irqfd < 0) 190 return irqfd; 191 192 ctx->irq = um_request_irq(UM_IRQ_ALLOC, irqfd, IRQ_READ, 193 uml_vfio_interrupt, 0, 194 "vfio-uml", ctx); 195 if (ctx->irq < 0) { 196 err = ctx->irq; 197 goto deactivate; 198 } 199 200 err = add_sigio_fd(irqfd); 201 if (err) 202 goto free_irq; 203 204 return 0; 205 206 free_irq: 207 um_free_irq(ctx->irq, ctx); 208 ctx->irq = -1; 209 deactivate: 210 uml_vfio_user_deactivate_irq(&dev->udev, index); 211 return err; 212 } 213 214 static int uml_vfio_deactivate_irq(struct uml_vfio_device *dev, int index) 215 { 216 struct uml_vfio_intr_ctx *ctx = &dev->intr_ctx[index]; 217 218 if (ctx->irq >= 0) { 219 ignore_sigio_fd(dev->udev.irqfd[index]); 220 um_free_irq(ctx->irq, ctx); 221 uml_vfio_user_deactivate_irq(&dev->udev, index); 222 ctx->irq = -1; 223 } 224 return 0; 225 } 226 227 static int uml_vfio_update_msix_cap(struct uml_vfio_device *dev, 228 unsigned int offset, int size, 229 unsigned long val) 230 { 231 /* 232 * Here, we handle only the operations we care about, 233 * ignoring the rest. 234 */ 235 if (size == 2 && offset == dev->msix_cap + PCI_MSIX_FLAGS) { 236 switch (val & ~PCI_MSIX_FLAGS_QSIZE) { 237 case PCI_MSIX_FLAGS_ENABLE: 238 case 0: 239 return uml_vfio_user_update_irqs(&dev->udev); 240 } 241 } 242 return 0; 243 } 244 245 static int uml_vfio_update_msix_table(struct uml_vfio_device *dev, 246 unsigned int offset, int size, 247 unsigned long val) 248 { 249 int index; 250 251 /* 252 * Here, we handle only the operations we care about, 253 * ignoring the rest. 254 */ 255 offset -= dev->msix_offset + PCI_MSIX_ENTRY_DATA; 256 257 if (size != 4 || offset % PCI_MSIX_ENTRY_SIZE != 0) 258 return 0; 259 260 index = offset / PCI_MSIX_ENTRY_SIZE; 261 if (index >= dev->udev.irq_count) 262 return -EINVAL; 263 264 dev->msix_data[index] = val; 265 266 return val ? uml_vfio_activate_irq(dev, index) : 267 uml_vfio_deactivate_irq(dev, index); 268 } 269 270 static unsigned long __uml_vfio_cfgspace_read(struct uml_vfio_device *dev, 271 unsigned int offset, int size) 272 { 273 u8 data[8]; 274 275 memset(data, 0xff, sizeof(data)); 276 277 if (uml_vfio_user_cfgspace_read(&dev->udev, offset, data, size)) 278 return ULONG_MAX; 279 280 switch (size) { 281 case 1: 282 return data[0]; 283 case 2: 284 return le16_to_cpup((void *)data); 285 case 4: 286 return le32_to_cpup((void *)data); 287 #ifdef CONFIG_64BIT 288 case 8: 289 return le64_to_cpup((void *)data); 290 #endif 291 default: 292 return ULONG_MAX; 293 } 294 } 295 296 static unsigned long uml_vfio_cfgspace_read(struct um_pci_device *pdev, 297 unsigned int offset, int size) 298 { 299 struct uml_vfio_device *dev = to_vdev(pdev); 300 301 return __uml_vfio_cfgspace_read(dev, offset, size); 302 } 303 304 static void __uml_vfio_cfgspace_write(struct uml_vfio_device *dev, 305 unsigned int offset, int size, 306 unsigned long val) 307 { 308 u8 data[8]; 309 310 switch (size) { 311 case 1: 312 data[0] = (u8)val; 313 break; 314 case 2: 315 put_unaligned_le16(val, (void *)data); 316 break; 317 case 4: 318 put_unaligned_le32(val, (void *)data); 319 break; 320 #ifdef CONFIG_64BIT 321 case 8: 322 put_unaligned_le64(val, (void *)data); 323 break; 324 #endif 325 } 326 327 WARN_ON(uml_vfio_user_cfgspace_write(&dev->udev, offset, data, size)); 328 } 329 330 static void uml_vfio_cfgspace_write(struct um_pci_device *pdev, 331 unsigned int offset, int size, 332 unsigned long val) 333 { 334 struct uml_vfio_device *dev = to_vdev(pdev); 335 336 if (offset < dev->msix_cap + PCI_CAP_MSIX_SIZEOF && 337 offset + size > dev->msix_cap) 338 WARN_ON(uml_vfio_update_msix_cap(dev, offset, size, val)); 339 340 __uml_vfio_cfgspace_write(dev, offset, size, val); 341 } 342 343 static void uml_vfio_bar_copy_from(struct um_pci_device *pdev, int bar, 344 void *buffer, unsigned int offset, int size) 345 { 346 struct uml_vfio_device *dev = to_vdev(pdev); 347 348 memset(buffer, 0xff, size); 349 uml_vfio_user_bar_read(&dev->udev, bar, offset, buffer, size); 350 } 351 352 static unsigned long uml_vfio_bar_read(struct um_pci_device *pdev, int bar, 353 unsigned int offset, int size) 354 { 355 u8 data[8]; 356 357 uml_vfio_bar_copy_from(pdev, bar, data, offset, size); 358 359 switch (size) { 360 case 1: 361 return data[0]; 362 case 2: 363 return le16_to_cpup((void *)data); 364 case 4: 365 return le32_to_cpup((void *)data); 366 #ifdef CONFIG_64BIT 367 case 8: 368 return le64_to_cpup((void *)data); 369 #endif 370 default: 371 return ULONG_MAX; 372 } 373 } 374 375 static void uml_vfio_bar_copy_to(struct um_pci_device *pdev, int bar, 376 unsigned int offset, const void *buffer, 377 int size) 378 { 379 struct uml_vfio_device *dev = to_vdev(pdev); 380 381 uml_vfio_user_bar_write(&dev->udev, bar, offset, buffer, size); 382 } 383 384 static void uml_vfio_bar_write(struct um_pci_device *pdev, int bar, 385 unsigned int offset, int size, 386 unsigned long val) 387 { 388 struct uml_vfio_device *dev = to_vdev(pdev); 389 u8 data[8]; 390 391 if (bar == dev->msix_bar && offset + size > dev->msix_offset && 392 offset < dev->msix_offset + dev->msix_size) 393 WARN_ON(uml_vfio_update_msix_table(dev, offset, size, val)); 394 395 switch (size) { 396 case 1: 397 data[0] = (u8)val; 398 break; 399 case 2: 400 put_unaligned_le16(val, (void *)data); 401 break; 402 case 4: 403 put_unaligned_le32(val, (void *)data); 404 break; 405 #ifdef CONFIG_64BIT 406 case 8: 407 put_unaligned_le64(val, (void *)data); 408 break; 409 #endif 410 } 411 412 uml_vfio_bar_copy_to(pdev, bar, offset, data, size); 413 } 414 415 static void uml_vfio_bar_set(struct um_pci_device *pdev, int bar, 416 unsigned int offset, u8 value, int size) 417 { 418 struct uml_vfio_device *dev = to_vdev(pdev); 419 int i; 420 421 for (i = 0; i < size; i++) 422 uml_vfio_user_bar_write(&dev->udev, bar, offset + i, &value, 1); 423 } 424 425 static const struct um_pci_ops uml_vfio_um_pci_ops = { 426 .cfgspace_read = uml_vfio_cfgspace_read, 427 .cfgspace_write = uml_vfio_cfgspace_write, 428 .bar_read = uml_vfio_bar_read, 429 .bar_write = uml_vfio_bar_write, 430 .bar_copy_from = uml_vfio_bar_copy_from, 431 .bar_copy_to = uml_vfio_bar_copy_to, 432 .bar_set = uml_vfio_bar_set, 433 }; 434 435 static u8 uml_vfio_find_capability(struct uml_vfio_device *dev, u8 cap) 436 { 437 u8 id, pos; 438 u16 ent; 439 int ttl = 48; /* PCI_FIND_CAP_TTL */ 440 441 pos = __uml_vfio_cfgspace_read(dev, PCI_CAPABILITY_LIST, sizeof(pos)); 442 443 while (pos && ttl--) { 444 ent = __uml_vfio_cfgspace_read(dev, pos, sizeof(ent)); 445 446 id = ent & 0xff; 447 if (id == 0xff) 448 break; 449 if (id == cap) 450 return pos; 451 452 pos = ent >> 8; 453 } 454 455 return 0; 456 } 457 458 static int uml_vfio_read_msix_table(struct uml_vfio_device *dev) 459 { 460 unsigned int off; 461 u16 flags; 462 u32 tbl; 463 464 off = uml_vfio_find_capability(dev, PCI_CAP_ID_MSIX); 465 if (!off) 466 return -ENOTSUPP; 467 468 dev->msix_cap = off; 469 470 tbl = __uml_vfio_cfgspace_read(dev, off + PCI_MSIX_TABLE, sizeof(tbl)); 471 flags = __uml_vfio_cfgspace_read(dev, off + PCI_MSIX_FLAGS, sizeof(flags)); 472 473 dev->msix_bar = tbl & PCI_MSIX_TABLE_BIR; 474 dev->msix_offset = tbl & PCI_MSIX_TABLE_OFFSET; 475 dev->msix_size = ((flags & PCI_MSIX_FLAGS_QSIZE) + 1) * PCI_MSIX_ENTRY_SIZE; 476 477 dev->msix_data = kzalloc(dev->msix_size, GFP_KERNEL); 478 if (!dev->msix_data) 479 return -ENOMEM; 480 481 return 0; 482 } 483 484 static void uml_vfio_open_device(struct uml_vfio_device *dev) 485 { 486 struct uml_vfio_intr_ctx *ctx; 487 int err, group_id, i; 488 489 group_id = uml_vfio_user_get_group_id(dev->name); 490 if (group_id < 0) { 491 pr_err("Failed to get group id (%s), error %d\n", 492 dev->name, group_id); 493 goto free_dev; 494 } 495 496 dev->group = uml_vfio_open_group(group_id); 497 if (dev->group < 0) { 498 pr_err("Failed to open group %d (%s), error %d\n", 499 group_id, dev->name, dev->group); 500 goto free_dev; 501 } 502 503 err = uml_vfio_user_setup_device(&dev->udev, dev->group, dev->name); 504 if (err) { 505 pr_err("Failed to setup device (%s), error %d\n", 506 dev->name, err); 507 goto release_group; 508 } 509 510 err = uml_vfio_read_msix_table(dev); 511 if (err) { 512 pr_err("Failed to read MSI-X table (%s), error %d\n", 513 dev->name, err); 514 goto teardown_udev; 515 } 516 517 dev->intr_ctx = kmalloc_array(dev->udev.irq_count, 518 sizeof(struct uml_vfio_intr_ctx), 519 GFP_KERNEL); 520 if (!dev->intr_ctx) { 521 pr_err("Failed to allocate interrupt context (%s)\n", 522 dev->name); 523 goto free_msix; 524 } 525 526 for (i = 0; i < dev->udev.irq_count; i++) { 527 ctx = &dev->intr_ctx[i]; 528 ctx->dev = dev; 529 ctx->irq = -1; 530 } 531 532 dev->pdev.ops = ¨_vfio_um_pci_ops; 533 534 err = um_pci_device_register(&dev->pdev); 535 if (err) { 536 pr_err("Failed to register UM PCI device (%s), error %d\n", 537 dev->name, err); 538 goto free_intr_ctx; 539 } 540 541 return; 542 543 free_intr_ctx: 544 kfree(dev->intr_ctx); 545 free_msix: 546 kfree(dev->msix_data); 547 teardown_udev: 548 uml_vfio_user_teardown_device(&dev->udev); 549 release_group: 550 uml_vfio_release_group(dev->group); 551 free_dev: 552 list_del(&dev->list); 553 kfree(dev->name); 554 kfree(dev); 555 } 556 557 static void uml_vfio_release_device(struct uml_vfio_device *dev) 558 { 559 int i; 560 561 for (i = 0; i < dev->udev.irq_count; i++) 562 uml_vfio_deactivate_irq(dev, i); 563 uml_vfio_user_update_irqs(&dev->udev); 564 565 um_pci_device_unregister(&dev->pdev); 566 kfree(dev->intr_ctx); 567 kfree(dev->msix_data); 568 uml_vfio_user_teardown_device(&dev->udev); 569 uml_vfio_release_group(dev->group); 570 list_del(&dev->list); 571 kfree(dev->name); 572 kfree(dev); 573 } 574 575 static struct uml_vfio_device *uml_vfio_find_device(const char *device) 576 { 577 struct uml_vfio_device *dev; 578 579 list_for_each_entry(dev, ¨_vfio_devices, list) { 580 if (!strcmp(dev->name, device)) 581 return dev; 582 } 583 return NULL; 584 } 585 586 static struct uml_vfio_device *uml_vfio_add_device(const char *device) 587 { 588 struct uml_vfio_device *dev; 589 int fd; 590 591 guard(mutex)(¨_vfio_devices_mtx); 592 593 if (uml_vfio_container.fd < 0) { 594 fd = uml_vfio_user_open_container(); 595 if (fd < 0) 596 return ERR_PTR(fd); 597 uml_vfio_container.fd = fd; 598 } 599 600 if (uml_vfio_find_device(device)) 601 return ERR_PTR(-EEXIST); 602 603 dev = kzalloc(sizeof(*dev), GFP_KERNEL); 604 if (!dev) 605 return ERR_PTR(-ENOMEM); 606 607 dev->name = kstrdup(device, GFP_KERNEL); 608 if (!dev->name) { 609 kfree(dev); 610 return ERR_PTR(-ENOMEM); 611 } 612 613 list_add_tail(&dev->list, ¨_vfio_devices); 614 return dev; 615 } 616 617 static int uml_vfio_cmdline_set(const char *device, const struct kernel_param *kp) 618 { 619 struct uml_vfio_device *dev; 620 621 dev = uml_vfio_add_device(device); 622 if (IS_ERR(dev)) 623 return PTR_ERR(dev); 624 return 0; 625 } 626 627 static int uml_vfio_cmdline_get(char *buffer, const struct kernel_param *kp) 628 { 629 return 0; 630 } 631 632 static const struct kernel_param_ops uml_vfio_cmdline_param_ops = { 633 .set = uml_vfio_cmdline_set, 634 .get = uml_vfio_cmdline_get, 635 }; 636 637 device_param_cb(device, ¨_vfio_cmdline_param_ops, NULL, 0400); 638 __uml_help(uml_vfio_cmdline_param_ops, 639 "vfio_uml.device=<domain:bus:slot.function>\n" 640 " Pass through a PCI device to UML via VFIO. Currently, only MSI-X\n" 641 " capable devices are supported, and it is assumed that drivers will\n" 642 " use MSI-X. This parameter can be specified multiple times to pass\n" 643 " through multiple PCI devices to UML.\n\n" 644 ); 645 646 static int uml_vfio_mc_config(char *str, char **error_out) 647 { 648 struct uml_vfio_device *dev; 649 650 if (*str != '=') { 651 *error_out = "Invalid config"; 652 return -EINVAL; 653 } 654 str += 1; 655 656 dev = uml_vfio_add_device(str); 657 if (IS_ERR(dev)) 658 return PTR_ERR(dev); 659 uml_vfio_open_device(dev); 660 return 0; 661 } 662 663 static int uml_vfio_mc_id(char **str, int *start_out, int *end_out) 664 { 665 return -EOPNOTSUPP; 666 } 667 668 static int uml_vfio_mc_remove(int n, char **error_out) 669 { 670 return -EOPNOTSUPP; 671 } 672 673 static struct mc_device uml_vfio_mc = { 674 .list = LIST_HEAD_INIT(uml_vfio_mc.list), 675 .name = "vfio_uml.device", 676 .config = uml_vfio_mc_config, 677 .get_config = NULL, 678 .id = uml_vfio_mc_id, 679 .remove = uml_vfio_mc_remove, 680 }; 681 682 static int __init uml_vfio_init(void) 683 { 684 struct uml_vfio_device *dev, *n; 685 686 sigio_broken(); 687 688 /* If the opening fails, the device will be released. */ 689 list_for_each_entry_safe(dev, n, ¨_vfio_devices, list) 690 uml_vfio_open_device(dev); 691 692 mconsole_register_dev(¨_vfio_mc); 693 694 return 0; 695 } 696 late_initcall(uml_vfio_init); 697 698 static void __exit uml_vfio_exit(void) 699 { 700 struct uml_vfio_device *dev, *n; 701 702 list_for_each_entry_safe(dev, n, ¨_vfio_devices, list) 703 uml_vfio_release_device(dev); 704 705 if (uml_vfio_container.fd >= 0) 706 os_close_file(uml_vfio_container.fd); 707 } 708 module_exit(uml_vfio_exit); 709