1 /* 2 * kvm eventfd support - use eventfd objects to signal various KVM events 3 * 4 * Copyright 2009 Novell. All Rights Reserved. 5 * Copyright 2010 Red Hat, Inc. and/or its affiliates. 6 * 7 * Author: 8 * Gregory Haskins <ghaskins@novell.com> 9 * 10 * This file is free software; you can redistribute it and/or modify 11 * it under the terms of version 2 of the GNU General Public License 12 * as published by the Free Software Foundation. 13 * 14 * This program is distributed in the hope that it will be useful, 15 * but WITHOUT ANY WARRANTY; without even the implied warranty of 16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 * GNU General Public License for more details. 18 * 19 * You should have received a copy of the GNU General Public License 20 * along with this program; if not, write to the Free Software Foundation, 21 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. 22 */ 23 24 #include <linux/kvm_host.h> 25 #include <linux/kvm.h> 26 #include <linux/workqueue.h> 27 #include <linux/syscalls.h> 28 #include <linux/wait.h> 29 #include <linux/poll.h> 30 #include <linux/file.h> 31 #include <linux/list.h> 32 #include <linux/eventfd.h> 33 #include <linux/kernel.h> 34 #include <linux/slab.h> 35 36 #include "iodev.h" 37 38 /* 39 * -------------------------------------------------------------------- 40 * irqfd: Allows an fd to be used to inject an interrupt to the guest 41 * 42 * Credit goes to Avi Kivity for the original idea. 43 * -------------------------------------------------------------------- 44 */ 45 46 struct _irqfd { 47 /* Used for MSI fast-path */ 48 struct kvm *kvm; 49 wait_queue_t wait; 50 /* Update side is protected by irqfds.lock */ 51 struct kvm_kernel_irq_routing_entry __rcu *irq_entry; 52 /* Used for level IRQ fast-path */ 53 int gsi; 54 struct work_struct inject; 55 /* Used for setup/shutdown */ 56 struct eventfd_ctx *eventfd; 57 struct list_head list; 58 poll_table pt; 59 struct work_struct shutdown; 60 }; 61 62 static struct workqueue_struct *irqfd_cleanup_wq; 63 64 static void 65 irqfd_inject(struct work_struct *work) 66 { 67 struct _irqfd *irqfd = container_of(work, struct _irqfd, inject); 68 struct kvm *kvm = irqfd->kvm; 69 70 kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1); 71 kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0); 72 } 73 74 /* 75 * Race-free decouple logic (ordering is critical) 76 */ 77 static void 78 irqfd_shutdown(struct work_struct *work) 79 { 80 struct _irqfd *irqfd = container_of(work, struct _irqfd, shutdown); 81 u64 cnt; 82 83 /* 84 * Synchronize with the wait-queue and unhook ourselves to prevent 85 * further events. 86 */ 87 eventfd_ctx_remove_wait_queue(irqfd->eventfd, &irqfd->wait, &cnt); 88 89 /* 90 * We know no new events will be scheduled at this point, so block 91 * until all previously outstanding events have completed 92 */ 93 flush_work_sync(&irqfd->inject); 94 95 /* 96 * It is now safe to release the object's resources 97 */ 98 eventfd_ctx_put(irqfd->eventfd); 99 kfree(irqfd); 100 } 101 102 103 /* assumes kvm->irqfds.lock is held */ 104 static bool 105 irqfd_is_active(struct _irqfd *irqfd) 106 { 107 return list_empty(&irqfd->list) ? false : true; 108 } 109 110 /* 111 * Mark the irqfd as inactive and schedule it for removal 112 * 113 * assumes kvm->irqfds.lock is held 114 */ 115 static void 116 irqfd_deactivate(struct _irqfd *irqfd) 117 { 118 BUG_ON(!irqfd_is_active(irqfd)); 119 120 list_del_init(&irqfd->list); 121 122 queue_work(irqfd_cleanup_wq, &irqfd->shutdown); 123 } 124 125 /* 126 * Called with wqh->lock held and interrupts disabled 127 */ 128 static int 129 irqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key) 130 { 131 struct _irqfd *irqfd = container_of(wait, struct _irqfd, wait); 132 unsigned long flags = (unsigned long)key; 133 struct kvm_kernel_irq_routing_entry *irq; 134 struct kvm *kvm = irqfd->kvm; 135 136 if (flags & POLLIN) { 137 rcu_read_lock(); 138 irq = rcu_dereference(irqfd->irq_entry); 139 /* An event has been signaled, inject an interrupt */ 140 if (irq) 141 kvm_set_msi(irq, kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 1); 142 else 143 schedule_work(&irqfd->inject); 144 rcu_read_unlock(); 145 } 146 147 if (flags & POLLHUP) { 148 /* The eventfd is closing, detach from KVM */ 149 unsigned long flags; 150 151 spin_lock_irqsave(&kvm->irqfds.lock, flags); 152 153 /* 154 * We must check if someone deactivated the irqfd before 155 * we could acquire the irqfds.lock since the item is 156 * deactivated from the KVM side before it is unhooked from 157 * the wait-queue. If it is already deactivated, we can 158 * simply return knowing the other side will cleanup for us. 159 * We cannot race against the irqfd going away since the 160 * other side is required to acquire wqh->lock, which we hold 161 */ 162 if (irqfd_is_active(irqfd)) 163 irqfd_deactivate(irqfd); 164 165 spin_unlock_irqrestore(&kvm->irqfds.lock, flags); 166 } 167 168 return 0; 169 } 170 171 static void 172 irqfd_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh, 173 poll_table *pt) 174 { 175 struct _irqfd *irqfd = container_of(pt, struct _irqfd, pt); 176 add_wait_queue(wqh, &irqfd->wait); 177 } 178 179 /* Must be called under irqfds.lock */ 180 static void irqfd_update(struct kvm *kvm, struct _irqfd *irqfd, 181 struct kvm_irq_routing_table *irq_rt) 182 { 183 struct kvm_kernel_irq_routing_entry *e; 184 struct hlist_node *n; 185 186 if (irqfd->gsi >= irq_rt->nr_rt_entries) { 187 rcu_assign_pointer(irqfd->irq_entry, NULL); 188 return; 189 } 190 191 hlist_for_each_entry(e, n, &irq_rt->map[irqfd->gsi], link) { 192 /* Only fast-path MSI. */ 193 if (e->type == KVM_IRQ_ROUTING_MSI) 194 rcu_assign_pointer(irqfd->irq_entry, e); 195 else 196 rcu_assign_pointer(irqfd->irq_entry, NULL); 197 } 198 } 199 200 static int 201 kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args) 202 { 203 struct kvm_irq_routing_table *irq_rt; 204 struct _irqfd *irqfd, *tmp; 205 struct file *file = NULL; 206 struct eventfd_ctx *eventfd = NULL; 207 int ret; 208 unsigned int events; 209 210 irqfd = kzalloc(sizeof(*irqfd), GFP_KERNEL); 211 if (!irqfd) 212 return -ENOMEM; 213 214 irqfd->kvm = kvm; 215 irqfd->gsi = args->gsi; 216 INIT_LIST_HEAD(&irqfd->list); 217 INIT_WORK(&irqfd->inject, irqfd_inject); 218 INIT_WORK(&irqfd->shutdown, irqfd_shutdown); 219 220 file = eventfd_fget(args->fd); 221 if (IS_ERR(file)) { 222 ret = PTR_ERR(file); 223 goto fail; 224 } 225 226 eventfd = eventfd_ctx_fileget(file); 227 if (IS_ERR(eventfd)) { 228 ret = PTR_ERR(eventfd); 229 goto fail; 230 } 231 232 irqfd->eventfd = eventfd; 233 234 /* 235 * Install our own custom wake-up handling so we are notified via 236 * a callback whenever someone signals the underlying eventfd 237 */ 238 init_waitqueue_func_entry(&irqfd->wait, irqfd_wakeup); 239 init_poll_funcptr(&irqfd->pt, irqfd_ptable_queue_proc); 240 241 spin_lock_irq(&kvm->irqfds.lock); 242 243 ret = 0; 244 list_for_each_entry(tmp, &kvm->irqfds.items, list) { 245 if (irqfd->eventfd != tmp->eventfd) 246 continue; 247 /* This fd is used for another irq already. */ 248 ret = -EBUSY; 249 spin_unlock_irq(&kvm->irqfds.lock); 250 goto fail; 251 } 252 253 irq_rt = rcu_dereference_protected(kvm->irq_routing, 254 lockdep_is_held(&kvm->irqfds.lock)); 255 irqfd_update(kvm, irqfd, irq_rt); 256 257 events = file->f_op->poll(file, &irqfd->pt); 258 259 list_add_tail(&irqfd->list, &kvm->irqfds.items); 260 261 /* 262 * Check if there was an event already pending on the eventfd 263 * before we registered, and trigger it as if we didn't miss it. 264 */ 265 if (events & POLLIN) 266 schedule_work(&irqfd->inject); 267 268 spin_unlock_irq(&kvm->irqfds.lock); 269 270 /* 271 * do not drop the file until the irqfd is fully initialized, otherwise 272 * we might race against the POLLHUP 273 */ 274 fput(file); 275 276 return 0; 277 278 fail: 279 if (eventfd && !IS_ERR(eventfd)) 280 eventfd_ctx_put(eventfd); 281 282 if (!IS_ERR(file)) 283 fput(file); 284 285 kfree(irqfd); 286 return ret; 287 } 288 289 void 290 kvm_eventfd_init(struct kvm *kvm) 291 { 292 spin_lock_init(&kvm->irqfds.lock); 293 INIT_LIST_HEAD(&kvm->irqfds.items); 294 INIT_LIST_HEAD(&kvm->ioeventfds); 295 } 296 297 /* 298 * shutdown any irqfd's that match fd+gsi 299 */ 300 static int 301 kvm_irqfd_deassign(struct kvm *kvm, struct kvm_irqfd *args) 302 { 303 struct _irqfd *irqfd, *tmp; 304 struct eventfd_ctx *eventfd; 305 306 eventfd = eventfd_ctx_fdget(args->fd); 307 if (IS_ERR(eventfd)) 308 return PTR_ERR(eventfd); 309 310 spin_lock_irq(&kvm->irqfds.lock); 311 312 list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds.items, list) { 313 if (irqfd->eventfd == eventfd && irqfd->gsi == args->gsi) { 314 /* 315 * This rcu_assign_pointer is needed for when 316 * another thread calls kvm_irq_routing_update before 317 * we flush workqueue below (we synchronize with 318 * kvm_irq_routing_update using irqfds.lock). 319 * It is paired with synchronize_rcu done by caller 320 * of that function. 321 */ 322 rcu_assign_pointer(irqfd->irq_entry, NULL); 323 irqfd_deactivate(irqfd); 324 } 325 } 326 327 spin_unlock_irq(&kvm->irqfds.lock); 328 eventfd_ctx_put(eventfd); 329 330 /* 331 * Block until we know all outstanding shutdown jobs have completed 332 * so that we guarantee there will not be any more interrupts on this 333 * gsi once this deassign function returns. 334 */ 335 flush_workqueue(irqfd_cleanup_wq); 336 337 return 0; 338 } 339 340 int 341 kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args) 342 { 343 if (args->flags & ~KVM_IRQFD_FLAG_DEASSIGN) 344 return -EINVAL; 345 346 if (args->flags & KVM_IRQFD_FLAG_DEASSIGN) 347 return kvm_irqfd_deassign(kvm, args); 348 349 return kvm_irqfd_assign(kvm, args); 350 } 351 352 /* 353 * This function is called as the kvm VM fd is being released. Shutdown all 354 * irqfds that still remain open 355 */ 356 void 357 kvm_irqfd_release(struct kvm *kvm) 358 { 359 struct _irqfd *irqfd, *tmp; 360 361 spin_lock_irq(&kvm->irqfds.lock); 362 363 list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds.items, list) 364 irqfd_deactivate(irqfd); 365 366 spin_unlock_irq(&kvm->irqfds.lock); 367 368 /* 369 * Block until we know all outstanding shutdown jobs have completed 370 * since we do not take a kvm* reference. 371 */ 372 flush_workqueue(irqfd_cleanup_wq); 373 374 } 375 376 /* 377 * Change irq_routing and irqfd. 378 * Caller must invoke synchronize_rcu afterwards. 379 */ 380 void kvm_irq_routing_update(struct kvm *kvm, 381 struct kvm_irq_routing_table *irq_rt) 382 { 383 struct _irqfd *irqfd; 384 385 spin_lock_irq(&kvm->irqfds.lock); 386 387 rcu_assign_pointer(kvm->irq_routing, irq_rt); 388 389 list_for_each_entry(irqfd, &kvm->irqfds.items, list) 390 irqfd_update(kvm, irqfd, irq_rt); 391 392 spin_unlock_irq(&kvm->irqfds.lock); 393 } 394 395 /* 396 * create a host-wide workqueue for issuing deferred shutdown requests 397 * aggregated from all vm* instances. We need our own isolated single-thread 398 * queue to prevent deadlock against flushing the normal work-queue. 399 */ 400 static int __init irqfd_module_init(void) 401 { 402 irqfd_cleanup_wq = create_singlethread_workqueue("kvm-irqfd-cleanup"); 403 if (!irqfd_cleanup_wq) 404 return -ENOMEM; 405 406 return 0; 407 } 408 409 static void __exit irqfd_module_exit(void) 410 { 411 destroy_workqueue(irqfd_cleanup_wq); 412 } 413 414 module_init(irqfd_module_init); 415 module_exit(irqfd_module_exit); 416 417 /* 418 * -------------------------------------------------------------------- 419 * ioeventfd: translate a PIO/MMIO memory write to an eventfd signal. 420 * 421 * userspace can register a PIO/MMIO address with an eventfd for receiving 422 * notification when the memory has been touched. 423 * -------------------------------------------------------------------- 424 */ 425 426 struct _ioeventfd { 427 struct list_head list; 428 u64 addr; 429 int length; 430 struct eventfd_ctx *eventfd; 431 u64 datamatch; 432 struct kvm_io_device dev; 433 bool wildcard; 434 }; 435 436 static inline struct _ioeventfd * 437 to_ioeventfd(struct kvm_io_device *dev) 438 { 439 return container_of(dev, struct _ioeventfd, dev); 440 } 441 442 static void 443 ioeventfd_release(struct _ioeventfd *p) 444 { 445 eventfd_ctx_put(p->eventfd); 446 list_del(&p->list); 447 kfree(p); 448 } 449 450 static bool 451 ioeventfd_in_range(struct _ioeventfd *p, gpa_t addr, int len, const void *val) 452 { 453 u64 _val; 454 455 if (!(addr == p->addr && len == p->length)) 456 /* address-range must be precise for a hit */ 457 return false; 458 459 if (p->wildcard) 460 /* all else equal, wildcard is always a hit */ 461 return true; 462 463 /* otherwise, we have to actually compare the data */ 464 465 BUG_ON(!IS_ALIGNED((unsigned long)val, len)); 466 467 switch (len) { 468 case 1: 469 _val = *(u8 *)val; 470 break; 471 case 2: 472 _val = *(u16 *)val; 473 break; 474 case 4: 475 _val = *(u32 *)val; 476 break; 477 case 8: 478 _val = *(u64 *)val; 479 break; 480 default: 481 return false; 482 } 483 484 return _val == p->datamatch ? true : false; 485 } 486 487 /* MMIO/PIO writes trigger an event if the addr/val match */ 488 static int 489 ioeventfd_write(struct kvm_io_device *this, gpa_t addr, int len, 490 const void *val) 491 { 492 struct _ioeventfd *p = to_ioeventfd(this); 493 494 if (!ioeventfd_in_range(p, addr, len, val)) 495 return -EOPNOTSUPP; 496 497 eventfd_signal(p->eventfd, 1); 498 return 0; 499 } 500 501 /* 502 * This function is called as KVM is completely shutting down. We do not 503 * need to worry about locking just nuke anything we have as quickly as possible 504 */ 505 static void 506 ioeventfd_destructor(struct kvm_io_device *this) 507 { 508 struct _ioeventfd *p = to_ioeventfd(this); 509 510 ioeventfd_release(p); 511 } 512 513 static const struct kvm_io_device_ops ioeventfd_ops = { 514 .write = ioeventfd_write, 515 .destructor = ioeventfd_destructor, 516 }; 517 518 /* assumes kvm->slots_lock held */ 519 static bool 520 ioeventfd_check_collision(struct kvm *kvm, struct _ioeventfd *p) 521 { 522 struct _ioeventfd *_p; 523 524 list_for_each_entry(_p, &kvm->ioeventfds, list) 525 if (_p->addr == p->addr && _p->length == p->length && 526 (_p->wildcard || p->wildcard || 527 _p->datamatch == p->datamatch)) 528 return true; 529 530 return false; 531 } 532 533 static int 534 kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) 535 { 536 int pio = args->flags & KVM_IOEVENTFD_FLAG_PIO; 537 enum kvm_bus bus_idx = pio ? KVM_PIO_BUS : KVM_MMIO_BUS; 538 struct _ioeventfd *p; 539 struct eventfd_ctx *eventfd; 540 int ret; 541 542 /* must be natural-word sized */ 543 switch (args->len) { 544 case 1: 545 case 2: 546 case 4: 547 case 8: 548 break; 549 default: 550 return -EINVAL; 551 } 552 553 /* check for range overflow */ 554 if (args->addr + args->len < args->addr) 555 return -EINVAL; 556 557 /* check for extra flags that we don't understand */ 558 if (args->flags & ~KVM_IOEVENTFD_VALID_FLAG_MASK) 559 return -EINVAL; 560 561 eventfd = eventfd_ctx_fdget(args->fd); 562 if (IS_ERR(eventfd)) 563 return PTR_ERR(eventfd); 564 565 p = kzalloc(sizeof(*p), GFP_KERNEL); 566 if (!p) { 567 ret = -ENOMEM; 568 goto fail; 569 } 570 571 INIT_LIST_HEAD(&p->list); 572 p->addr = args->addr; 573 p->length = args->len; 574 p->eventfd = eventfd; 575 576 /* The datamatch feature is optional, otherwise this is a wildcard */ 577 if (args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH) 578 p->datamatch = args->datamatch; 579 else 580 p->wildcard = true; 581 582 mutex_lock(&kvm->slots_lock); 583 584 /* Verify that there isn't a match already */ 585 if (ioeventfd_check_collision(kvm, p)) { 586 ret = -EEXIST; 587 goto unlock_fail; 588 } 589 590 kvm_iodevice_init(&p->dev, &ioeventfd_ops); 591 592 ret = kvm_io_bus_register_dev(kvm, bus_idx, p->addr, p->length, 593 &p->dev); 594 if (ret < 0) 595 goto unlock_fail; 596 597 list_add_tail(&p->list, &kvm->ioeventfds); 598 599 mutex_unlock(&kvm->slots_lock); 600 601 return 0; 602 603 unlock_fail: 604 mutex_unlock(&kvm->slots_lock); 605 606 fail: 607 kfree(p); 608 eventfd_ctx_put(eventfd); 609 610 return ret; 611 } 612 613 static int 614 kvm_deassign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) 615 { 616 int pio = args->flags & KVM_IOEVENTFD_FLAG_PIO; 617 enum kvm_bus bus_idx = pio ? KVM_PIO_BUS : KVM_MMIO_BUS; 618 struct _ioeventfd *p, *tmp; 619 struct eventfd_ctx *eventfd; 620 int ret = -ENOENT; 621 622 eventfd = eventfd_ctx_fdget(args->fd); 623 if (IS_ERR(eventfd)) 624 return PTR_ERR(eventfd); 625 626 mutex_lock(&kvm->slots_lock); 627 628 list_for_each_entry_safe(p, tmp, &kvm->ioeventfds, list) { 629 bool wildcard = !(args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH); 630 631 if (p->eventfd != eventfd || 632 p->addr != args->addr || 633 p->length != args->len || 634 p->wildcard != wildcard) 635 continue; 636 637 if (!p->wildcard && p->datamatch != args->datamatch) 638 continue; 639 640 kvm_io_bus_unregister_dev(kvm, bus_idx, &p->dev); 641 ioeventfd_release(p); 642 ret = 0; 643 break; 644 } 645 646 mutex_unlock(&kvm->slots_lock); 647 648 eventfd_ctx_put(eventfd); 649 650 return ret; 651 } 652 653 int 654 kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) 655 { 656 if (args->flags & KVM_IOEVENTFD_FLAG_DEASSIGN) 657 return kvm_deassign_ioeventfd(kvm, args); 658 659 return kvm_assign_ioeventfd(kvm, args); 660 } 661