1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * kvm eventfd support - use eventfd objects to signal various KVM events 4 * 5 * Copyright 2009 Novell. All Rights Reserved. 6 * Copyright 2010 Red Hat, Inc. and/or its affiliates. 7 * 8 * Author: 9 * Gregory Haskins <ghaskins@novell.com> 10 */ 11 12 #include <linux/kvm_host.h> 13 #include <linux/kvm.h> 14 #include <linux/kvm_irqfd.h> 15 #include <linux/workqueue.h> 16 #include <linux/syscalls.h> 17 #include <linux/wait.h> 18 #include <linux/poll.h> 19 #include <linux/file.h> 20 #include <linux/list.h> 21 #include <linux/eventfd.h> 22 #include <linux/kernel.h> 23 #include <linux/srcu.h> 24 #include <linux/slab.h> 25 #include <linux/seqlock.h> 26 #include <linux/irqbypass.h> 27 #include <trace/events/kvm.h> 28 29 #include <kvm/iodev.h> 30 31 #ifdef CONFIG_HAVE_KVM_IRQFD 32 33 static struct workqueue_struct *irqfd_cleanup_wq; 34 35 bool __attribute__((weak)) 36 kvm_arch_irqfd_allowed(struct kvm *kvm, struct kvm_irqfd *args) 37 { 38 return true; 39 } 40 41 static void 42 irqfd_inject(struct work_struct *work) 43 { 44 struct kvm_kernel_irqfd *irqfd = 45 container_of(work, struct kvm_kernel_irqfd, inject); 46 struct kvm *kvm = irqfd->kvm; 47 48 if (!irqfd->resampler) { 49 kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1, 50 false); 51 kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0, 52 false); 53 } else 54 kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID, 55 irqfd->gsi, 1, false); 56 } 57 58 /* 59 * Since resampler irqfds share an IRQ source ID, we de-assert once 60 * then notify all of the resampler irqfds using this GSI. We can't 61 * do multiple de-asserts or we risk racing with incoming re-asserts. 62 */ 63 static void 64 irqfd_resampler_ack(struct kvm_irq_ack_notifier *kian) 65 { 66 struct kvm_kernel_irqfd_resampler *resampler; 67 struct kvm *kvm; 68 struct kvm_kernel_irqfd *irqfd; 69 int idx; 70 71 resampler = container_of(kian, 72 struct kvm_kernel_irqfd_resampler, notifier); 73 kvm = resampler->kvm; 74 75 kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID, 76 resampler->notifier.gsi, 0, false); 77 78 idx = srcu_read_lock(&kvm->irq_srcu); 79 80 list_for_each_entry_srcu(irqfd, &resampler->list, resampler_link, 81 srcu_read_lock_held(&kvm->irq_srcu)) 82 eventfd_signal(irqfd->resamplefd, 1); 83 84 srcu_read_unlock(&kvm->irq_srcu, idx); 85 } 86 87 static void 88 irqfd_resampler_shutdown(struct kvm_kernel_irqfd *irqfd) 89 { 90 struct kvm_kernel_irqfd_resampler *resampler = irqfd->resampler; 91 struct kvm *kvm = resampler->kvm; 92 93 mutex_lock(&kvm->irqfds.resampler_lock); 94 95 list_del_rcu(&irqfd->resampler_link); 96 synchronize_srcu(&kvm->irq_srcu); 97 98 if (list_empty(&resampler->list)) { 99 list_del(&resampler->link); 100 kvm_unregister_irq_ack_notifier(kvm, &resampler->notifier); 101 kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID, 102 resampler->notifier.gsi, 0, false); 103 kfree(resampler); 104 } 105 106 mutex_unlock(&kvm->irqfds.resampler_lock); 107 } 108 109 /* 110 * Race-free decouple logic (ordering is critical) 111 */ 112 static void 113 irqfd_shutdown(struct work_struct *work) 114 { 115 struct kvm_kernel_irqfd *irqfd = 116 container_of(work, struct kvm_kernel_irqfd, shutdown); 117 struct kvm *kvm = irqfd->kvm; 118 u64 cnt; 119 120 /* Make sure irqfd has been initialized in assign path. */ 121 synchronize_srcu(&kvm->irq_srcu); 122 123 /* 124 * Synchronize with the wait-queue and unhook ourselves to prevent 125 * further events. 126 */ 127 eventfd_ctx_remove_wait_queue(irqfd->eventfd, &irqfd->wait, &cnt); 128 129 /* 130 * We know no new events will be scheduled at this point, so block 131 * until all previously outstanding events have completed 132 */ 133 flush_work(&irqfd->inject); 134 135 if (irqfd->resampler) { 136 irqfd_resampler_shutdown(irqfd); 137 eventfd_ctx_put(irqfd->resamplefd); 138 } 139 140 /* 141 * It is now safe to release the object's resources 142 */ 143 #ifdef CONFIG_HAVE_KVM_IRQ_BYPASS 144 irq_bypass_unregister_consumer(&irqfd->consumer); 145 #endif 146 eventfd_ctx_put(irqfd->eventfd); 147 kfree(irqfd); 148 } 149 150 151 /* assumes kvm->irqfds.lock is held */ 152 static bool 153 irqfd_is_active(struct kvm_kernel_irqfd *irqfd) 154 { 155 return list_empty(&irqfd->list) ? false : true; 156 } 157 158 /* 159 * Mark the irqfd as inactive and schedule it for removal 160 * 161 * assumes kvm->irqfds.lock is held 162 */ 163 static void 164 irqfd_deactivate(struct kvm_kernel_irqfd *irqfd) 165 { 166 BUG_ON(!irqfd_is_active(irqfd)); 167 168 list_del_init(&irqfd->list); 169 170 queue_work(irqfd_cleanup_wq, &irqfd->shutdown); 171 } 172 173 int __attribute__((weak)) kvm_arch_set_irq_inatomic( 174 struct kvm_kernel_irq_routing_entry *irq, 175 struct kvm *kvm, int irq_source_id, 176 int level, 177 bool line_status) 178 { 179 return -EWOULDBLOCK; 180 } 181 182 /* 183 * Called with wqh->lock held and interrupts disabled 184 */ 185 static int 186 irqfd_wakeup(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) 187 { 188 struct kvm_kernel_irqfd *irqfd = 189 container_of(wait, struct kvm_kernel_irqfd, wait); 190 __poll_t flags = key_to_poll(key); 191 struct kvm_kernel_irq_routing_entry irq; 192 struct kvm *kvm = irqfd->kvm; 193 unsigned seq; 194 int idx; 195 int ret = 0; 196 197 if (flags & EPOLLIN) { 198 u64 cnt; 199 eventfd_ctx_do_read(irqfd->eventfd, &cnt); 200 201 idx = srcu_read_lock(&kvm->irq_srcu); 202 do { 203 seq = read_seqcount_begin(&irqfd->irq_entry_sc); 204 irq = irqfd->irq_entry; 205 } while (read_seqcount_retry(&irqfd->irq_entry_sc, seq)); 206 /* An event has been signaled, inject an interrupt */ 207 if (kvm_arch_set_irq_inatomic(&irq, kvm, 208 KVM_USERSPACE_IRQ_SOURCE_ID, 1, 209 false) == -EWOULDBLOCK) 210 schedule_work(&irqfd->inject); 211 srcu_read_unlock(&kvm->irq_srcu, idx); 212 ret = 1; 213 } 214 215 if (flags & EPOLLHUP) { 216 /* The eventfd is closing, detach from KVM */ 217 unsigned long iflags; 218 219 spin_lock_irqsave(&kvm->irqfds.lock, iflags); 220 221 /* 222 * We must check if someone deactivated the irqfd before 223 * we could acquire the irqfds.lock since the item is 224 * deactivated from the KVM side before it is unhooked from 225 * the wait-queue. If it is already deactivated, we can 226 * simply return knowing the other side will cleanup for us. 227 * We cannot race against the irqfd going away since the 228 * other side is required to acquire wqh->lock, which we hold 229 */ 230 if (irqfd_is_active(irqfd)) 231 irqfd_deactivate(irqfd); 232 233 spin_unlock_irqrestore(&kvm->irqfds.lock, iflags); 234 } 235 236 return ret; 237 } 238 239 static void 240 irqfd_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh, 241 poll_table *pt) 242 { 243 struct kvm_kernel_irqfd *irqfd = 244 container_of(pt, struct kvm_kernel_irqfd, pt); 245 add_wait_queue_priority(wqh, &irqfd->wait); 246 } 247 248 /* Must be called under irqfds.lock */ 249 static void irqfd_update(struct kvm *kvm, struct kvm_kernel_irqfd *irqfd) 250 { 251 struct kvm_kernel_irq_routing_entry *e; 252 struct kvm_kernel_irq_routing_entry entries[KVM_NR_IRQCHIPS]; 253 int n_entries; 254 255 n_entries = kvm_irq_map_gsi(kvm, entries, irqfd->gsi); 256 257 write_seqcount_begin(&irqfd->irq_entry_sc); 258 259 e = entries; 260 if (n_entries == 1) 261 irqfd->irq_entry = *e; 262 else 263 irqfd->irq_entry.type = 0; 264 265 write_seqcount_end(&irqfd->irq_entry_sc); 266 } 267 268 #ifdef CONFIG_HAVE_KVM_IRQ_BYPASS 269 void __attribute__((weak)) kvm_arch_irq_bypass_stop( 270 struct irq_bypass_consumer *cons) 271 { 272 } 273 274 void __attribute__((weak)) kvm_arch_irq_bypass_start( 275 struct irq_bypass_consumer *cons) 276 { 277 } 278 279 int __attribute__((weak)) kvm_arch_update_irqfd_routing( 280 struct kvm *kvm, unsigned int host_irq, 281 uint32_t guest_irq, bool set) 282 { 283 return 0; 284 } 285 286 bool __attribute__((weak)) kvm_arch_irqfd_route_changed( 287 struct kvm_kernel_irq_routing_entry *old, 288 struct kvm_kernel_irq_routing_entry *new) 289 { 290 return true; 291 } 292 #endif 293 294 static int 295 kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args) 296 { 297 struct kvm_kernel_irqfd *irqfd, *tmp; 298 struct fd f; 299 struct eventfd_ctx *eventfd = NULL, *resamplefd = NULL; 300 int ret; 301 __poll_t events; 302 int idx; 303 304 if (!kvm_arch_intc_initialized(kvm)) 305 return -EAGAIN; 306 307 if (!kvm_arch_irqfd_allowed(kvm, args)) 308 return -EINVAL; 309 310 irqfd = kzalloc(sizeof(*irqfd), GFP_KERNEL_ACCOUNT); 311 if (!irqfd) 312 return -ENOMEM; 313 314 irqfd->kvm = kvm; 315 irqfd->gsi = args->gsi; 316 INIT_LIST_HEAD(&irqfd->list); 317 INIT_WORK(&irqfd->inject, irqfd_inject); 318 INIT_WORK(&irqfd->shutdown, irqfd_shutdown); 319 seqcount_spinlock_init(&irqfd->irq_entry_sc, &kvm->irqfds.lock); 320 321 f = fdget(args->fd); 322 if (!f.file) { 323 ret = -EBADF; 324 goto out; 325 } 326 327 eventfd = eventfd_ctx_fileget(f.file); 328 if (IS_ERR(eventfd)) { 329 ret = PTR_ERR(eventfd); 330 goto fail; 331 } 332 333 irqfd->eventfd = eventfd; 334 335 if (args->flags & KVM_IRQFD_FLAG_RESAMPLE) { 336 struct kvm_kernel_irqfd_resampler *resampler; 337 338 resamplefd = eventfd_ctx_fdget(args->resamplefd); 339 if (IS_ERR(resamplefd)) { 340 ret = PTR_ERR(resamplefd); 341 goto fail; 342 } 343 344 irqfd->resamplefd = resamplefd; 345 INIT_LIST_HEAD(&irqfd->resampler_link); 346 347 mutex_lock(&kvm->irqfds.resampler_lock); 348 349 list_for_each_entry(resampler, 350 &kvm->irqfds.resampler_list, link) { 351 if (resampler->notifier.gsi == irqfd->gsi) { 352 irqfd->resampler = resampler; 353 break; 354 } 355 } 356 357 if (!irqfd->resampler) { 358 resampler = kzalloc(sizeof(*resampler), 359 GFP_KERNEL_ACCOUNT); 360 if (!resampler) { 361 ret = -ENOMEM; 362 mutex_unlock(&kvm->irqfds.resampler_lock); 363 goto fail; 364 } 365 366 resampler->kvm = kvm; 367 INIT_LIST_HEAD(&resampler->list); 368 resampler->notifier.gsi = irqfd->gsi; 369 resampler->notifier.irq_acked = irqfd_resampler_ack; 370 INIT_LIST_HEAD(&resampler->link); 371 372 list_add(&resampler->link, &kvm->irqfds.resampler_list); 373 kvm_register_irq_ack_notifier(kvm, 374 &resampler->notifier); 375 irqfd->resampler = resampler; 376 } 377 378 list_add_rcu(&irqfd->resampler_link, &irqfd->resampler->list); 379 synchronize_srcu(&kvm->irq_srcu); 380 381 mutex_unlock(&kvm->irqfds.resampler_lock); 382 } 383 384 /* 385 * Install our own custom wake-up handling so we are notified via 386 * a callback whenever someone signals the underlying eventfd 387 */ 388 init_waitqueue_func_entry(&irqfd->wait, irqfd_wakeup); 389 init_poll_funcptr(&irqfd->pt, irqfd_ptable_queue_proc); 390 391 spin_lock_irq(&kvm->irqfds.lock); 392 393 ret = 0; 394 list_for_each_entry(tmp, &kvm->irqfds.items, list) { 395 if (irqfd->eventfd != tmp->eventfd) 396 continue; 397 /* This fd is used for another irq already. */ 398 ret = -EBUSY; 399 spin_unlock_irq(&kvm->irqfds.lock); 400 goto fail; 401 } 402 403 idx = srcu_read_lock(&kvm->irq_srcu); 404 irqfd_update(kvm, irqfd); 405 406 list_add_tail(&irqfd->list, &kvm->irqfds.items); 407 408 spin_unlock_irq(&kvm->irqfds.lock); 409 410 /* 411 * Check if there was an event already pending on the eventfd 412 * before we registered, and trigger it as if we didn't miss it. 413 */ 414 events = vfs_poll(f.file, &irqfd->pt); 415 416 if (events & EPOLLIN) 417 schedule_work(&irqfd->inject); 418 419 #ifdef CONFIG_HAVE_KVM_IRQ_BYPASS 420 if (kvm_arch_has_irq_bypass()) { 421 irqfd->consumer.token = (void *)irqfd->eventfd; 422 irqfd->consumer.add_producer = kvm_arch_irq_bypass_add_producer; 423 irqfd->consumer.del_producer = kvm_arch_irq_bypass_del_producer; 424 irqfd->consumer.stop = kvm_arch_irq_bypass_stop; 425 irqfd->consumer.start = kvm_arch_irq_bypass_start; 426 ret = irq_bypass_register_consumer(&irqfd->consumer); 427 if (ret) 428 pr_info("irq bypass consumer (token %p) registration fails: %d\n", 429 irqfd->consumer.token, ret); 430 } 431 #endif 432 433 srcu_read_unlock(&kvm->irq_srcu, idx); 434 435 /* 436 * do not drop the file until the irqfd is fully initialized, otherwise 437 * we might race against the EPOLLHUP 438 */ 439 fdput(f); 440 return 0; 441 442 fail: 443 if (irqfd->resampler) 444 irqfd_resampler_shutdown(irqfd); 445 446 if (resamplefd && !IS_ERR(resamplefd)) 447 eventfd_ctx_put(resamplefd); 448 449 if (eventfd && !IS_ERR(eventfd)) 450 eventfd_ctx_put(eventfd); 451 452 fdput(f); 453 454 out: 455 kfree(irqfd); 456 return ret; 457 } 458 459 bool kvm_irq_has_notifier(struct kvm *kvm, unsigned irqchip, unsigned pin) 460 { 461 struct kvm_irq_ack_notifier *kian; 462 int gsi, idx; 463 464 idx = srcu_read_lock(&kvm->irq_srcu); 465 gsi = kvm_irq_map_chip_pin(kvm, irqchip, pin); 466 if (gsi != -1) 467 hlist_for_each_entry_srcu(kian, &kvm->irq_ack_notifier_list, 468 link, srcu_read_lock_held(&kvm->irq_srcu)) 469 if (kian->gsi == gsi) { 470 srcu_read_unlock(&kvm->irq_srcu, idx); 471 return true; 472 } 473 474 srcu_read_unlock(&kvm->irq_srcu, idx); 475 476 return false; 477 } 478 EXPORT_SYMBOL_GPL(kvm_irq_has_notifier); 479 480 void kvm_notify_acked_gsi(struct kvm *kvm, int gsi) 481 { 482 struct kvm_irq_ack_notifier *kian; 483 484 hlist_for_each_entry_srcu(kian, &kvm->irq_ack_notifier_list, 485 link, srcu_read_lock_held(&kvm->irq_srcu)) 486 if (kian->gsi == gsi) 487 kian->irq_acked(kian); 488 } 489 490 void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin) 491 { 492 int gsi, idx; 493 494 trace_kvm_ack_irq(irqchip, pin); 495 496 idx = srcu_read_lock(&kvm->irq_srcu); 497 gsi = kvm_irq_map_chip_pin(kvm, irqchip, pin); 498 if (gsi != -1) 499 kvm_notify_acked_gsi(kvm, gsi); 500 srcu_read_unlock(&kvm->irq_srcu, idx); 501 } 502 503 void kvm_register_irq_ack_notifier(struct kvm *kvm, 504 struct kvm_irq_ack_notifier *kian) 505 { 506 mutex_lock(&kvm->irq_lock); 507 hlist_add_head_rcu(&kian->link, &kvm->irq_ack_notifier_list); 508 mutex_unlock(&kvm->irq_lock); 509 kvm_arch_post_irq_ack_notifier_list_update(kvm); 510 } 511 512 void kvm_unregister_irq_ack_notifier(struct kvm *kvm, 513 struct kvm_irq_ack_notifier *kian) 514 { 515 mutex_lock(&kvm->irq_lock); 516 hlist_del_init_rcu(&kian->link); 517 mutex_unlock(&kvm->irq_lock); 518 synchronize_srcu(&kvm->irq_srcu); 519 kvm_arch_post_irq_ack_notifier_list_update(kvm); 520 } 521 #endif 522 523 void 524 kvm_eventfd_init(struct kvm *kvm) 525 { 526 #ifdef CONFIG_HAVE_KVM_IRQFD 527 spin_lock_init(&kvm->irqfds.lock); 528 INIT_LIST_HEAD(&kvm->irqfds.items); 529 INIT_LIST_HEAD(&kvm->irqfds.resampler_list); 530 mutex_init(&kvm->irqfds.resampler_lock); 531 #endif 532 INIT_LIST_HEAD(&kvm->ioeventfds); 533 } 534 535 #ifdef CONFIG_HAVE_KVM_IRQFD 536 /* 537 * shutdown any irqfd's that match fd+gsi 538 */ 539 static int 540 kvm_irqfd_deassign(struct kvm *kvm, struct kvm_irqfd *args) 541 { 542 struct kvm_kernel_irqfd *irqfd, *tmp; 543 struct eventfd_ctx *eventfd; 544 545 eventfd = eventfd_ctx_fdget(args->fd); 546 if (IS_ERR(eventfd)) 547 return PTR_ERR(eventfd); 548 549 spin_lock_irq(&kvm->irqfds.lock); 550 551 list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds.items, list) { 552 if (irqfd->eventfd == eventfd && irqfd->gsi == args->gsi) { 553 /* 554 * This clearing of irq_entry.type is needed for when 555 * another thread calls kvm_irq_routing_update before 556 * we flush workqueue below (we synchronize with 557 * kvm_irq_routing_update using irqfds.lock). 558 */ 559 write_seqcount_begin(&irqfd->irq_entry_sc); 560 irqfd->irq_entry.type = 0; 561 write_seqcount_end(&irqfd->irq_entry_sc); 562 irqfd_deactivate(irqfd); 563 } 564 } 565 566 spin_unlock_irq(&kvm->irqfds.lock); 567 eventfd_ctx_put(eventfd); 568 569 /* 570 * Block until we know all outstanding shutdown jobs have completed 571 * so that we guarantee there will not be any more interrupts on this 572 * gsi once this deassign function returns. 573 */ 574 flush_workqueue(irqfd_cleanup_wq); 575 576 return 0; 577 } 578 579 int 580 kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args) 581 { 582 if (args->flags & ~(KVM_IRQFD_FLAG_DEASSIGN | KVM_IRQFD_FLAG_RESAMPLE)) 583 return -EINVAL; 584 585 if (args->flags & KVM_IRQFD_FLAG_DEASSIGN) 586 return kvm_irqfd_deassign(kvm, args); 587 588 return kvm_irqfd_assign(kvm, args); 589 } 590 591 /* 592 * This function is called as the kvm VM fd is being released. Shutdown all 593 * irqfds that still remain open 594 */ 595 void 596 kvm_irqfd_release(struct kvm *kvm) 597 { 598 struct kvm_kernel_irqfd *irqfd, *tmp; 599 600 spin_lock_irq(&kvm->irqfds.lock); 601 602 list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds.items, list) 603 irqfd_deactivate(irqfd); 604 605 spin_unlock_irq(&kvm->irqfds.lock); 606 607 /* 608 * Block until we know all outstanding shutdown jobs have completed 609 * since we do not take a kvm* reference. 610 */ 611 flush_workqueue(irqfd_cleanup_wq); 612 613 } 614 615 /* 616 * Take note of a change in irq routing. 617 * Caller must invoke synchronize_srcu(&kvm->irq_srcu) afterwards. 618 */ 619 void kvm_irq_routing_update(struct kvm *kvm) 620 { 621 struct kvm_kernel_irqfd *irqfd; 622 623 spin_lock_irq(&kvm->irqfds.lock); 624 625 list_for_each_entry(irqfd, &kvm->irqfds.items, list) { 626 #ifdef CONFIG_HAVE_KVM_IRQ_BYPASS 627 /* Under irqfds.lock, so can read irq_entry safely */ 628 struct kvm_kernel_irq_routing_entry old = irqfd->irq_entry; 629 #endif 630 631 irqfd_update(kvm, irqfd); 632 633 #ifdef CONFIG_HAVE_KVM_IRQ_BYPASS 634 if (irqfd->producer && 635 kvm_arch_irqfd_route_changed(&old, &irqfd->irq_entry)) { 636 int ret = kvm_arch_update_irqfd_routing( 637 irqfd->kvm, irqfd->producer->irq, 638 irqfd->gsi, 1); 639 WARN_ON(ret); 640 } 641 #endif 642 } 643 644 spin_unlock_irq(&kvm->irqfds.lock); 645 } 646 647 /* 648 * create a host-wide workqueue for issuing deferred shutdown requests 649 * aggregated from all vm* instances. We need our own isolated 650 * queue to ease flushing work items when a VM exits. 651 */ 652 int kvm_irqfd_init(void) 653 { 654 irqfd_cleanup_wq = alloc_workqueue("kvm-irqfd-cleanup", 0, 0); 655 if (!irqfd_cleanup_wq) 656 return -ENOMEM; 657 658 return 0; 659 } 660 661 void kvm_irqfd_exit(void) 662 { 663 destroy_workqueue(irqfd_cleanup_wq); 664 } 665 #endif 666 667 /* 668 * -------------------------------------------------------------------- 669 * ioeventfd: translate a PIO/MMIO memory write to an eventfd signal. 670 * 671 * userspace can register a PIO/MMIO address with an eventfd for receiving 672 * notification when the memory has been touched. 673 * -------------------------------------------------------------------- 674 */ 675 676 struct _ioeventfd { 677 struct list_head list; 678 u64 addr; 679 int length; 680 struct eventfd_ctx *eventfd; 681 u64 datamatch; 682 struct kvm_io_device dev; 683 u8 bus_idx; 684 bool wildcard; 685 }; 686 687 static inline struct _ioeventfd * 688 to_ioeventfd(struct kvm_io_device *dev) 689 { 690 return container_of(dev, struct _ioeventfd, dev); 691 } 692 693 static void 694 ioeventfd_release(struct _ioeventfd *p) 695 { 696 eventfd_ctx_put(p->eventfd); 697 list_del(&p->list); 698 kfree(p); 699 } 700 701 static bool 702 ioeventfd_in_range(struct _ioeventfd *p, gpa_t addr, int len, const void *val) 703 { 704 u64 _val; 705 706 if (addr != p->addr) 707 /* address must be precise for a hit */ 708 return false; 709 710 if (!p->length) 711 /* length = 0 means only look at the address, so always a hit */ 712 return true; 713 714 if (len != p->length) 715 /* address-range must be precise for a hit */ 716 return false; 717 718 if (p->wildcard) 719 /* all else equal, wildcard is always a hit */ 720 return true; 721 722 /* otherwise, we have to actually compare the data */ 723 724 BUG_ON(!IS_ALIGNED((unsigned long)val, len)); 725 726 switch (len) { 727 case 1: 728 _val = *(u8 *)val; 729 break; 730 case 2: 731 _val = *(u16 *)val; 732 break; 733 case 4: 734 _val = *(u32 *)val; 735 break; 736 case 8: 737 _val = *(u64 *)val; 738 break; 739 default: 740 return false; 741 } 742 743 return _val == p->datamatch; 744 } 745 746 /* MMIO/PIO writes trigger an event if the addr/val match */ 747 static int 748 ioeventfd_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this, gpa_t addr, 749 int len, const void *val) 750 { 751 struct _ioeventfd *p = to_ioeventfd(this); 752 753 if (!ioeventfd_in_range(p, addr, len, val)) 754 return -EOPNOTSUPP; 755 756 eventfd_signal(p->eventfd, 1); 757 return 0; 758 } 759 760 /* 761 * This function is called as KVM is completely shutting down. We do not 762 * need to worry about locking just nuke anything we have as quickly as possible 763 */ 764 static void 765 ioeventfd_destructor(struct kvm_io_device *this) 766 { 767 struct _ioeventfd *p = to_ioeventfd(this); 768 769 ioeventfd_release(p); 770 } 771 772 static const struct kvm_io_device_ops ioeventfd_ops = { 773 .write = ioeventfd_write, 774 .destructor = ioeventfd_destructor, 775 }; 776 777 /* assumes kvm->slots_lock held */ 778 static bool 779 ioeventfd_check_collision(struct kvm *kvm, struct _ioeventfd *p) 780 { 781 struct _ioeventfd *_p; 782 783 list_for_each_entry(_p, &kvm->ioeventfds, list) 784 if (_p->bus_idx == p->bus_idx && 785 _p->addr == p->addr && 786 (!_p->length || !p->length || 787 (_p->length == p->length && 788 (_p->wildcard || p->wildcard || 789 _p->datamatch == p->datamatch)))) 790 return true; 791 792 return false; 793 } 794 795 static enum kvm_bus ioeventfd_bus_from_flags(__u32 flags) 796 { 797 if (flags & KVM_IOEVENTFD_FLAG_PIO) 798 return KVM_PIO_BUS; 799 if (flags & KVM_IOEVENTFD_FLAG_VIRTIO_CCW_NOTIFY) 800 return KVM_VIRTIO_CCW_NOTIFY_BUS; 801 return KVM_MMIO_BUS; 802 } 803 804 static int kvm_assign_ioeventfd_idx(struct kvm *kvm, 805 enum kvm_bus bus_idx, 806 struct kvm_ioeventfd *args) 807 { 808 809 struct eventfd_ctx *eventfd; 810 struct _ioeventfd *p; 811 int ret; 812 813 eventfd = eventfd_ctx_fdget(args->fd); 814 if (IS_ERR(eventfd)) 815 return PTR_ERR(eventfd); 816 817 p = kzalloc(sizeof(*p), GFP_KERNEL_ACCOUNT); 818 if (!p) { 819 ret = -ENOMEM; 820 goto fail; 821 } 822 823 INIT_LIST_HEAD(&p->list); 824 p->addr = args->addr; 825 p->bus_idx = bus_idx; 826 p->length = args->len; 827 p->eventfd = eventfd; 828 829 /* The datamatch feature is optional, otherwise this is a wildcard */ 830 if (args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH) 831 p->datamatch = args->datamatch; 832 else 833 p->wildcard = true; 834 835 mutex_lock(&kvm->slots_lock); 836 837 /* Verify that there isn't a match already */ 838 if (ioeventfd_check_collision(kvm, p)) { 839 ret = -EEXIST; 840 goto unlock_fail; 841 } 842 843 kvm_iodevice_init(&p->dev, &ioeventfd_ops); 844 845 ret = kvm_io_bus_register_dev(kvm, bus_idx, p->addr, p->length, 846 &p->dev); 847 if (ret < 0) 848 goto unlock_fail; 849 850 kvm_get_bus(kvm, bus_idx)->ioeventfd_count++; 851 list_add_tail(&p->list, &kvm->ioeventfds); 852 853 mutex_unlock(&kvm->slots_lock); 854 855 return 0; 856 857 unlock_fail: 858 mutex_unlock(&kvm->slots_lock); 859 860 fail: 861 kfree(p); 862 eventfd_ctx_put(eventfd); 863 864 return ret; 865 } 866 867 static int 868 kvm_deassign_ioeventfd_idx(struct kvm *kvm, enum kvm_bus bus_idx, 869 struct kvm_ioeventfd *args) 870 { 871 struct _ioeventfd *p, *tmp; 872 struct eventfd_ctx *eventfd; 873 struct kvm_io_bus *bus; 874 int ret = -ENOENT; 875 bool wildcard; 876 877 eventfd = eventfd_ctx_fdget(args->fd); 878 if (IS_ERR(eventfd)) 879 return PTR_ERR(eventfd); 880 881 wildcard = !(args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH); 882 883 mutex_lock(&kvm->slots_lock); 884 885 list_for_each_entry_safe(p, tmp, &kvm->ioeventfds, list) { 886 887 if (p->bus_idx != bus_idx || 888 p->eventfd != eventfd || 889 p->addr != args->addr || 890 p->length != args->len || 891 p->wildcard != wildcard) 892 continue; 893 894 if (!p->wildcard && p->datamatch != args->datamatch) 895 continue; 896 897 kvm_io_bus_unregister_dev(kvm, bus_idx, &p->dev); 898 bus = kvm_get_bus(kvm, bus_idx); 899 if (bus) 900 bus->ioeventfd_count--; 901 ioeventfd_release(p); 902 ret = 0; 903 break; 904 } 905 906 mutex_unlock(&kvm->slots_lock); 907 908 eventfd_ctx_put(eventfd); 909 910 return ret; 911 } 912 913 static int kvm_deassign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) 914 { 915 enum kvm_bus bus_idx = ioeventfd_bus_from_flags(args->flags); 916 int ret = kvm_deassign_ioeventfd_idx(kvm, bus_idx, args); 917 918 if (!args->len && bus_idx == KVM_MMIO_BUS) 919 kvm_deassign_ioeventfd_idx(kvm, KVM_FAST_MMIO_BUS, args); 920 921 return ret; 922 } 923 924 static int 925 kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) 926 { 927 enum kvm_bus bus_idx; 928 int ret; 929 930 bus_idx = ioeventfd_bus_from_flags(args->flags); 931 /* must be natural-word sized, or 0 to ignore length */ 932 switch (args->len) { 933 case 0: 934 case 1: 935 case 2: 936 case 4: 937 case 8: 938 break; 939 default: 940 return -EINVAL; 941 } 942 943 /* check for range overflow */ 944 if (args->addr + args->len < args->addr) 945 return -EINVAL; 946 947 /* check for extra flags that we don't understand */ 948 if (args->flags & ~KVM_IOEVENTFD_VALID_FLAG_MASK) 949 return -EINVAL; 950 951 /* ioeventfd with no length can't be combined with DATAMATCH */ 952 if (!args->len && (args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH)) 953 return -EINVAL; 954 955 ret = kvm_assign_ioeventfd_idx(kvm, bus_idx, args); 956 if (ret) 957 goto fail; 958 959 /* When length is ignored, MMIO is also put on a separate bus, for 960 * faster lookups. 961 */ 962 if (!args->len && bus_idx == KVM_MMIO_BUS) { 963 ret = kvm_assign_ioeventfd_idx(kvm, KVM_FAST_MMIO_BUS, args); 964 if (ret < 0) 965 goto fast_fail; 966 } 967 968 return 0; 969 970 fast_fail: 971 kvm_deassign_ioeventfd_idx(kvm, bus_idx, args); 972 fail: 973 return ret; 974 } 975 976 int 977 kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) 978 { 979 if (args->flags & KVM_IOEVENTFD_FLAG_DEASSIGN) 980 return kvm_deassign_ioeventfd(kvm, args); 981 982 return kvm_assign_ioeventfd(kvm, args); 983 } 984