1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * kvm eventfd support - use eventfd objects to signal various KVM events 4 * 5 * Copyright 2009 Novell. All Rights Reserved. 6 * Copyright 2010 Red Hat, Inc. and/or its affiliates. 7 * 8 * Author: 9 * Gregory Haskins <ghaskins@novell.com> 10 */ 11 12 #include <linux/kvm_host.h> 13 #include <linux/kvm.h> 14 #include <linux/kvm_irqfd.h> 15 #include <linux/workqueue.h> 16 #include <linux/syscalls.h> 17 #include <linux/wait.h> 18 #include <linux/poll.h> 19 #include <linux/file.h> 20 #include <linux/list.h> 21 #include <linux/eventfd.h> 22 #include <linux/kernel.h> 23 #include <linux/srcu.h> 24 #include <linux/slab.h> 25 #include <linux/seqlock.h> 26 #include <linux/irqbypass.h> 27 #include <trace/events/kvm.h> 28 29 #include <kvm/iodev.h> 30 31 #ifdef CONFIG_HAVE_KVM_IRQCHIP 32 33 static struct workqueue_struct *irqfd_cleanup_wq; 34 35 bool __attribute__((weak)) 36 kvm_arch_irqfd_allowed(struct kvm *kvm, struct kvm_irqfd *args) 37 { 38 return true; 39 } 40 41 static void 42 irqfd_inject(struct work_struct *work) 43 { 44 struct kvm_kernel_irqfd *irqfd = 45 container_of(work, struct kvm_kernel_irqfd, inject); 46 struct kvm *kvm = irqfd->kvm; 47 48 if (!irqfd->resampler) { 49 kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1, 50 false); 51 kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0, 52 false); 53 } else 54 kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID, 55 irqfd->gsi, 1, false); 56 } 57 58 static void irqfd_resampler_notify(struct kvm_kernel_irqfd_resampler *resampler) 59 { 60 struct kvm_kernel_irqfd *irqfd; 61 62 list_for_each_entry_srcu(irqfd, &resampler->list, resampler_link, 63 srcu_read_lock_held(&resampler->kvm->irq_srcu)) 64 eventfd_signal(irqfd->resamplefd); 65 } 66 67 /* 68 * Since resampler irqfds share an IRQ source ID, we de-assert once 69 * then notify all of the resampler irqfds using this GSI. We can't 70 * do multiple de-asserts or we risk racing with incoming re-asserts. 71 */ 72 static void 73 irqfd_resampler_ack(struct kvm_irq_ack_notifier *kian) 74 { 75 struct kvm_kernel_irqfd_resampler *resampler; 76 struct kvm *kvm; 77 int idx; 78 79 resampler = container_of(kian, 80 struct kvm_kernel_irqfd_resampler, notifier); 81 kvm = resampler->kvm; 82 83 kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID, 84 resampler->notifier.gsi, 0, false); 85 86 idx = srcu_read_lock(&kvm->irq_srcu); 87 irqfd_resampler_notify(resampler); 88 srcu_read_unlock(&kvm->irq_srcu, idx); 89 } 90 91 static void 92 irqfd_resampler_shutdown(struct kvm_kernel_irqfd *irqfd) 93 { 94 struct kvm_kernel_irqfd_resampler *resampler = irqfd->resampler; 95 struct kvm *kvm = resampler->kvm; 96 97 mutex_lock(&kvm->irqfds.resampler_lock); 98 99 list_del_rcu(&irqfd->resampler_link); 100 101 if (list_empty(&resampler->list)) { 102 list_del_rcu(&resampler->link); 103 kvm_unregister_irq_ack_notifier(kvm, &resampler->notifier); 104 /* 105 * synchronize_srcu_expedited(&kvm->irq_srcu) already called 106 * in kvm_unregister_irq_ack_notifier(). 107 */ 108 kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID, 109 resampler->notifier.gsi, 0, false); 110 kfree(resampler); 111 } else { 112 synchronize_srcu_expedited(&kvm->irq_srcu); 113 } 114 115 mutex_unlock(&kvm->irqfds.resampler_lock); 116 } 117 118 /* 119 * Race-free decouple logic (ordering is critical) 120 */ 121 static void 122 irqfd_shutdown(struct work_struct *work) 123 { 124 struct kvm_kernel_irqfd *irqfd = 125 container_of(work, struct kvm_kernel_irqfd, shutdown); 126 struct kvm *kvm = irqfd->kvm; 127 u64 cnt; 128 129 /* Make sure irqfd has been initialized in assign path. */ 130 synchronize_srcu_expedited(&kvm->irq_srcu); 131 132 /* 133 * Synchronize with the wait-queue and unhook ourselves to prevent 134 * further events. 135 */ 136 eventfd_ctx_remove_wait_queue(irqfd->eventfd, &irqfd->wait, &cnt); 137 138 /* 139 * We know no new events will be scheduled at this point, so block 140 * until all previously outstanding events have completed 141 */ 142 flush_work(&irqfd->inject); 143 144 if (irqfd->resampler) { 145 irqfd_resampler_shutdown(irqfd); 146 eventfd_ctx_put(irqfd->resamplefd); 147 } 148 149 /* 150 * It is now safe to release the object's resources 151 */ 152 #if IS_ENABLED(CONFIG_HAVE_KVM_IRQ_BYPASS) 153 irq_bypass_unregister_consumer(&irqfd->consumer); 154 #endif 155 eventfd_ctx_put(irqfd->eventfd); 156 kfree(irqfd); 157 } 158 159 160 /* assumes kvm->irqfds.lock is held */ 161 static bool 162 irqfd_is_active(struct kvm_kernel_irqfd *irqfd) 163 { 164 return list_empty(&irqfd->list) ? false : true; 165 } 166 167 /* 168 * Mark the irqfd as inactive and schedule it for removal 169 * 170 * assumes kvm->irqfds.lock is held 171 */ 172 static void 173 irqfd_deactivate(struct kvm_kernel_irqfd *irqfd) 174 { 175 BUG_ON(!irqfd_is_active(irqfd)); 176 177 list_del_init(&irqfd->list); 178 179 queue_work(irqfd_cleanup_wq, &irqfd->shutdown); 180 } 181 182 int __attribute__((weak)) kvm_arch_set_irq_inatomic( 183 struct kvm_kernel_irq_routing_entry *irq, 184 struct kvm *kvm, int irq_source_id, 185 int level, 186 bool line_status) 187 { 188 return -EWOULDBLOCK; 189 } 190 191 /* 192 * Called with wqh->lock held and interrupts disabled 193 */ 194 static int 195 irqfd_wakeup(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) 196 { 197 struct kvm_kernel_irqfd *irqfd = 198 container_of(wait, struct kvm_kernel_irqfd, wait); 199 __poll_t flags = key_to_poll(key); 200 struct kvm_kernel_irq_routing_entry irq; 201 struct kvm *kvm = irqfd->kvm; 202 unsigned seq; 203 int idx; 204 int ret = 0; 205 206 if (flags & EPOLLIN) { 207 u64 cnt; 208 eventfd_ctx_do_read(irqfd->eventfd, &cnt); 209 210 idx = srcu_read_lock(&kvm->irq_srcu); 211 do { 212 seq = read_seqcount_begin(&irqfd->irq_entry_sc); 213 irq = irqfd->irq_entry; 214 } while (read_seqcount_retry(&irqfd->irq_entry_sc, seq)); 215 /* An event has been signaled, inject an interrupt */ 216 if (kvm_arch_set_irq_inatomic(&irq, kvm, 217 KVM_USERSPACE_IRQ_SOURCE_ID, 1, 218 false) == -EWOULDBLOCK) 219 schedule_work(&irqfd->inject); 220 srcu_read_unlock(&kvm->irq_srcu, idx); 221 ret = 1; 222 } 223 224 if (flags & EPOLLHUP) { 225 /* The eventfd is closing, detach from KVM */ 226 unsigned long iflags; 227 228 spin_lock_irqsave(&kvm->irqfds.lock, iflags); 229 230 /* 231 * We must check if someone deactivated the irqfd before 232 * we could acquire the irqfds.lock since the item is 233 * deactivated from the KVM side before it is unhooked from 234 * the wait-queue. If it is already deactivated, we can 235 * simply return knowing the other side will cleanup for us. 236 * We cannot race against the irqfd going away since the 237 * other side is required to acquire wqh->lock, which we hold 238 */ 239 if (irqfd_is_active(irqfd)) 240 irqfd_deactivate(irqfd); 241 242 spin_unlock_irqrestore(&kvm->irqfds.lock, iflags); 243 } 244 245 return ret; 246 } 247 248 static void 249 irqfd_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh, 250 poll_table *pt) 251 { 252 struct kvm_kernel_irqfd *irqfd = 253 container_of(pt, struct kvm_kernel_irqfd, pt); 254 add_wait_queue_priority(wqh, &irqfd->wait); 255 } 256 257 /* Must be called under irqfds.lock */ 258 static void irqfd_update(struct kvm *kvm, struct kvm_kernel_irqfd *irqfd) 259 { 260 struct kvm_kernel_irq_routing_entry *e; 261 struct kvm_kernel_irq_routing_entry entries[KVM_NR_IRQCHIPS]; 262 int n_entries; 263 264 n_entries = kvm_irq_map_gsi(kvm, entries, irqfd->gsi); 265 266 write_seqcount_begin(&irqfd->irq_entry_sc); 267 268 e = entries; 269 if (n_entries == 1) 270 irqfd->irq_entry = *e; 271 else 272 irqfd->irq_entry.type = 0; 273 274 write_seqcount_end(&irqfd->irq_entry_sc); 275 } 276 277 #if IS_ENABLED(CONFIG_HAVE_KVM_IRQ_BYPASS) 278 void __attribute__((weak)) kvm_arch_irq_bypass_stop( 279 struct irq_bypass_consumer *cons) 280 { 281 } 282 283 void __attribute__((weak)) kvm_arch_irq_bypass_start( 284 struct irq_bypass_consumer *cons) 285 { 286 } 287 288 int __attribute__((weak)) kvm_arch_update_irqfd_routing( 289 struct kvm *kvm, unsigned int host_irq, 290 uint32_t guest_irq, bool set) 291 { 292 return 0; 293 } 294 295 bool __attribute__((weak)) kvm_arch_irqfd_route_changed( 296 struct kvm_kernel_irq_routing_entry *old, 297 struct kvm_kernel_irq_routing_entry *new) 298 { 299 return true; 300 } 301 #endif 302 303 static int 304 kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args) 305 { 306 struct kvm_kernel_irqfd *irqfd, *tmp; 307 struct eventfd_ctx *eventfd = NULL, *resamplefd = NULL; 308 int ret; 309 __poll_t events; 310 int idx; 311 312 if (!kvm_arch_intc_initialized(kvm)) 313 return -EAGAIN; 314 315 if (!kvm_arch_irqfd_allowed(kvm, args)) 316 return -EINVAL; 317 318 irqfd = kzalloc(sizeof(*irqfd), GFP_KERNEL_ACCOUNT); 319 if (!irqfd) 320 return -ENOMEM; 321 322 irqfd->kvm = kvm; 323 irqfd->gsi = args->gsi; 324 INIT_LIST_HEAD(&irqfd->list); 325 INIT_WORK(&irqfd->inject, irqfd_inject); 326 INIT_WORK(&irqfd->shutdown, irqfd_shutdown); 327 seqcount_spinlock_init(&irqfd->irq_entry_sc, &kvm->irqfds.lock); 328 329 CLASS(fd, f)(args->fd); 330 if (fd_empty(f)) { 331 ret = -EBADF; 332 goto out; 333 } 334 335 eventfd = eventfd_ctx_fileget(fd_file(f)); 336 if (IS_ERR(eventfd)) { 337 ret = PTR_ERR(eventfd); 338 goto out; 339 } 340 341 irqfd->eventfd = eventfd; 342 343 if (args->flags & KVM_IRQFD_FLAG_RESAMPLE) { 344 struct kvm_kernel_irqfd_resampler *resampler; 345 346 resamplefd = eventfd_ctx_fdget(args->resamplefd); 347 if (IS_ERR(resamplefd)) { 348 ret = PTR_ERR(resamplefd); 349 goto fail; 350 } 351 352 irqfd->resamplefd = resamplefd; 353 INIT_LIST_HEAD(&irqfd->resampler_link); 354 355 mutex_lock(&kvm->irqfds.resampler_lock); 356 357 list_for_each_entry(resampler, 358 &kvm->irqfds.resampler_list, link) { 359 if (resampler->notifier.gsi == irqfd->gsi) { 360 irqfd->resampler = resampler; 361 break; 362 } 363 } 364 365 if (!irqfd->resampler) { 366 resampler = kzalloc(sizeof(*resampler), 367 GFP_KERNEL_ACCOUNT); 368 if (!resampler) { 369 ret = -ENOMEM; 370 mutex_unlock(&kvm->irqfds.resampler_lock); 371 goto fail; 372 } 373 374 resampler->kvm = kvm; 375 INIT_LIST_HEAD(&resampler->list); 376 resampler->notifier.gsi = irqfd->gsi; 377 resampler->notifier.irq_acked = irqfd_resampler_ack; 378 INIT_LIST_HEAD(&resampler->link); 379 380 list_add_rcu(&resampler->link, &kvm->irqfds.resampler_list); 381 kvm_register_irq_ack_notifier(kvm, 382 &resampler->notifier); 383 irqfd->resampler = resampler; 384 } 385 386 list_add_rcu(&irqfd->resampler_link, &irqfd->resampler->list); 387 synchronize_srcu_expedited(&kvm->irq_srcu); 388 389 mutex_unlock(&kvm->irqfds.resampler_lock); 390 } 391 392 /* 393 * Install our own custom wake-up handling so we are notified via 394 * a callback whenever someone signals the underlying eventfd 395 */ 396 init_waitqueue_func_entry(&irqfd->wait, irqfd_wakeup); 397 init_poll_funcptr(&irqfd->pt, irqfd_ptable_queue_proc); 398 399 spin_lock_irq(&kvm->irqfds.lock); 400 401 ret = 0; 402 list_for_each_entry(tmp, &kvm->irqfds.items, list) { 403 if (irqfd->eventfd != tmp->eventfd) 404 continue; 405 /* This fd is used for another irq already. */ 406 ret = -EBUSY; 407 spin_unlock_irq(&kvm->irqfds.lock); 408 goto fail; 409 } 410 411 idx = srcu_read_lock(&kvm->irq_srcu); 412 irqfd_update(kvm, irqfd); 413 414 list_add_tail(&irqfd->list, &kvm->irqfds.items); 415 416 spin_unlock_irq(&kvm->irqfds.lock); 417 418 /* 419 * Check if there was an event already pending on the eventfd 420 * before we registered, and trigger it as if we didn't miss it. 421 */ 422 events = vfs_poll(fd_file(f), &irqfd->pt); 423 424 if (events & EPOLLIN) 425 schedule_work(&irqfd->inject); 426 427 #if IS_ENABLED(CONFIG_HAVE_KVM_IRQ_BYPASS) 428 if (kvm_arch_has_irq_bypass()) { 429 irqfd->consumer.add_producer = kvm_arch_irq_bypass_add_producer; 430 irqfd->consumer.del_producer = kvm_arch_irq_bypass_del_producer; 431 irqfd->consumer.stop = kvm_arch_irq_bypass_stop; 432 irqfd->consumer.start = kvm_arch_irq_bypass_start; 433 ret = irq_bypass_register_consumer(&irqfd->consumer, irqfd->eventfd); 434 if (ret) 435 pr_info("irq bypass consumer (eventfd %p) registration fails: %d\n", 436 irqfd->eventfd, ret); 437 } 438 #endif 439 440 srcu_read_unlock(&kvm->irq_srcu, idx); 441 return 0; 442 443 fail: 444 if (irqfd->resampler) 445 irqfd_resampler_shutdown(irqfd); 446 447 if (resamplefd && !IS_ERR(resamplefd)) 448 eventfd_ctx_put(resamplefd); 449 450 if (eventfd && !IS_ERR(eventfd)) 451 eventfd_ctx_put(eventfd); 452 453 out: 454 kfree(irqfd); 455 return ret; 456 } 457 458 bool kvm_irq_has_notifier(struct kvm *kvm, unsigned irqchip, unsigned pin) 459 { 460 struct kvm_irq_ack_notifier *kian; 461 int gsi, idx; 462 463 idx = srcu_read_lock(&kvm->irq_srcu); 464 gsi = kvm_irq_map_chip_pin(kvm, irqchip, pin); 465 if (gsi != -1) 466 hlist_for_each_entry_srcu(kian, &kvm->irq_ack_notifier_list, 467 link, srcu_read_lock_held(&kvm->irq_srcu)) 468 if (kian->gsi == gsi) { 469 srcu_read_unlock(&kvm->irq_srcu, idx); 470 return true; 471 } 472 473 srcu_read_unlock(&kvm->irq_srcu, idx); 474 475 return false; 476 } 477 EXPORT_SYMBOL_GPL(kvm_irq_has_notifier); 478 479 void kvm_notify_acked_gsi(struct kvm *kvm, int gsi) 480 { 481 struct kvm_irq_ack_notifier *kian; 482 483 hlist_for_each_entry_srcu(kian, &kvm->irq_ack_notifier_list, 484 link, srcu_read_lock_held(&kvm->irq_srcu)) 485 if (kian->gsi == gsi) 486 kian->irq_acked(kian); 487 } 488 489 void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin) 490 { 491 int gsi, idx; 492 493 trace_kvm_ack_irq(irqchip, pin); 494 495 idx = srcu_read_lock(&kvm->irq_srcu); 496 gsi = kvm_irq_map_chip_pin(kvm, irqchip, pin); 497 if (gsi != -1) 498 kvm_notify_acked_gsi(kvm, gsi); 499 srcu_read_unlock(&kvm->irq_srcu, idx); 500 } 501 502 void kvm_register_irq_ack_notifier(struct kvm *kvm, 503 struct kvm_irq_ack_notifier *kian) 504 { 505 mutex_lock(&kvm->irq_lock); 506 hlist_add_head_rcu(&kian->link, &kvm->irq_ack_notifier_list); 507 mutex_unlock(&kvm->irq_lock); 508 kvm_arch_post_irq_ack_notifier_list_update(kvm); 509 } 510 511 void kvm_unregister_irq_ack_notifier(struct kvm *kvm, 512 struct kvm_irq_ack_notifier *kian) 513 { 514 mutex_lock(&kvm->irq_lock); 515 hlist_del_init_rcu(&kian->link); 516 mutex_unlock(&kvm->irq_lock); 517 synchronize_srcu_expedited(&kvm->irq_srcu); 518 kvm_arch_post_irq_ack_notifier_list_update(kvm); 519 } 520 521 /* 522 * shutdown any irqfd's that match fd+gsi 523 */ 524 static int 525 kvm_irqfd_deassign(struct kvm *kvm, struct kvm_irqfd *args) 526 { 527 struct kvm_kernel_irqfd *irqfd, *tmp; 528 struct eventfd_ctx *eventfd; 529 530 eventfd = eventfd_ctx_fdget(args->fd); 531 if (IS_ERR(eventfd)) 532 return PTR_ERR(eventfd); 533 534 spin_lock_irq(&kvm->irqfds.lock); 535 536 list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds.items, list) { 537 if (irqfd->eventfd == eventfd && irqfd->gsi == args->gsi) { 538 /* 539 * This clearing of irq_entry.type is needed for when 540 * another thread calls kvm_irq_routing_update before 541 * we flush workqueue below (we synchronize with 542 * kvm_irq_routing_update using irqfds.lock). 543 */ 544 write_seqcount_begin(&irqfd->irq_entry_sc); 545 irqfd->irq_entry.type = 0; 546 write_seqcount_end(&irqfd->irq_entry_sc); 547 irqfd_deactivate(irqfd); 548 } 549 } 550 551 spin_unlock_irq(&kvm->irqfds.lock); 552 eventfd_ctx_put(eventfd); 553 554 /* 555 * Block until we know all outstanding shutdown jobs have completed 556 * so that we guarantee there will not be any more interrupts on this 557 * gsi once this deassign function returns. 558 */ 559 flush_workqueue(irqfd_cleanup_wq); 560 561 return 0; 562 } 563 564 int 565 kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args) 566 { 567 if (args->flags & ~(KVM_IRQFD_FLAG_DEASSIGN | KVM_IRQFD_FLAG_RESAMPLE)) 568 return -EINVAL; 569 570 if (args->flags & KVM_IRQFD_FLAG_DEASSIGN) 571 return kvm_irqfd_deassign(kvm, args); 572 573 return kvm_irqfd_assign(kvm, args); 574 } 575 576 /* 577 * This function is called as the kvm VM fd is being released. Shutdown all 578 * irqfds that still remain open 579 */ 580 void 581 kvm_irqfd_release(struct kvm *kvm) 582 { 583 struct kvm_kernel_irqfd *irqfd, *tmp; 584 585 spin_lock_irq(&kvm->irqfds.lock); 586 587 list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds.items, list) 588 irqfd_deactivate(irqfd); 589 590 spin_unlock_irq(&kvm->irqfds.lock); 591 592 /* 593 * Block until we know all outstanding shutdown jobs have completed 594 * since we do not take a kvm* reference. 595 */ 596 flush_workqueue(irqfd_cleanup_wq); 597 598 } 599 600 /* 601 * Take note of a change in irq routing. 602 * Caller must invoke synchronize_srcu_expedited(&kvm->irq_srcu) afterwards. 603 */ 604 void kvm_irq_routing_update(struct kvm *kvm) 605 { 606 struct kvm_kernel_irqfd *irqfd; 607 608 spin_lock_irq(&kvm->irqfds.lock); 609 610 list_for_each_entry(irqfd, &kvm->irqfds.items, list) { 611 #if IS_ENABLED(CONFIG_HAVE_KVM_IRQ_BYPASS) 612 /* Under irqfds.lock, so can read irq_entry safely */ 613 struct kvm_kernel_irq_routing_entry old = irqfd->irq_entry; 614 #endif 615 616 irqfd_update(kvm, irqfd); 617 618 #if IS_ENABLED(CONFIG_HAVE_KVM_IRQ_BYPASS) 619 if (irqfd->producer && 620 kvm_arch_irqfd_route_changed(&old, &irqfd->irq_entry)) { 621 int ret = kvm_arch_update_irqfd_routing( 622 irqfd->kvm, irqfd->producer->irq, 623 irqfd->gsi, 1); 624 WARN_ON(ret); 625 } 626 #endif 627 } 628 629 spin_unlock_irq(&kvm->irqfds.lock); 630 } 631 632 bool kvm_notify_irqfd_resampler(struct kvm *kvm, 633 unsigned int irqchip, 634 unsigned int pin) 635 { 636 struct kvm_kernel_irqfd_resampler *resampler; 637 int gsi, idx; 638 639 idx = srcu_read_lock(&kvm->irq_srcu); 640 gsi = kvm_irq_map_chip_pin(kvm, irqchip, pin); 641 if (gsi != -1) { 642 list_for_each_entry_srcu(resampler, 643 &kvm->irqfds.resampler_list, link, 644 srcu_read_lock_held(&kvm->irq_srcu)) { 645 if (resampler->notifier.gsi == gsi) { 646 irqfd_resampler_notify(resampler); 647 srcu_read_unlock(&kvm->irq_srcu, idx); 648 return true; 649 } 650 } 651 } 652 srcu_read_unlock(&kvm->irq_srcu, idx); 653 654 return false; 655 } 656 657 /* 658 * create a host-wide workqueue for issuing deferred shutdown requests 659 * aggregated from all vm* instances. We need our own isolated 660 * queue to ease flushing work items when a VM exits. 661 */ 662 int kvm_irqfd_init(void) 663 { 664 irqfd_cleanup_wq = alloc_workqueue("kvm-irqfd-cleanup", 0, 0); 665 if (!irqfd_cleanup_wq) 666 return -ENOMEM; 667 668 return 0; 669 } 670 671 void kvm_irqfd_exit(void) 672 { 673 destroy_workqueue(irqfd_cleanup_wq); 674 } 675 #endif 676 677 /* 678 * -------------------------------------------------------------------- 679 * ioeventfd: translate a PIO/MMIO memory write to an eventfd signal. 680 * 681 * userspace can register a PIO/MMIO address with an eventfd for receiving 682 * notification when the memory has been touched. 683 * -------------------------------------------------------------------- 684 */ 685 686 struct _ioeventfd { 687 struct list_head list; 688 u64 addr; 689 int length; 690 struct eventfd_ctx *eventfd; 691 u64 datamatch; 692 struct kvm_io_device dev; 693 u8 bus_idx; 694 bool wildcard; 695 }; 696 697 static inline struct _ioeventfd * 698 to_ioeventfd(struct kvm_io_device *dev) 699 { 700 return container_of(dev, struct _ioeventfd, dev); 701 } 702 703 static void 704 ioeventfd_release(struct _ioeventfd *p) 705 { 706 eventfd_ctx_put(p->eventfd); 707 list_del(&p->list); 708 kfree(p); 709 } 710 711 static bool 712 ioeventfd_in_range(struct _ioeventfd *p, gpa_t addr, int len, const void *val) 713 { 714 u64 _val; 715 716 if (addr != p->addr) 717 /* address must be precise for a hit */ 718 return false; 719 720 if (!p->length) 721 /* length = 0 means only look at the address, so always a hit */ 722 return true; 723 724 if (len != p->length) 725 /* address-range must be precise for a hit */ 726 return false; 727 728 if (p->wildcard) 729 /* all else equal, wildcard is always a hit */ 730 return true; 731 732 /* otherwise, we have to actually compare the data */ 733 734 BUG_ON(!IS_ALIGNED((unsigned long)val, len)); 735 736 switch (len) { 737 case 1: 738 _val = *(u8 *)val; 739 break; 740 case 2: 741 _val = *(u16 *)val; 742 break; 743 case 4: 744 _val = *(u32 *)val; 745 break; 746 case 8: 747 _val = *(u64 *)val; 748 break; 749 default: 750 return false; 751 } 752 753 return _val == p->datamatch; 754 } 755 756 /* MMIO/PIO writes trigger an event if the addr/val match */ 757 static int 758 ioeventfd_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this, gpa_t addr, 759 int len, const void *val) 760 { 761 struct _ioeventfd *p = to_ioeventfd(this); 762 763 if (!ioeventfd_in_range(p, addr, len, val)) 764 return -EOPNOTSUPP; 765 766 eventfd_signal(p->eventfd); 767 return 0; 768 } 769 770 /* 771 * This function is called as KVM is completely shutting down. We do not 772 * need to worry about locking just nuke anything we have as quickly as possible 773 */ 774 static void 775 ioeventfd_destructor(struct kvm_io_device *this) 776 { 777 struct _ioeventfd *p = to_ioeventfd(this); 778 779 ioeventfd_release(p); 780 } 781 782 static const struct kvm_io_device_ops ioeventfd_ops = { 783 .write = ioeventfd_write, 784 .destructor = ioeventfd_destructor, 785 }; 786 787 /* assumes kvm->slots_lock held */ 788 static bool 789 ioeventfd_check_collision(struct kvm *kvm, struct _ioeventfd *p) 790 { 791 struct _ioeventfd *_p; 792 793 list_for_each_entry(_p, &kvm->ioeventfds, list) 794 if (_p->bus_idx == p->bus_idx && 795 _p->addr == p->addr && 796 (!_p->length || !p->length || 797 (_p->length == p->length && 798 (_p->wildcard || p->wildcard || 799 _p->datamatch == p->datamatch)))) 800 return true; 801 802 return false; 803 } 804 805 static enum kvm_bus ioeventfd_bus_from_flags(__u32 flags) 806 { 807 if (flags & KVM_IOEVENTFD_FLAG_PIO) 808 return KVM_PIO_BUS; 809 if (flags & KVM_IOEVENTFD_FLAG_VIRTIO_CCW_NOTIFY) 810 return KVM_VIRTIO_CCW_NOTIFY_BUS; 811 return KVM_MMIO_BUS; 812 } 813 814 static int kvm_assign_ioeventfd_idx(struct kvm *kvm, 815 enum kvm_bus bus_idx, 816 struct kvm_ioeventfd *args) 817 { 818 819 struct eventfd_ctx *eventfd; 820 struct _ioeventfd *p; 821 int ret; 822 823 eventfd = eventfd_ctx_fdget(args->fd); 824 if (IS_ERR(eventfd)) 825 return PTR_ERR(eventfd); 826 827 p = kzalloc(sizeof(*p), GFP_KERNEL_ACCOUNT); 828 if (!p) { 829 ret = -ENOMEM; 830 goto fail; 831 } 832 833 INIT_LIST_HEAD(&p->list); 834 p->addr = args->addr; 835 p->bus_idx = bus_idx; 836 p->length = args->len; 837 p->eventfd = eventfd; 838 839 /* The datamatch feature is optional, otherwise this is a wildcard */ 840 if (args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH) 841 p->datamatch = args->datamatch; 842 else 843 p->wildcard = true; 844 845 mutex_lock(&kvm->slots_lock); 846 847 /* Verify that there isn't a match already */ 848 if (ioeventfd_check_collision(kvm, p)) { 849 ret = -EEXIST; 850 goto unlock_fail; 851 } 852 853 kvm_iodevice_init(&p->dev, &ioeventfd_ops); 854 855 ret = kvm_io_bus_register_dev(kvm, bus_idx, p->addr, p->length, 856 &p->dev); 857 if (ret < 0) 858 goto unlock_fail; 859 860 kvm_get_bus(kvm, bus_idx)->ioeventfd_count++; 861 list_add_tail(&p->list, &kvm->ioeventfds); 862 863 mutex_unlock(&kvm->slots_lock); 864 865 return 0; 866 867 unlock_fail: 868 mutex_unlock(&kvm->slots_lock); 869 kfree(p); 870 871 fail: 872 eventfd_ctx_put(eventfd); 873 874 return ret; 875 } 876 877 static int 878 kvm_deassign_ioeventfd_idx(struct kvm *kvm, enum kvm_bus bus_idx, 879 struct kvm_ioeventfd *args) 880 { 881 struct _ioeventfd *p; 882 struct eventfd_ctx *eventfd; 883 struct kvm_io_bus *bus; 884 int ret = -ENOENT; 885 bool wildcard; 886 887 eventfd = eventfd_ctx_fdget(args->fd); 888 if (IS_ERR(eventfd)) 889 return PTR_ERR(eventfd); 890 891 wildcard = !(args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH); 892 893 mutex_lock(&kvm->slots_lock); 894 895 list_for_each_entry(p, &kvm->ioeventfds, list) { 896 if (p->bus_idx != bus_idx || 897 p->eventfd != eventfd || 898 p->addr != args->addr || 899 p->length != args->len || 900 p->wildcard != wildcard) 901 continue; 902 903 if (!p->wildcard && p->datamatch != args->datamatch) 904 continue; 905 906 kvm_io_bus_unregister_dev(kvm, bus_idx, &p->dev); 907 bus = kvm_get_bus(kvm, bus_idx); 908 if (bus) 909 bus->ioeventfd_count--; 910 ret = 0; 911 break; 912 } 913 914 mutex_unlock(&kvm->slots_lock); 915 916 eventfd_ctx_put(eventfd); 917 918 return ret; 919 } 920 921 static int kvm_deassign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) 922 { 923 enum kvm_bus bus_idx = ioeventfd_bus_from_flags(args->flags); 924 int ret = kvm_deassign_ioeventfd_idx(kvm, bus_idx, args); 925 926 if (!args->len && bus_idx == KVM_MMIO_BUS) 927 kvm_deassign_ioeventfd_idx(kvm, KVM_FAST_MMIO_BUS, args); 928 929 return ret; 930 } 931 932 static int 933 kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) 934 { 935 enum kvm_bus bus_idx; 936 int ret; 937 938 bus_idx = ioeventfd_bus_from_flags(args->flags); 939 /* must be natural-word sized, or 0 to ignore length */ 940 switch (args->len) { 941 case 0: 942 case 1: 943 case 2: 944 case 4: 945 case 8: 946 break; 947 default: 948 return -EINVAL; 949 } 950 951 /* check for range overflow */ 952 if (args->addr + args->len < args->addr) 953 return -EINVAL; 954 955 /* check for extra flags that we don't understand */ 956 if (args->flags & ~KVM_IOEVENTFD_VALID_FLAG_MASK) 957 return -EINVAL; 958 959 /* ioeventfd with no length can't be combined with DATAMATCH */ 960 if (!args->len && (args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH)) 961 return -EINVAL; 962 963 ret = kvm_assign_ioeventfd_idx(kvm, bus_idx, args); 964 if (ret) 965 goto fail; 966 967 /* When length is ignored, MMIO is also put on a separate bus, for 968 * faster lookups. 969 */ 970 if (!args->len && bus_idx == KVM_MMIO_BUS) { 971 ret = kvm_assign_ioeventfd_idx(kvm, KVM_FAST_MMIO_BUS, args); 972 if (ret < 0) 973 goto fast_fail; 974 } 975 976 return 0; 977 978 fast_fail: 979 kvm_deassign_ioeventfd_idx(kvm, bus_idx, args); 980 fail: 981 return ret; 982 } 983 984 int 985 kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) 986 { 987 if (args->flags & KVM_IOEVENTFD_FLAG_DEASSIGN) 988 return kvm_deassign_ioeventfd(kvm, args); 989 990 return kvm_assign_ioeventfd(kvm, args); 991 } 992 993 void 994 kvm_eventfd_init(struct kvm *kvm) 995 { 996 #ifdef CONFIG_HAVE_KVM_IRQCHIP 997 spin_lock_init(&kvm->irqfds.lock); 998 INIT_LIST_HEAD(&kvm->irqfds.items); 999 INIT_LIST_HEAD(&kvm->irqfds.resampler_list); 1000 mutex_init(&kvm->irqfds.resampler_lock); 1001 #endif 1002 INIT_LIST_HEAD(&kvm->ioeventfds); 1003 } 1004