1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * kvm eventfd support - use eventfd objects to signal various KVM events 4 * 5 * Copyright 2009 Novell. All Rights Reserved. 6 * Copyright 2010 Red Hat, Inc. and/or its affiliates. 7 * 8 * Author: 9 * Gregory Haskins <ghaskins@novell.com> 10 */ 11 12 #include <linux/kvm_host.h> 13 #include <linux/kvm.h> 14 #include <linux/kvm_irqfd.h> 15 #include <linux/workqueue.h> 16 #include <linux/syscalls.h> 17 #include <linux/wait.h> 18 #include <linux/poll.h> 19 #include <linux/file.h> 20 #include <linux/list.h> 21 #include <linux/eventfd.h> 22 #include <linux/kernel.h> 23 #include <linux/srcu.h> 24 #include <linux/slab.h> 25 #include <linux/seqlock.h> 26 #include <linux/irqbypass.h> 27 #include <trace/events/kvm.h> 28 29 #include <kvm/iodev.h> 30 31 #ifdef CONFIG_HAVE_KVM_IRQCHIP 32 33 static struct workqueue_struct *irqfd_cleanup_wq; 34 35 bool __attribute__((weak)) 36 kvm_arch_irqfd_allowed(struct kvm *kvm, struct kvm_irqfd *args) 37 { 38 return true; 39 } 40 41 static void 42 irqfd_inject(struct work_struct *work) 43 { 44 struct kvm_kernel_irqfd *irqfd = 45 container_of(work, struct kvm_kernel_irqfd, inject); 46 struct kvm *kvm = irqfd->kvm; 47 48 if (!irqfd->resampler) { 49 kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1, 50 false); 51 kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0, 52 false); 53 } else 54 kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID, 55 irqfd->gsi, 1, false); 56 } 57 58 static void irqfd_resampler_notify(struct kvm_kernel_irqfd_resampler *resampler) 59 { 60 struct kvm_kernel_irqfd *irqfd; 61 62 list_for_each_entry_srcu(irqfd, &resampler->list, resampler_link, 63 srcu_read_lock_held(&resampler->kvm->irq_srcu)) 64 eventfd_signal(irqfd->resamplefd); 65 } 66 67 /* 68 * Since resampler irqfds share an IRQ source ID, we de-assert once 69 * then notify all of the resampler irqfds using this GSI. We can't 70 * do multiple de-asserts or we risk racing with incoming re-asserts. 71 */ 72 static void 73 irqfd_resampler_ack(struct kvm_irq_ack_notifier *kian) 74 { 75 struct kvm_kernel_irqfd_resampler *resampler; 76 struct kvm *kvm; 77 int idx; 78 79 resampler = container_of(kian, 80 struct kvm_kernel_irqfd_resampler, notifier); 81 kvm = resampler->kvm; 82 83 kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID, 84 resampler->notifier.gsi, 0, false); 85 86 idx = srcu_read_lock(&kvm->irq_srcu); 87 irqfd_resampler_notify(resampler); 88 srcu_read_unlock(&kvm->irq_srcu, idx); 89 } 90 91 static void 92 irqfd_resampler_shutdown(struct kvm_kernel_irqfd *irqfd) 93 { 94 struct kvm_kernel_irqfd_resampler *resampler = irqfd->resampler; 95 struct kvm *kvm = resampler->kvm; 96 97 mutex_lock(&kvm->irqfds.resampler_lock); 98 99 list_del_rcu(&irqfd->resampler_link); 100 101 if (list_empty(&resampler->list)) { 102 list_del_rcu(&resampler->link); 103 kvm_unregister_irq_ack_notifier(kvm, &resampler->notifier); 104 /* 105 * synchronize_srcu_expedited(&kvm->irq_srcu) already called 106 * in kvm_unregister_irq_ack_notifier(). 107 */ 108 kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID, 109 resampler->notifier.gsi, 0, false); 110 kfree(resampler); 111 } else { 112 synchronize_srcu_expedited(&kvm->irq_srcu); 113 } 114 115 mutex_unlock(&kvm->irqfds.resampler_lock); 116 } 117 118 /* 119 * Race-free decouple logic (ordering is critical) 120 */ 121 static void 122 irqfd_shutdown(struct work_struct *work) 123 { 124 struct kvm_kernel_irqfd *irqfd = 125 container_of(work, struct kvm_kernel_irqfd, shutdown); 126 struct kvm *kvm = irqfd->kvm; 127 u64 cnt; 128 129 /* Make sure irqfd has been initialized in assign path. */ 130 synchronize_srcu_expedited(&kvm->irq_srcu); 131 132 /* 133 * Synchronize with the wait-queue and unhook ourselves to prevent 134 * further events. 135 */ 136 eventfd_ctx_remove_wait_queue(irqfd->eventfd, &irqfd->wait, &cnt); 137 138 /* 139 * We know no new events will be scheduled at this point, so block 140 * until all previously outstanding events have completed 141 */ 142 flush_work(&irqfd->inject); 143 144 if (irqfd->resampler) { 145 irqfd_resampler_shutdown(irqfd); 146 eventfd_ctx_put(irqfd->resamplefd); 147 } 148 149 /* 150 * It is now safe to release the object's resources 151 */ 152 #if IS_ENABLED(CONFIG_HAVE_KVM_IRQ_BYPASS) 153 irq_bypass_unregister_consumer(&irqfd->consumer); 154 #endif 155 eventfd_ctx_put(irqfd->eventfd); 156 kfree(irqfd); 157 } 158 159 160 /* assumes kvm->irqfds.lock is held */ 161 static bool 162 irqfd_is_active(struct kvm_kernel_irqfd *irqfd) 163 { 164 return list_empty(&irqfd->list) ? false : true; 165 } 166 167 /* 168 * Mark the irqfd as inactive and schedule it for removal 169 * 170 * assumes kvm->irqfds.lock is held 171 */ 172 static void 173 irqfd_deactivate(struct kvm_kernel_irqfd *irqfd) 174 { 175 BUG_ON(!irqfd_is_active(irqfd)); 176 177 list_del_init(&irqfd->list); 178 179 queue_work(irqfd_cleanup_wq, &irqfd->shutdown); 180 } 181 182 int __attribute__((weak)) kvm_arch_set_irq_inatomic( 183 struct kvm_kernel_irq_routing_entry *irq, 184 struct kvm *kvm, int irq_source_id, 185 int level, 186 bool line_status) 187 { 188 return -EWOULDBLOCK; 189 } 190 191 /* 192 * Called with wqh->lock held and interrupts disabled 193 */ 194 static int 195 irqfd_wakeup(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) 196 { 197 struct kvm_kernel_irqfd *irqfd = 198 container_of(wait, struct kvm_kernel_irqfd, wait); 199 __poll_t flags = key_to_poll(key); 200 struct kvm_kernel_irq_routing_entry irq; 201 struct kvm *kvm = irqfd->kvm; 202 unsigned seq; 203 int idx; 204 int ret = 0; 205 206 if (flags & EPOLLIN) { 207 /* 208 * WARNING: Do NOT take irqfds.lock in any path except EPOLLHUP, 209 * as KVM holds irqfds.lock when registering the irqfd with the 210 * eventfd. 211 */ 212 u64 cnt; 213 eventfd_ctx_do_read(irqfd->eventfd, &cnt); 214 215 idx = srcu_read_lock(&kvm->irq_srcu); 216 do { 217 seq = read_seqcount_begin(&irqfd->irq_entry_sc); 218 irq = irqfd->irq_entry; 219 } while (read_seqcount_retry(&irqfd->irq_entry_sc, seq)); 220 /* An event has been signaled, inject an interrupt */ 221 if (kvm_arch_set_irq_inatomic(&irq, kvm, 222 KVM_USERSPACE_IRQ_SOURCE_ID, 1, 223 false) == -EWOULDBLOCK) 224 schedule_work(&irqfd->inject); 225 srcu_read_unlock(&kvm->irq_srcu, idx); 226 ret = 1; 227 } 228 229 if (flags & EPOLLHUP) { 230 /* The eventfd is closing, detach from KVM */ 231 unsigned long iflags; 232 233 /* 234 * Taking irqfds.lock is safe here, as KVM holds a reference to 235 * the eventfd when registering the irqfd, i.e. this path can't 236 * be reached while kvm_irqfd_add() is running. 237 */ 238 spin_lock_irqsave(&kvm->irqfds.lock, iflags); 239 240 /* 241 * We must check if someone deactivated the irqfd before 242 * we could acquire the irqfds.lock since the item is 243 * deactivated from the KVM side before it is unhooked from 244 * the wait-queue. If it is already deactivated, we can 245 * simply return knowing the other side will cleanup for us. 246 * We cannot race against the irqfd going away since the 247 * other side is required to acquire wqh->lock, which we hold 248 */ 249 if (irqfd_is_active(irqfd)) 250 irqfd_deactivate(irqfd); 251 252 spin_unlock_irqrestore(&kvm->irqfds.lock, iflags); 253 } 254 255 return ret; 256 } 257 258 static void irqfd_update(struct kvm *kvm, struct kvm_kernel_irqfd *irqfd) 259 { 260 struct kvm_kernel_irq_routing_entry *e; 261 struct kvm_kernel_irq_routing_entry entries[KVM_NR_IRQCHIPS]; 262 int n_entries; 263 264 lockdep_assert_held(&kvm->irqfds.lock); 265 266 n_entries = kvm_irq_map_gsi(kvm, entries, irqfd->gsi); 267 268 write_seqcount_begin(&irqfd->irq_entry_sc); 269 270 e = entries; 271 if (n_entries == 1) 272 irqfd->irq_entry = *e; 273 else 274 irqfd->irq_entry.type = 0; 275 276 write_seqcount_end(&irqfd->irq_entry_sc); 277 } 278 279 struct kvm_irqfd_pt { 280 struct kvm_kernel_irqfd *irqfd; 281 struct kvm *kvm; 282 poll_table pt; 283 int ret; 284 }; 285 286 static void kvm_irqfd_register(struct file *file, wait_queue_head_t *wqh, 287 poll_table *pt) 288 { 289 struct kvm_irqfd_pt *p = container_of(pt, struct kvm_irqfd_pt, pt); 290 struct kvm_kernel_irqfd *irqfd = p->irqfd; 291 struct kvm *kvm = p->kvm; 292 293 /* 294 * Note, irqfds.lock protects the irqfd's irq_entry, i.e. its routing, 295 * and irqfds.items. It does NOT protect registering with the eventfd. 296 */ 297 spin_lock_irq(&kvm->irqfds.lock); 298 299 /* 300 * Initialize the routing information prior to adding the irqfd to the 301 * eventfd's waitqueue, as irqfd_wakeup() can be invoked as soon as the 302 * irqfd is registered. 303 */ 304 irqfd_update(kvm, irqfd); 305 306 /* 307 * Add the irqfd as a priority waiter on the eventfd, with a custom 308 * wake-up handler, so that KVM *and only KVM* is notified whenever the 309 * underlying eventfd is signaled. 310 */ 311 init_waitqueue_func_entry(&irqfd->wait, irqfd_wakeup); 312 313 /* 314 * Temporarily lie to lockdep about holding irqfds.lock to avoid a 315 * false positive regarding potential deadlock with irqfd_wakeup() 316 * (see irqfd_wakeup() for details). 317 * 318 * Adding to the wait queue will fail if there is already a priority 319 * waiter, i.e. if the eventfd is associated with another irqfd (in any 320 * VM). Note, kvm_irqfd_deassign() waits for all in-flight shutdown 321 * jobs to complete, i.e. ensures the irqfd has been removed from the 322 * eventfd's waitqueue before returning to userspace. 323 */ 324 spin_release(&kvm->irqfds.lock.dep_map, _RET_IP_); 325 p->ret = add_wait_queue_priority_exclusive(wqh, &irqfd->wait); 326 spin_acquire(&kvm->irqfds.lock.dep_map, 0, 0, _RET_IP_); 327 if (p->ret) 328 goto out; 329 330 list_add_tail(&irqfd->list, &kvm->irqfds.items); 331 332 out: 333 spin_unlock_irq(&kvm->irqfds.lock); 334 } 335 336 #if IS_ENABLED(CONFIG_HAVE_KVM_IRQ_BYPASS) 337 void __attribute__((weak)) kvm_arch_irq_bypass_stop( 338 struct irq_bypass_consumer *cons) 339 { 340 } 341 342 void __attribute__((weak)) kvm_arch_irq_bypass_start( 343 struct irq_bypass_consumer *cons) 344 { 345 } 346 347 void __weak kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd *irqfd, 348 struct kvm_kernel_irq_routing_entry *old, 349 struct kvm_kernel_irq_routing_entry *new) 350 { 351 352 } 353 #endif 354 355 static int 356 kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args) 357 { 358 struct kvm_kernel_irqfd *irqfd; 359 struct eventfd_ctx *eventfd = NULL, *resamplefd = NULL; 360 struct kvm_irqfd_pt irqfd_pt; 361 int ret; 362 __poll_t events; 363 int idx; 364 365 if (!kvm_arch_intc_initialized(kvm)) 366 return -EAGAIN; 367 368 if (!kvm_arch_irqfd_allowed(kvm, args)) 369 return -EINVAL; 370 371 irqfd = kzalloc(sizeof(*irqfd), GFP_KERNEL_ACCOUNT); 372 if (!irqfd) 373 return -ENOMEM; 374 375 irqfd->kvm = kvm; 376 irqfd->gsi = args->gsi; 377 INIT_LIST_HEAD(&irqfd->list); 378 INIT_WORK(&irqfd->inject, irqfd_inject); 379 INIT_WORK(&irqfd->shutdown, irqfd_shutdown); 380 seqcount_spinlock_init(&irqfd->irq_entry_sc, &kvm->irqfds.lock); 381 382 CLASS(fd, f)(args->fd); 383 if (fd_empty(f)) { 384 ret = -EBADF; 385 goto out; 386 } 387 388 eventfd = eventfd_ctx_fileget(fd_file(f)); 389 if (IS_ERR(eventfd)) { 390 ret = PTR_ERR(eventfd); 391 goto out; 392 } 393 394 irqfd->eventfd = eventfd; 395 396 if (args->flags & KVM_IRQFD_FLAG_RESAMPLE) { 397 struct kvm_kernel_irqfd_resampler *resampler; 398 399 resamplefd = eventfd_ctx_fdget(args->resamplefd); 400 if (IS_ERR(resamplefd)) { 401 ret = PTR_ERR(resamplefd); 402 goto fail; 403 } 404 405 irqfd->resamplefd = resamplefd; 406 INIT_LIST_HEAD(&irqfd->resampler_link); 407 408 mutex_lock(&kvm->irqfds.resampler_lock); 409 410 list_for_each_entry(resampler, 411 &kvm->irqfds.resampler_list, link) { 412 if (resampler->notifier.gsi == irqfd->gsi) { 413 irqfd->resampler = resampler; 414 break; 415 } 416 } 417 418 if (!irqfd->resampler) { 419 resampler = kzalloc(sizeof(*resampler), 420 GFP_KERNEL_ACCOUNT); 421 if (!resampler) { 422 ret = -ENOMEM; 423 mutex_unlock(&kvm->irqfds.resampler_lock); 424 goto fail; 425 } 426 427 resampler->kvm = kvm; 428 INIT_LIST_HEAD(&resampler->list); 429 resampler->notifier.gsi = irqfd->gsi; 430 resampler->notifier.irq_acked = irqfd_resampler_ack; 431 INIT_LIST_HEAD(&resampler->link); 432 433 list_add_rcu(&resampler->link, &kvm->irqfds.resampler_list); 434 kvm_register_irq_ack_notifier(kvm, 435 &resampler->notifier); 436 irqfd->resampler = resampler; 437 } 438 439 list_add_rcu(&irqfd->resampler_link, &irqfd->resampler->list); 440 synchronize_srcu_expedited(&kvm->irq_srcu); 441 442 mutex_unlock(&kvm->irqfds.resampler_lock); 443 } 444 445 /* 446 * Set the irqfd routing and add it to KVM's list before registering 447 * the irqfd with the eventfd, so that the routing information is valid 448 * and stays valid, e.g. if there are GSI routing changes, prior to 449 * making the irqfd visible, i.e. before it might be signaled. 450 * 451 * Note, holding SRCU ensures a stable read of routing information, and 452 * also prevents irqfd_shutdown() from freeing the irqfd before it's 453 * fully initialized. 454 */ 455 idx = srcu_read_lock(&kvm->irq_srcu); 456 457 /* 458 * Register the irqfd with the eventfd by polling on the eventfd, and 459 * simultaneously and the irqfd to KVM's list. If there was en event 460 * pending on the eventfd prior to registering, manually trigger IRQ 461 * injection. 462 */ 463 irqfd_pt.irqfd = irqfd; 464 irqfd_pt.kvm = kvm; 465 init_poll_funcptr(&irqfd_pt.pt, kvm_irqfd_register); 466 467 events = vfs_poll(fd_file(f), &irqfd_pt.pt); 468 469 ret = irqfd_pt.ret; 470 if (ret) 471 goto fail_poll; 472 473 if (events & EPOLLIN) 474 schedule_work(&irqfd->inject); 475 476 #if IS_ENABLED(CONFIG_HAVE_KVM_IRQ_BYPASS) 477 if (kvm_arch_has_irq_bypass()) { 478 irqfd->consumer.add_producer = kvm_arch_irq_bypass_add_producer; 479 irqfd->consumer.del_producer = kvm_arch_irq_bypass_del_producer; 480 irqfd->consumer.stop = kvm_arch_irq_bypass_stop; 481 irqfd->consumer.start = kvm_arch_irq_bypass_start; 482 ret = irq_bypass_register_consumer(&irqfd->consumer, irqfd->eventfd); 483 if (ret) 484 pr_info("irq bypass consumer (eventfd %p) registration fails: %d\n", 485 irqfd->eventfd, ret); 486 } 487 #endif 488 489 srcu_read_unlock(&kvm->irq_srcu, idx); 490 return 0; 491 492 fail_poll: 493 srcu_read_unlock(&kvm->irq_srcu, idx); 494 fail: 495 if (irqfd->resampler) 496 irqfd_resampler_shutdown(irqfd); 497 498 if (resamplefd && !IS_ERR(resamplefd)) 499 eventfd_ctx_put(resamplefd); 500 501 if (eventfd && !IS_ERR(eventfd)) 502 eventfd_ctx_put(eventfd); 503 504 out: 505 kfree(irqfd); 506 return ret; 507 } 508 509 bool kvm_irq_has_notifier(struct kvm *kvm, unsigned irqchip, unsigned pin) 510 { 511 struct kvm_irq_ack_notifier *kian; 512 int gsi, idx; 513 514 idx = srcu_read_lock(&kvm->irq_srcu); 515 gsi = kvm_irq_map_chip_pin(kvm, irqchip, pin); 516 if (gsi != -1) 517 hlist_for_each_entry_srcu(kian, &kvm->irq_ack_notifier_list, 518 link, srcu_read_lock_held(&kvm->irq_srcu)) 519 if (kian->gsi == gsi) { 520 srcu_read_unlock(&kvm->irq_srcu, idx); 521 return true; 522 } 523 524 srcu_read_unlock(&kvm->irq_srcu, idx); 525 526 return false; 527 } 528 EXPORT_SYMBOL_GPL(kvm_irq_has_notifier); 529 530 void kvm_notify_acked_gsi(struct kvm *kvm, int gsi) 531 { 532 struct kvm_irq_ack_notifier *kian; 533 534 hlist_for_each_entry_srcu(kian, &kvm->irq_ack_notifier_list, 535 link, srcu_read_lock_held(&kvm->irq_srcu)) 536 if (kian->gsi == gsi) 537 kian->irq_acked(kian); 538 } 539 540 void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin) 541 { 542 int gsi, idx; 543 544 trace_kvm_ack_irq(irqchip, pin); 545 546 idx = srcu_read_lock(&kvm->irq_srcu); 547 gsi = kvm_irq_map_chip_pin(kvm, irqchip, pin); 548 if (gsi != -1) 549 kvm_notify_acked_gsi(kvm, gsi); 550 srcu_read_unlock(&kvm->irq_srcu, idx); 551 } 552 553 void kvm_register_irq_ack_notifier(struct kvm *kvm, 554 struct kvm_irq_ack_notifier *kian) 555 { 556 mutex_lock(&kvm->irq_lock); 557 hlist_add_head_rcu(&kian->link, &kvm->irq_ack_notifier_list); 558 mutex_unlock(&kvm->irq_lock); 559 kvm_arch_post_irq_ack_notifier_list_update(kvm); 560 } 561 562 void kvm_unregister_irq_ack_notifier(struct kvm *kvm, 563 struct kvm_irq_ack_notifier *kian) 564 { 565 mutex_lock(&kvm->irq_lock); 566 hlist_del_init_rcu(&kian->link); 567 mutex_unlock(&kvm->irq_lock); 568 synchronize_srcu_expedited(&kvm->irq_srcu); 569 kvm_arch_post_irq_ack_notifier_list_update(kvm); 570 } 571 572 /* 573 * shutdown any irqfd's that match fd+gsi 574 */ 575 static int 576 kvm_irqfd_deassign(struct kvm *kvm, struct kvm_irqfd *args) 577 { 578 struct kvm_kernel_irqfd *irqfd, *tmp; 579 struct eventfd_ctx *eventfd; 580 581 eventfd = eventfd_ctx_fdget(args->fd); 582 if (IS_ERR(eventfd)) 583 return PTR_ERR(eventfd); 584 585 spin_lock_irq(&kvm->irqfds.lock); 586 587 list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds.items, list) { 588 if (irqfd->eventfd == eventfd && irqfd->gsi == args->gsi) { 589 /* 590 * This clearing of irq_entry.type is needed for when 591 * another thread calls kvm_irq_routing_update before 592 * we flush workqueue below (we synchronize with 593 * kvm_irq_routing_update using irqfds.lock). 594 */ 595 write_seqcount_begin(&irqfd->irq_entry_sc); 596 irqfd->irq_entry.type = 0; 597 write_seqcount_end(&irqfd->irq_entry_sc); 598 irqfd_deactivate(irqfd); 599 } 600 } 601 602 spin_unlock_irq(&kvm->irqfds.lock); 603 eventfd_ctx_put(eventfd); 604 605 /* 606 * Block until we know all outstanding shutdown jobs have completed 607 * so that we guarantee there will not be any more interrupts on this 608 * gsi once this deassign function returns. 609 */ 610 flush_workqueue(irqfd_cleanup_wq); 611 612 return 0; 613 } 614 615 int 616 kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args) 617 { 618 if (args->flags & ~(KVM_IRQFD_FLAG_DEASSIGN | KVM_IRQFD_FLAG_RESAMPLE)) 619 return -EINVAL; 620 621 if (args->flags & KVM_IRQFD_FLAG_DEASSIGN) 622 return kvm_irqfd_deassign(kvm, args); 623 624 return kvm_irqfd_assign(kvm, args); 625 } 626 627 /* 628 * This function is called as the kvm VM fd is being released. Shutdown all 629 * irqfds that still remain open 630 */ 631 void 632 kvm_irqfd_release(struct kvm *kvm) 633 { 634 struct kvm_kernel_irqfd *irqfd, *tmp; 635 636 spin_lock_irq(&kvm->irqfds.lock); 637 638 list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds.items, list) 639 irqfd_deactivate(irqfd); 640 641 spin_unlock_irq(&kvm->irqfds.lock); 642 643 /* 644 * Block until we know all outstanding shutdown jobs have completed 645 * since we do not take a kvm* reference. 646 */ 647 flush_workqueue(irqfd_cleanup_wq); 648 649 } 650 651 /* 652 * Take note of a change in irq routing. 653 * Caller must invoke synchronize_srcu_expedited(&kvm->irq_srcu) afterwards. 654 */ 655 void kvm_irq_routing_update(struct kvm *kvm) 656 { 657 struct kvm_kernel_irqfd *irqfd; 658 659 spin_lock_irq(&kvm->irqfds.lock); 660 661 list_for_each_entry(irqfd, &kvm->irqfds.items, list) { 662 #if IS_ENABLED(CONFIG_HAVE_KVM_IRQ_BYPASS) 663 /* Under irqfds.lock, so can read irq_entry safely */ 664 struct kvm_kernel_irq_routing_entry old = irqfd->irq_entry; 665 #endif 666 667 irqfd_update(kvm, irqfd); 668 669 #if IS_ENABLED(CONFIG_HAVE_KVM_IRQ_BYPASS) 670 if (irqfd->producer) 671 kvm_arch_update_irqfd_routing(irqfd, &old, &irqfd->irq_entry); 672 #endif 673 } 674 675 spin_unlock_irq(&kvm->irqfds.lock); 676 } 677 678 bool kvm_notify_irqfd_resampler(struct kvm *kvm, 679 unsigned int irqchip, 680 unsigned int pin) 681 { 682 struct kvm_kernel_irqfd_resampler *resampler; 683 int gsi, idx; 684 685 idx = srcu_read_lock(&kvm->irq_srcu); 686 gsi = kvm_irq_map_chip_pin(kvm, irqchip, pin); 687 if (gsi != -1) { 688 list_for_each_entry_srcu(resampler, 689 &kvm->irqfds.resampler_list, link, 690 srcu_read_lock_held(&kvm->irq_srcu)) { 691 if (resampler->notifier.gsi == gsi) { 692 irqfd_resampler_notify(resampler); 693 srcu_read_unlock(&kvm->irq_srcu, idx); 694 return true; 695 } 696 } 697 } 698 srcu_read_unlock(&kvm->irq_srcu, idx); 699 700 return false; 701 } 702 703 /* 704 * create a host-wide workqueue for issuing deferred shutdown requests 705 * aggregated from all vm* instances. We need our own isolated 706 * queue to ease flushing work items when a VM exits. 707 */ 708 int kvm_irqfd_init(void) 709 { 710 irqfd_cleanup_wq = alloc_workqueue("kvm-irqfd-cleanup", 0, 0); 711 if (!irqfd_cleanup_wq) 712 return -ENOMEM; 713 714 return 0; 715 } 716 717 void kvm_irqfd_exit(void) 718 { 719 destroy_workqueue(irqfd_cleanup_wq); 720 } 721 #endif 722 723 /* 724 * -------------------------------------------------------------------- 725 * ioeventfd: translate a PIO/MMIO memory write to an eventfd signal. 726 * 727 * userspace can register a PIO/MMIO address with an eventfd for receiving 728 * notification when the memory has been touched. 729 * -------------------------------------------------------------------- 730 */ 731 732 struct _ioeventfd { 733 struct list_head list; 734 u64 addr; 735 int length; 736 struct eventfd_ctx *eventfd; 737 u64 datamatch; 738 struct kvm_io_device dev; 739 u8 bus_idx; 740 bool wildcard; 741 }; 742 743 static inline struct _ioeventfd * 744 to_ioeventfd(struct kvm_io_device *dev) 745 { 746 return container_of(dev, struct _ioeventfd, dev); 747 } 748 749 static void 750 ioeventfd_release(struct _ioeventfd *p) 751 { 752 eventfd_ctx_put(p->eventfd); 753 list_del(&p->list); 754 kfree(p); 755 } 756 757 static bool 758 ioeventfd_in_range(struct _ioeventfd *p, gpa_t addr, int len, const void *val) 759 { 760 u64 _val; 761 762 if (addr != p->addr) 763 /* address must be precise for a hit */ 764 return false; 765 766 if (!p->length) 767 /* length = 0 means only look at the address, so always a hit */ 768 return true; 769 770 if (len != p->length) 771 /* address-range must be precise for a hit */ 772 return false; 773 774 if (p->wildcard) 775 /* all else equal, wildcard is always a hit */ 776 return true; 777 778 /* otherwise, we have to actually compare the data */ 779 780 BUG_ON(!IS_ALIGNED((unsigned long)val, len)); 781 782 switch (len) { 783 case 1: 784 _val = *(u8 *)val; 785 break; 786 case 2: 787 _val = *(u16 *)val; 788 break; 789 case 4: 790 _val = *(u32 *)val; 791 break; 792 case 8: 793 _val = *(u64 *)val; 794 break; 795 default: 796 return false; 797 } 798 799 return _val == p->datamatch; 800 } 801 802 /* MMIO/PIO writes trigger an event if the addr/val match */ 803 static int 804 ioeventfd_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this, gpa_t addr, 805 int len, const void *val) 806 { 807 struct _ioeventfd *p = to_ioeventfd(this); 808 809 if (!ioeventfd_in_range(p, addr, len, val)) 810 return -EOPNOTSUPP; 811 812 eventfd_signal(p->eventfd); 813 return 0; 814 } 815 816 /* 817 * This function is called as KVM is completely shutting down. We do not 818 * need to worry about locking just nuke anything we have as quickly as possible 819 */ 820 static void 821 ioeventfd_destructor(struct kvm_io_device *this) 822 { 823 struct _ioeventfd *p = to_ioeventfd(this); 824 825 ioeventfd_release(p); 826 } 827 828 static const struct kvm_io_device_ops ioeventfd_ops = { 829 .write = ioeventfd_write, 830 .destructor = ioeventfd_destructor, 831 }; 832 833 /* assumes kvm->slots_lock held */ 834 static bool 835 ioeventfd_check_collision(struct kvm *kvm, struct _ioeventfd *p) 836 { 837 struct _ioeventfd *_p; 838 839 list_for_each_entry(_p, &kvm->ioeventfds, list) 840 if (_p->bus_idx == p->bus_idx && 841 _p->addr == p->addr && 842 (!_p->length || !p->length || 843 (_p->length == p->length && 844 (_p->wildcard || p->wildcard || 845 _p->datamatch == p->datamatch)))) 846 return true; 847 848 return false; 849 } 850 851 static enum kvm_bus ioeventfd_bus_from_flags(__u32 flags) 852 { 853 if (flags & KVM_IOEVENTFD_FLAG_PIO) 854 return KVM_PIO_BUS; 855 if (flags & KVM_IOEVENTFD_FLAG_VIRTIO_CCW_NOTIFY) 856 return KVM_VIRTIO_CCW_NOTIFY_BUS; 857 return KVM_MMIO_BUS; 858 } 859 860 static int kvm_assign_ioeventfd_idx(struct kvm *kvm, 861 enum kvm_bus bus_idx, 862 struct kvm_ioeventfd *args) 863 { 864 865 struct eventfd_ctx *eventfd; 866 struct _ioeventfd *p; 867 int ret; 868 869 eventfd = eventfd_ctx_fdget(args->fd); 870 if (IS_ERR(eventfd)) 871 return PTR_ERR(eventfd); 872 873 p = kzalloc(sizeof(*p), GFP_KERNEL_ACCOUNT); 874 if (!p) { 875 ret = -ENOMEM; 876 goto fail; 877 } 878 879 INIT_LIST_HEAD(&p->list); 880 p->addr = args->addr; 881 p->bus_idx = bus_idx; 882 p->length = args->len; 883 p->eventfd = eventfd; 884 885 /* The datamatch feature is optional, otherwise this is a wildcard */ 886 if (args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH) 887 p->datamatch = args->datamatch; 888 else 889 p->wildcard = true; 890 891 mutex_lock(&kvm->slots_lock); 892 893 /* Verify that there isn't a match already */ 894 if (ioeventfd_check_collision(kvm, p)) { 895 ret = -EEXIST; 896 goto unlock_fail; 897 } 898 899 kvm_iodevice_init(&p->dev, &ioeventfd_ops); 900 901 ret = kvm_io_bus_register_dev(kvm, bus_idx, p->addr, p->length, 902 &p->dev); 903 if (ret < 0) 904 goto unlock_fail; 905 906 kvm_get_bus(kvm, bus_idx)->ioeventfd_count++; 907 list_add_tail(&p->list, &kvm->ioeventfds); 908 909 mutex_unlock(&kvm->slots_lock); 910 911 return 0; 912 913 unlock_fail: 914 mutex_unlock(&kvm->slots_lock); 915 kfree(p); 916 917 fail: 918 eventfd_ctx_put(eventfd); 919 920 return ret; 921 } 922 923 static int 924 kvm_deassign_ioeventfd_idx(struct kvm *kvm, enum kvm_bus bus_idx, 925 struct kvm_ioeventfd *args) 926 { 927 struct _ioeventfd *p; 928 struct eventfd_ctx *eventfd; 929 struct kvm_io_bus *bus; 930 int ret = -ENOENT; 931 bool wildcard; 932 933 eventfd = eventfd_ctx_fdget(args->fd); 934 if (IS_ERR(eventfd)) 935 return PTR_ERR(eventfd); 936 937 wildcard = !(args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH); 938 939 mutex_lock(&kvm->slots_lock); 940 941 list_for_each_entry(p, &kvm->ioeventfds, list) { 942 if (p->bus_idx != bus_idx || 943 p->eventfd != eventfd || 944 p->addr != args->addr || 945 p->length != args->len || 946 p->wildcard != wildcard) 947 continue; 948 949 if (!p->wildcard && p->datamatch != args->datamatch) 950 continue; 951 952 kvm_io_bus_unregister_dev(kvm, bus_idx, &p->dev); 953 bus = kvm_get_bus(kvm, bus_idx); 954 if (bus) 955 bus->ioeventfd_count--; 956 ret = 0; 957 break; 958 } 959 960 mutex_unlock(&kvm->slots_lock); 961 962 eventfd_ctx_put(eventfd); 963 964 return ret; 965 } 966 967 static int kvm_deassign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) 968 { 969 enum kvm_bus bus_idx = ioeventfd_bus_from_flags(args->flags); 970 int ret = kvm_deassign_ioeventfd_idx(kvm, bus_idx, args); 971 972 if (!args->len && bus_idx == KVM_MMIO_BUS) 973 kvm_deassign_ioeventfd_idx(kvm, KVM_FAST_MMIO_BUS, args); 974 975 return ret; 976 } 977 978 static int 979 kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) 980 { 981 enum kvm_bus bus_idx; 982 int ret; 983 984 bus_idx = ioeventfd_bus_from_flags(args->flags); 985 /* must be natural-word sized, or 0 to ignore length */ 986 switch (args->len) { 987 case 0: 988 case 1: 989 case 2: 990 case 4: 991 case 8: 992 break; 993 default: 994 return -EINVAL; 995 } 996 997 /* check for range overflow */ 998 if (args->addr + args->len < args->addr) 999 return -EINVAL; 1000 1001 /* check for extra flags that we don't understand */ 1002 if (args->flags & ~KVM_IOEVENTFD_VALID_FLAG_MASK) 1003 return -EINVAL; 1004 1005 /* ioeventfd with no length can't be combined with DATAMATCH */ 1006 if (!args->len && (args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH)) 1007 return -EINVAL; 1008 1009 ret = kvm_assign_ioeventfd_idx(kvm, bus_idx, args); 1010 if (ret) 1011 goto fail; 1012 1013 /* When length is ignored, MMIO is also put on a separate bus, for 1014 * faster lookups. 1015 */ 1016 if (!args->len && bus_idx == KVM_MMIO_BUS) { 1017 ret = kvm_assign_ioeventfd_idx(kvm, KVM_FAST_MMIO_BUS, args); 1018 if (ret < 0) 1019 goto fast_fail; 1020 } 1021 1022 return 0; 1023 1024 fast_fail: 1025 kvm_deassign_ioeventfd_idx(kvm, bus_idx, args); 1026 fail: 1027 return ret; 1028 } 1029 1030 int 1031 kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) 1032 { 1033 if (args->flags & KVM_IOEVENTFD_FLAG_DEASSIGN) 1034 return kvm_deassign_ioeventfd(kvm, args); 1035 1036 return kvm_assign_ioeventfd(kvm, args); 1037 } 1038 1039 void 1040 kvm_eventfd_init(struct kvm *kvm) 1041 { 1042 #ifdef CONFIG_HAVE_KVM_IRQCHIP 1043 spin_lock_init(&kvm->irqfds.lock); 1044 INIT_LIST_HEAD(&kvm->irqfds.items); 1045 INIT_LIST_HEAD(&kvm->irqfds.resampler_list); 1046 mutex_init(&kvm->irqfds.resampler_lock); 1047 #endif 1048 INIT_LIST_HEAD(&kvm->ioeventfds); 1049 } 1050