1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * kvm eventfd support - use eventfd objects to signal various KVM events 4 * 5 * Copyright 2009 Novell. All Rights Reserved. 6 * Copyright 2010 Red Hat, Inc. and/or its affiliates. 7 * 8 * Author: 9 * Gregory Haskins <ghaskins@novell.com> 10 */ 11 12 #include <linux/kvm_host.h> 13 #include <linux/kvm.h> 14 #include <linux/kvm_irqfd.h> 15 #include <linux/workqueue.h> 16 #include <linux/syscalls.h> 17 #include <linux/wait.h> 18 #include <linux/poll.h> 19 #include <linux/file.h> 20 #include <linux/list.h> 21 #include <linux/eventfd.h> 22 #include <linux/kernel.h> 23 #include <linux/srcu.h> 24 #include <linux/slab.h> 25 #include <linux/seqlock.h> 26 #include <linux/irqbypass.h> 27 #include <trace/events/kvm.h> 28 29 #include <kvm/iodev.h> 30 31 #ifdef CONFIG_HAVE_KVM_IRQCHIP 32 33 static struct workqueue_struct *irqfd_cleanup_wq; 34 35 bool __attribute__((weak)) 36 kvm_arch_irqfd_allowed(struct kvm *kvm, struct kvm_irqfd *args) 37 { 38 return true; 39 } 40 41 static void 42 irqfd_inject(struct work_struct *work) 43 { 44 struct kvm_kernel_irqfd *irqfd = 45 container_of(work, struct kvm_kernel_irqfd, inject); 46 struct kvm *kvm = irqfd->kvm; 47 48 if (!irqfd->resampler) { 49 kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1, 50 false); 51 kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0, 52 false); 53 } else 54 kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID, 55 irqfd->gsi, 1, false); 56 } 57 58 static void irqfd_resampler_notify(struct kvm_kernel_irqfd_resampler *resampler) 59 { 60 struct kvm_kernel_irqfd *irqfd; 61 62 list_for_each_entry_srcu(irqfd, &resampler->list, resampler_link, 63 srcu_read_lock_held(&resampler->kvm->irq_srcu)) 64 eventfd_signal(irqfd->resamplefd); 65 } 66 67 /* 68 * Since resampler irqfds share an IRQ source ID, we de-assert once 69 * then notify all of the resampler irqfds using this GSI. We can't 70 * do multiple de-asserts or we risk racing with incoming re-asserts. 71 */ 72 static void 73 irqfd_resampler_ack(struct kvm_irq_ack_notifier *kian) 74 { 75 struct kvm_kernel_irqfd_resampler *resampler; 76 struct kvm *kvm; 77 int idx; 78 79 resampler = container_of(kian, 80 struct kvm_kernel_irqfd_resampler, notifier); 81 kvm = resampler->kvm; 82 83 kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID, 84 resampler->notifier.gsi, 0, false); 85 86 idx = srcu_read_lock(&kvm->irq_srcu); 87 irqfd_resampler_notify(resampler); 88 srcu_read_unlock(&kvm->irq_srcu, idx); 89 } 90 91 static void 92 irqfd_resampler_shutdown(struct kvm_kernel_irqfd *irqfd) 93 { 94 struct kvm_kernel_irqfd_resampler *resampler = irqfd->resampler; 95 struct kvm *kvm = resampler->kvm; 96 97 mutex_lock(&kvm->irqfds.resampler_lock); 98 99 list_del_rcu(&irqfd->resampler_link); 100 101 if (list_empty(&resampler->list)) { 102 list_del_rcu(&resampler->link); 103 kvm_unregister_irq_ack_notifier(kvm, &resampler->notifier); 104 /* 105 * synchronize_srcu_expedited(&kvm->irq_srcu) already called 106 * in kvm_unregister_irq_ack_notifier(). 107 */ 108 kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID, 109 resampler->notifier.gsi, 0, false); 110 kfree(resampler); 111 } else { 112 synchronize_srcu_expedited(&kvm->irq_srcu); 113 } 114 115 mutex_unlock(&kvm->irqfds.resampler_lock); 116 } 117 118 /* 119 * Race-free decouple logic (ordering is critical) 120 */ 121 static void 122 irqfd_shutdown(struct work_struct *work) 123 { 124 struct kvm_kernel_irqfd *irqfd = 125 container_of(work, struct kvm_kernel_irqfd, shutdown); 126 struct kvm *kvm = irqfd->kvm; 127 u64 cnt; 128 129 /* Make sure irqfd has been initialized in assign path. */ 130 synchronize_srcu_expedited(&kvm->irq_srcu); 131 132 /* 133 * Synchronize with the wait-queue and unhook ourselves to prevent 134 * further events. 135 */ 136 eventfd_ctx_remove_wait_queue(irqfd->eventfd, &irqfd->wait, &cnt); 137 138 /* 139 * We know no new events will be scheduled at this point, so block 140 * until all previously outstanding events have completed 141 */ 142 flush_work(&irqfd->inject); 143 144 if (irqfd->resampler) { 145 irqfd_resampler_shutdown(irqfd); 146 eventfd_ctx_put(irqfd->resamplefd); 147 } 148 149 /* 150 * It is now safe to release the object's resources 151 */ 152 #if IS_ENABLED(CONFIG_HAVE_KVM_IRQ_BYPASS) 153 irq_bypass_unregister_consumer(&irqfd->consumer); 154 #endif 155 eventfd_ctx_put(irqfd->eventfd); 156 kfree(irqfd); 157 } 158 159 160 static bool irqfd_is_active(struct kvm_kernel_irqfd *irqfd) 161 { 162 /* 163 * Assert that either irqfds.lock or SRCU is held, as irqfds.lock must 164 * be held to prevent false positives (on the irqfd being active), and 165 * while false negatives are impossible as irqfds are never added back 166 * to the list once they're deactivated, the caller must at least hold 167 * SRCU to guard against routing changes if the irqfd is deactivated. 168 */ 169 lockdep_assert_once(lockdep_is_held(&irqfd->kvm->irqfds.lock) || 170 srcu_read_lock_held(&irqfd->kvm->irq_srcu)); 171 172 return list_empty(&irqfd->list) ? false : true; 173 } 174 175 /* 176 * Mark the irqfd as inactive and schedule it for removal 177 */ 178 static void irqfd_deactivate(struct kvm_kernel_irqfd *irqfd) 179 { 180 lockdep_assert_held(&irqfd->kvm->irqfds.lock); 181 182 BUG_ON(!irqfd_is_active(irqfd)); 183 184 list_del_init(&irqfd->list); 185 186 queue_work(irqfd_cleanup_wq, &irqfd->shutdown); 187 } 188 189 int __attribute__((weak)) kvm_arch_set_irq_inatomic( 190 struct kvm_kernel_irq_routing_entry *irq, 191 struct kvm *kvm, int irq_source_id, 192 int level, 193 bool line_status) 194 { 195 return -EWOULDBLOCK; 196 } 197 198 /* 199 * Called with wqh->lock held and interrupts disabled 200 */ 201 static int 202 irqfd_wakeup(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) 203 { 204 struct kvm_kernel_irqfd *irqfd = 205 container_of(wait, struct kvm_kernel_irqfd, wait); 206 __poll_t flags = key_to_poll(key); 207 struct kvm_kernel_irq_routing_entry irq; 208 struct kvm *kvm = irqfd->kvm; 209 unsigned seq; 210 int idx; 211 int ret = 0; 212 213 if (flags & EPOLLIN) { 214 /* 215 * WARNING: Do NOT take irqfds.lock in any path except EPOLLHUP, 216 * as KVM holds irqfds.lock when registering the irqfd with the 217 * eventfd. 218 */ 219 u64 cnt; 220 eventfd_ctx_do_read(irqfd->eventfd, &cnt); 221 222 idx = srcu_read_lock(&kvm->irq_srcu); 223 do { 224 seq = read_seqcount_begin(&irqfd->irq_entry_sc); 225 irq = irqfd->irq_entry; 226 } while (read_seqcount_retry(&irqfd->irq_entry_sc, seq)); 227 228 /* 229 * An event has been signaled, inject an interrupt unless the 230 * irqfd is being deassigned (isn't active), in which case the 231 * routing information may be stale (once the irqfd is removed 232 * from the list, it will stop receiving routing updates). 233 */ 234 if (unlikely(!irqfd_is_active(irqfd)) || 235 kvm_arch_set_irq_inatomic(&irq, kvm, 236 KVM_USERSPACE_IRQ_SOURCE_ID, 1, 237 false) == -EWOULDBLOCK) 238 schedule_work(&irqfd->inject); 239 srcu_read_unlock(&kvm->irq_srcu, idx); 240 ret = 1; 241 } 242 243 if (flags & EPOLLHUP) { 244 /* The eventfd is closing, detach from KVM */ 245 unsigned long iflags; 246 247 /* 248 * Taking irqfds.lock is safe here, as KVM holds a reference to 249 * the eventfd when registering the irqfd, i.e. this path can't 250 * be reached while kvm_irqfd_add() is running. 251 */ 252 spin_lock_irqsave(&kvm->irqfds.lock, iflags); 253 254 /* 255 * We must check if someone deactivated the irqfd before 256 * we could acquire the irqfds.lock since the item is 257 * deactivated from the KVM side before it is unhooked from 258 * the wait-queue. If it is already deactivated, we can 259 * simply return knowing the other side will cleanup for us. 260 * We cannot race against the irqfd going away since the 261 * other side is required to acquire wqh->lock, which we hold 262 */ 263 if (irqfd_is_active(irqfd)) 264 irqfd_deactivate(irqfd); 265 266 spin_unlock_irqrestore(&kvm->irqfds.lock, iflags); 267 } 268 269 return ret; 270 } 271 272 static void irqfd_update(struct kvm *kvm, struct kvm_kernel_irqfd *irqfd) 273 { 274 struct kvm_kernel_irq_routing_entry *e; 275 struct kvm_kernel_irq_routing_entry entries[KVM_NR_IRQCHIPS]; 276 int n_entries; 277 278 lockdep_assert_held(&kvm->irqfds.lock); 279 280 n_entries = kvm_irq_map_gsi(kvm, entries, irqfd->gsi); 281 282 write_seqcount_begin(&irqfd->irq_entry_sc); 283 284 e = entries; 285 if (n_entries == 1) 286 irqfd->irq_entry = *e; 287 else 288 irqfd->irq_entry.type = 0; 289 290 write_seqcount_end(&irqfd->irq_entry_sc); 291 } 292 293 struct kvm_irqfd_pt { 294 struct kvm_kernel_irqfd *irqfd; 295 struct kvm *kvm; 296 poll_table pt; 297 int ret; 298 }; 299 300 static void kvm_irqfd_register(struct file *file, wait_queue_head_t *wqh, 301 poll_table *pt) 302 { 303 struct kvm_irqfd_pt *p = container_of(pt, struct kvm_irqfd_pt, pt); 304 struct kvm_kernel_irqfd *irqfd = p->irqfd; 305 struct kvm *kvm = p->kvm; 306 307 /* 308 * Note, irqfds.lock protects the irqfd's irq_entry, i.e. its routing, 309 * and irqfds.items. It does NOT protect registering with the eventfd. 310 */ 311 spin_lock_irq(&kvm->irqfds.lock); 312 313 /* 314 * Initialize the routing information prior to adding the irqfd to the 315 * eventfd's waitqueue, as irqfd_wakeup() can be invoked as soon as the 316 * irqfd is registered. 317 */ 318 irqfd_update(kvm, irqfd); 319 320 /* 321 * Add the irqfd as a priority waiter on the eventfd, with a custom 322 * wake-up handler, so that KVM *and only KVM* is notified whenever the 323 * underlying eventfd is signaled. 324 */ 325 init_waitqueue_func_entry(&irqfd->wait, irqfd_wakeup); 326 327 /* 328 * Temporarily lie to lockdep about holding irqfds.lock to avoid a 329 * false positive regarding potential deadlock with irqfd_wakeup() 330 * (see irqfd_wakeup() for details). 331 * 332 * Adding to the wait queue will fail if there is already a priority 333 * waiter, i.e. if the eventfd is associated with another irqfd (in any 334 * VM). Note, kvm_irqfd_deassign() waits for all in-flight shutdown 335 * jobs to complete, i.e. ensures the irqfd has been removed from the 336 * eventfd's waitqueue before returning to userspace. 337 */ 338 spin_release(&kvm->irqfds.lock.dep_map, _RET_IP_); 339 p->ret = add_wait_queue_priority_exclusive(wqh, &irqfd->wait); 340 spin_acquire(&kvm->irqfds.lock.dep_map, 0, 0, _RET_IP_); 341 if (p->ret) 342 goto out; 343 344 list_add_tail(&irqfd->list, &kvm->irqfds.items); 345 346 out: 347 spin_unlock_irq(&kvm->irqfds.lock); 348 } 349 350 #if IS_ENABLED(CONFIG_HAVE_KVM_IRQ_BYPASS) 351 void __attribute__((weak)) kvm_arch_irq_bypass_stop( 352 struct irq_bypass_consumer *cons) 353 { 354 } 355 356 void __attribute__((weak)) kvm_arch_irq_bypass_start( 357 struct irq_bypass_consumer *cons) 358 { 359 } 360 361 void __weak kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd *irqfd, 362 struct kvm_kernel_irq_routing_entry *old, 363 struct kvm_kernel_irq_routing_entry *new) 364 { 365 366 } 367 #endif 368 369 static int 370 kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args) 371 { 372 struct kvm_kernel_irqfd *irqfd; 373 struct eventfd_ctx *eventfd = NULL, *resamplefd = NULL; 374 struct kvm_irqfd_pt irqfd_pt; 375 int ret; 376 __poll_t events; 377 int idx; 378 379 if (!kvm_arch_intc_initialized(kvm)) 380 return -EAGAIN; 381 382 if (!kvm_arch_irqfd_allowed(kvm, args)) 383 return -EINVAL; 384 385 irqfd = kzalloc_obj(*irqfd, GFP_KERNEL_ACCOUNT); 386 if (!irqfd) 387 return -ENOMEM; 388 389 irqfd->kvm = kvm; 390 irqfd->gsi = args->gsi; 391 INIT_LIST_HEAD(&irqfd->list); 392 INIT_WORK(&irqfd->inject, irqfd_inject); 393 INIT_WORK(&irqfd->shutdown, irqfd_shutdown); 394 seqcount_spinlock_init(&irqfd->irq_entry_sc, &kvm->irqfds.lock); 395 396 CLASS(fd, f)(args->fd); 397 if (fd_empty(f)) { 398 ret = -EBADF; 399 goto out; 400 } 401 402 eventfd = eventfd_ctx_fileget(fd_file(f)); 403 if (IS_ERR(eventfd)) { 404 ret = PTR_ERR(eventfd); 405 goto out; 406 } 407 408 irqfd->eventfd = eventfd; 409 410 if (args->flags & KVM_IRQFD_FLAG_RESAMPLE) { 411 struct kvm_kernel_irqfd_resampler *resampler; 412 413 resamplefd = eventfd_ctx_fdget(args->resamplefd); 414 if (IS_ERR(resamplefd)) { 415 ret = PTR_ERR(resamplefd); 416 goto fail; 417 } 418 419 irqfd->resamplefd = resamplefd; 420 INIT_LIST_HEAD(&irqfd->resampler_link); 421 422 mutex_lock(&kvm->irqfds.resampler_lock); 423 424 list_for_each_entry(resampler, 425 &kvm->irqfds.resampler_list, link) { 426 if (resampler->notifier.gsi == irqfd->gsi) { 427 irqfd->resampler = resampler; 428 break; 429 } 430 } 431 432 if (!irqfd->resampler) { 433 resampler = kzalloc_obj(*resampler, GFP_KERNEL_ACCOUNT); 434 if (!resampler) { 435 ret = -ENOMEM; 436 mutex_unlock(&kvm->irqfds.resampler_lock); 437 goto fail; 438 } 439 440 resampler->kvm = kvm; 441 INIT_LIST_HEAD(&resampler->list); 442 resampler->notifier.gsi = irqfd->gsi; 443 resampler->notifier.irq_acked = irqfd_resampler_ack; 444 INIT_LIST_HEAD(&resampler->link); 445 446 list_add_rcu(&resampler->link, &kvm->irqfds.resampler_list); 447 kvm_register_irq_ack_notifier(kvm, 448 &resampler->notifier); 449 irqfd->resampler = resampler; 450 } 451 452 list_add_rcu(&irqfd->resampler_link, &irqfd->resampler->list); 453 synchronize_srcu_expedited(&kvm->irq_srcu); 454 455 mutex_unlock(&kvm->irqfds.resampler_lock); 456 } 457 458 /* 459 * Set the irqfd routing and add it to KVM's list before registering 460 * the irqfd with the eventfd, so that the routing information is valid 461 * and stays valid, e.g. if there are GSI routing changes, prior to 462 * making the irqfd visible, i.e. before it might be signaled. 463 * 464 * Note, holding SRCU ensures a stable read of routing information, and 465 * also prevents irqfd_shutdown() from freeing the irqfd before it's 466 * fully initialized. 467 */ 468 idx = srcu_read_lock(&kvm->irq_srcu); 469 470 /* 471 * Register the irqfd with the eventfd by polling on the eventfd, and 472 * simultaneously and the irqfd to KVM's list. If there was en event 473 * pending on the eventfd prior to registering, manually trigger IRQ 474 * injection. 475 */ 476 irqfd_pt.irqfd = irqfd; 477 irqfd_pt.kvm = kvm; 478 init_poll_funcptr(&irqfd_pt.pt, kvm_irqfd_register); 479 480 events = vfs_poll(fd_file(f), &irqfd_pt.pt); 481 482 ret = irqfd_pt.ret; 483 if (ret) 484 goto fail_poll; 485 486 if (events & EPOLLIN) 487 schedule_work(&irqfd->inject); 488 489 #if IS_ENABLED(CONFIG_HAVE_KVM_IRQ_BYPASS) 490 if (kvm_arch_has_irq_bypass()) { 491 irqfd->consumer.add_producer = kvm_arch_irq_bypass_add_producer; 492 irqfd->consumer.del_producer = kvm_arch_irq_bypass_del_producer; 493 irqfd->consumer.stop = kvm_arch_irq_bypass_stop; 494 irqfd->consumer.start = kvm_arch_irq_bypass_start; 495 ret = irq_bypass_register_consumer(&irqfd->consumer, irqfd->eventfd); 496 if (ret) 497 pr_info("irq bypass consumer (eventfd %p) registration fails: %d\n", 498 irqfd->eventfd, ret); 499 } 500 #endif 501 502 srcu_read_unlock(&kvm->irq_srcu, idx); 503 return 0; 504 505 fail_poll: 506 srcu_read_unlock(&kvm->irq_srcu, idx); 507 fail: 508 if (irqfd->resampler) 509 irqfd_resampler_shutdown(irqfd); 510 511 if (resamplefd && !IS_ERR(resamplefd)) 512 eventfd_ctx_put(resamplefd); 513 514 if (eventfd && !IS_ERR(eventfd)) 515 eventfd_ctx_put(eventfd); 516 517 out: 518 kfree(irqfd); 519 return ret; 520 } 521 522 bool kvm_irq_has_notifier(struct kvm *kvm, unsigned irqchip, unsigned pin) 523 { 524 struct kvm_irq_ack_notifier *kian; 525 int gsi, idx; 526 527 idx = srcu_read_lock(&kvm->irq_srcu); 528 gsi = kvm_irq_map_chip_pin(kvm, irqchip, pin); 529 if (gsi != -1) 530 hlist_for_each_entry_srcu(kian, &kvm->irq_ack_notifier_list, 531 link, srcu_read_lock_held(&kvm->irq_srcu)) 532 if (kian->gsi == gsi) { 533 srcu_read_unlock(&kvm->irq_srcu, idx); 534 return true; 535 } 536 537 srcu_read_unlock(&kvm->irq_srcu, idx); 538 539 return false; 540 } 541 EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_irq_has_notifier); 542 543 void kvm_notify_acked_gsi(struct kvm *kvm, int gsi) 544 { 545 struct kvm_irq_ack_notifier *kian; 546 547 hlist_for_each_entry_srcu(kian, &kvm->irq_ack_notifier_list, 548 link, srcu_read_lock_held(&kvm->irq_srcu)) 549 if (kian->gsi == gsi) 550 kian->irq_acked(kian); 551 } 552 553 void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin) 554 { 555 int gsi, idx; 556 557 trace_kvm_ack_irq(irqchip, pin); 558 559 idx = srcu_read_lock(&kvm->irq_srcu); 560 gsi = kvm_irq_map_chip_pin(kvm, irqchip, pin); 561 if (gsi != -1) 562 kvm_notify_acked_gsi(kvm, gsi); 563 srcu_read_unlock(&kvm->irq_srcu, idx); 564 } 565 566 void kvm_register_irq_ack_notifier(struct kvm *kvm, 567 struct kvm_irq_ack_notifier *kian) 568 { 569 mutex_lock(&kvm->irq_lock); 570 hlist_add_head_rcu(&kian->link, &kvm->irq_ack_notifier_list); 571 mutex_unlock(&kvm->irq_lock); 572 kvm_arch_post_irq_ack_notifier_list_update(kvm); 573 } 574 575 void kvm_unregister_irq_ack_notifier(struct kvm *kvm, 576 struct kvm_irq_ack_notifier *kian) 577 { 578 mutex_lock(&kvm->irq_lock); 579 hlist_del_init_rcu(&kian->link); 580 mutex_unlock(&kvm->irq_lock); 581 synchronize_srcu_expedited(&kvm->irq_srcu); 582 kvm_arch_post_irq_ack_notifier_list_update(kvm); 583 } 584 585 /* 586 * shutdown any irqfd's that match fd+gsi 587 */ 588 static int 589 kvm_irqfd_deassign(struct kvm *kvm, struct kvm_irqfd *args) 590 { 591 struct kvm_kernel_irqfd *irqfd, *tmp; 592 struct eventfd_ctx *eventfd; 593 594 eventfd = eventfd_ctx_fdget(args->fd); 595 if (IS_ERR(eventfd)) 596 return PTR_ERR(eventfd); 597 598 spin_lock_irq(&kvm->irqfds.lock); 599 600 list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds.items, list) { 601 if (irqfd->eventfd == eventfd && irqfd->gsi == args->gsi) 602 irqfd_deactivate(irqfd); 603 } 604 605 spin_unlock_irq(&kvm->irqfds.lock); 606 eventfd_ctx_put(eventfd); 607 608 /* 609 * Block until we know all outstanding shutdown jobs have completed 610 * so that we guarantee there will not be any more interrupts on this 611 * gsi once this deassign function returns. 612 */ 613 flush_workqueue(irqfd_cleanup_wq); 614 615 return 0; 616 } 617 618 int 619 kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args) 620 { 621 if (args->flags & ~(KVM_IRQFD_FLAG_DEASSIGN | KVM_IRQFD_FLAG_RESAMPLE)) 622 return -EINVAL; 623 624 if (args->flags & KVM_IRQFD_FLAG_DEASSIGN) 625 return kvm_irqfd_deassign(kvm, args); 626 627 return kvm_irqfd_assign(kvm, args); 628 } 629 630 /* 631 * This function is called as the kvm VM fd is being released. Shutdown all 632 * irqfds that still remain open 633 */ 634 void 635 kvm_irqfd_release(struct kvm *kvm) 636 { 637 struct kvm_kernel_irqfd *irqfd, *tmp; 638 639 spin_lock_irq(&kvm->irqfds.lock); 640 641 list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds.items, list) 642 irqfd_deactivate(irqfd); 643 644 spin_unlock_irq(&kvm->irqfds.lock); 645 646 /* 647 * Block until we know all outstanding shutdown jobs have completed 648 * since we do not take a kvm* reference. 649 */ 650 flush_workqueue(irqfd_cleanup_wq); 651 652 } 653 654 /* 655 * Take note of a change in irq routing. 656 * Caller must invoke synchronize_srcu_expedited(&kvm->irq_srcu) afterwards. 657 */ 658 void kvm_irq_routing_update(struct kvm *kvm) 659 { 660 struct kvm_kernel_irqfd *irqfd; 661 662 spin_lock_irq(&kvm->irqfds.lock); 663 664 list_for_each_entry(irqfd, &kvm->irqfds.items, list) { 665 #if IS_ENABLED(CONFIG_HAVE_KVM_IRQ_BYPASS) 666 /* Under irqfds.lock, so can read irq_entry safely */ 667 struct kvm_kernel_irq_routing_entry old = irqfd->irq_entry; 668 #endif 669 670 irqfd_update(kvm, irqfd); 671 672 #if IS_ENABLED(CONFIG_HAVE_KVM_IRQ_BYPASS) 673 if (irqfd->producer) 674 kvm_arch_update_irqfd_routing(irqfd, &old, &irqfd->irq_entry); 675 #endif 676 } 677 678 spin_unlock_irq(&kvm->irqfds.lock); 679 } 680 681 bool kvm_notify_irqfd_resampler(struct kvm *kvm, 682 unsigned int irqchip, 683 unsigned int pin) 684 { 685 struct kvm_kernel_irqfd_resampler *resampler; 686 int gsi, idx; 687 688 idx = srcu_read_lock(&kvm->irq_srcu); 689 gsi = kvm_irq_map_chip_pin(kvm, irqchip, pin); 690 if (gsi != -1) { 691 list_for_each_entry_srcu(resampler, 692 &kvm->irqfds.resampler_list, link, 693 srcu_read_lock_held(&kvm->irq_srcu)) { 694 if (resampler->notifier.gsi == gsi) { 695 irqfd_resampler_notify(resampler); 696 srcu_read_unlock(&kvm->irq_srcu, idx); 697 return true; 698 } 699 } 700 } 701 srcu_read_unlock(&kvm->irq_srcu, idx); 702 703 return false; 704 } 705 706 /* 707 * create a host-wide workqueue for issuing deferred shutdown requests 708 * aggregated from all vm* instances. We need our own isolated 709 * queue to ease flushing work items when a VM exits. 710 */ 711 int kvm_irqfd_init(void) 712 { 713 irqfd_cleanup_wq = alloc_workqueue("kvm-irqfd-cleanup", WQ_PERCPU, 0); 714 if (!irqfd_cleanup_wq) 715 return -ENOMEM; 716 717 return 0; 718 } 719 720 void kvm_irqfd_exit(void) 721 { 722 destroy_workqueue(irqfd_cleanup_wq); 723 } 724 #endif 725 726 /* 727 * -------------------------------------------------------------------- 728 * ioeventfd: translate a PIO/MMIO memory write to an eventfd signal. 729 * 730 * userspace can register a PIO/MMIO address with an eventfd for receiving 731 * notification when the memory has been touched. 732 * -------------------------------------------------------------------- 733 */ 734 735 struct _ioeventfd { 736 struct list_head list; 737 u64 addr; 738 int length; 739 struct eventfd_ctx *eventfd; 740 u64 datamatch; 741 struct kvm_io_device dev; 742 u8 bus_idx; 743 bool wildcard; 744 }; 745 746 static inline struct _ioeventfd * 747 to_ioeventfd(struct kvm_io_device *dev) 748 { 749 return container_of(dev, struct _ioeventfd, dev); 750 } 751 752 static void 753 ioeventfd_release(struct _ioeventfd *p) 754 { 755 eventfd_ctx_put(p->eventfd); 756 list_del(&p->list); 757 kfree(p); 758 } 759 760 static bool 761 ioeventfd_in_range(struct _ioeventfd *p, gpa_t addr, int len, const void *val) 762 { 763 u64 _val; 764 765 if (addr != p->addr) 766 /* address must be precise for a hit */ 767 return false; 768 769 if (!p->length) 770 /* length = 0 means only look at the address, so always a hit */ 771 return true; 772 773 if (len != p->length) 774 /* address-range must be precise for a hit */ 775 return false; 776 777 if (p->wildcard) 778 /* all else equal, wildcard is always a hit */ 779 return true; 780 781 /* otherwise, we have to actually compare the data */ 782 783 BUG_ON(!IS_ALIGNED((unsigned long)val, len)); 784 785 switch (len) { 786 case 1: 787 _val = *(u8 *)val; 788 break; 789 case 2: 790 _val = *(u16 *)val; 791 break; 792 case 4: 793 _val = *(u32 *)val; 794 break; 795 case 8: 796 _val = *(u64 *)val; 797 break; 798 default: 799 return false; 800 } 801 802 return _val == p->datamatch; 803 } 804 805 /* MMIO/PIO writes trigger an event if the addr/val match */ 806 static int 807 ioeventfd_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this, gpa_t addr, 808 int len, const void *val) 809 { 810 struct _ioeventfd *p = to_ioeventfd(this); 811 812 if (!ioeventfd_in_range(p, addr, len, val)) 813 return -EOPNOTSUPP; 814 815 eventfd_signal(p->eventfd); 816 return 0; 817 } 818 819 /* 820 * This function is called as KVM is completely shutting down. We do not 821 * need to worry about locking just nuke anything we have as quickly as possible 822 */ 823 static void 824 ioeventfd_destructor(struct kvm_io_device *this) 825 { 826 struct _ioeventfd *p = to_ioeventfd(this); 827 828 ioeventfd_release(p); 829 } 830 831 static const struct kvm_io_device_ops ioeventfd_ops = { 832 .write = ioeventfd_write, 833 .destructor = ioeventfd_destructor, 834 }; 835 836 /* assumes kvm->slots_lock held */ 837 static bool 838 ioeventfd_check_collision(struct kvm *kvm, struct _ioeventfd *p) 839 { 840 struct _ioeventfd *_p; 841 842 list_for_each_entry(_p, &kvm->ioeventfds, list) 843 if (_p->bus_idx == p->bus_idx && 844 _p->addr == p->addr && 845 (!_p->length || !p->length || 846 (_p->length == p->length && 847 (_p->wildcard || p->wildcard || 848 _p->datamatch == p->datamatch)))) 849 return true; 850 851 return false; 852 } 853 854 static enum kvm_bus ioeventfd_bus_from_flags(__u32 flags) 855 { 856 if (flags & KVM_IOEVENTFD_FLAG_PIO) 857 return KVM_PIO_BUS; 858 if (flags & KVM_IOEVENTFD_FLAG_VIRTIO_CCW_NOTIFY) 859 return KVM_VIRTIO_CCW_NOTIFY_BUS; 860 return KVM_MMIO_BUS; 861 } 862 863 static int kvm_assign_ioeventfd_idx(struct kvm *kvm, 864 enum kvm_bus bus_idx, 865 struct kvm_ioeventfd *args) 866 { 867 868 struct eventfd_ctx *eventfd; 869 struct _ioeventfd *p; 870 int ret; 871 872 eventfd = eventfd_ctx_fdget(args->fd); 873 if (IS_ERR(eventfd)) 874 return PTR_ERR(eventfd); 875 876 p = kzalloc_obj(*p, GFP_KERNEL_ACCOUNT); 877 if (!p) { 878 ret = -ENOMEM; 879 goto fail; 880 } 881 882 INIT_LIST_HEAD(&p->list); 883 p->addr = args->addr; 884 p->bus_idx = bus_idx; 885 p->length = args->len; 886 p->eventfd = eventfd; 887 888 /* The datamatch feature is optional, otherwise this is a wildcard */ 889 if (args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH) 890 p->datamatch = args->datamatch; 891 else 892 p->wildcard = true; 893 894 mutex_lock(&kvm->slots_lock); 895 896 /* Verify that there isn't a match already */ 897 if (ioeventfd_check_collision(kvm, p)) { 898 ret = -EEXIST; 899 goto unlock_fail; 900 } 901 902 kvm_iodevice_init(&p->dev, &ioeventfd_ops); 903 904 ret = kvm_io_bus_register_dev(kvm, bus_idx, p->addr, p->length, 905 &p->dev); 906 if (ret < 0) 907 goto unlock_fail; 908 909 kvm_get_bus(kvm, bus_idx)->ioeventfd_count++; 910 list_add_tail(&p->list, &kvm->ioeventfds); 911 912 mutex_unlock(&kvm->slots_lock); 913 914 return 0; 915 916 unlock_fail: 917 mutex_unlock(&kvm->slots_lock); 918 kfree(p); 919 920 fail: 921 eventfd_ctx_put(eventfd); 922 923 return ret; 924 } 925 926 static int 927 kvm_deassign_ioeventfd_idx(struct kvm *kvm, enum kvm_bus bus_idx, 928 struct kvm_ioeventfd *args) 929 { 930 struct _ioeventfd *p; 931 struct eventfd_ctx *eventfd; 932 struct kvm_io_bus *bus; 933 int ret = -ENOENT; 934 bool wildcard; 935 936 eventfd = eventfd_ctx_fdget(args->fd); 937 if (IS_ERR(eventfd)) 938 return PTR_ERR(eventfd); 939 940 wildcard = !(args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH); 941 942 mutex_lock(&kvm->slots_lock); 943 944 list_for_each_entry(p, &kvm->ioeventfds, list) { 945 if (p->bus_idx != bus_idx || 946 p->eventfd != eventfd || 947 p->addr != args->addr || 948 p->length != args->len || 949 p->wildcard != wildcard) 950 continue; 951 952 if (!p->wildcard && p->datamatch != args->datamatch) 953 continue; 954 955 kvm_io_bus_unregister_dev(kvm, bus_idx, &p->dev); 956 bus = kvm_get_bus(kvm, bus_idx); 957 if (bus) 958 bus->ioeventfd_count--; 959 ret = 0; 960 break; 961 } 962 963 mutex_unlock(&kvm->slots_lock); 964 965 eventfd_ctx_put(eventfd); 966 967 return ret; 968 } 969 970 static int kvm_deassign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) 971 { 972 enum kvm_bus bus_idx = ioeventfd_bus_from_flags(args->flags); 973 int ret = kvm_deassign_ioeventfd_idx(kvm, bus_idx, args); 974 975 if (!args->len && bus_idx == KVM_MMIO_BUS) 976 kvm_deassign_ioeventfd_idx(kvm, KVM_FAST_MMIO_BUS, args); 977 978 return ret; 979 } 980 981 static int 982 kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) 983 { 984 enum kvm_bus bus_idx; 985 int ret; 986 987 bus_idx = ioeventfd_bus_from_flags(args->flags); 988 /* must be natural-word sized, or 0 to ignore length */ 989 switch (args->len) { 990 case 0: 991 case 1: 992 case 2: 993 case 4: 994 case 8: 995 break; 996 default: 997 return -EINVAL; 998 } 999 1000 /* check for range overflow */ 1001 if (args->addr + args->len < args->addr) 1002 return -EINVAL; 1003 1004 /* check for extra flags that we don't understand */ 1005 if (args->flags & ~KVM_IOEVENTFD_VALID_FLAG_MASK) 1006 return -EINVAL; 1007 1008 /* ioeventfd with no length can't be combined with DATAMATCH */ 1009 if (!args->len && (args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH)) 1010 return -EINVAL; 1011 1012 ret = kvm_assign_ioeventfd_idx(kvm, bus_idx, args); 1013 if (ret) 1014 goto fail; 1015 1016 /* When length is ignored, MMIO is also put on a separate bus, for 1017 * faster lookups. 1018 */ 1019 if (!args->len && bus_idx == KVM_MMIO_BUS) { 1020 ret = kvm_assign_ioeventfd_idx(kvm, KVM_FAST_MMIO_BUS, args); 1021 if (ret < 0) 1022 goto fast_fail; 1023 } 1024 1025 return 0; 1026 1027 fast_fail: 1028 kvm_deassign_ioeventfd_idx(kvm, bus_idx, args); 1029 fail: 1030 return ret; 1031 } 1032 1033 int 1034 kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) 1035 { 1036 if (args->flags & KVM_IOEVENTFD_FLAG_DEASSIGN) 1037 return kvm_deassign_ioeventfd(kvm, args); 1038 1039 return kvm_assign_ioeventfd(kvm, args); 1040 } 1041 1042 void 1043 kvm_eventfd_init(struct kvm *kvm) 1044 { 1045 #ifdef CONFIG_HAVE_KVM_IRQCHIP 1046 spin_lock_init(&kvm->irqfds.lock); 1047 INIT_LIST_HEAD(&kvm->irqfds.items); 1048 INIT_LIST_HEAD(&kvm->irqfds.resampler_list); 1049 mutex_init(&kvm->irqfds.resampler_lock); 1050 #endif 1051 INIT_LIST_HEAD(&kvm->ioeventfds); 1052 } 1053