1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * eventfd support for mshv 4 * 5 * Heavily inspired from KVM implementation of irqfd/ioeventfd. The basic 6 * framework code is taken from the kvm implementation. 7 * 8 * All credits to kvm developers. 9 */ 10 11 #include <linux/syscalls.h> 12 #include <linux/wait.h> 13 #include <linux/poll.h> 14 #include <linux/file.h> 15 #include <linux/list.h> 16 #include <linux/workqueue.h> 17 #include <linux/eventfd.h> 18 19 #if IS_ENABLED(CONFIG_X86_64) 20 #include <asm/apic.h> 21 #endif 22 #include <asm/mshyperv.h> 23 24 #include "mshv_eventfd.h" 25 #include "mshv.h" 26 #include "mshv_root.h" 27 28 static struct workqueue_struct *irqfd_cleanup_wq; 29 30 void mshv_register_irq_ack_notifier(struct mshv_partition *partition, 31 struct mshv_irq_ack_notifier *mian) 32 { 33 mutex_lock(&partition->pt_irq_lock); 34 hlist_add_head_rcu(&mian->link, &partition->irq_ack_notifier_list); 35 mutex_unlock(&partition->pt_irq_lock); 36 } 37 38 void mshv_unregister_irq_ack_notifier(struct mshv_partition *partition, 39 struct mshv_irq_ack_notifier *mian) 40 { 41 mutex_lock(&partition->pt_irq_lock); 42 hlist_del_init_rcu(&mian->link); 43 mutex_unlock(&partition->pt_irq_lock); 44 synchronize_rcu(); 45 } 46 47 bool mshv_notify_acked_gsi(struct mshv_partition *partition, int gsi) 48 { 49 struct mshv_irq_ack_notifier *mian; 50 bool acked = false; 51 52 rcu_read_lock(); 53 hlist_for_each_entry_rcu(mian, &partition->irq_ack_notifier_list, 54 link) { 55 if (mian->irq_ack_gsi == gsi) { 56 mian->irq_acked(mian); 57 acked = true; 58 } 59 } 60 rcu_read_unlock(); 61 62 return acked; 63 } 64 65 #if IS_ENABLED(CONFIG_ARM64) 66 static inline bool hv_should_clear_interrupt(enum hv_interrupt_type type) 67 { 68 return false; 69 } 70 #elif IS_ENABLED(CONFIG_X86_64) 71 static inline bool hv_should_clear_interrupt(enum hv_interrupt_type type) 72 { 73 return type == HV_X64_INTERRUPT_TYPE_EXTINT; 74 } 75 #endif 76 77 static void mshv_irqfd_resampler_ack(struct mshv_irq_ack_notifier *mian) 78 { 79 struct mshv_irqfd_resampler *resampler; 80 struct mshv_partition *partition; 81 struct mshv_irqfd *irqfd; 82 int idx; 83 84 resampler = container_of(mian, struct mshv_irqfd_resampler, 85 rsmplr_notifier); 86 partition = resampler->rsmplr_partn; 87 88 idx = srcu_read_lock(&partition->pt_irq_srcu); 89 90 hlist_for_each_entry_rcu(irqfd, &resampler->rsmplr_irqfd_list, 91 irqfd_resampler_hnode) { 92 if (hv_should_clear_interrupt(irqfd->irqfd_lapic_irq.lapic_control.interrupt_type)) 93 hv_call_clear_virtual_interrupt(partition->pt_id); 94 95 eventfd_signal(irqfd->irqfd_resamplefd); 96 } 97 98 srcu_read_unlock(&partition->pt_irq_srcu, idx); 99 } 100 101 #if IS_ENABLED(CONFIG_X86_64) 102 static bool 103 mshv_vp_irq_vector_injected(union hv_vp_register_page_interrupt_vectors iv, 104 u32 vector) 105 { 106 int i; 107 108 for (i = 0; i < iv.vector_count; i++) { 109 if (iv.vector[i] == vector) 110 return true; 111 } 112 113 return false; 114 } 115 116 static int mshv_vp_irq_try_set_vector(struct mshv_vp *vp, u32 vector) 117 { 118 union hv_vp_register_page_interrupt_vectors iv, new_iv; 119 120 iv = vp->vp_register_page->interrupt_vectors; 121 new_iv = iv; 122 123 if (mshv_vp_irq_vector_injected(iv, vector)) 124 return 0; 125 126 if (iv.vector_count >= HV_VP_REGISTER_PAGE_MAX_VECTOR_COUNT) 127 return -ENOSPC; 128 129 new_iv.vector[new_iv.vector_count++] = vector; 130 131 if (cmpxchg(&vp->vp_register_page->interrupt_vectors.as_uint64, 132 iv.as_uint64, new_iv.as_uint64) != iv.as_uint64) 133 return -EAGAIN; 134 135 return 0; 136 } 137 138 static int mshv_vp_irq_set_vector(struct mshv_vp *vp, u32 vector) 139 { 140 int ret; 141 142 do { 143 ret = mshv_vp_irq_try_set_vector(vp, vector); 144 } while (ret == -EAGAIN && !need_resched()); 145 146 return ret; 147 } 148 149 /* 150 * Try to raise irq for guest via shared vector array. hyp does the actual 151 * inject of the interrupt. 152 */ 153 static int mshv_try_assert_irq_fast(struct mshv_irqfd *irqfd) 154 { 155 struct mshv_partition *partition = irqfd->irqfd_partn; 156 struct mshv_lapic_irq *irq = &irqfd->irqfd_lapic_irq; 157 struct mshv_vp *vp; 158 159 if (!(ms_hyperv.ext_features & 160 HV_VP_DISPATCH_INTERRUPT_INJECTION_AVAILABLE)) 161 return -EOPNOTSUPP; 162 163 if (hv_scheduler_type != HV_SCHEDULER_TYPE_ROOT) 164 return -EOPNOTSUPP; 165 166 if (irq->lapic_control.logical_dest_mode) 167 return -EOPNOTSUPP; 168 169 vp = partition->pt_vp_array[irq->lapic_apic_id]; 170 171 if (!vp->vp_register_page) 172 return -EOPNOTSUPP; 173 174 if (mshv_vp_irq_set_vector(vp, irq->lapic_vector)) 175 return -EINVAL; 176 177 if (vp->run.flags.root_sched_dispatched && 178 vp->vp_register_page->interrupt_vectors.as_uint64) 179 return -EBUSY; 180 181 wake_up(&vp->run.vp_suspend_queue); 182 183 return 0; 184 } 185 #else /* CONFIG_X86_64 */ 186 static int mshv_try_assert_irq_fast(struct mshv_irqfd *irqfd) 187 { 188 return -EOPNOTSUPP; 189 } 190 #endif 191 192 static void mshv_assert_irq_slow(struct mshv_irqfd *irqfd) 193 { 194 struct mshv_partition *partition = irqfd->irqfd_partn; 195 struct mshv_lapic_irq *irq = &irqfd->irqfd_lapic_irq; 196 unsigned int seq; 197 int idx; 198 199 WARN_ON(irqfd->irqfd_resampler && 200 !irq->lapic_control.level_triggered); 201 202 idx = srcu_read_lock(&partition->pt_irq_srcu); 203 if (irqfd->irqfd_girq_ent.guest_irq_num) { 204 if (!irqfd->irqfd_girq_ent.girq_entry_valid) { 205 srcu_read_unlock(&partition->pt_irq_srcu, idx); 206 return; 207 } 208 209 do { 210 seq = read_seqcount_begin(&irqfd->irqfd_irqe_sc); 211 } while (read_seqcount_retry(&irqfd->irqfd_irqe_sc, seq)); 212 } 213 214 hv_call_assert_virtual_interrupt(irqfd->irqfd_partn->pt_id, 215 irq->lapic_vector, irq->lapic_apic_id, 216 irq->lapic_control); 217 srcu_read_unlock(&partition->pt_irq_srcu, idx); 218 } 219 220 static void mshv_irqfd_resampler_shutdown(struct mshv_irqfd *irqfd) 221 { 222 struct mshv_irqfd_resampler *rp = irqfd->irqfd_resampler; 223 struct mshv_partition *pt = rp->rsmplr_partn; 224 225 mutex_lock(&pt->irqfds_resampler_lock); 226 227 hlist_del_rcu(&irqfd->irqfd_resampler_hnode); 228 synchronize_srcu(&pt->pt_irq_srcu); 229 230 if (hlist_empty(&rp->rsmplr_irqfd_list)) { 231 hlist_del(&rp->rsmplr_hnode); 232 mshv_unregister_irq_ack_notifier(pt, &rp->rsmplr_notifier); 233 kfree(rp); 234 } 235 236 mutex_unlock(&pt->irqfds_resampler_lock); 237 } 238 239 /* 240 * Race-free decouple logic (ordering is critical) 241 */ 242 static void mshv_irqfd_shutdown(struct work_struct *work) 243 { 244 struct mshv_irqfd *irqfd = 245 container_of(work, struct mshv_irqfd, irqfd_shutdown); 246 247 /* 248 * Synchronize with the wait-queue and unhook ourselves to prevent 249 * further events. 250 */ 251 remove_wait_queue(irqfd->irqfd_wqh, &irqfd->irqfd_wait); 252 253 if (irqfd->irqfd_resampler) { 254 mshv_irqfd_resampler_shutdown(irqfd); 255 eventfd_ctx_put(irqfd->irqfd_resamplefd); 256 } 257 258 /* 259 * It is now safe to release the object's resources 260 */ 261 eventfd_ctx_put(irqfd->irqfd_eventfd_ctx); 262 kfree(irqfd); 263 } 264 265 /* assumes partition->pt_irqfds_lock is held */ 266 static bool mshv_irqfd_is_active(struct mshv_irqfd *irqfd) 267 { 268 return !hlist_unhashed(&irqfd->irqfd_hnode); 269 } 270 271 /* 272 * Mark the irqfd as inactive and schedule it for removal 273 * 274 * assumes partition->pt_irqfds_lock is held 275 */ 276 static void mshv_irqfd_deactivate(struct mshv_irqfd *irqfd) 277 { 278 if (!mshv_irqfd_is_active(irqfd)) 279 return; 280 281 hlist_del(&irqfd->irqfd_hnode); 282 283 queue_work(irqfd_cleanup_wq, &irqfd->irqfd_shutdown); 284 } 285 286 /* 287 * Called with wqh->lock held and interrupts disabled 288 */ 289 static int mshv_irqfd_wakeup(wait_queue_entry_t *wait, unsigned int mode, 290 int sync, void *key) 291 { 292 struct mshv_irqfd *irqfd = container_of(wait, struct mshv_irqfd, 293 irqfd_wait); 294 unsigned long flags = (unsigned long)key; 295 int idx; 296 unsigned int seq; 297 struct mshv_partition *pt = irqfd->irqfd_partn; 298 int ret = 0; 299 300 if (flags & POLLIN) { 301 u64 cnt; 302 303 eventfd_ctx_do_read(irqfd->irqfd_eventfd_ctx, &cnt); 304 idx = srcu_read_lock(&pt->pt_irq_srcu); 305 do { 306 seq = read_seqcount_begin(&irqfd->irqfd_irqe_sc); 307 } while (read_seqcount_retry(&irqfd->irqfd_irqe_sc, seq)); 308 309 /* An event has been signaled, raise an interrupt */ 310 ret = mshv_try_assert_irq_fast(irqfd); 311 if (ret) 312 mshv_assert_irq_slow(irqfd); 313 314 srcu_read_unlock(&pt->pt_irq_srcu, idx); 315 316 ret = 1; 317 } 318 319 if (flags & POLLHUP) { 320 /* The eventfd is closing, detach from the partition */ 321 unsigned long flags; 322 323 spin_lock_irqsave(&pt->pt_irqfds_lock, flags); 324 325 /* 326 * We must check if someone deactivated the irqfd before 327 * we could acquire the pt_irqfds_lock since the item is 328 * deactivated from the mshv side before it is unhooked from 329 * the wait-queue. If it is already deactivated, we can 330 * simply return knowing the other side will cleanup for us. 331 * We cannot race against the irqfd going away since the 332 * other side is required to acquire wqh->lock, which we hold 333 */ 334 if (mshv_irqfd_is_active(irqfd)) 335 mshv_irqfd_deactivate(irqfd); 336 337 spin_unlock_irqrestore(&pt->pt_irqfds_lock, flags); 338 } 339 340 return ret; 341 } 342 343 /* Must be called under pt_irqfds_lock */ 344 static void mshv_irqfd_update(struct mshv_partition *pt, 345 struct mshv_irqfd *irqfd) 346 { 347 write_seqcount_begin(&irqfd->irqfd_irqe_sc); 348 irqfd->irqfd_girq_ent = mshv_ret_girq_entry(pt, 349 irqfd->irqfd_irqnum); 350 mshv_copy_girq_info(&irqfd->irqfd_girq_ent, &irqfd->irqfd_lapic_irq); 351 write_seqcount_end(&irqfd->irqfd_irqe_sc); 352 } 353 354 void mshv_irqfd_routing_update(struct mshv_partition *pt) 355 { 356 struct mshv_irqfd *irqfd; 357 358 spin_lock_irq(&pt->pt_irqfds_lock); 359 hlist_for_each_entry(irqfd, &pt->pt_irqfds_list, irqfd_hnode) 360 mshv_irqfd_update(pt, irqfd); 361 spin_unlock_irq(&pt->pt_irqfds_lock); 362 } 363 364 static void mshv_irqfd_queue_proc(struct file *file, wait_queue_head_t *wqh, 365 poll_table *polltbl) 366 { 367 struct mshv_irqfd *irqfd = 368 container_of(polltbl, struct mshv_irqfd, irqfd_polltbl); 369 370 irqfd->irqfd_wqh = wqh; 371 add_wait_queue_priority(wqh, &irqfd->irqfd_wait); 372 } 373 374 static int mshv_irqfd_assign(struct mshv_partition *pt, 375 struct mshv_user_irqfd *args) 376 { 377 struct eventfd_ctx *eventfd = NULL, *resamplefd = NULL; 378 struct mshv_irqfd *irqfd, *tmp; 379 unsigned int events; 380 struct fd f; 381 int ret; 382 int idx; 383 384 irqfd = kzalloc(sizeof(*irqfd), GFP_KERNEL); 385 if (!irqfd) 386 return -ENOMEM; 387 388 irqfd->irqfd_partn = pt; 389 irqfd->irqfd_irqnum = args->gsi; 390 INIT_WORK(&irqfd->irqfd_shutdown, mshv_irqfd_shutdown); 391 seqcount_spinlock_init(&irqfd->irqfd_irqe_sc, &pt->pt_irqfds_lock); 392 393 f = fdget(args->fd); 394 if (!fd_file(f)) { 395 ret = -EBADF; 396 goto out; 397 } 398 399 eventfd = eventfd_ctx_fileget(fd_file(f)); 400 if (IS_ERR(eventfd)) { 401 ret = PTR_ERR(eventfd); 402 goto fail; 403 } 404 405 irqfd->irqfd_eventfd_ctx = eventfd; 406 407 if (args->flags & BIT(MSHV_IRQFD_BIT_RESAMPLE)) { 408 struct mshv_irqfd_resampler *rp; 409 410 resamplefd = eventfd_ctx_fdget(args->resamplefd); 411 if (IS_ERR(resamplefd)) { 412 ret = PTR_ERR(resamplefd); 413 goto fail; 414 } 415 416 irqfd->irqfd_resamplefd = resamplefd; 417 418 mutex_lock(&pt->irqfds_resampler_lock); 419 420 hlist_for_each_entry(rp, &pt->irqfds_resampler_list, 421 rsmplr_hnode) { 422 if (rp->rsmplr_notifier.irq_ack_gsi == 423 irqfd->irqfd_irqnum) { 424 irqfd->irqfd_resampler = rp; 425 break; 426 } 427 } 428 429 if (!irqfd->irqfd_resampler) { 430 rp = kzalloc(sizeof(*rp), GFP_KERNEL_ACCOUNT); 431 if (!rp) { 432 ret = -ENOMEM; 433 mutex_unlock(&pt->irqfds_resampler_lock); 434 goto fail; 435 } 436 437 rp->rsmplr_partn = pt; 438 INIT_HLIST_HEAD(&rp->rsmplr_irqfd_list); 439 rp->rsmplr_notifier.irq_ack_gsi = irqfd->irqfd_irqnum; 440 rp->rsmplr_notifier.irq_acked = 441 mshv_irqfd_resampler_ack; 442 443 hlist_add_head(&rp->rsmplr_hnode, 444 &pt->irqfds_resampler_list); 445 mshv_register_irq_ack_notifier(pt, 446 &rp->rsmplr_notifier); 447 irqfd->irqfd_resampler = rp; 448 } 449 450 hlist_add_head_rcu(&irqfd->irqfd_resampler_hnode, 451 &irqfd->irqfd_resampler->rsmplr_irqfd_list); 452 453 mutex_unlock(&pt->irqfds_resampler_lock); 454 } 455 456 /* 457 * Install our own custom wake-up handling so we are notified via 458 * a callback whenever someone signals the underlying eventfd 459 */ 460 init_waitqueue_func_entry(&irqfd->irqfd_wait, mshv_irqfd_wakeup); 461 init_poll_funcptr(&irqfd->irqfd_polltbl, mshv_irqfd_queue_proc); 462 463 spin_lock_irq(&pt->pt_irqfds_lock); 464 if (args->flags & BIT(MSHV_IRQFD_BIT_RESAMPLE) && 465 !irqfd->irqfd_lapic_irq.lapic_control.level_triggered) { 466 /* 467 * Resample Fd must be for level triggered interrupt 468 * Otherwise return with failure 469 */ 470 spin_unlock_irq(&pt->pt_irqfds_lock); 471 ret = -EINVAL; 472 goto fail; 473 } 474 ret = 0; 475 hlist_for_each_entry(tmp, &pt->pt_irqfds_list, irqfd_hnode) { 476 if (irqfd->irqfd_eventfd_ctx != tmp->irqfd_eventfd_ctx) 477 continue; 478 /* This fd is used for another irq already. */ 479 ret = -EBUSY; 480 spin_unlock_irq(&pt->pt_irqfds_lock); 481 goto fail; 482 } 483 484 idx = srcu_read_lock(&pt->pt_irq_srcu); 485 mshv_irqfd_update(pt, irqfd); 486 hlist_add_head(&irqfd->irqfd_hnode, &pt->pt_irqfds_list); 487 spin_unlock_irq(&pt->pt_irqfds_lock); 488 489 /* 490 * Check if there was an event already pending on the eventfd 491 * before we registered, and trigger it as if we didn't miss it. 492 */ 493 events = vfs_poll(fd_file(f), &irqfd->irqfd_polltbl); 494 495 if (events & POLLIN) 496 mshv_assert_irq_slow(irqfd); 497 498 srcu_read_unlock(&pt->pt_irq_srcu, idx); 499 /* 500 * do not drop the file until the irqfd is fully initialized, otherwise 501 * we might race against the POLLHUP 502 */ 503 fdput(f); 504 505 return 0; 506 507 fail: 508 if (irqfd->irqfd_resampler) 509 mshv_irqfd_resampler_shutdown(irqfd); 510 511 if (resamplefd && !IS_ERR(resamplefd)) 512 eventfd_ctx_put(resamplefd); 513 514 if (eventfd && !IS_ERR(eventfd)) 515 eventfd_ctx_put(eventfd); 516 517 fdput(f); 518 519 out: 520 kfree(irqfd); 521 return ret; 522 } 523 524 /* 525 * shutdown any irqfd's that match fd+gsi 526 */ 527 static int mshv_irqfd_deassign(struct mshv_partition *pt, 528 struct mshv_user_irqfd *args) 529 { 530 struct mshv_irqfd *irqfd; 531 struct hlist_node *n; 532 struct eventfd_ctx *eventfd; 533 534 eventfd = eventfd_ctx_fdget(args->fd); 535 if (IS_ERR(eventfd)) 536 return PTR_ERR(eventfd); 537 538 hlist_for_each_entry_safe(irqfd, n, &pt->pt_irqfds_list, 539 irqfd_hnode) { 540 if (irqfd->irqfd_eventfd_ctx == eventfd && 541 irqfd->irqfd_irqnum == args->gsi) 542 543 mshv_irqfd_deactivate(irqfd); 544 } 545 546 eventfd_ctx_put(eventfd); 547 548 /* 549 * Block until we know all outstanding shutdown jobs have completed 550 * so that we guarantee there will not be any more interrupts on this 551 * gsi once this deassign function returns. 552 */ 553 flush_workqueue(irqfd_cleanup_wq); 554 555 return 0; 556 } 557 558 int mshv_set_unset_irqfd(struct mshv_partition *pt, 559 struct mshv_user_irqfd *args) 560 { 561 if (args->flags & ~MSHV_IRQFD_FLAGS_MASK) 562 return -EINVAL; 563 564 if (args->flags & BIT(MSHV_IRQFD_BIT_DEASSIGN)) 565 return mshv_irqfd_deassign(pt, args); 566 567 return mshv_irqfd_assign(pt, args); 568 } 569 570 /* 571 * This function is called as the mshv VM fd is being released. 572 * Shutdown all irqfds that still remain open 573 */ 574 static void mshv_irqfd_release(struct mshv_partition *pt) 575 { 576 struct mshv_irqfd *irqfd; 577 struct hlist_node *n; 578 579 spin_lock_irq(&pt->pt_irqfds_lock); 580 581 hlist_for_each_entry_safe(irqfd, n, &pt->pt_irqfds_list, irqfd_hnode) 582 mshv_irqfd_deactivate(irqfd); 583 584 spin_unlock_irq(&pt->pt_irqfds_lock); 585 586 /* 587 * Block until we know all outstanding shutdown jobs have completed 588 * since we do not take a mshv_partition* reference. 589 */ 590 flush_workqueue(irqfd_cleanup_wq); 591 } 592 593 int mshv_irqfd_wq_init(void) 594 { 595 irqfd_cleanup_wq = alloc_workqueue("mshv-irqfd-cleanup", 0, 0); 596 if (!irqfd_cleanup_wq) 597 return -ENOMEM; 598 599 return 0; 600 } 601 602 void mshv_irqfd_wq_cleanup(void) 603 { 604 destroy_workqueue(irqfd_cleanup_wq); 605 } 606 607 /* 608 * -------------------------------------------------------------------- 609 * ioeventfd: translate a MMIO memory write to an eventfd signal. 610 * 611 * userspace can register a MMIO address with an eventfd for receiving 612 * notification when the memory has been touched. 613 * -------------------------------------------------------------------- 614 */ 615 616 static void ioeventfd_release(struct mshv_ioeventfd *p, u64 partition_id) 617 { 618 if (p->iovntfd_doorbell_id > 0) 619 mshv_unregister_doorbell(partition_id, p->iovntfd_doorbell_id); 620 eventfd_ctx_put(p->iovntfd_eventfd); 621 kfree(p); 622 } 623 624 /* MMIO writes trigger an event if the addr/val match */ 625 static void ioeventfd_mmio_write(int doorbell_id, void *data) 626 { 627 struct mshv_partition *partition = (struct mshv_partition *)data; 628 struct mshv_ioeventfd *p; 629 630 rcu_read_lock(); 631 hlist_for_each_entry_rcu(p, &partition->ioeventfds_list, iovntfd_hnode) 632 if (p->iovntfd_doorbell_id == doorbell_id) { 633 eventfd_signal(p->iovntfd_eventfd); 634 break; 635 } 636 637 rcu_read_unlock(); 638 } 639 640 static bool ioeventfd_check_collision(struct mshv_partition *pt, 641 struct mshv_ioeventfd *p) 642 __must_hold(&pt->mutex) 643 { 644 struct mshv_ioeventfd *_p; 645 646 hlist_for_each_entry(_p, &pt->ioeventfds_list, iovntfd_hnode) 647 if (_p->iovntfd_addr == p->iovntfd_addr && 648 _p->iovntfd_length == p->iovntfd_length && 649 (_p->iovntfd_wildcard || p->iovntfd_wildcard || 650 _p->iovntfd_datamatch == p->iovntfd_datamatch)) 651 return true; 652 653 return false; 654 } 655 656 static int mshv_assign_ioeventfd(struct mshv_partition *pt, 657 struct mshv_user_ioeventfd *args) 658 __must_hold(&pt->mutex) 659 { 660 struct mshv_ioeventfd *p; 661 struct eventfd_ctx *eventfd; 662 u64 doorbell_flags = 0; 663 int ret; 664 665 /* This mutex is currently protecting ioeventfd.items list */ 666 WARN_ON_ONCE(!mutex_is_locked(&pt->pt_mutex)); 667 668 if (args->flags & BIT(MSHV_IOEVENTFD_BIT_PIO)) 669 return -EOPNOTSUPP; 670 671 /* must be natural-word sized */ 672 switch (args->len) { 673 case 0: 674 doorbell_flags = HV_DOORBELL_FLAG_TRIGGER_SIZE_ANY; 675 break; 676 case 1: 677 doorbell_flags = HV_DOORBELL_FLAG_TRIGGER_SIZE_BYTE; 678 break; 679 case 2: 680 doorbell_flags = HV_DOORBELL_FLAG_TRIGGER_SIZE_WORD; 681 break; 682 case 4: 683 doorbell_flags = HV_DOORBELL_FLAG_TRIGGER_SIZE_DWORD; 684 break; 685 case 8: 686 doorbell_flags = HV_DOORBELL_FLAG_TRIGGER_SIZE_QWORD; 687 break; 688 default: 689 return -EINVAL; 690 } 691 692 /* check for range overflow */ 693 if (args->addr + args->len < args->addr) 694 return -EINVAL; 695 696 /* check for extra flags that we don't understand */ 697 if (args->flags & ~MSHV_IOEVENTFD_FLAGS_MASK) 698 return -EINVAL; 699 700 eventfd = eventfd_ctx_fdget(args->fd); 701 if (IS_ERR(eventfd)) 702 return PTR_ERR(eventfd); 703 704 p = kzalloc(sizeof(*p), GFP_KERNEL); 705 if (!p) { 706 ret = -ENOMEM; 707 goto fail; 708 } 709 710 p->iovntfd_addr = args->addr; 711 p->iovntfd_length = args->len; 712 p->iovntfd_eventfd = eventfd; 713 714 /* The datamatch feature is optional, otherwise this is a wildcard */ 715 if (args->flags & BIT(MSHV_IOEVENTFD_BIT_DATAMATCH)) { 716 p->iovntfd_datamatch = args->datamatch; 717 } else { 718 p->iovntfd_wildcard = true; 719 doorbell_flags |= HV_DOORBELL_FLAG_TRIGGER_ANY_VALUE; 720 } 721 722 if (ioeventfd_check_collision(pt, p)) { 723 ret = -EEXIST; 724 goto unlock_fail; 725 } 726 727 ret = mshv_register_doorbell(pt->pt_id, ioeventfd_mmio_write, 728 (void *)pt, p->iovntfd_addr, 729 p->iovntfd_datamatch, doorbell_flags); 730 if (ret < 0) 731 goto unlock_fail; 732 733 p->iovntfd_doorbell_id = ret; 734 735 hlist_add_head_rcu(&p->iovntfd_hnode, &pt->ioeventfds_list); 736 737 return 0; 738 739 unlock_fail: 740 kfree(p); 741 742 fail: 743 eventfd_ctx_put(eventfd); 744 745 return ret; 746 } 747 748 static int mshv_deassign_ioeventfd(struct mshv_partition *pt, 749 struct mshv_user_ioeventfd *args) 750 __must_hold(&pt->mutex) 751 { 752 struct mshv_ioeventfd *p; 753 struct eventfd_ctx *eventfd; 754 struct hlist_node *n; 755 int ret = -ENOENT; 756 757 /* This mutex is currently protecting ioeventfd.items list */ 758 WARN_ON_ONCE(!mutex_is_locked(&pt->pt_mutex)); 759 760 eventfd = eventfd_ctx_fdget(args->fd); 761 if (IS_ERR(eventfd)) 762 return PTR_ERR(eventfd); 763 764 hlist_for_each_entry_safe(p, n, &pt->ioeventfds_list, iovntfd_hnode) { 765 bool wildcard = !(args->flags & BIT(MSHV_IOEVENTFD_BIT_DATAMATCH)); 766 767 if (p->iovntfd_eventfd != eventfd || 768 p->iovntfd_addr != args->addr || 769 p->iovntfd_length != args->len || 770 p->iovntfd_wildcard != wildcard) 771 continue; 772 773 if (!p->iovntfd_wildcard && 774 p->iovntfd_datamatch != args->datamatch) 775 continue; 776 777 hlist_del_rcu(&p->iovntfd_hnode); 778 synchronize_rcu(); 779 ioeventfd_release(p, pt->pt_id); 780 ret = 0; 781 break; 782 } 783 784 eventfd_ctx_put(eventfd); 785 786 return ret; 787 } 788 789 int mshv_set_unset_ioeventfd(struct mshv_partition *pt, 790 struct mshv_user_ioeventfd *args) 791 __must_hold(&pt->mutex) 792 { 793 if ((args->flags & ~MSHV_IOEVENTFD_FLAGS_MASK) || 794 mshv_field_nonzero(*args, rsvd)) 795 return -EINVAL; 796 797 /* PIO not yet implemented */ 798 if (args->flags & BIT(MSHV_IOEVENTFD_BIT_PIO)) 799 return -EOPNOTSUPP; 800 801 if (args->flags & BIT(MSHV_IOEVENTFD_BIT_DEASSIGN)) 802 return mshv_deassign_ioeventfd(pt, args); 803 804 return mshv_assign_ioeventfd(pt, args); 805 } 806 807 void mshv_eventfd_init(struct mshv_partition *pt) 808 { 809 spin_lock_init(&pt->pt_irqfds_lock); 810 INIT_HLIST_HEAD(&pt->pt_irqfds_list); 811 812 INIT_HLIST_HEAD(&pt->irqfds_resampler_list); 813 mutex_init(&pt->irqfds_resampler_lock); 814 815 INIT_HLIST_HEAD(&pt->ioeventfds_list); 816 } 817 818 void mshv_eventfd_release(struct mshv_partition *pt) 819 { 820 struct hlist_head items; 821 struct hlist_node *n; 822 struct mshv_ioeventfd *p; 823 824 hlist_move_list(&pt->ioeventfds_list, &items); 825 synchronize_rcu(); 826 827 hlist_for_each_entry_safe(p, n, &items, iovntfd_hnode) { 828 hlist_del(&p->iovntfd_hnode); 829 ioeventfd_release(p, pt->pt_id); 830 } 831 832 mshv_irqfd_release(pt); 833 } 834