/* * This file and its contents are supplied under the terms of the * Common Development and Distribution License ("CDDL"), version 1.0. * You may only use this file in accordance with the terms of version * 1.0 of the CDDL. * * A full copy of the text of the CDDL should have accompanied this * source. A copy of the CDDL is also available via the Internet at * http://www.illumos.org/license/CDDL. */ /* * Copyright 2017 Joyent, Inc. * Copyright 2024 Oxide Computer Company */ /* * Support for the eventfd facility, a Linux-borne facility for user-generated * file descriptor-based events. */ #include #include #include #include #include #include #include #include #include struct eventfd_state; typedef struct eventfd_state eventfd_state_t; struct eventfd_state { kmutex_t efd_lock; /* lock protecting state */ boolean_t efd_semaphore; /* boolean: sema. semantics */ kcondvar_t efd_cv; /* condvar */ pollhead_t efd_pollhd; /* poll head */ uint64_t efd_value; /* value */ size_t efd_bwriters; /* count of blocked writers */ eventfd_state_t *efd_next; /* next state on global list */ }; /* * Internal global variables. */ static kmutex_t eventfd_lock; /* lock protecting state */ static dev_info_t *eventfd_devi; /* device info */ static vmem_t *eventfd_minor; /* minor number arena */ static void *eventfd_softstate; /* softstate pointer */ static eventfd_state_t *eventfd_state; /* global list of state */ static int eventfd_open(dev_t *devp, int flag __unused, int otyp __unused, cred_t *cr __unused) { eventfd_state_t *state; major_t major = getemajor(*devp); minor_t minor = getminor(*devp); if (minor != EVENTFDMNRN_EVENTFD) return (ENXIO); mutex_enter(&eventfd_lock); minor = (minor_t)(uintptr_t)vmem_alloc(eventfd_minor, 1, VM_BESTFIT | VM_SLEEP); if (ddi_soft_state_zalloc(eventfd_softstate, minor) != DDI_SUCCESS) { vmem_free(eventfd_minor, (void *)(uintptr_t)minor, 1); mutex_exit(&eventfd_lock); return (ENXIO); } state = ddi_get_soft_state(eventfd_softstate, minor); *devp = makedevice(major, minor); state->efd_next = eventfd_state; eventfd_state = state; mutex_exit(&eventfd_lock); return (0); } static int eventfd_read(dev_t dev, uio_t *uio, cred_t *cr __unused) { eventfd_state_t *state; minor_t minor = getminor(dev); uint64_t val, oval; int err; if (uio->uio_resid < sizeof (val)) return (EINVAL); state = ddi_get_soft_state(eventfd_softstate, minor); mutex_enter(&state->efd_lock); while (state->efd_value == 0) { if (uio->uio_fmode & (FNDELAY|FNONBLOCK)) { mutex_exit(&state->efd_lock); return (EAGAIN); } if (!cv_wait_sig_swap(&state->efd_cv, &state->efd_lock)) { mutex_exit(&state->efd_lock); return (EINTR); } } /* * We have a non-zero value and we own the lock; our behavior now * depends on whether or not EFD_SEMAPHORE was set when the eventfd * was created. */ val = oval = state->efd_value; if (state->efd_semaphore) { state->efd_value--; val = 1; } else { state->efd_value = 0; } err = uiomove(&val, sizeof (val), UIO_READ, uio); /* * Wake any writers blocked on this eventfd as this read operation may * have created adequate capacity for their values. */ if (state->efd_bwriters != 0) { cv_broadcast(&state->efd_cv); } mutex_exit(&state->efd_lock); /* * It is necessary to emit POLLOUT events only when the eventfd * transitions from EVENTFD_VALMAX to a lower value. At all other * times, it is already considered writable by poll. */ if (oval == EVENTFD_VALMAX) { pollwakeup(&state->efd_pollhd, POLLWRNORM | POLLOUT); } return (err); } static int eventfd_write(dev_t dev, struct uio *uio, cred_t *cr __unused) { eventfd_state_t *state; minor_t minor = getminor(dev); uint64_t val, oval; int err; if (uio->uio_resid < sizeof (val)) return (EINVAL); if ((err = uiomove(&val, sizeof (val), UIO_WRITE, uio)) != 0) return (err); if (val > EVENTFD_VALMAX) return (EINVAL); state = ddi_get_soft_state(eventfd_softstate, minor); mutex_enter(&state->efd_lock); while (val > EVENTFD_VALMAX - state->efd_value) { if (uio->uio_fmode & (FNDELAY|FNONBLOCK)) { mutex_exit(&state->efd_lock); return (EAGAIN); } state->efd_bwriters++; if (!cv_wait_sig_swap(&state->efd_cv, &state->efd_lock)) { state->efd_bwriters--; mutex_exit(&state->efd_lock); return (EINTR); } state->efd_bwriters--; } /* * We now know that we can add the value without overflowing. */ state->efd_value = (oval = state->efd_value) + val; /* * If the value was previously "empty", notify blocked readers that * data is available. */ if (oval == 0) { cv_broadcast(&state->efd_cv); } mutex_exit(&state->efd_lock); /* * Notify pollers that something has changed. */ pollwakeup(&state->efd_pollhd, POLLRDNORM | POLLIN); return (0); } static int eventfd_poll(dev_t dev, short events, int anyyet, short *reventsp, struct pollhead **phpp) { eventfd_state_t *state; minor_t minor = getminor(dev); short revents = 0; state = ddi_get_soft_state(eventfd_softstate, minor); mutex_enter(&state->efd_lock); if (state->efd_value > 0) revents |= POLLRDNORM | POLLIN; if (state->efd_value < EVENTFD_VALMAX) revents |= POLLWRNORM | POLLOUT; *reventsp = revents & events; if ((*reventsp == 0 && !anyyet) || (events & POLLET)) { *phpp = &state->efd_pollhd; } mutex_exit(&state->efd_lock); return (0); } static int eventfd_ioctl(dev_t dev, int cmd, intptr_t arg __unused, int md __unused, cred_t *cr __unused, int *rv __unused) { eventfd_state_t *state; minor_t minor = getminor(dev); state = ddi_get_soft_state(eventfd_softstate, minor); switch (cmd) { case EVENTFDIOC_SEMAPHORE: { mutex_enter(&state->efd_lock); state->efd_semaphore ^= 1; mutex_exit(&state->efd_lock); return (0); } default: break; } return (ENOTTY); } static int eventfd_close(dev_t dev, int flag __unused, int otyp __unused, cred_t *cr __unused) { eventfd_state_t *state, **sp; minor_t minor = getminor(dev); state = ddi_get_soft_state(eventfd_softstate, minor); if (state->efd_pollhd.ph_list != NULL) { pollwakeup(&state->efd_pollhd, POLLERR); pollhead_clean(&state->efd_pollhd); } mutex_enter(&eventfd_lock); /* * Remove our state from our global list. */ for (sp = &eventfd_state; *sp != state; sp = &((*sp)->efd_next)) VERIFY(*sp != NULL); *sp = (*sp)->efd_next; ddi_soft_state_free(eventfd_softstate, minor); vmem_free(eventfd_minor, (void *)(uintptr_t)minor, 1); mutex_exit(&eventfd_lock); return (0); } static int eventfd_attach(dev_info_t *devi, ddi_attach_cmd_t cmd) { switch (cmd) { case DDI_ATTACH: break; case DDI_RESUME: return (DDI_SUCCESS); default: return (DDI_FAILURE); } mutex_enter(&eventfd_lock); if (ddi_soft_state_init(&eventfd_softstate, sizeof (eventfd_state_t), 0) != 0) { cmn_err(CE_NOTE, "/dev/eventfd failed to create soft state"); mutex_exit(&eventfd_lock); return (DDI_FAILURE); } if (ddi_create_minor_node(devi, "eventfd", S_IFCHR, EVENTFDMNRN_EVENTFD, DDI_PSEUDO, 0) == DDI_FAILURE) { cmn_err(CE_NOTE, "/dev/eventfd couldn't create minor node"); ddi_soft_state_fini(&eventfd_softstate); mutex_exit(&eventfd_lock); return (DDI_FAILURE); } ddi_report_dev(devi); eventfd_devi = devi; eventfd_minor = vmem_create("eventfd_minor", (void *)EVENTFDMNRN_CLONE, UINT32_MAX - EVENTFDMNRN_CLONE, 1, NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER); mutex_exit(&eventfd_lock); return (DDI_SUCCESS); } static int eventfd_detach(dev_info_t *dip __unused, ddi_detach_cmd_t cmd) { switch (cmd) { case DDI_DETACH: break; case DDI_SUSPEND: return (DDI_SUCCESS); default: return (DDI_FAILURE); } mutex_enter(&eventfd_lock); vmem_destroy(eventfd_minor); ddi_remove_minor_node(eventfd_devi, NULL); eventfd_devi = NULL; ddi_soft_state_fini(&eventfd_softstate); mutex_exit(&eventfd_lock); return (DDI_SUCCESS); } static int eventfd_info(dev_info_t *dip __unused, ddi_info_cmd_t infocmd, void *arg __unused, void **result) { int error; switch (infocmd) { case DDI_INFO_DEVT2DEVINFO: *result = (void *)eventfd_devi; error = DDI_SUCCESS; break; case DDI_INFO_DEVT2INSTANCE: *result = (void *)0; error = DDI_SUCCESS; break; default: error = DDI_FAILURE; } return (error); } static struct cb_ops eventfd_cb_ops = { eventfd_open, /* open */ eventfd_close, /* close */ nulldev, /* strategy */ nulldev, /* print */ nodev, /* dump */ eventfd_read, /* read */ eventfd_write, /* write */ eventfd_ioctl, /* ioctl */ nodev, /* devmap */ nodev, /* mmap */ nodev, /* segmap */ eventfd_poll, /* poll */ ddi_prop_op, /* cb_prop_op */ 0, /* streamtab */ D_NEW | D_MP /* Driver compatibility flag */ }; static struct dev_ops eventfd_ops = { DEVO_REV, /* devo_rev */ 0, /* refcnt */ eventfd_info, /* get_dev_info */ nulldev, /* identify */ nulldev, /* probe */ eventfd_attach, /* attach */ eventfd_detach, /* detach */ nodev, /* reset */ &eventfd_cb_ops, /* driver operations */ NULL, /* bus operations */ nodev, /* dev power */ ddi_quiesce_not_needed, /* quiesce */ }; static struct modldrv modldrv = { &mod_driverops, /* module type (this is a pseudo driver) */ "eventfd support", /* name of module */ &eventfd_ops, /* driver ops */ }; static struct modlinkage modlinkage = { MODREV_1, (void *)&modldrv, NULL }; int _init(void) { return (mod_install(&modlinkage)); } int _info(struct modinfo *modinfop) { return (mod_info(&modlinkage, modinfop)); } int _fini(void) { return (mod_remove(&modlinkage)); }