1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 12 /* 13 * Copyright 2017 Joyent, Inc. 14 * Copyright 2024 Oxide Computer Company 15 */ 16 17 /* 18 * Support for the eventfd facility, a Linux-borne facility for user-generated 19 * file descriptor-based events. 20 */ 21 22 #include <sys/ddi.h> 23 #include <sys/sunddi.h> 24 #include <sys/eventfd.h> 25 #include <sys/conf.h> 26 #include <sys/vmem.h> 27 #include <sys/sysmacros.h> 28 #include <sys/filio.h> 29 #include <sys/stat.h> 30 #include <sys/file.h> 31 32 struct eventfd_state; 33 typedef struct eventfd_state eventfd_state_t; 34 35 struct eventfd_state { 36 kmutex_t efd_lock; /* lock protecting state */ 37 boolean_t efd_semaphore; /* boolean: sema. semantics */ 38 kcondvar_t efd_cv; /* condvar */ 39 pollhead_t efd_pollhd; /* poll head */ 40 uint64_t efd_value; /* value */ 41 size_t efd_bwriters; /* count of blocked writers */ 42 eventfd_state_t *efd_next; /* next state on global list */ 43 }; 44 45 /* 46 * Internal global variables. 47 */ 48 static kmutex_t eventfd_lock; /* lock protecting state */ 49 static dev_info_t *eventfd_devi; /* device info */ 50 static vmem_t *eventfd_minor; /* minor number arena */ 51 static void *eventfd_softstate; /* softstate pointer */ 52 static eventfd_state_t *eventfd_state; /* global list of state */ 53 54 static int 55 eventfd_open(dev_t *devp, int flag __unused, int otyp __unused, 56 cred_t *cr __unused) 57 { 58 eventfd_state_t *state; 59 major_t major = getemajor(*devp); 60 minor_t minor = getminor(*devp); 61 62 if (minor != EVENTFDMNRN_EVENTFD) 63 return (ENXIO); 64 65 mutex_enter(&eventfd_lock); 66 67 minor = (minor_t)(uintptr_t)vmem_alloc(eventfd_minor, 1, 68 VM_BESTFIT | VM_SLEEP); 69 70 if (ddi_soft_state_zalloc(eventfd_softstate, minor) != DDI_SUCCESS) { 71 vmem_free(eventfd_minor, (void *)(uintptr_t)minor, 1); 72 mutex_exit(&eventfd_lock); 73 return (ENXIO); 74 } 75 76 state = ddi_get_soft_state(eventfd_softstate, minor); 77 *devp = makedevice(major, minor); 78 79 state->efd_next = eventfd_state; 80 eventfd_state = state; 81 82 mutex_exit(&eventfd_lock); 83 84 return (0); 85 } 86 87 static int 88 eventfd_read(dev_t dev, uio_t *uio, cred_t *cr __unused) 89 { 90 eventfd_state_t *state; 91 minor_t minor = getminor(dev); 92 uint64_t val, oval; 93 int err; 94 95 if (uio->uio_resid < sizeof (val)) 96 return (EINVAL); 97 98 state = ddi_get_soft_state(eventfd_softstate, minor); 99 100 mutex_enter(&state->efd_lock); 101 102 while (state->efd_value == 0) { 103 if (uio->uio_fmode & (FNDELAY|FNONBLOCK)) { 104 mutex_exit(&state->efd_lock); 105 return (EAGAIN); 106 } 107 108 if (!cv_wait_sig_swap(&state->efd_cv, &state->efd_lock)) { 109 mutex_exit(&state->efd_lock); 110 return (EINTR); 111 } 112 } 113 114 /* 115 * We have a non-zero value and we own the lock; our behavior now 116 * depends on whether or not EFD_SEMAPHORE was set when the eventfd 117 * was created. 118 */ 119 val = oval = state->efd_value; 120 121 if (state->efd_semaphore) { 122 state->efd_value--; 123 val = 1; 124 } else { 125 state->efd_value = 0; 126 } 127 128 err = uiomove(&val, sizeof (val), UIO_READ, uio); 129 130 /* 131 * Wake any writers blocked on this eventfd as this read operation may 132 * have created adequate capacity for their values. 133 */ 134 if (state->efd_bwriters != 0) { 135 cv_broadcast(&state->efd_cv); 136 } 137 mutex_exit(&state->efd_lock); 138 139 /* 140 * It is necessary to emit POLLOUT events only when the eventfd 141 * transitions from EVENTFD_VALMAX to a lower value. At all other 142 * times, it is already considered writable by poll. 143 */ 144 if (oval == EVENTFD_VALMAX) { 145 pollwakeup(&state->efd_pollhd, POLLWRNORM | POLLOUT); 146 } 147 148 return (err); 149 } 150 151 static int 152 eventfd_write(dev_t dev, struct uio *uio, cred_t *cr __unused) 153 { 154 eventfd_state_t *state; 155 minor_t minor = getminor(dev); 156 uint64_t val, oval; 157 int err; 158 159 if (uio->uio_resid < sizeof (val)) 160 return (EINVAL); 161 162 if ((err = uiomove(&val, sizeof (val), UIO_WRITE, uio)) != 0) 163 return (err); 164 165 if (val > EVENTFD_VALMAX) 166 return (EINVAL); 167 168 state = ddi_get_soft_state(eventfd_softstate, minor); 169 170 mutex_enter(&state->efd_lock); 171 172 while (val > EVENTFD_VALMAX - state->efd_value) { 173 if (uio->uio_fmode & (FNDELAY|FNONBLOCK)) { 174 mutex_exit(&state->efd_lock); 175 return (EAGAIN); 176 } 177 178 state->efd_bwriters++; 179 if (!cv_wait_sig_swap(&state->efd_cv, &state->efd_lock)) { 180 state->efd_bwriters--; 181 mutex_exit(&state->efd_lock); 182 return (EINTR); 183 } 184 state->efd_bwriters--; 185 } 186 187 /* 188 * We now know that we can add the value without overflowing. 189 */ 190 state->efd_value = (oval = state->efd_value) + val; 191 192 /* 193 * If the value was previously "empty", notify blocked readers that 194 * data is available. 195 */ 196 if (oval == 0) { 197 cv_broadcast(&state->efd_cv); 198 } 199 mutex_exit(&state->efd_lock); 200 201 /* 202 * Notify pollers that something has changed. 203 */ 204 pollwakeup(&state->efd_pollhd, POLLRDNORM | POLLIN); 205 206 return (0); 207 } 208 209 static int 210 eventfd_poll(dev_t dev, short events, int anyyet, short *reventsp, 211 struct pollhead **phpp) 212 { 213 eventfd_state_t *state; 214 minor_t minor = getminor(dev); 215 short revents = 0; 216 217 state = ddi_get_soft_state(eventfd_softstate, minor); 218 219 mutex_enter(&state->efd_lock); 220 221 if (state->efd_value > 0) 222 revents |= POLLRDNORM | POLLIN; 223 224 if (state->efd_value < EVENTFD_VALMAX) 225 revents |= POLLWRNORM | POLLOUT; 226 227 *reventsp = revents & events; 228 if ((*reventsp == 0 && !anyyet) || (events & POLLET)) { 229 *phpp = &state->efd_pollhd; 230 } 231 232 mutex_exit(&state->efd_lock); 233 234 return (0); 235 } 236 237 static int 238 eventfd_ioctl(dev_t dev, int cmd, intptr_t arg __unused, int md __unused, 239 cred_t *cr __unused, int *rv __unused) 240 { 241 eventfd_state_t *state; 242 minor_t minor = getminor(dev); 243 244 state = ddi_get_soft_state(eventfd_softstate, minor); 245 246 switch (cmd) { 247 case EVENTFDIOC_SEMAPHORE: { 248 mutex_enter(&state->efd_lock); 249 state->efd_semaphore ^= 1; 250 mutex_exit(&state->efd_lock); 251 252 return (0); 253 } 254 255 default: 256 break; 257 } 258 259 return (ENOTTY); 260 } 261 262 static int 263 eventfd_close(dev_t dev, int flag __unused, int otyp __unused, 264 cred_t *cr __unused) 265 { 266 eventfd_state_t *state, **sp; 267 minor_t minor = getminor(dev); 268 269 state = ddi_get_soft_state(eventfd_softstate, minor); 270 271 if (state->efd_pollhd.ph_list != NULL) { 272 pollwakeup(&state->efd_pollhd, POLLERR); 273 pollhead_clean(&state->efd_pollhd); 274 } 275 276 mutex_enter(&eventfd_lock); 277 278 /* 279 * Remove our state from our global list. 280 */ 281 for (sp = &eventfd_state; *sp != state; sp = &((*sp)->efd_next)) 282 VERIFY(*sp != NULL); 283 284 *sp = (*sp)->efd_next; 285 286 ddi_soft_state_free(eventfd_softstate, minor); 287 vmem_free(eventfd_minor, (void *)(uintptr_t)minor, 1); 288 289 mutex_exit(&eventfd_lock); 290 291 return (0); 292 } 293 294 static int 295 eventfd_attach(dev_info_t *devi, ddi_attach_cmd_t cmd) 296 { 297 switch (cmd) { 298 case DDI_ATTACH: 299 break; 300 301 case DDI_RESUME: 302 return (DDI_SUCCESS); 303 304 default: 305 return (DDI_FAILURE); 306 } 307 308 mutex_enter(&eventfd_lock); 309 310 if (ddi_soft_state_init(&eventfd_softstate, 311 sizeof (eventfd_state_t), 0) != 0) { 312 cmn_err(CE_NOTE, "/dev/eventfd failed to create soft state"); 313 mutex_exit(&eventfd_lock); 314 return (DDI_FAILURE); 315 } 316 317 if (ddi_create_minor_node(devi, "eventfd", S_IFCHR, 318 EVENTFDMNRN_EVENTFD, DDI_PSEUDO, 0) == DDI_FAILURE) { 319 cmn_err(CE_NOTE, "/dev/eventfd couldn't create minor node"); 320 ddi_soft_state_fini(&eventfd_softstate); 321 mutex_exit(&eventfd_lock); 322 return (DDI_FAILURE); 323 } 324 325 ddi_report_dev(devi); 326 eventfd_devi = devi; 327 328 eventfd_minor = vmem_create("eventfd_minor", (void *)EVENTFDMNRN_CLONE, 329 UINT32_MAX - EVENTFDMNRN_CLONE, 1, NULL, NULL, NULL, 0, 330 VM_SLEEP | VMC_IDENTIFIER); 331 332 mutex_exit(&eventfd_lock); 333 334 return (DDI_SUCCESS); 335 } 336 337 static int 338 eventfd_detach(dev_info_t *dip __unused, ddi_detach_cmd_t cmd) 339 { 340 switch (cmd) { 341 case DDI_DETACH: 342 break; 343 344 case DDI_SUSPEND: 345 return (DDI_SUCCESS); 346 347 default: 348 return (DDI_FAILURE); 349 } 350 351 mutex_enter(&eventfd_lock); 352 vmem_destroy(eventfd_minor); 353 354 ddi_remove_minor_node(eventfd_devi, NULL); 355 eventfd_devi = NULL; 356 357 ddi_soft_state_fini(&eventfd_softstate); 358 mutex_exit(&eventfd_lock); 359 360 return (DDI_SUCCESS); 361 } 362 363 static int 364 eventfd_info(dev_info_t *dip __unused, ddi_info_cmd_t infocmd, 365 void *arg __unused, void **result) 366 { 367 int error; 368 369 switch (infocmd) { 370 case DDI_INFO_DEVT2DEVINFO: 371 *result = (void *)eventfd_devi; 372 error = DDI_SUCCESS; 373 break; 374 case DDI_INFO_DEVT2INSTANCE: 375 *result = (void *)0; 376 error = DDI_SUCCESS; 377 break; 378 default: 379 error = DDI_FAILURE; 380 } 381 return (error); 382 } 383 384 static struct cb_ops eventfd_cb_ops = { 385 eventfd_open, /* open */ 386 eventfd_close, /* close */ 387 nulldev, /* strategy */ 388 nulldev, /* print */ 389 nodev, /* dump */ 390 eventfd_read, /* read */ 391 eventfd_write, /* write */ 392 eventfd_ioctl, /* ioctl */ 393 nodev, /* devmap */ 394 nodev, /* mmap */ 395 nodev, /* segmap */ 396 eventfd_poll, /* poll */ 397 ddi_prop_op, /* cb_prop_op */ 398 0, /* streamtab */ 399 D_NEW | D_MP /* Driver compatibility flag */ 400 }; 401 402 static struct dev_ops eventfd_ops = { 403 DEVO_REV, /* devo_rev */ 404 0, /* refcnt */ 405 eventfd_info, /* get_dev_info */ 406 nulldev, /* identify */ 407 nulldev, /* probe */ 408 eventfd_attach, /* attach */ 409 eventfd_detach, /* detach */ 410 nodev, /* reset */ 411 &eventfd_cb_ops, /* driver operations */ 412 NULL, /* bus operations */ 413 nodev, /* dev power */ 414 ddi_quiesce_not_needed, /* quiesce */ 415 }; 416 417 static struct modldrv modldrv = { 418 &mod_driverops, /* module type (this is a pseudo driver) */ 419 "eventfd support", /* name of module */ 420 &eventfd_ops, /* driver ops */ 421 }; 422 423 static struct modlinkage modlinkage = { 424 MODREV_1, 425 (void *)&modldrv, 426 NULL 427 }; 428 429 int 430 _init(void) 431 { 432 return (mod_install(&modlinkage)); 433 } 434 435 int 436 _info(struct modinfo *modinfop) 437 { 438 return (mod_info(&modlinkage, modinfop)); 439 } 440 441 int 442 _fini(void) 443 { 444 return (mod_remove(&modlinkage)); 445 } 446