1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 12 /* 13 * Copyright 2017 Joyent, Inc. 14 */ 15 16 /* 17 * Support for the eventfd facility, a Linux-borne facility for user-generated 18 * file descriptor-based events. 19 */ 20 21 #include <sys/ddi.h> 22 #include <sys/sunddi.h> 23 #include <sys/eventfd.h> 24 #include <sys/conf.h> 25 #include <sys/vmem.h> 26 #include <sys/sysmacros.h> 27 #include <sys/filio.h> 28 #include <sys/stat.h> 29 #include <sys/file.h> 30 31 struct eventfd_state; 32 typedef struct eventfd_state eventfd_state_t; 33 34 struct eventfd_state { 35 kmutex_t efd_lock; /* lock protecting state */ 36 boolean_t efd_semaphore; /* boolean: sema. semantics */ 37 kcondvar_t efd_cv; /* condvar */ 38 pollhead_t efd_pollhd; /* poll head */ 39 uint64_t efd_value; /* value */ 40 size_t efd_bwriters; /* count of blocked writers */ 41 eventfd_state_t *efd_next; /* next state on global list */ 42 }; 43 44 /* 45 * Internal global variables. 46 */ 47 static kmutex_t eventfd_lock; /* lock protecting state */ 48 static dev_info_t *eventfd_devi; /* device info */ 49 static vmem_t *eventfd_minor; /* minor number arena */ 50 static void *eventfd_softstate; /* softstate pointer */ 51 static eventfd_state_t *eventfd_state; /* global list of state */ 52 53 /*ARGSUSED*/ 54 static int 55 eventfd_open(dev_t *devp, int flag, int otyp, cred_t *cred_p) 56 { 57 eventfd_state_t *state; 58 major_t major = getemajor(*devp); 59 minor_t minor = getminor(*devp); 60 61 if (minor != EVENTFDMNRN_EVENTFD) 62 return (ENXIO); 63 64 mutex_enter(&eventfd_lock); 65 66 minor = (minor_t)(uintptr_t)vmem_alloc(eventfd_minor, 1, 67 VM_BESTFIT | VM_SLEEP); 68 69 if (ddi_soft_state_zalloc(eventfd_softstate, minor) != DDI_SUCCESS) { 70 vmem_free(eventfd_minor, (void *)(uintptr_t)minor, 1); 71 mutex_exit(&eventfd_lock); 72 return (ENXIO); 73 } 74 75 state = ddi_get_soft_state(eventfd_softstate, minor); 76 *devp = makedevice(major, minor); 77 78 state->efd_next = eventfd_state; 79 eventfd_state = state; 80 81 mutex_exit(&eventfd_lock); 82 83 return (0); 84 } 85 86 /*ARGSUSED*/ 87 static int 88 eventfd_read(dev_t dev, uio_t *uio, cred_t *cr) 89 { 90 eventfd_state_t *state; 91 minor_t minor = getminor(dev); 92 uint64_t val, oval; 93 int err; 94 95 if (uio->uio_resid < sizeof (val)) 96 return (EINVAL); 97 98 state = ddi_get_soft_state(eventfd_softstate, minor); 99 100 mutex_enter(&state->efd_lock); 101 102 while (state->efd_value == 0) { 103 if (uio->uio_fmode & (FNDELAY|FNONBLOCK)) { 104 mutex_exit(&state->efd_lock); 105 return (EAGAIN); 106 } 107 108 if (!cv_wait_sig_swap(&state->efd_cv, &state->efd_lock)) { 109 mutex_exit(&state->efd_lock); 110 return (EINTR); 111 } 112 } 113 114 /* 115 * We have a non-zero value and we own the lock; our behavior now 116 * depends on whether or not EFD_SEMAPHORE was set when the eventfd 117 * was created. 118 */ 119 val = oval = state->efd_value; 120 121 if (state->efd_semaphore) { 122 state->efd_value--; 123 val = 1; 124 } else { 125 state->efd_value = 0; 126 } 127 128 err = uiomove(&val, sizeof (val), UIO_READ, uio); 129 130 /* 131 * Wake any writers blocked on this eventfd as this read operation may 132 * have created adequate capacity for their values. 133 */ 134 if (state->efd_bwriters != 0) { 135 cv_broadcast(&state->efd_cv); 136 } 137 mutex_exit(&state->efd_lock); 138 139 /* 140 * It is necessary to emit POLLOUT events only when the eventfd 141 * transitions from EVENTFD_VALMAX to a lower value. At all other 142 * times, it is already considered writable by poll. 143 */ 144 if (oval == EVENTFD_VALMAX) { 145 pollwakeup(&state->efd_pollhd, POLLWRNORM | POLLOUT); 146 } 147 148 return (err); 149 } 150 151 /*ARGSUSED*/ 152 static int 153 eventfd_write(dev_t dev, struct uio *uio, cred_t *credp) 154 { 155 eventfd_state_t *state; 156 minor_t minor = getminor(dev); 157 uint64_t val, oval; 158 int err; 159 160 if (uio->uio_resid < sizeof (val)) 161 return (EINVAL); 162 163 if ((err = uiomove(&val, sizeof (val), UIO_WRITE, uio)) != 0) 164 return (err); 165 166 if (val > EVENTFD_VALMAX) 167 return (EINVAL); 168 169 state = ddi_get_soft_state(eventfd_softstate, minor); 170 171 mutex_enter(&state->efd_lock); 172 173 while (val > EVENTFD_VALMAX - state->efd_value) { 174 if (uio->uio_fmode & (FNDELAY|FNONBLOCK)) { 175 mutex_exit(&state->efd_lock); 176 return (EAGAIN); 177 } 178 179 state->efd_bwriters++; 180 if (!cv_wait_sig_swap(&state->efd_cv, &state->efd_lock)) { 181 state->efd_bwriters--; 182 mutex_exit(&state->efd_lock); 183 return (EINTR); 184 } 185 state->efd_bwriters--; 186 } 187 188 /* 189 * We now know that we can add the value without overflowing. 190 */ 191 state->efd_value = (oval = state->efd_value) + val; 192 193 /* 194 * If the value was previously "empty", notify blocked readers that 195 * data is available. 196 */ 197 if (oval == 0) { 198 cv_broadcast(&state->efd_cv); 199 } 200 mutex_exit(&state->efd_lock); 201 202 /* 203 * Notify pollers as well if the eventfd is now readable. 204 */ 205 if (oval == 0) { 206 pollwakeup(&state->efd_pollhd, POLLRDNORM | POLLIN); 207 } 208 209 return (0); 210 } 211 212 /*ARGSUSED*/ 213 static int 214 eventfd_poll(dev_t dev, short events, int anyyet, short *reventsp, 215 struct pollhead **phpp) 216 { 217 eventfd_state_t *state; 218 minor_t minor = getminor(dev); 219 short revents = 0; 220 221 state = ddi_get_soft_state(eventfd_softstate, minor); 222 223 mutex_enter(&state->efd_lock); 224 225 if (state->efd_value > 0) 226 revents |= POLLRDNORM | POLLIN; 227 228 if (state->efd_value < EVENTFD_VALMAX) 229 revents |= POLLWRNORM | POLLOUT; 230 231 *reventsp = revents & events; 232 if ((*reventsp == 0 && !anyyet) || (events & POLLET)) { 233 *phpp = &state->efd_pollhd; 234 } 235 236 mutex_exit(&state->efd_lock); 237 238 return (0); 239 } 240 241 /*ARGSUSED*/ 242 static int 243 eventfd_ioctl(dev_t dev, int cmd, intptr_t arg, int md, cred_t *cr, int *rv) 244 { 245 eventfd_state_t *state; 246 minor_t minor = getminor(dev); 247 248 state = ddi_get_soft_state(eventfd_softstate, minor); 249 250 switch (cmd) { 251 case EVENTFDIOC_SEMAPHORE: { 252 mutex_enter(&state->efd_lock); 253 state->efd_semaphore ^= 1; 254 mutex_exit(&state->efd_lock); 255 256 return (0); 257 } 258 259 default: 260 break; 261 } 262 263 return (ENOTTY); 264 } 265 266 /*ARGSUSED*/ 267 static int 268 eventfd_close(dev_t dev, int flag, int otyp, cred_t *cred_p) 269 { 270 eventfd_state_t *state, **sp; 271 minor_t minor = getminor(dev); 272 273 state = ddi_get_soft_state(eventfd_softstate, minor); 274 275 if (state->efd_pollhd.ph_list != NULL) { 276 pollwakeup(&state->efd_pollhd, POLLERR); 277 pollhead_clean(&state->efd_pollhd); 278 } 279 280 mutex_enter(&eventfd_lock); 281 282 /* 283 * Remove our state from our global list. 284 */ 285 for (sp = &eventfd_state; *sp != state; sp = &((*sp)->efd_next)) 286 VERIFY(*sp != NULL); 287 288 *sp = (*sp)->efd_next; 289 290 ddi_soft_state_free(eventfd_softstate, minor); 291 vmem_free(eventfd_minor, (void *)(uintptr_t)minor, 1); 292 293 mutex_exit(&eventfd_lock); 294 295 return (0); 296 } 297 298 static int 299 eventfd_attach(dev_info_t *devi, ddi_attach_cmd_t cmd) 300 { 301 switch (cmd) { 302 case DDI_ATTACH: 303 break; 304 305 case DDI_RESUME: 306 return (DDI_SUCCESS); 307 308 default: 309 return (DDI_FAILURE); 310 } 311 312 mutex_enter(&eventfd_lock); 313 314 if (ddi_soft_state_init(&eventfd_softstate, 315 sizeof (eventfd_state_t), 0) != 0) { 316 cmn_err(CE_NOTE, "/dev/eventfd failed to create soft state"); 317 mutex_exit(&eventfd_lock); 318 return (DDI_FAILURE); 319 } 320 321 if (ddi_create_minor_node(devi, "eventfd", S_IFCHR, 322 EVENTFDMNRN_EVENTFD, DDI_PSEUDO, 0) == DDI_FAILURE) { 323 cmn_err(CE_NOTE, "/dev/eventfd couldn't create minor node"); 324 ddi_soft_state_fini(&eventfd_softstate); 325 mutex_exit(&eventfd_lock); 326 return (DDI_FAILURE); 327 } 328 329 ddi_report_dev(devi); 330 eventfd_devi = devi; 331 332 eventfd_minor = vmem_create("eventfd_minor", (void *)EVENTFDMNRN_CLONE, 333 UINT32_MAX - EVENTFDMNRN_CLONE, 1, NULL, NULL, NULL, 0, 334 VM_SLEEP | VMC_IDENTIFIER); 335 336 mutex_exit(&eventfd_lock); 337 338 return (DDI_SUCCESS); 339 } 340 341 /*ARGSUSED*/ 342 static int 343 eventfd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 344 { 345 switch (cmd) { 346 case DDI_DETACH: 347 break; 348 349 case DDI_SUSPEND: 350 return (DDI_SUCCESS); 351 352 default: 353 return (DDI_FAILURE); 354 } 355 356 mutex_enter(&eventfd_lock); 357 vmem_destroy(eventfd_minor); 358 359 ddi_remove_minor_node(eventfd_devi, NULL); 360 eventfd_devi = NULL; 361 362 ddi_soft_state_fini(&eventfd_softstate); 363 mutex_exit(&eventfd_lock); 364 365 return (DDI_SUCCESS); 366 } 367 368 /*ARGSUSED*/ 369 static int 370 eventfd_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result) 371 { 372 int error; 373 374 switch (infocmd) { 375 case DDI_INFO_DEVT2DEVINFO: 376 *result = (void *)eventfd_devi; 377 error = DDI_SUCCESS; 378 break; 379 case DDI_INFO_DEVT2INSTANCE: 380 *result = (void *)0; 381 error = DDI_SUCCESS; 382 break; 383 default: 384 error = DDI_FAILURE; 385 } 386 return (error); 387 } 388 389 static struct cb_ops eventfd_cb_ops = { 390 eventfd_open, /* open */ 391 eventfd_close, /* close */ 392 nulldev, /* strategy */ 393 nulldev, /* print */ 394 nodev, /* dump */ 395 eventfd_read, /* read */ 396 eventfd_write, /* write */ 397 eventfd_ioctl, /* ioctl */ 398 nodev, /* devmap */ 399 nodev, /* mmap */ 400 nodev, /* segmap */ 401 eventfd_poll, /* poll */ 402 ddi_prop_op, /* cb_prop_op */ 403 0, /* streamtab */ 404 D_NEW | D_MP /* Driver compatibility flag */ 405 }; 406 407 static struct dev_ops eventfd_ops = { 408 DEVO_REV, /* devo_rev */ 409 0, /* refcnt */ 410 eventfd_info, /* get_dev_info */ 411 nulldev, /* identify */ 412 nulldev, /* probe */ 413 eventfd_attach, /* attach */ 414 eventfd_detach, /* detach */ 415 nodev, /* reset */ 416 &eventfd_cb_ops, /* driver operations */ 417 NULL, /* bus operations */ 418 nodev, /* dev power */ 419 ddi_quiesce_not_needed, /* quiesce */ 420 }; 421 422 static struct modldrv modldrv = { 423 &mod_driverops, /* module type (this is a pseudo driver) */ 424 "eventfd support", /* name of module */ 425 &eventfd_ops, /* driver ops */ 426 }; 427 428 static struct modlinkage modlinkage = { 429 MODREV_1, 430 (void *)&modldrv, 431 NULL 432 }; 433 434 int 435 _init(void) 436 { 437 return (mod_install(&modlinkage)); 438 } 439 440 int 441 _info(struct modinfo *modinfop) 442 { 443 return (mod_info(&modlinkage, modinfop)); 444 } 445 446 int 447 _fini(void) 448 { 449 return (mod_remove(&modlinkage)); 450 } 451