1 /*
2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
5 * 1.0 of the CDDL.
6 *
7 * A full copy of the text of the CDDL should have accompanied this
8 * source. A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
10 */
11
12 /*
13 * Copyright 2017 Joyent, Inc.
14 * Copyright 2024 Oxide Computer Company
15 */
16
17 /*
18 * Support for the eventfd facility, a Linux-borne facility for user-generated
19 * file descriptor-based events.
20 */
21
22 #include <sys/ddi.h>
23 #include <sys/sunddi.h>
24 #include <sys/eventfd.h>
25 #include <sys/conf.h>
26 #include <sys/vmem.h>
27 #include <sys/sysmacros.h>
28 #include <sys/filio.h>
29 #include <sys/stat.h>
30 #include <sys/file.h>
31
32 struct eventfd_state;
33 typedef struct eventfd_state eventfd_state_t;
34
35 struct eventfd_state {
36 kmutex_t efd_lock; /* lock protecting state */
37 boolean_t efd_semaphore; /* boolean: sema. semantics */
38 kcondvar_t efd_cv; /* condvar */
39 pollhead_t efd_pollhd; /* poll head */
40 uint64_t efd_value; /* value */
41 size_t efd_bwriters; /* count of blocked writers */
42 eventfd_state_t *efd_next; /* next state on global list */
43 };
44
45 /*
46 * Internal global variables.
47 */
48 static kmutex_t eventfd_lock; /* lock protecting state */
49 static dev_info_t *eventfd_devi; /* device info */
50 static vmem_t *eventfd_minor; /* minor number arena */
51 static void *eventfd_softstate; /* softstate pointer */
52 static eventfd_state_t *eventfd_state; /* global list of state */
53
54 static int
eventfd_open(dev_t * devp,int flag __unused,int otyp __unused,cred_t * cr __unused)55 eventfd_open(dev_t *devp, int flag __unused, int otyp __unused,
56 cred_t *cr __unused)
57 {
58 eventfd_state_t *state;
59 major_t major = getemajor(*devp);
60 minor_t minor = getminor(*devp);
61
62 if (minor != EVENTFDMNRN_EVENTFD)
63 return (ENXIO);
64
65 mutex_enter(&eventfd_lock);
66
67 minor = (minor_t)(uintptr_t)vmem_alloc(eventfd_minor, 1,
68 VM_BESTFIT | VM_SLEEP);
69
70 if (ddi_soft_state_zalloc(eventfd_softstate, minor) != DDI_SUCCESS) {
71 vmem_free(eventfd_minor, (void *)(uintptr_t)minor, 1);
72 mutex_exit(&eventfd_lock);
73 return (ENXIO);
74 }
75
76 state = ddi_get_soft_state(eventfd_softstate, minor);
77 *devp = makedevice(major, minor);
78
79 state->efd_next = eventfd_state;
80 eventfd_state = state;
81
82 mutex_exit(&eventfd_lock);
83
84 return (0);
85 }
86
87 static int
eventfd_read(dev_t dev,uio_t * uio,cred_t * cr __unused)88 eventfd_read(dev_t dev, uio_t *uio, cred_t *cr __unused)
89 {
90 eventfd_state_t *state;
91 minor_t minor = getminor(dev);
92 uint64_t val, oval;
93 int err;
94
95 if (uio->uio_resid < sizeof (val))
96 return (EINVAL);
97
98 state = ddi_get_soft_state(eventfd_softstate, minor);
99
100 mutex_enter(&state->efd_lock);
101
102 while (state->efd_value == 0) {
103 if (uio->uio_fmode & (FNDELAY|FNONBLOCK)) {
104 mutex_exit(&state->efd_lock);
105 return (EAGAIN);
106 }
107
108 if (!cv_wait_sig_swap(&state->efd_cv, &state->efd_lock)) {
109 mutex_exit(&state->efd_lock);
110 return (EINTR);
111 }
112 }
113
114 /*
115 * We have a non-zero value and we own the lock; our behavior now
116 * depends on whether or not EFD_SEMAPHORE was set when the eventfd
117 * was created.
118 */
119 val = oval = state->efd_value;
120
121 if (state->efd_semaphore) {
122 state->efd_value--;
123 val = 1;
124 } else {
125 state->efd_value = 0;
126 }
127
128 err = uiomove(&val, sizeof (val), UIO_READ, uio);
129
130 /*
131 * Wake any writers blocked on this eventfd as this read operation may
132 * have created adequate capacity for their values.
133 */
134 if (state->efd_bwriters != 0) {
135 cv_broadcast(&state->efd_cv);
136 }
137 mutex_exit(&state->efd_lock);
138
139 /*
140 * It is necessary to emit POLLOUT events only when the eventfd
141 * transitions from EVENTFD_VALMAX to a lower value. At all other
142 * times, it is already considered writable by poll.
143 */
144 if (oval == EVENTFD_VALMAX) {
145 pollwakeup(&state->efd_pollhd, POLLWRNORM | POLLOUT);
146 }
147
148 return (err);
149 }
150
151 static int
eventfd_write(dev_t dev,struct uio * uio,cred_t * cr __unused)152 eventfd_write(dev_t dev, struct uio *uio, cred_t *cr __unused)
153 {
154 eventfd_state_t *state;
155 minor_t minor = getminor(dev);
156 uint64_t val, oval;
157 int err;
158
159 if (uio->uio_resid < sizeof (val))
160 return (EINVAL);
161
162 if ((err = uiomove(&val, sizeof (val), UIO_WRITE, uio)) != 0)
163 return (err);
164
165 if (val > EVENTFD_VALMAX)
166 return (EINVAL);
167
168 state = ddi_get_soft_state(eventfd_softstate, minor);
169
170 mutex_enter(&state->efd_lock);
171
172 while (val > EVENTFD_VALMAX - state->efd_value) {
173 if (uio->uio_fmode & (FNDELAY|FNONBLOCK)) {
174 mutex_exit(&state->efd_lock);
175 return (EAGAIN);
176 }
177
178 state->efd_bwriters++;
179 if (!cv_wait_sig_swap(&state->efd_cv, &state->efd_lock)) {
180 state->efd_bwriters--;
181 mutex_exit(&state->efd_lock);
182 return (EINTR);
183 }
184 state->efd_bwriters--;
185 }
186
187 /*
188 * We now know that we can add the value without overflowing.
189 */
190 state->efd_value = (oval = state->efd_value) + val;
191
192 /*
193 * If the value was previously "empty", notify blocked readers that
194 * data is available.
195 */
196 if (oval == 0) {
197 cv_broadcast(&state->efd_cv);
198 }
199 mutex_exit(&state->efd_lock);
200
201 /*
202 * Notify pollers that something has changed.
203 */
204 pollwakeup(&state->efd_pollhd, POLLRDNORM | POLLIN);
205
206 return (0);
207 }
208
209 static int
eventfd_poll(dev_t dev,short events,int anyyet,short * reventsp,struct pollhead ** phpp)210 eventfd_poll(dev_t dev, short events, int anyyet, short *reventsp,
211 struct pollhead **phpp)
212 {
213 eventfd_state_t *state;
214 minor_t minor = getminor(dev);
215 short revents = 0;
216
217 state = ddi_get_soft_state(eventfd_softstate, minor);
218
219 mutex_enter(&state->efd_lock);
220
221 if (state->efd_value > 0)
222 revents |= POLLRDNORM | POLLIN;
223
224 if (state->efd_value < EVENTFD_VALMAX)
225 revents |= POLLWRNORM | POLLOUT;
226
227 *reventsp = revents & events;
228 if ((*reventsp == 0 && !anyyet) || (events & POLLET)) {
229 *phpp = &state->efd_pollhd;
230 }
231
232 mutex_exit(&state->efd_lock);
233
234 return (0);
235 }
236
237 static int
eventfd_ioctl(dev_t dev,int cmd,intptr_t arg __unused,int md __unused,cred_t * cr __unused,int * rv __unused)238 eventfd_ioctl(dev_t dev, int cmd, intptr_t arg __unused, int md __unused,
239 cred_t *cr __unused, int *rv __unused)
240 {
241 eventfd_state_t *state;
242 minor_t minor = getminor(dev);
243
244 state = ddi_get_soft_state(eventfd_softstate, minor);
245
246 switch (cmd) {
247 case EVENTFDIOC_SEMAPHORE: {
248 mutex_enter(&state->efd_lock);
249 state->efd_semaphore ^= 1;
250 mutex_exit(&state->efd_lock);
251
252 return (0);
253 }
254
255 default:
256 break;
257 }
258
259 return (ENOTTY);
260 }
261
262 static int
eventfd_close(dev_t dev,int flag __unused,int otyp __unused,cred_t * cr __unused)263 eventfd_close(dev_t dev, int flag __unused, int otyp __unused,
264 cred_t *cr __unused)
265 {
266 eventfd_state_t *state, **sp;
267 minor_t minor = getminor(dev);
268
269 state = ddi_get_soft_state(eventfd_softstate, minor);
270
271 if (state->efd_pollhd.ph_list != NULL) {
272 pollwakeup(&state->efd_pollhd, POLLERR);
273 pollhead_clean(&state->efd_pollhd);
274 }
275
276 mutex_enter(&eventfd_lock);
277
278 /*
279 * Remove our state from our global list.
280 */
281 for (sp = &eventfd_state; *sp != state; sp = &((*sp)->efd_next))
282 VERIFY(*sp != NULL);
283
284 *sp = (*sp)->efd_next;
285
286 ddi_soft_state_free(eventfd_softstate, minor);
287 vmem_free(eventfd_minor, (void *)(uintptr_t)minor, 1);
288
289 mutex_exit(&eventfd_lock);
290
291 return (0);
292 }
293
294 static int
eventfd_attach(dev_info_t * devi,ddi_attach_cmd_t cmd)295 eventfd_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
296 {
297 switch (cmd) {
298 case DDI_ATTACH:
299 break;
300
301 case DDI_RESUME:
302 return (DDI_SUCCESS);
303
304 default:
305 return (DDI_FAILURE);
306 }
307
308 mutex_enter(&eventfd_lock);
309
310 if (ddi_soft_state_init(&eventfd_softstate,
311 sizeof (eventfd_state_t), 0) != 0) {
312 cmn_err(CE_NOTE, "/dev/eventfd failed to create soft state");
313 mutex_exit(&eventfd_lock);
314 return (DDI_FAILURE);
315 }
316
317 if (ddi_create_minor_node(devi, "eventfd", S_IFCHR,
318 EVENTFDMNRN_EVENTFD, DDI_PSEUDO, 0) == DDI_FAILURE) {
319 cmn_err(CE_NOTE, "/dev/eventfd couldn't create minor node");
320 ddi_soft_state_fini(&eventfd_softstate);
321 mutex_exit(&eventfd_lock);
322 return (DDI_FAILURE);
323 }
324
325 ddi_report_dev(devi);
326 eventfd_devi = devi;
327
328 eventfd_minor = vmem_create("eventfd_minor", (void *)EVENTFDMNRN_CLONE,
329 UINT32_MAX - EVENTFDMNRN_CLONE, 1, NULL, NULL, NULL, 0,
330 VM_SLEEP | VMC_IDENTIFIER);
331
332 mutex_exit(&eventfd_lock);
333
334 return (DDI_SUCCESS);
335 }
336
337 static int
eventfd_detach(dev_info_t * dip __unused,ddi_detach_cmd_t cmd)338 eventfd_detach(dev_info_t *dip __unused, ddi_detach_cmd_t cmd)
339 {
340 switch (cmd) {
341 case DDI_DETACH:
342 break;
343
344 case DDI_SUSPEND:
345 return (DDI_SUCCESS);
346
347 default:
348 return (DDI_FAILURE);
349 }
350
351 mutex_enter(&eventfd_lock);
352 vmem_destroy(eventfd_minor);
353
354 ddi_remove_minor_node(eventfd_devi, NULL);
355 eventfd_devi = NULL;
356
357 ddi_soft_state_fini(&eventfd_softstate);
358 mutex_exit(&eventfd_lock);
359
360 return (DDI_SUCCESS);
361 }
362
363 static int
eventfd_info(dev_info_t * dip __unused,ddi_info_cmd_t infocmd,void * arg __unused,void ** result)364 eventfd_info(dev_info_t *dip __unused, ddi_info_cmd_t infocmd,
365 void *arg __unused, void **result)
366 {
367 int error;
368
369 switch (infocmd) {
370 case DDI_INFO_DEVT2DEVINFO:
371 *result = (void *)eventfd_devi;
372 error = DDI_SUCCESS;
373 break;
374 case DDI_INFO_DEVT2INSTANCE:
375 *result = (void *)0;
376 error = DDI_SUCCESS;
377 break;
378 default:
379 error = DDI_FAILURE;
380 }
381 return (error);
382 }
383
384 static struct cb_ops eventfd_cb_ops = {
385 eventfd_open, /* open */
386 eventfd_close, /* close */
387 nulldev, /* strategy */
388 nulldev, /* print */
389 nodev, /* dump */
390 eventfd_read, /* read */
391 eventfd_write, /* write */
392 eventfd_ioctl, /* ioctl */
393 nodev, /* devmap */
394 nodev, /* mmap */
395 nodev, /* segmap */
396 eventfd_poll, /* poll */
397 ddi_prop_op, /* cb_prop_op */
398 0, /* streamtab */
399 D_NEW | D_MP /* Driver compatibility flag */
400 };
401
402 static struct dev_ops eventfd_ops = {
403 DEVO_REV, /* devo_rev */
404 0, /* refcnt */
405 eventfd_info, /* get_dev_info */
406 nulldev, /* identify */
407 nulldev, /* probe */
408 eventfd_attach, /* attach */
409 eventfd_detach, /* detach */
410 nodev, /* reset */
411 &eventfd_cb_ops, /* driver operations */
412 NULL, /* bus operations */
413 nodev, /* dev power */
414 ddi_quiesce_not_needed, /* quiesce */
415 };
416
417 static struct modldrv modldrv = {
418 &mod_driverops, /* module type (this is a pseudo driver) */
419 "eventfd support", /* name of module */
420 &eventfd_ops, /* driver ops */
421 };
422
423 static struct modlinkage modlinkage = {
424 MODREV_1,
425 (void *)&modldrv,
426 NULL
427 };
428
429 int
_init(void)430 _init(void)
431 {
432 return (mod_install(&modlinkage));
433 }
434
435 int
_info(struct modinfo * modinfop)436 _info(struct modinfo *modinfop)
437 {
438 return (mod_info(&modlinkage, modinfop));
439 }
440
441 int
_fini(void)442 _fini(void)
443 {
444 return (mod_remove(&modlinkage));
445 }
446