xref: /illumos-gate/usr/src/uts/common/io/eventfd.c (revision 2833423dc59f4c35fe4713dbb942950c82df0437)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright 2017 Joyent, Inc.
14  * Copyright 2024 Oxide Computer Company
15  */
16 
17 /*
18  * Support for the eventfd facility, a Linux-borne facility for user-generated
19  * file descriptor-based events.
20  */
21 
22 #include <sys/ddi.h>
23 #include <sys/sunddi.h>
24 #include <sys/eventfd.h>
25 #include <sys/conf.h>
26 #include <sys/vmem.h>
27 #include <sys/sysmacros.h>
28 #include <sys/filio.h>
29 #include <sys/stat.h>
30 #include <sys/file.h>
31 
32 struct eventfd_state;
33 typedef struct eventfd_state eventfd_state_t;
34 
35 struct eventfd_state {
36 	kmutex_t efd_lock;			/* lock protecting state */
37 	boolean_t efd_semaphore;		/* boolean: sema. semantics */
38 	kcondvar_t efd_cv;			/* condvar */
39 	pollhead_t efd_pollhd;			/* poll head */
40 	uint64_t efd_value;			/* value */
41 	size_t efd_bwriters;			/* count of blocked writers */
42 	eventfd_state_t *efd_next;		/* next state on global list */
43 };
44 
45 /*
46  * Internal global variables.
47  */
48 static kmutex_t		eventfd_lock;		/* lock protecting state */
49 static dev_info_t	*eventfd_devi;		/* device info */
50 static vmem_t		*eventfd_minor;		/* minor number arena */
51 static void		*eventfd_softstate;	/* softstate pointer */
52 static eventfd_state_t	*eventfd_state;		/* global list of state */
53 
54 static int
55 eventfd_open(dev_t *devp, int flag __unused, int otyp __unused,
56     cred_t *cr __unused)
57 {
58 	eventfd_state_t *state;
59 	major_t major = getemajor(*devp);
60 	minor_t minor = getminor(*devp);
61 
62 	if (minor != EVENTFDMNRN_EVENTFD)
63 		return (ENXIO);
64 
65 	mutex_enter(&eventfd_lock);
66 
67 	minor = (minor_t)(uintptr_t)vmem_alloc(eventfd_minor, 1,
68 	    VM_BESTFIT | VM_SLEEP);
69 
70 	if (ddi_soft_state_zalloc(eventfd_softstate, minor) != DDI_SUCCESS) {
71 		vmem_free(eventfd_minor, (void *)(uintptr_t)minor, 1);
72 		mutex_exit(&eventfd_lock);
73 		return (ENXIO);
74 	}
75 
76 	state = ddi_get_soft_state(eventfd_softstate, minor);
77 	*devp = makedevice(major, minor);
78 
79 	state->efd_next = eventfd_state;
80 	eventfd_state = state;
81 
82 	mutex_exit(&eventfd_lock);
83 
84 	return (0);
85 }
86 
87 static int
88 eventfd_read(dev_t dev, uio_t *uio, cred_t *cr __unused)
89 {
90 	eventfd_state_t *state;
91 	minor_t minor = getminor(dev);
92 	uint64_t val, oval;
93 	int err;
94 
95 	if (uio->uio_resid < sizeof (val))
96 		return (EINVAL);
97 
98 	state = ddi_get_soft_state(eventfd_softstate, minor);
99 
100 	mutex_enter(&state->efd_lock);
101 
102 	while (state->efd_value == 0) {
103 		if (uio->uio_fmode & (FNDELAY|FNONBLOCK)) {
104 			mutex_exit(&state->efd_lock);
105 			return (EAGAIN);
106 		}
107 
108 		if (!cv_wait_sig_swap(&state->efd_cv, &state->efd_lock)) {
109 			mutex_exit(&state->efd_lock);
110 			return (EINTR);
111 		}
112 	}
113 
114 	/*
115 	 * We have a non-zero value and we own the lock; our behavior now
116 	 * depends on whether or not EFD_SEMAPHORE was set when the eventfd
117 	 * was created.
118 	 */
119 	val = oval = state->efd_value;
120 
121 	if (state->efd_semaphore) {
122 		state->efd_value--;
123 		val = 1;
124 	} else {
125 		state->efd_value = 0;
126 	}
127 
128 	err = uiomove(&val, sizeof (val), UIO_READ, uio);
129 
130 	/*
131 	 * Wake any writers blocked on this eventfd as this read operation may
132 	 * have created adequate capacity for their values.
133 	 */
134 	if (state->efd_bwriters != 0) {
135 		cv_broadcast(&state->efd_cv);
136 	}
137 	mutex_exit(&state->efd_lock);
138 
139 	/*
140 	 * It is necessary to emit POLLOUT events only when the eventfd
141 	 * transitions from EVENTFD_VALMAX to a lower value.  At all other
142 	 * times, it is already considered writable by poll.
143 	 */
144 	if (oval == EVENTFD_VALMAX) {
145 		pollwakeup(&state->efd_pollhd, POLLWRNORM | POLLOUT);
146 	}
147 
148 	return (err);
149 }
150 
151 static int
152 eventfd_write(dev_t dev, struct uio *uio, cred_t *cr __unused)
153 {
154 	eventfd_state_t *state;
155 	minor_t minor = getminor(dev);
156 	uint64_t val, oval;
157 	int err;
158 
159 	if (uio->uio_resid < sizeof (val))
160 		return (EINVAL);
161 
162 	if ((err = uiomove(&val, sizeof (val), UIO_WRITE, uio)) != 0)
163 		return (err);
164 
165 	if (val > EVENTFD_VALMAX)
166 		return (EINVAL);
167 
168 	state = ddi_get_soft_state(eventfd_softstate, minor);
169 
170 	mutex_enter(&state->efd_lock);
171 
172 	while (val > EVENTFD_VALMAX - state->efd_value) {
173 		if (uio->uio_fmode & (FNDELAY|FNONBLOCK)) {
174 			mutex_exit(&state->efd_lock);
175 			return (EAGAIN);
176 		}
177 
178 		state->efd_bwriters++;
179 		if (!cv_wait_sig_swap(&state->efd_cv, &state->efd_lock)) {
180 			state->efd_bwriters--;
181 			mutex_exit(&state->efd_lock);
182 			return (EINTR);
183 		}
184 		state->efd_bwriters--;
185 	}
186 
187 	/*
188 	 * We now know that we can add the value without overflowing.
189 	 */
190 	state->efd_value = (oval = state->efd_value) + val;
191 
192 	/*
193 	 * If the value was previously "empty", notify blocked readers that
194 	 * data is available.
195 	 */
196 	if (oval == 0) {
197 		cv_broadcast(&state->efd_cv);
198 	}
199 	mutex_exit(&state->efd_lock);
200 
201 	/*
202 	 * Notify pollers that something has changed.
203 	 */
204 	pollwakeup(&state->efd_pollhd, POLLRDNORM | POLLIN);
205 
206 	return (0);
207 }
208 
209 static int
210 eventfd_poll(dev_t dev, short events, int anyyet, short *reventsp,
211     struct pollhead **phpp)
212 {
213 	eventfd_state_t *state;
214 	minor_t minor = getminor(dev);
215 	short revents = 0;
216 
217 	state = ddi_get_soft_state(eventfd_softstate, minor);
218 
219 	mutex_enter(&state->efd_lock);
220 
221 	if (state->efd_value > 0)
222 		revents |= POLLRDNORM | POLLIN;
223 
224 	if (state->efd_value < EVENTFD_VALMAX)
225 		revents |= POLLWRNORM | POLLOUT;
226 
227 	*reventsp = revents & events;
228 	if ((*reventsp == 0 && !anyyet) || (events & POLLET)) {
229 		*phpp = &state->efd_pollhd;
230 	}
231 
232 	mutex_exit(&state->efd_lock);
233 
234 	return (0);
235 }
236 
237 static int
238 eventfd_ioctl(dev_t dev, int cmd, intptr_t arg __unused, int md __unused,
239     cred_t *cr __unused, int *rv __unused)
240 {
241 	eventfd_state_t *state;
242 	minor_t minor = getminor(dev);
243 
244 	state = ddi_get_soft_state(eventfd_softstate, minor);
245 
246 	switch (cmd) {
247 	case EVENTFDIOC_SEMAPHORE: {
248 		mutex_enter(&state->efd_lock);
249 		state->efd_semaphore ^= 1;
250 		mutex_exit(&state->efd_lock);
251 
252 		return (0);
253 	}
254 
255 	default:
256 		break;
257 	}
258 
259 	return (ENOTTY);
260 }
261 
262 static int
263 eventfd_close(dev_t dev, int flag __unused, int otyp __unused,
264     cred_t *cr __unused)
265 {
266 	eventfd_state_t *state, **sp;
267 	minor_t minor = getminor(dev);
268 
269 	state = ddi_get_soft_state(eventfd_softstate, minor);
270 
271 	if (state->efd_pollhd.ph_list != NULL) {
272 		pollwakeup(&state->efd_pollhd, POLLERR);
273 		pollhead_clean(&state->efd_pollhd);
274 	}
275 
276 	mutex_enter(&eventfd_lock);
277 
278 	/*
279 	 * Remove our state from our global list.
280 	 */
281 	for (sp = &eventfd_state; *sp != state; sp = &((*sp)->efd_next))
282 		VERIFY(*sp != NULL);
283 
284 	*sp = (*sp)->efd_next;
285 
286 	ddi_soft_state_free(eventfd_softstate, minor);
287 	vmem_free(eventfd_minor, (void *)(uintptr_t)minor, 1);
288 
289 	mutex_exit(&eventfd_lock);
290 
291 	return (0);
292 }
293 
294 static int
295 eventfd_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
296 {
297 	switch (cmd) {
298 	case DDI_ATTACH:
299 		break;
300 
301 	case DDI_RESUME:
302 		return (DDI_SUCCESS);
303 
304 	default:
305 		return (DDI_FAILURE);
306 	}
307 
308 	mutex_enter(&eventfd_lock);
309 
310 	if (ddi_soft_state_init(&eventfd_softstate,
311 	    sizeof (eventfd_state_t), 0) != 0) {
312 		cmn_err(CE_NOTE, "/dev/eventfd failed to create soft state");
313 		mutex_exit(&eventfd_lock);
314 		return (DDI_FAILURE);
315 	}
316 
317 	if (ddi_create_minor_node(devi, "eventfd", S_IFCHR,
318 	    EVENTFDMNRN_EVENTFD, DDI_PSEUDO, 0) == DDI_FAILURE) {
319 		cmn_err(CE_NOTE, "/dev/eventfd couldn't create minor node");
320 		ddi_soft_state_fini(&eventfd_softstate);
321 		mutex_exit(&eventfd_lock);
322 		return (DDI_FAILURE);
323 	}
324 
325 	ddi_report_dev(devi);
326 	eventfd_devi = devi;
327 
328 	eventfd_minor = vmem_create("eventfd_minor", (void *)EVENTFDMNRN_CLONE,
329 	    UINT32_MAX - EVENTFDMNRN_CLONE, 1, NULL, NULL, NULL, 0,
330 	    VM_SLEEP | VMC_IDENTIFIER);
331 
332 	mutex_exit(&eventfd_lock);
333 
334 	return (DDI_SUCCESS);
335 }
336 
337 static int
338 eventfd_detach(dev_info_t *dip __unused, ddi_detach_cmd_t cmd)
339 {
340 	switch (cmd) {
341 	case DDI_DETACH:
342 		break;
343 
344 	case DDI_SUSPEND:
345 		return (DDI_SUCCESS);
346 
347 	default:
348 		return (DDI_FAILURE);
349 	}
350 
351 	mutex_enter(&eventfd_lock);
352 	vmem_destroy(eventfd_minor);
353 
354 	ddi_remove_minor_node(eventfd_devi, NULL);
355 	eventfd_devi = NULL;
356 
357 	ddi_soft_state_fini(&eventfd_softstate);
358 	mutex_exit(&eventfd_lock);
359 
360 	return (DDI_SUCCESS);
361 }
362 
363 static int
364 eventfd_info(dev_info_t *dip __unused, ddi_info_cmd_t infocmd,
365     void *arg __unused, void **result)
366 {
367 	int error;
368 
369 	switch (infocmd) {
370 	case DDI_INFO_DEVT2DEVINFO:
371 		*result = (void *)eventfd_devi;
372 		error = DDI_SUCCESS;
373 		break;
374 	case DDI_INFO_DEVT2INSTANCE:
375 		*result = (void *)0;
376 		error = DDI_SUCCESS;
377 		break;
378 	default:
379 		error = DDI_FAILURE;
380 	}
381 	return (error);
382 }
383 
384 static struct cb_ops eventfd_cb_ops = {
385 	eventfd_open,		/* open */
386 	eventfd_close,		/* close */
387 	nulldev,		/* strategy */
388 	nulldev,		/* print */
389 	nodev,			/* dump */
390 	eventfd_read,		/* read */
391 	eventfd_write,		/* write */
392 	eventfd_ioctl,		/* ioctl */
393 	nodev,			/* devmap */
394 	nodev,			/* mmap */
395 	nodev,			/* segmap */
396 	eventfd_poll,		/* poll */
397 	ddi_prop_op,		/* cb_prop_op */
398 	0,			/* streamtab  */
399 	D_NEW | D_MP		/* Driver compatibility flag */
400 };
401 
402 static struct dev_ops eventfd_ops = {
403 	DEVO_REV,		/* devo_rev */
404 	0,			/* refcnt */
405 	eventfd_info,		/* get_dev_info */
406 	nulldev,		/* identify */
407 	nulldev,		/* probe */
408 	eventfd_attach,		/* attach */
409 	eventfd_detach,		/* detach */
410 	nodev,			/* reset */
411 	&eventfd_cb_ops,	/* driver operations */
412 	NULL,			/* bus operations */
413 	nodev,			/* dev power */
414 	ddi_quiesce_not_needed,	/* quiesce */
415 };
416 
417 static struct modldrv modldrv = {
418 	&mod_driverops,		/* module type (this is a pseudo driver) */
419 	"eventfd support",	/* name of module */
420 	&eventfd_ops,		/* driver ops */
421 };
422 
423 static struct modlinkage modlinkage = {
424 	MODREV_1,
425 	(void *)&modldrv,
426 	NULL
427 };
428 
429 int
430 _init(void)
431 {
432 	return (mod_install(&modlinkage));
433 }
434 
435 int
436 _info(struct modinfo *modinfop)
437 {
438 	return (mod_info(&modlinkage, modinfop));
439 }
440 
441 int
442 _fini(void)
443 {
444 	return (mod_remove(&modlinkage));
445 }
446