xref: /illumos-gate/usr/src/uts/common/io/eventfd.c (revision ed093b41a93e8563e6e1e5dae0768dda2a7bcc27)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright 2017 Joyent, Inc.
14  */
15 
16 /*
17  * Support for the eventfd facility, a Linux-borne facility for user-generated
18  * file descriptor-based events.
19  */
20 
21 #include <sys/ddi.h>
22 #include <sys/sunddi.h>
23 #include <sys/eventfd.h>
24 #include <sys/conf.h>
25 #include <sys/vmem.h>
26 #include <sys/sysmacros.h>
27 #include <sys/filio.h>
28 #include <sys/stat.h>
29 #include <sys/file.h>
30 
31 struct eventfd_state;
32 typedef struct eventfd_state eventfd_state_t;
33 
34 struct eventfd_state {
35 	kmutex_t efd_lock;			/* lock protecting state */
36 	boolean_t efd_semaphore;		/* boolean: sema. semantics */
37 	kcondvar_t efd_cv;			/* condvar */
38 	pollhead_t efd_pollhd;			/* poll head */
39 	uint64_t efd_value;			/* value */
40 	size_t efd_bwriters;			/* count of blocked writers */
41 	eventfd_state_t *efd_next;		/* next state on global list */
42 };
43 
44 /*
45  * Internal global variables.
46  */
47 static kmutex_t		eventfd_lock;		/* lock protecting state */
48 static dev_info_t	*eventfd_devi;		/* device info */
49 static vmem_t		*eventfd_minor;		/* minor number arena */
50 static void		*eventfd_softstate;	/* softstate pointer */
51 static eventfd_state_t	*eventfd_state;		/* global list of state */
52 
53 /*ARGSUSED*/
54 static int
55 eventfd_open(dev_t *devp, int flag, int otyp, cred_t *cred_p)
56 {
57 	eventfd_state_t *state;
58 	major_t major = getemajor(*devp);
59 	minor_t minor = getminor(*devp);
60 
61 	if (minor != EVENTFDMNRN_EVENTFD)
62 		return (ENXIO);
63 
64 	mutex_enter(&eventfd_lock);
65 
66 	minor = (minor_t)(uintptr_t)vmem_alloc(eventfd_minor, 1,
67 	    VM_BESTFIT | VM_SLEEP);
68 
69 	if (ddi_soft_state_zalloc(eventfd_softstate, minor) != DDI_SUCCESS) {
70 		vmem_free(eventfd_minor, (void *)(uintptr_t)minor, 1);
71 		mutex_exit(&eventfd_lock);
72 		return (ENXIO);
73 	}
74 
75 	state = ddi_get_soft_state(eventfd_softstate, minor);
76 	*devp = makedevice(major, minor);
77 
78 	state->efd_next = eventfd_state;
79 	eventfd_state = state;
80 
81 	mutex_exit(&eventfd_lock);
82 
83 	return (0);
84 }
85 
86 /*ARGSUSED*/
87 static int
88 eventfd_read(dev_t dev, uio_t *uio, cred_t *cr)
89 {
90 	eventfd_state_t *state;
91 	minor_t minor = getminor(dev);
92 	uint64_t val, oval;
93 	int err;
94 
95 	if (uio->uio_resid < sizeof (val))
96 		return (EINVAL);
97 
98 	state = ddi_get_soft_state(eventfd_softstate, minor);
99 
100 	mutex_enter(&state->efd_lock);
101 
102 	while (state->efd_value == 0) {
103 		if (uio->uio_fmode & (FNDELAY|FNONBLOCK)) {
104 			mutex_exit(&state->efd_lock);
105 			return (EAGAIN);
106 		}
107 
108 		if (!cv_wait_sig_swap(&state->efd_cv, &state->efd_lock)) {
109 			mutex_exit(&state->efd_lock);
110 			return (EINTR);
111 		}
112 	}
113 
114 	/*
115 	 * We have a non-zero value and we own the lock; our behavior now
116 	 * depends on whether or not EFD_SEMAPHORE was set when the eventfd
117 	 * was created.
118 	 */
119 	val = oval = state->efd_value;
120 
121 	if (state->efd_semaphore) {
122 		state->efd_value--;
123 		val = 1;
124 	} else {
125 		state->efd_value = 0;
126 	}
127 
128 	err = uiomove(&val, sizeof (val), UIO_READ, uio);
129 
130 	/*
131 	 * Wake any writers blocked on this eventfd as this read operation may
132 	 * have created adequate capacity for their values.
133 	 */
134 	if (state->efd_bwriters != 0) {
135 		cv_broadcast(&state->efd_cv);
136 	}
137 	mutex_exit(&state->efd_lock);
138 
139 	/*
140 	 * It is necessary to emit POLLOUT events only when the eventfd
141 	 * transitions from EVENTFD_VALMAX to a lower value.  At all other
142 	 * times, it is already considered writable by poll.
143 	 */
144 	if (oval == EVENTFD_VALMAX) {
145 		pollwakeup(&state->efd_pollhd, POLLWRNORM | POLLOUT);
146 	}
147 
148 	return (err);
149 }
150 
151 /*ARGSUSED*/
152 static int
153 eventfd_write(dev_t dev, struct uio *uio, cred_t *credp)
154 {
155 	eventfd_state_t *state;
156 	minor_t minor = getminor(dev);
157 	uint64_t val, oval;
158 	int err;
159 
160 	if (uio->uio_resid < sizeof (val))
161 		return (EINVAL);
162 
163 	if ((err = uiomove(&val, sizeof (val), UIO_WRITE, uio)) != 0)
164 		return (err);
165 
166 	if (val > EVENTFD_VALMAX)
167 		return (EINVAL);
168 
169 	state = ddi_get_soft_state(eventfd_softstate, minor);
170 
171 	mutex_enter(&state->efd_lock);
172 
173 	while (val > EVENTFD_VALMAX - state->efd_value) {
174 		if (uio->uio_fmode & (FNDELAY|FNONBLOCK)) {
175 			mutex_exit(&state->efd_lock);
176 			return (EAGAIN);
177 		}
178 
179 		state->efd_bwriters++;
180 		if (!cv_wait_sig_swap(&state->efd_cv, &state->efd_lock)) {
181 			state->efd_bwriters--;
182 			mutex_exit(&state->efd_lock);
183 			return (EINTR);
184 		}
185 		state->efd_bwriters--;
186 	}
187 
188 	/*
189 	 * We now know that we can add the value without overflowing.
190 	 */
191 	state->efd_value = (oval = state->efd_value) + val;
192 
193 	/*
194 	 * If the value was previously "empty", notify blocked readers that
195 	 * data is available.
196 	 */
197 	if (oval == 0) {
198 		cv_broadcast(&state->efd_cv);
199 	}
200 	mutex_exit(&state->efd_lock);
201 
202 	/*
203 	 * Notify pollers as well if the eventfd is now readable.
204 	 */
205 	if (oval == 0) {
206 		pollwakeup(&state->efd_pollhd, POLLRDNORM | POLLIN);
207 	}
208 
209 	return (0);
210 }
211 
212 /*ARGSUSED*/
213 static int
214 eventfd_poll(dev_t dev, short events, int anyyet, short *reventsp,
215     struct pollhead **phpp)
216 {
217 	eventfd_state_t *state;
218 	minor_t minor = getminor(dev);
219 	short revents = 0;
220 
221 	state = ddi_get_soft_state(eventfd_softstate, minor);
222 
223 	mutex_enter(&state->efd_lock);
224 
225 	if (state->efd_value > 0)
226 		revents |= POLLRDNORM | POLLIN;
227 
228 	if (state->efd_value < EVENTFD_VALMAX)
229 		revents |= POLLWRNORM | POLLOUT;
230 
231 	*reventsp = revents & events;
232 	if ((*reventsp == 0 && !anyyet) || (events & POLLET)) {
233 		*phpp = &state->efd_pollhd;
234 	}
235 
236 	mutex_exit(&state->efd_lock);
237 
238 	return (0);
239 }
240 
241 /*ARGSUSED*/
242 static int
243 eventfd_ioctl(dev_t dev, int cmd, intptr_t arg, int md, cred_t *cr, int *rv)
244 {
245 	eventfd_state_t *state;
246 	minor_t minor = getminor(dev);
247 
248 	state = ddi_get_soft_state(eventfd_softstate, minor);
249 
250 	switch (cmd) {
251 	case EVENTFDIOC_SEMAPHORE: {
252 		mutex_enter(&state->efd_lock);
253 		state->efd_semaphore ^= 1;
254 		mutex_exit(&state->efd_lock);
255 
256 		return (0);
257 	}
258 
259 	default:
260 		break;
261 	}
262 
263 	return (ENOTTY);
264 }
265 
266 /*ARGSUSED*/
267 static int
268 eventfd_close(dev_t dev, int flag, int otyp, cred_t *cred_p)
269 {
270 	eventfd_state_t *state, **sp;
271 	minor_t minor = getminor(dev);
272 
273 	state = ddi_get_soft_state(eventfd_softstate, minor);
274 
275 	if (state->efd_pollhd.ph_list != NULL) {
276 		pollwakeup(&state->efd_pollhd, POLLERR);
277 		pollhead_clean(&state->efd_pollhd);
278 	}
279 
280 	mutex_enter(&eventfd_lock);
281 
282 	/*
283 	 * Remove our state from our global list.
284 	 */
285 	for (sp = &eventfd_state; *sp != state; sp = &((*sp)->efd_next))
286 		VERIFY(*sp != NULL);
287 
288 	*sp = (*sp)->efd_next;
289 
290 	ddi_soft_state_free(eventfd_softstate, minor);
291 	vmem_free(eventfd_minor, (void *)(uintptr_t)minor, 1);
292 
293 	mutex_exit(&eventfd_lock);
294 
295 	return (0);
296 }
297 
298 static int
299 eventfd_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
300 {
301 	switch (cmd) {
302 	case DDI_ATTACH:
303 		break;
304 
305 	case DDI_RESUME:
306 		return (DDI_SUCCESS);
307 
308 	default:
309 		return (DDI_FAILURE);
310 	}
311 
312 	mutex_enter(&eventfd_lock);
313 
314 	if (ddi_soft_state_init(&eventfd_softstate,
315 	    sizeof (eventfd_state_t), 0) != 0) {
316 		cmn_err(CE_NOTE, "/dev/eventfd failed to create soft state");
317 		mutex_exit(&eventfd_lock);
318 		return (DDI_FAILURE);
319 	}
320 
321 	if (ddi_create_minor_node(devi, "eventfd", S_IFCHR,
322 	    EVENTFDMNRN_EVENTFD, DDI_PSEUDO, 0) == DDI_FAILURE) {
323 		cmn_err(CE_NOTE, "/dev/eventfd couldn't create minor node");
324 		ddi_soft_state_fini(&eventfd_softstate);
325 		mutex_exit(&eventfd_lock);
326 		return (DDI_FAILURE);
327 	}
328 
329 	ddi_report_dev(devi);
330 	eventfd_devi = devi;
331 
332 	eventfd_minor = vmem_create("eventfd_minor", (void *)EVENTFDMNRN_CLONE,
333 	    UINT32_MAX - EVENTFDMNRN_CLONE, 1, NULL, NULL, NULL, 0,
334 	    VM_SLEEP | VMC_IDENTIFIER);
335 
336 	mutex_exit(&eventfd_lock);
337 
338 	return (DDI_SUCCESS);
339 }
340 
341 /*ARGSUSED*/
342 static int
343 eventfd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
344 {
345 	switch (cmd) {
346 	case DDI_DETACH:
347 		break;
348 
349 	case DDI_SUSPEND:
350 		return (DDI_SUCCESS);
351 
352 	default:
353 		return (DDI_FAILURE);
354 	}
355 
356 	mutex_enter(&eventfd_lock);
357 	vmem_destroy(eventfd_minor);
358 
359 	ddi_remove_minor_node(eventfd_devi, NULL);
360 	eventfd_devi = NULL;
361 
362 	ddi_soft_state_fini(&eventfd_softstate);
363 	mutex_exit(&eventfd_lock);
364 
365 	return (DDI_SUCCESS);
366 }
367 
368 /*ARGSUSED*/
369 static int
370 eventfd_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
371 {
372 	int error;
373 
374 	switch (infocmd) {
375 	case DDI_INFO_DEVT2DEVINFO:
376 		*result = (void *)eventfd_devi;
377 		error = DDI_SUCCESS;
378 		break;
379 	case DDI_INFO_DEVT2INSTANCE:
380 		*result = (void *)0;
381 		error = DDI_SUCCESS;
382 		break;
383 	default:
384 		error = DDI_FAILURE;
385 	}
386 	return (error);
387 }
388 
389 static struct cb_ops eventfd_cb_ops = {
390 	eventfd_open,		/* open */
391 	eventfd_close,		/* close */
392 	nulldev,		/* strategy */
393 	nulldev,		/* print */
394 	nodev,			/* dump */
395 	eventfd_read,		/* read */
396 	eventfd_write,		/* write */
397 	eventfd_ioctl,		/* ioctl */
398 	nodev,			/* devmap */
399 	nodev,			/* mmap */
400 	nodev,			/* segmap */
401 	eventfd_poll,		/* poll */
402 	ddi_prop_op,		/* cb_prop_op */
403 	0,			/* streamtab  */
404 	D_NEW | D_MP		/* Driver compatibility flag */
405 };
406 
407 static struct dev_ops eventfd_ops = {
408 	DEVO_REV,		/* devo_rev */
409 	0,			/* refcnt */
410 	eventfd_info,		/* get_dev_info */
411 	nulldev,		/* identify */
412 	nulldev,		/* probe */
413 	eventfd_attach,		/* attach */
414 	eventfd_detach,		/* detach */
415 	nodev,			/* reset */
416 	&eventfd_cb_ops,	/* driver operations */
417 	NULL,			/* bus operations */
418 	nodev,			/* dev power */
419 	ddi_quiesce_not_needed,	/* quiesce */
420 };
421 
422 static struct modldrv modldrv = {
423 	&mod_driverops,		/* module type (this is a pseudo driver) */
424 	"eventfd support",	/* name of module */
425 	&eventfd_ops,		/* driver ops */
426 };
427 
428 static struct modlinkage modlinkage = {
429 	MODREV_1,
430 	(void *)&modldrv,
431 	NULL
432 };
433 
434 int
435 _init(void)
436 {
437 	return (mod_install(&modlinkage));
438 }
439 
440 int
441 _info(struct modinfo *modinfop)
442 {
443 	return (mod_info(&modlinkage, modinfop));
444 }
445 
446 int
447 _fini(void)
448 {
449 	return (mod_remove(&modlinkage));
450 }
451