xref: /freebsd/usr.sbin/bhyve/mevent.c (revision fd45b686f9d92f583366c75b22c04c7ee49709c0)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2011 NetApp, Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 /*
30  * Micro event library for FreeBSD, designed for a single i/o thread
31  * using kqueue, and having events be persistent by default.
32  */
33 
34 #include <sys/cdefs.h>
35 #include <assert.h>
36 #ifndef WITHOUT_CAPSICUM
37 #include <capsicum_helpers.h>
38 #endif
39 #include <err.h>
40 #include <errno.h>
41 #include <stdbool.h>
42 #include <stdlib.h>
43 #include <stdio.h>
44 #include <string.h>
45 #include <sysexits.h>
46 #include <unistd.h>
47 
48 #include <sys/types.h>
49 #ifndef WITHOUT_CAPSICUM
50 #include <sys/capsicum.h>
51 #endif
52 #include <sys/event.h>
53 #include <sys/time.h>
54 
55 #include <pthread.h>
56 #include <pthread_np.h>
57 
58 #include "mevent.h"
59 
60 #define	MEVENT_MAX	64
61 
62 static pthread_t mevent_tid;
63 static pthread_once_t mevent_once = PTHREAD_ONCE_INIT;
64 static int mevent_timid = 43;
65 static int mevent_pipefd[2];
66 static int mfd;
67 static pthread_mutex_t mevent_lmutex = PTHREAD_MUTEX_INITIALIZER;
68 
69 struct mevent {
70 	void	(*me_func)(int, enum ev_type, void *);
71 #define me_msecs me_fd
72 	int	me_fd;
73 	int	me_timid;
74 	enum ev_type me_type;
75 	void    *me_param;
76 	int	me_cq;
77 	int	me_state; /* Desired kevent flags. */
78 	int	me_closefd;
79 	int	me_fflags;
80 	LIST_ENTRY(mevent) me_list;
81 };
82 
83 static LIST_HEAD(listhead, mevent) global_head, change_head;
84 
85 static void
86 mevent_qlock(void)
87 {
88 	pthread_mutex_lock(&mevent_lmutex);
89 }
90 
91 static void
92 mevent_qunlock(void)
93 {
94 	pthread_mutex_unlock(&mevent_lmutex);
95 }
96 
97 static void
98 mevent_pipe_read(int fd, enum ev_type type __unused, void *param __unused)
99 {
100 	char buf[MEVENT_MAX];
101 	int status;
102 
103 	/*
104 	 * Drain the pipe read side. The fd is non-blocking so this is
105 	 * safe to do.
106 	 */
107 	do {
108 		status = read(fd, buf, sizeof(buf));
109 	} while (status == MEVENT_MAX);
110 }
111 
112 static void
113 mevent_notify(void)
114 {
115 	char c = '\0';
116 
117 	/*
118 	 * If calling from outside the i/o thread, write a byte on the
119 	 * pipe to force the i/o thread to exit the blocking kevent call.
120 	 */
121 	if (mevent_pipefd[1] != 0 && pthread_self() != mevent_tid) {
122 		write(mevent_pipefd[1], &c, 1);
123 	}
124 }
125 
126 static void
127 mevent_init(void)
128 {
129 #ifndef WITHOUT_CAPSICUM
130 	cap_rights_t rights;
131 #endif
132 
133 	mfd = kqueue();
134 	assert(mfd > 0);
135 
136 #ifndef WITHOUT_CAPSICUM
137 	cap_rights_init(&rights, CAP_KQUEUE);
138 	if (caph_rights_limit(mfd, &rights) == -1)
139 		errx(EX_OSERR, "Unable to apply rights for sandbox");
140 #endif
141 
142 	LIST_INIT(&change_head);
143 	LIST_INIT(&global_head);
144 }
145 
146 static int
147 mevent_kq_filter(struct mevent *mevp)
148 {
149 	int retval;
150 
151 	retval = 0;
152 
153 	if (mevp->me_type == EVF_READ)
154 		retval = EVFILT_READ;
155 
156 	if (mevp->me_type == EVF_WRITE)
157 		retval = EVFILT_WRITE;
158 
159 	if (mevp->me_type == EVF_TIMER)
160 		retval = EVFILT_TIMER;
161 
162 	if (mevp->me_type == EVF_SIGNAL)
163 		retval = EVFILT_SIGNAL;
164 
165 	if (mevp->me_type == EVF_VNODE)
166 		retval = EVFILT_VNODE;
167 
168 	return (retval);
169 }
170 
171 static int
172 mevent_kq_flags(struct mevent *mevp)
173 {
174 	int retval;
175 
176 	retval = mevp->me_state;
177 
178 	if (mevp->me_type == EVF_VNODE)
179 		retval |= EV_CLEAR;
180 
181 	return (retval);
182 }
183 
184 static int
185 mevent_kq_fflags(struct mevent *mevp)
186 {
187 	int retval;
188 
189 	retval = 0;
190 
191 	switch (mevp->me_type) {
192 	case EVF_VNODE:
193 		if ((mevp->me_fflags & EVFF_ATTRIB) != 0)
194 			retval |= NOTE_ATTRIB;
195 		break;
196 	case EVF_READ:
197 	case EVF_WRITE:
198 	case EVF_TIMER:
199 	case EVF_SIGNAL:
200 		break;
201 	}
202 
203 	return (retval);
204 }
205 
206 static void
207 mevent_populate(struct mevent *mevp, struct kevent *kev)
208 {
209 	if (mevp->me_type == EVF_TIMER) {
210 		kev->ident = mevp->me_timid;
211 		kev->data = mevp->me_msecs;
212 	} else {
213 		kev->ident = mevp->me_fd;
214 		kev->data = 0;
215 	}
216 	kev->filter = mevent_kq_filter(mevp);
217 	kev->flags = mevent_kq_flags(mevp);
218 	kev->fflags = mevent_kq_fflags(mevp);
219 	kev->udata = mevp;
220 }
221 
222 static int
223 mevent_build(struct kevent *kev)
224 {
225 	struct mevent *mevp, *tmpp;
226 	int i;
227 
228 	i = 0;
229 
230 	mevent_qlock();
231 
232 	LIST_FOREACH_SAFE(mevp, &change_head, me_list, tmpp) {
233 		if (mevp->me_closefd) {
234 			/*
235 			 * A close of the file descriptor will remove the
236 			 * event
237 			 */
238 			close(mevp->me_fd);
239 		} else {
240 			assert((mevp->me_state & EV_ADD) == 0);
241 			mevent_populate(mevp, &kev[i]);
242 			i++;
243 		}
244 
245 		mevp->me_cq = 0;
246 		LIST_REMOVE(mevp, me_list);
247 
248 		if (mevp->me_state & EV_DELETE) {
249 			free(mevp);
250 		} else {
251 			LIST_INSERT_HEAD(&global_head, mevp, me_list);
252 		}
253 
254 		assert(i < MEVENT_MAX);
255 	}
256 
257 	mevent_qunlock();
258 
259 	return (i);
260 }
261 
262 static void
263 mevent_handle(struct kevent *kev, int numev)
264 {
265 	struct mevent *mevp;
266 	int i;
267 
268 	for (i = 0; i < numev; i++) {
269 		mevp = kev[i].udata;
270 
271 		/* XXX check for EV_ERROR ? */
272 
273 		(*mevp->me_func)(mevp->me_fd, mevp->me_type, mevp->me_param);
274 	}
275 }
276 
277 static struct mevent *
278 mevent_add_state(int tfd, enum ev_type type,
279 	   void (*func)(int, enum ev_type, void *), void *param,
280 	   int state, int fflags)
281 {
282 	struct kevent kev;
283 	struct mevent *lp, *mevp;
284 	int ret;
285 
286 	if (tfd < 0 || func == NULL) {
287 		return (NULL);
288 	}
289 
290 	mevp = NULL;
291 
292 	pthread_once(&mevent_once, mevent_init);
293 
294 	mevent_qlock();
295 
296 	/*
297 	 * Verify that the fd/type tuple is not present in any list
298 	 */
299 	LIST_FOREACH(lp, &global_head, me_list) {
300 		if (type != EVF_TIMER && lp->me_fd == tfd &&
301 		    lp->me_type == type) {
302 			goto exit;
303 		}
304 	}
305 
306 	LIST_FOREACH(lp, &change_head, me_list) {
307 		if (type != EVF_TIMER && lp->me_fd == tfd &&
308 		    lp->me_type == type) {
309 			goto exit;
310 		}
311 	}
312 
313 	/*
314 	 * Allocate an entry and populate it.
315 	 */
316 	mevp = calloc(1, sizeof(struct mevent));
317 	if (mevp == NULL) {
318 		goto exit;
319 	}
320 
321 	if (type == EVF_TIMER) {
322 		mevp->me_msecs = tfd;
323 		mevp->me_timid = mevent_timid++;
324 	} else
325 		mevp->me_fd = tfd;
326 	mevp->me_type = type;
327 	mevp->me_func = func;
328 	mevp->me_param = param;
329 	mevp->me_state = state;
330 	mevp->me_fflags = fflags;
331 
332 	/*
333 	 * Try to add the event.  If this fails, report the failure to
334 	 * the caller.
335 	 */
336 	mevent_populate(mevp, &kev);
337 	ret = kevent(mfd, &kev, 1, NULL, 0, NULL);
338 	if (ret == -1) {
339 		free(mevp);
340 		mevp = NULL;
341 		goto exit;
342 	}
343 
344 	mevp->me_state &= ~EV_ADD;
345 	LIST_INSERT_HEAD(&global_head, mevp, me_list);
346 
347 exit:
348 	mevent_qunlock();
349 
350 	return (mevp);
351 }
352 
353 struct mevent *
354 mevent_add(int tfd, enum ev_type type,
355 	   void (*func)(int, enum ev_type, void *), void *param)
356 {
357 
358 	return (mevent_add_state(tfd, type, func, param, EV_ADD, 0));
359 }
360 
361 struct mevent *
362 mevent_add_flags(int tfd, enum ev_type type, int fflags,
363 		 void (*func)(int, enum ev_type, void *), void *param)
364 {
365 
366 	return (mevent_add_state(tfd, type, func, param, EV_ADD, fflags));
367 }
368 
369 struct mevent *
370 mevent_add_disabled(int tfd, enum ev_type type,
371 		    void (*func)(int, enum ev_type, void *), void *param)
372 {
373 
374 	return (mevent_add_state(tfd, type, func, param, EV_ADD | EV_DISABLE, 0));
375 }
376 
377 static int
378 mevent_update(struct mevent *evp, bool enable)
379 {
380 	int newstate;
381 
382 	mevent_qlock();
383 
384 	/*
385 	 * It's not possible to enable/disable a deleted event
386 	 */
387 	assert((evp->me_state & EV_DELETE) == 0);
388 
389 	newstate = evp->me_state;
390 	if (enable) {
391 		newstate |= EV_ENABLE;
392 		newstate &= ~EV_DISABLE;
393 	} else {
394 		newstate |= EV_DISABLE;
395 		newstate &= ~EV_ENABLE;
396 	}
397 
398 	/*
399 	 * No update needed if state isn't changing
400 	 */
401 	if (evp->me_state != newstate) {
402 		evp->me_state = newstate;
403 
404 		/*
405 		 * Place the entry onto the changed list if not
406 		 * already there.
407 		 */
408 		if (evp->me_cq == 0) {
409 			evp->me_cq = 1;
410 			LIST_REMOVE(evp, me_list);
411 			LIST_INSERT_HEAD(&change_head, evp, me_list);
412 			mevent_notify();
413 		}
414 	}
415 
416 	mevent_qunlock();
417 
418 	return (0);
419 }
420 
421 int
422 mevent_enable(struct mevent *evp)
423 {
424 
425 	return (mevent_update(evp, true));
426 }
427 
428 int
429 mevent_disable(struct mevent *evp)
430 {
431 
432 	return (mevent_update(evp, false));
433 }
434 
435 static int
436 mevent_delete_event(struct mevent *evp, int closefd)
437 {
438 	mevent_qlock();
439 
440 	/*
441          * Place the entry onto the changed list if not already there, and
442 	 * mark as to be deleted.
443          */
444         if (evp->me_cq == 0) {
445 		evp->me_cq = 1;
446 		LIST_REMOVE(evp, me_list);
447 		LIST_INSERT_HEAD(&change_head, evp, me_list);
448 		mevent_notify();
449         }
450 	evp->me_state = EV_DELETE;
451 
452 	if (closefd)
453 		evp->me_closefd = 1;
454 
455 	mevent_qunlock();
456 
457 	return (0);
458 }
459 
460 int
461 mevent_delete(struct mevent *evp)
462 {
463 
464 	return (mevent_delete_event(evp, 0));
465 }
466 
467 int
468 mevent_delete_close(struct mevent *evp)
469 {
470 
471 	return (mevent_delete_event(evp, 1));
472 }
473 
474 static void
475 mevent_set_name(void)
476 {
477 
478 	pthread_set_name_np(mevent_tid, "mevent");
479 }
480 
481 void
482 mevent_dispatch(void)
483 {
484 	struct kevent changelist[MEVENT_MAX];
485 	struct kevent eventlist[MEVENT_MAX];
486 	struct mevent *pipev;
487 	int numev;
488 	int ret;
489 #ifndef WITHOUT_CAPSICUM
490 	cap_rights_t rights;
491 #endif
492 
493 	mevent_tid = pthread_self();
494 	mevent_set_name();
495 
496 	pthread_once(&mevent_once, mevent_init);
497 
498 	/*
499 	 * Open the pipe that will be used for other threads to force
500 	 * the blocking kqueue call to exit by writing to it. Set the
501 	 * descriptor to non-blocking.
502 	 */
503 	ret = pipe(mevent_pipefd);
504 	if (ret < 0) {
505 		perror("pipe");
506 		exit(0);
507 	}
508 
509 #ifndef WITHOUT_CAPSICUM
510 	cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE);
511 	if (caph_rights_limit(mevent_pipefd[0], &rights) == -1)
512 		errx(EX_OSERR, "Unable to apply rights for sandbox");
513 	if (caph_rights_limit(mevent_pipefd[1], &rights) == -1)
514 		errx(EX_OSERR, "Unable to apply rights for sandbox");
515 #endif
516 
517 	/*
518 	 * Add internal event handler for the pipe write fd
519 	 */
520 	pipev = mevent_add(mevent_pipefd[0], EVF_READ, mevent_pipe_read, NULL);
521 	assert(pipev != NULL);
522 
523 	for (;;) {
524 		/*
525 		 * Build changelist if required.
526 		 * XXX the changelist can be put into the blocking call
527 		 * to eliminate the extra syscall. Currently better for
528 		 * debug.
529 		 */
530 		numev = mevent_build(changelist);
531 		if (numev) {
532 			ret = kevent(mfd, changelist, numev, NULL, 0, NULL);
533 			if (ret == -1) {
534 				perror("Error return from kevent change");
535 			}
536 		}
537 
538 		/*
539 		 * Block awaiting events
540 		 */
541 		ret = kevent(mfd, NULL, 0, eventlist, MEVENT_MAX, NULL);
542 		if (ret == -1 && errno != EINTR) {
543 			perror("Error return from kevent monitor");
544 		}
545 
546 		/*
547 		 * Handle reported events
548 		 */
549 		mevent_handle(eventlist, ret);
550 	}
551 }
552