xref: /freebsd/usr.sbin/bhyve/mevent.c (revision 3cbb4cc200f8a0ad7ed08233425ea54524a21f1c)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2011 NetApp, Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  *
28  * $FreeBSD$
29  */
30 
31 /*
32  * Micro event library for FreeBSD, designed for a single i/o thread
33  * using kqueue, and having events be persistent by default.
34  */
35 
36 #include <sys/cdefs.h>
37 __FBSDID("$FreeBSD$");
38 
39 #include <assert.h>
40 #ifndef WITHOUT_CAPSICUM
41 #include <capsicum_helpers.h>
42 #endif
43 #include <err.h>
44 #include <errno.h>
45 #include <stdbool.h>
46 #include <stdlib.h>
47 #include <stdio.h>
48 #include <string.h>
49 #include <sysexits.h>
50 #include <unistd.h>
51 
52 #include <sys/types.h>
53 #ifndef WITHOUT_CAPSICUM
54 #include <sys/capsicum.h>
55 #endif
56 #include <sys/event.h>
57 #include <sys/time.h>
58 
59 #include <pthread.h>
60 #include <pthread_np.h>
61 
62 #include "mevent.h"
63 
64 #define	MEVENT_MAX	64
65 
66 extern const char *vmname;
67 
68 static pthread_t mevent_tid;
69 static int mevent_timid = 43;
70 static int mevent_pipefd[2];
71 static pthread_mutex_t mevent_lmutex = PTHREAD_MUTEX_INITIALIZER;
72 
73 struct mevent {
74 	void	(*me_func)(int, enum ev_type, void *);
75 #define me_msecs me_fd
76 	int	me_fd;
77 	int	me_timid;
78 	enum ev_type me_type;
79 	void    *me_param;
80 	int	me_cq;
81 	int	me_state; /* Desired kevent flags. */
82 	int	me_closefd;
83 	LIST_ENTRY(mevent) me_list;
84 };
85 
86 static LIST_HEAD(listhead, mevent) global_head, change_head;
87 
88 static void
89 mevent_qlock(void)
90 {
91 	pthread_mutex_lock(&mevent_lmutex);
92 }
93 
94 static void
95 mevent_qunlock(void)
96 {
97 	pthread_mutex_unlock(&mevent_lmutex);
98 }
99 
100 static void
101 mevent_pipe_read(int fd, enum ev_type type, void *param)
102 {
103 	char buf[MEVENT_MAX];
104 	int status;
105 
106 	/*
107 	 * Drain the pipe read side. The fd is non-blocking so this is
108 	 * safe to do.
109 	 */
110 	do {
111 		status = read(fd, buf, sizeof(buf));
112 	} while (status == MEVENT_MAX);
113 }
114 
115 static void
116 mevent_notify(void)
117 {
118 	char c = '\0';
119 
120 	/*
121 	 * If calling from outside the i/o thread, write a byte on the
122 	 * pipe to force the i/o thread to exit the blocking kevent call.
123 	 */
124 	if (mevent_pipefd[1] != 0 && pthread_self() != mevent_tid) {
125 		write(mevent_pipefd[1], &c, 1);
126 	}
127 }
128 
129 static int
130 mevent_kq_filter(struct mevent *mevp)
131 {
132 	int retval;
133 
134 	retval = 0;
135 
136 	if (mevp->me_type == EVF_READ)
137 		retval = EVFILT_READ;
138 
139 	if (mevp->me_type == EVF_WRITE)
140 		retval = EVFILT_WRITE;
141 
142 	if (mevp->me_type == EVF_TIMER)
143 		retval = EVFILT_TIMER;
144 
145 	if (mevp->me_type == EVF_SIGNAL)
146 		retval = EVFILT_SIGNAL;
147 
148 	return (retval);
149 }
150 
151 static int
152 mevent_kq_flags(struct mevent *mevp)
153 {
154 	return (mevp->me_state);
155 }
156 
157 static int
158 mevent_kq_fflags(struct mevent *mevp)
159 {
160 	/* XXX nothing yet, perhaps EV_EOF for reads ? */
161 	return (0);
162 }
163 
164 static int
165 mevent_build(int mfd, struct kevent *kev)
166 {
167 	struct mevent *mevp, *tmpp;
168 	int i;
169 
170 	i = 0;
171 
172 	mevent_qlock();
173 
174 	LIST_FOREACH_SAFE(mevp, &change_head, me_list, tmpp) {
175 		if (mevp->me_closefd) {
176 			/*
177 			 * A close of the file descriptor will remove the
178 			 * event
179 			 */
180 			close(mevp->me_fd);
181 		} else {
182 			if (mevp->me_type == EVF_TIMER) {
183 				kev[i].ident = mevp->me_timid;
184 				kev[i].data = mevp->me_msecs;
185 			} else {
186 				kev[i].ident = mevp->me_fd;
187 				kev[i].data = 0;
188 			}
189 			kev[i].filter = mevent_kq_filter(mevp);
190 			kev[i].flags = mevent_kq_flags(mevp);
191 			kev[i].fflags = mevent_kq_fflags(mevp);
192 			kev[i].udata = mevp;
193 			i++;
194 		}
195 
196 		mevp->me_cq = 0;
197 		LIST_REMOVE(mevp, me_list);
198 
199 		if (mevp->me_state & EV_DELETE) {
200 			free(mevp);
201 		} else {
202 			/*
203 			 * We need to add the event only once, so we can
204 			 * reset the EV_ADD bit after it has been propagated
205 			 * to the kevent() arguments the first time.
206 			 */
207 			mevp->me_state &= ~EV_ADD;
208 			LIST_INSERT_HEAD(&global_head, mevp, me_list);
209 		}
210 
211 		assert(i < MEVENT_MAX);
212 	}
213 
214 	mevent_qunlock();
215 
216 	return (i);
217 }
218 
219 static void
220 mevent_handle(struct kevent *kev, int numev)
221 {
222 	struct mevent *mevp;
223 	int i;
224 
225 	for (i = 0; i < numev; i++) {
226 		mevp = kev[i].udata;
227 
228 		/* XXX check for EV_ERROR ? */
229 
230 		(*mevp->me_func)(mevp->me_fd, mevp->me_type, mevp->me_param);
231 	}
232 }
233 
234 static struct mevent *
235 mevent_add_state(int tfd, enum ev_type type,
236 	   void (*func)(int, enum ev_type, void *), void *param,
237 	   int state)
238 {
239 	struct mevent *lp, *mevp;
240 
241 	if (tfd < 0 || func == NULL) {
242 		return (NULL);
243 	}
244 
245 	mevp = NULL;
246 
247 	mevent_qlock();
248 
249 	/*
250 	 * Verify that the fd/type tuple is not present in any list
251 	 */
252 	LIST_FOREACH(lp, &global_head, me_list) {
253 		if (type != EVF_TIMER && lp->me_fd == tfd &&
254 		    lp->me_type == type) {
255 			goto exit;
256 		}
257 	}
258 
259 	LIST_FOREACH(lp, &change_head, me_list) {
260 		if (type != EVF_TIMER && lp->me_fd == tfd &&
261 		    lp->me_type == type) {
262 			goto exit;
263 		}
264 	}
265 
266 	/*
267 	 * Allocate an entry, populate it, and add it to the change list.
268 	 */
269 	mevp = calloc(1, sizeof(struct mevent));
270 	if (mevp == NULL) {
271 		goto exit;
272 	}
273 
274 	if (type == EVF_TIMER) {
275 		mevp->me_msecs = tfd;
276 		mevp->me_timid = mevent_timid++;
277 	} else
278 		mevp->me_fd = tfd;
279 	mevp->me_type = type;
280 	mevp->me_func = func;
281 	mevp->me_param = param;
282 
283 	LIST_INSERT_HEAD(&change_head, mevp, me_list);
284 	mevp->me_cq = 1;
285 	mevp->me_state = state;
286 	mevent_notify();
287 
288 exit:
289 	mevent_qunlock();
290 
291 	return (mevp);
292 }
293 
294 struct mevent *
295 mevent_add(int tfd, enum ev_type type,
296 	   void (*func)(int, enum ev_type, void *), void *param)
297 {
298 
299 	return (mevent_add_state(tfd, type, func, param, EV_ADD));
300 }
301 
302 struct mevent *
303 mevent_add_disabled(int tfd, enum ev_type type,
304 		    void (*func)(int, enum ev_type, void *), void *param)
305 {
306 
307 	return (mevent_add_state(tfd, type, func, param, EV_ADD | EV_DISABLE));
308 }
309 
310 static int
311 mevent_update(struct mevent *evp, bool enable)
312 {
313 	int newstate;
314 
315 	mevent_qlock();
316 
317 	/*
318 	 * It's not possible to enable/disable a deleted event
319 	 */
320 	assert((evp->me_state & EV_DELETE) == 0);
321 
322 	newstate = evp->me_state;
323 	if (enable) {
324 		newstate |= EV_ENABLE;
325 		newstate &= ~EV_DISABLE;
326 	} else {
327 		newstate |= EV_DISABLE;
328 		newstate &= ~EV_ENABLE;
329 	}
330 
331 	/*
332 	 * No update needed if state isn't changing
333 	 */
334 	if (evp->me_state != newstate) {
335 		evp->me_state = newstate;
336 
337 		/*
338 		 * Place the entry onto the changed list if not
339 		 * already there.
340 		 */
341 		if (evp->me_cq == 0) {
342 			evp->me_cq = 1;
343 			LIST_REMOVE(evp, me_list);
344 			LIST_INSERT_HEAD(&change_head, evp, me_list);
345 			mevent_notify();
346 		}
347 	}
348 
349 	mevent_qunlock();
350 
351 	return (0);
352 }
353 
354 int
355 mevent_enable(struct mevent *evp)
356 {
357 
358 	return (mevent_update(evp, true));
359 }
360 
361 int
362 mevent_disable(struct mevent *evp)
363 {
364 
365 	return (mevent_update(evp, false));
366 }
367 
368 static int
369 mevent_delete_event(struct mevent *evp, int closefd)
370 {
371 	mevent_qlock();
372 
373 	/*
374          * Place the entry onto the changed list if not already there, and
375 	 * mark as to be deleted.
376          */
377         if (evp->me_cq == 0) {
378 		evp->me_cq = 1;
379 		LIST_REMOVE(evp, me_list);
380 		LIST_INSERT_HEAD(&change_head, evp, me_list);
381 		mevent_notify();
382         }
383 	evp->me_state = EV_DELETE;
384 
385 	if (closefd)
386 		evp->me_closefd = 1;
387 
388 	mevent_qunlock();
389 
390 	return (0);
391 }
392 
393 int
394 mevent_delete(struct mevent *evp)
395 {
396 
397 	return (mevent_delete_event(evp, 0));
398 }
399 
400 int
401 mevent_delete_close(struct mevent *evp)
402 {
403 
404 	return (mevent_delete_event(evp, 1));
405 }
406 
407 static void
408 mevent_set_name(void)
409 {
410 
411 	pthread_set_name_np(mevent_tid, "mevent");
412 }
413 
414 void
415 mevent_dispatch(void)
416 {
417 	struct kevent changelist[MEVENT_MAX];
418 	struct kevent eventlist[MEVENT_MAX];
419 	struct mevent *pipev;
420 	int mfd;
421 	int numev;
422 	int ret;
423 #ifndef WITHOUT_CAPSICUM
424 	cap_rights_t rights;
425 #endif
426 
427 	mevent_tid = pthread_self();
428 	mevent_set_name();
429 
430 	mfd = kqueue();
431 	assert(mfd > 0);
432 
433 #ifndef WITHOUT_CAPSICUM
434 	cap_rights_init(&rights, CAP_KQUEUE);
435 	if (caph_rights_limit(mfd, &rights) == -1)
436 		errx(EX_OSERR, "Unable to apply rights for sandbox");
437 #endif
438 
439 	/*
440 	 * Open the pipe that will be used for other threads to force
441 	 * the blocking kqueue call to exit by writing to it. Set the
442 	 * descriptor to non-blocking.
443 	 */
444 	ret = pipe(mevent_pipefd);
445 	if (ret < 0) {
446 		perror("pipe");
447 		exit(0);
448 	}
449 
450 #ifndef WITHOUT_CAPSICUM
451 	cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE);
452 	if (caph_rights_limit(mevent_pipefd[0], &rights) == -1)
453 		errx(EX_OSERR, "Unable to apply rights for sandbox");
454 	if (caph_rights_limit(mevent_pipefd[1], &rights) == -1)
455 		errx(EX_OSERR, "Unable to apply rights for sandbox");
456 #endif
457 
458 	/*
459 	 * Add internal event handler for the pipe write fd
460 	 */
461 	pipev = mevent_add(mevent_pipefd[0], EVF_READ, mevent_pipe_read, NULL);
462 	assert(pipev != NULL);
463 
464 	for (;;) {
465 		/*
466 		 * Build changelist if required.
467 		 * XXX the changelist can be put into the blocking call
468 		 * to eliminate the extra syscall. Currently better for
469 		 * debug.
470 		 */
471 		numev = mevent_build(mfd, changelist);
472 		if (numev) {
473 			ret = kevent(mfd, changelist, numev, NULL, 0, NULL);
474 			if (ret == -1) {
475 				perror("Error return from kevent change");
476 			}
477 		}
478 
479 		/*
480 		 * Block awaiting events
481 		 */
482 		ret = kevent(mfd, NULL, 0, eventlist, MEVENT_MAX, NULL);
483 		if (ret == -1 && errno != EINTR) {
484 			perror("Error return from kevent monitor");
485 		}
486 
487 		/*
488 		 * Handle reported events
489 		 */
490 		mevent_handle(eventlist, ret);
491 	}
492 }
493