xref: /freebsd/contrib/libevent/epoll.c (revision b50261e21f39a6c7249a49e7b60aa878c98512a8)
1c43e99fdSEd Maste /*
2c43e99fdSEd Maste  * Copyright 2000-2007 Niels Provos <provos@citi.umich.edu>
3c43e99fdSEd Maste  * Copyright 2007-2012 Niels Provos, Nick Mathewson
4c43e99fdSEd Maste  *
5c43e99fdSEd Maste  * Redistribution and use in source and binary forms, with or without
6c43e99fdSEd Maste  * modification, are permitted provided that the following conditions
7c43e99fdSEd Maste  * are met:
8c43e99fdSEd Maste  * 1. Redistributions of source code must retain the above copyright
9c43e99fdSEd Maste  *    notice, this list of conditions and the following disclaimer.
10c43e99fdSEd Maste  * 2. Redistributions in binary form must reproduce the above copyright
11c43e99fdSEd Maste  *    notice, this list of conditions and the following disclaimer in the
12c43e99fdSEd Maste  *    documentation and/or other materials provided with the distribution.
13c43e99fdSEd Maste  * 3. The name of the author may not be used to endorse or promote products
14c43e99fdSEd Maste  *    derived from this software without specific prior written permission.
15c43e99fdSEd Maste  *
16c43e99fdSEd Maste  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17c43e99fdSEd Maste  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18c43e99fdSEd Maste  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19c43e99fdSEd Maste  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
20c43e99fdSEd Maste  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
21c43e99fdSEd Maste  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22c43e99fdSEd Maste  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23c43e99fdSEd Maste  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24c43e99fdSEd Maste  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
25c43e99fdSEd Maste  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26c43e99fdSEd Maste  */
27c43e99fdSEd Maste #include "event2/event-config.h"
28c43e99fdSEd Maste #include "evconfig-private.h"
29c43e99fdSEd Maste 
30c43e99fdSEd Maste #ifdef EVENT__HAVE_EPOLL
31c43e99fdSEd Maste 
32c43e99fdSEd Maste #include <stdint.h>
33c43e99fdSEd Maste #include <sys/types.h>
34c43e99fdSEd Maste #include <sys/resource.h>
35c43e99fdSEd Maste #ifdef EVENT__HAVE_SYS_TIME_H
36c43e99fdSEd Maste #include <sys/time.h>
37c43e99fdSEd Maste #endif
38c43e99fdSEd Maste #include <sys/queue.h>
39c43e99fdSEd Maste #include <sys/epoll.h>
40c43e99fdSEd Maste #include <signal.h>
41c43e99fdSEd Maste #include <limits.h>
42c43e99fdSEd Maste #include <stdio.h>
43c43e99fdSEd Maste #include <stdlib.h>
44c43e99fdSEd Maste #include <string.h>
45c43e99fdSEd Maste #include <unistd.h>
46c43e99fdSEd Maste #include <errno.h>
47c43e99fdSEd Maste #ifdef EVENT__HAVE_FCNTL_H
48c43e99fdSEd Maste #include <fcntl.h>
49c43e99fdSEd Maste #endif
50c43e99fdSEd Maste #ifdef EVENT__HAVE_SYS_TIMERFD_H
51c43e99fdSEd Maste #include <sys/timerfd.h>
52c43e99fdSEd Maste #endif
53c43e99fdSEd Maste 
54c43e99fdSEd Maste #include "event-internal.h"
55c43e99fdSEd Maste #include "evsignal-internal.h"
56c43e99fdSEd Maste #include "event2/thread.h"
57c43e99fdSEd Maste #include "evthread-internal.h"
58c43e99fdSEd Maste #include "log-internal.h"
59c43e99fdSEd Maste #include "evmap-internal.h"
60c43e99fdSEd Maste #include "changelist-internal.h"
61c43e99fdSEd Maste #include "time-internal.h"
62c43e99fdSEd Maste 
63c43e99fdSEd Maste /* Since Linux 2.6.17, epoll is able to report about peer half-closed connection
64c43e99fdSEd Maste    using special EPOLLRDHUP flag on a read event.
65c43e99fdSEd Maste */
66c43e99fdSEd Maste #if !defined(EPOLLRDHUP)
67c43e99fdSEd Maste #define EPOLLRDHUP 0
68c43e99fdSEd Maste #define EARLY_CLOSE_IF_HAVE_RDHUP 0
69c43e99fdSEd Maste #else
70c43e99fdSEd Maste #define EARLY_CLOSE_IF_HAVE_RDHUP EV_FEATURE_EARLY_CLOSE
71c43e99fdSEd Maste #endif
72c43e99fdSEd Maste 
73c43e99fdSEd Maste #include "epolltable-internal.h"
74c43e99fdSEd Maste 
75c43e99fdSEd Maste #if defined(EVENT__HAVE_SYS_TIMERFD_H) &&			  \
76c43e99fdSEd Maste 	defined(EVENT__HAVE_TIMERFD_CREATE) &&			  \
77c43e99fdSEd Maste 	defined(HAVE_POSIX_MONOTONIC) && defined(TFD_NONBLOCK) && \
78c43e99fdSEd Maste 	defined(TFD_CLOEXEC)
79c43e99fdSEd Maste /* Note that we only use timerfd if TFD_NONBLOCK and TFD_CLOEXEC are available
80c43e99fdSEd Maste    and working.  This means that we can't support it on 2.6.25 (where timerfd
81c43e99fdSEd Maste    was introduced) or 2.6.26, since 2.6.27 introduced those flags.
82c43e99fdSEd Maste  */
83c43e99fdSEd Maste #define USING_TIMERFD
84c43e99fdSEd Maste #endif
85c43e99fdSEd Maste 
86c43e99fdSEd Maste struct epollop {
87c43e99fdSEd Maste 	struct epoll_event *events;
88c43e99fdSEd Maste 	int nevents;
89c43e99fdSEd Maste 	int epfd;
90c43e99fdSEd Maste #ifdef USING_TIMERFD
91c43e99fdSEd Maste 	int timerfd;
92c43e99fdSEd Maste #endif
93c43e99fdSEd Maste };
94c43e99fdSEd Maste 
95c43e99fdSEd Maste static void *epoll_init(struct event_base *);
96c43e99fdSEd Maste static int epoll_dispatch(struct event_base *, struct timeval *);
97c43e99fdSEd Maste static void epoll_dealloc(struct event_base *);
98c43e99fdSEd Maste 
99c43e99fdSEd Maste static const struct eventop epollops_changelist = {
100c43e99fdSEd Maste 	"epoll (with changelist)",
101c43e99fdSEd Maste 	epoll_init,
102c43e99fdSEd Maste 	event_changelist_add_,
103c43e99fdSEd Maste 	event_changelist_del_,
104c43e99fdSEd Maste 	epoll_dispatch,
105c43e99fdSEd Maste 	epoll_dealloc,
106c43e99fdSEd Maste 	1, /* need reinit */
107c43e99fdSEd Maste 	EV_FEATURE_ET|EV_FEATURE_O1| EARLY_CLOSE_IF_HAVE_RDHUP,
108c43e99fdSEd Maste 	EVENT_CHANGELIST_FDINFO_SIZE
109c43e99fdSEd Maste };
110c43e99fdSEd Maste 
111c43e99fdSEd Maste 
112c43e99fdSEd Maste static int epoll_nochangelist_add(struct event_base *base, evutil_socket_t fd,
113c43e99fdSEd Maste     short old, short events, void *p);
114c43e99fdSEd Maste static int epoll_nochangelist_del(struct event_base *base, evutil_socket_t fd,
115c43e99fdSEd Maste     short old, short events, void *p);
116c43e99fdSEd Maste 
117c43e99fdSEd Maste const struct eventop epollops = {
118c43e99fdSEd Maste 	"epoll",
119c43e99fdSEd Maste 	epoll_init,
120c43e99fdSEd Maste 	epoll_nochangelist_add,
121c43e99fdSEd Maste 	epoll_nochangelist_del,
122c43e99fdSEd Maste 	epoll_dispatch,
123c43e99fdSEd Maste 	epoll_dealloc,
124c43e99fdSEd Maste 	1, /* need reinit */
125c43e99fdSEd Maste 	EV_FEATURE_ET|EV_FEATURE_O1|EV_FEATURE_EARLY_CLOSE,
126c43e99fdSEd Maste 	0
127c43e99fdSEd Maste };
128c43e99fdSEd Maste 
129c43e99fdSEd Maste #define INITIAL_NEVENT 32
130c43e99fdSEd Maste #define MAX_NEVENT 4096
131c43e99fdSEd Maste 
132c43e99fdSEd Maste /* On Linux kernels at least up to 2.6.24.4, epoll can't handle timeout
133c43e99fdSEd Maste  * values bigger than (LONG_MAX - 999ULL)/HZ.  HZ in the wild can be
134c43e99fdSEd Maste  * as big as 1000, and LONG_MAX can be as small as (1<<31)-1, so the
135c43e99fdSEd Maste  * largest number of msec we can support here is 2147482.  Let's
136c43e99fdSEd Maste  * round that down by 47 seconds.
137c43e99fdSEd Maste  */
138c43e99fdSEd Maste #define MAX_EPOLL_TIMEOUT_MSEC (35*60*1000)
139c43e99fdSEd Maste 
140c43e99fdSEd Maste static void *
epoll_init(struct event_base * base)141c43e99fdSEd Maste epoll_init(struct event_base *base)
142c43e99fdSEd Maste {
143c43e99fdSEd Maste 	int epfd = -1;
144c43e99fdSEd Maste 	struct epollop *epollop;
145c43e99fdSEd Maste 
146c43e99fdSEd Maste #ifdef EVENT__HAVE_EPOLL_CREATE1
147c43e99fdSEd Maste 	/* First, try the shiny new epoll_create1 interface, if we have it. */
148c43e99fdSEd Maste 	epfd = epoll_create1(EPOLL_CLOEXEC);
149c43e99fdSEd Maste #endif
150c43e99fdSEd Maste 	if (epfd == -1) {
151c43e99fdSEd Maste 		/* Initialize the kernel queue using the old interface.  (The
152c43e99fdSEd Maste 		size field is ignored   since 2.6.8.) */
153c43e99fdSEd Maste 		if ((epfd = epoll_create(32000)) == -1) {
154c43e99fdSEd Maste 			if (errno != ENOSYS)
155c43e99fdSEd Maste 				event_warn("epoll_create");
156c43e99fdSEd Maste 			return (NULL);
157c43e99fdSEd Maste 		}
158c43e99fdSEd Maste 		evutil_make_socket_closeonexec(epfd);
159c43e99fdSEd Maste 	}
160c43e99fdSEd Maste 
161c43e99fdSEd Maste 	if (!(epollop = mm_calloc(1, sizeof(struct epollop)))) {
162c43e99fdSEd Maste 		close(epfd);
163c43e99fdSEd Maste 		return (NULL);
164c43e99fdSEd Maste 	}
165c43e99fdSEd Maste 
166c43e99fdSEd Maste 	epollop->epfd = epfd;
167c43e99fdSEd Maste 
168c43e99fdSEd Maste 	/* Initialize fields */
169c43e99fdSEd Maste 	epollop->events = mm_calloc(INITIAL_NEVENT, sizeof(struct epoll_event));
170c43e99fdSEd Maste 	if (epollop->events == NULL) {
171c43e99fdSEd Maste 		mm_free(epollop);
172c43e99fdSEd Maste 		close(epfd);
173c43e99fdSEd Maste 		return (NULL);
174c43e99fdSEd Maste 	}
175c43e99fdSEd Maste 	epollop->nevents = INITIAL_NEVENT;
176c43e99fdSEd Maste 
177c43e99fdSEd Maste 	if ((base->flags & EVENT_BASE_FLAG_EPOLL_USE_CHANGELIST) != 0 ||
178c43e99fdSEd Maste 	    ((base->flags & EVENT_BASE_FLAG_IGNORE_ENV) == 0 &&
179c43e99fdSEd Maste 		evutil_getenv_("EVENT_EPOLL_USE_CHANGELIST") != NULL)) {
180c43e99fdSEd Maste 
181c43e99fdSEd Maste 		base->evsel = &epollops_changelist;
182c43e99fdSEd Maste 	}
183c43e99fdSEd Maste 
184c43e99fdSEd Maste #ifdef USING_TIMERFD
185c43e99fdSEd Maste 	/*
186c43e99fdSEd Maste 	  The epoll interface ordinarily gives us one-millisecond precision,
187c43e99fdSEd Maste 	  so on Linux it makes perfect sense to use the CLOCK_MONOTONIC_COARSE
188c43e99fdSEd Maste 	  timer.  But when the user has set the new PRECISE_TIMER flag for an
189c43e99fdSEd Maste 	  event_base, we can try to use timerfd to give them finer granularity.
190c43e99fdSEd Maste 	*/
191c43e99fdSEd Maste 	if ((base->flags & EVENT_BASE_FLAG_PRECISE_TIMER) &&
192c43e99fdSEd Maste 	    base->monotonic_timer.monotonic_clock == CLOCK_MONOTONIC) {
193c43e99fdSEd Maste 		int fd;
194c43e99fdSEd Maste 		fd = epollop->timerfd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK|TFD_CLOEXEC);
195c43e99fdSEd Maste 		if (epollop->timerfd >= 0) {
196c43e99fdSEd Maste 			struct epoll_event epev;
197c43e99fdSEd Maste 			memset(&epev, 0, sizeof(epev));
198c43e99fdSEd Maste 			epev.data.fd = epollop->timerfd;
199c43e99fdSEd Maste 			epev.events = EPOLLIN;
200c43e99fdSEd Maste 			if (epoll_ctl(epollop->epfd, EPOLL_CTL_ADD, fd, &epev) < 0) {
201c43e99fdSEd Maste 				event_warn("epoll_ctl(timerfd)");
202c43e99fdSEd Maste 				close(fd);
203c43e99fdSEd Maste 				epollop->timerfd = -1;
204c43e99fdSEd Maste 			}
205c43e99fdSEd Maste 		} else {
206c43e99fdSEd Maste 			if (errno != EINVAL && errno != ENOSYS) {
207c43e99fdSEd Maste 				/* These errors probably mean that we were
208c43e99fdSEd Maste 				 * compiled with timerfd/TFD_* support, but
209c43e99fdSEd Maste 				 * we're running on a kernel that lacks those.
210c43e99fdSEd Maste 				 */
211c43e99fdSEd Maste 				event_warn("timerfd_create");
212c43e99fdSEd Maste 			}
213c43e99fdSEd Maste 			epollop->timerfd = -1;
214c43e99fdSEd Maste 		}
215c43e99fdSEd Maste 	} else {
216c43e99fdSEd Maste 		epollop->timerfd = -1;
217c43e99fdSEd Maste 	}
218c43e99fdSEd Maste #endif
219c43e99fdSEd Maste 
220c43e99fdSEd Maste 	evsig_init_(base);
221c43e99fdSEd Maste 
222c43e99fdSEd Maste 	return (epollop);
223c43e99fdSEd Maste }
224c43e99fdSEd Maste 
225c43e99fdSEd Maste static const char *
change_to_string(int change)226c43e99fdSEd Maste change_to_string(int change)
227c43e99fdSEd Maste {
228c43e99fdSEd Maste 	change &= (EV_CHANGE_ADD|EV_CHANGE_DEL);
229c43e99fdSEd Maste 	if (change == EV_CHANGE_ADD) {
230c43e99fdSEd Maste 		return "add";
231c43e99fdSEd Maste 	} else if (change == EV_CHANGE_DEL) {
232c43e99fdSEd Maste 		return "del";
233c43e99fdSEd Maste 	} else if (change == 0) {
234c43e99fdSEd Maste 		return "none";
235c43e99fdSEd Maste 	} else {
236c43e99fdSEd Maste 		return "???";
237c43e99fdSEd Maste 	}
238c43e99fdSEd Maste }
239c43e99fdSEd Maste 
240c43e99fdSEd Maste static const char *
epoll_op_to_string(int op)241c43e99fdSEd Maste epoll_op_to_string(int op)
242c43e99fdSEd Maste {
243c43e99fdSEd Maste 	return op == EPOLL_CTL_ADD?"ADD":
244c43e99fdSEd Maste 	    op == EPOLL_CTL_DEL?"DEL":
245c43e99fdSEd Maste 	    op == EPOLL_CTL_MOD?"MOD":
246c43e99fdSEd Maste 	    "???";
247c43e99fdSEd Maste }
248c43e99fdSEd Maste 
249c43e99fdSEd Maste #define PRINT_CHANGES(op, events, ch, status)  \
250c43e99fdSEd Maste 	"Epoll %s(%d) on fd %d " status ". "       \
251c43e99fdSEd Maste 	"Old events were %d; "                     \
252c43e99fdSEd Maste 	"read change was %d (%s); "                \
253c43e99fdSEd Maste 	"write change was %d (%s); "               \
254c43e99fdSEd Maste 	"close change was %d (%s)",                \
255c43e99fdSEd Maste 	epoll_op_to_string(op),                    \
256c43e99fdSEd Maste 	events,                                    \
257c43e99fdSEd Maste 	ch->fd,                                    \
258c43e99fdSEd Maste 	ch->old_events,                            \
259c43e99fdSEd Maste 	ch->read_change,                           \
260c43e99fdSEd Maste 	change_to_string(ch->read_change),         \
261c43e99fdSEd Maste 	ch->write_change,                          \
262c43e99fdSEd Maste 	change_to_string(ch->write_change),        \
263c43e99fdSEd Maste 	ch->close_change,                          \
264c43e99fdSEd Maste 	change_to_string(ch->close_change)
265c43e99fdSEd Maste 
266c43e99fdSEd Maste static int
epoll_apply_one_change(struct event_base * base,struct epollop * epollop,const struct event_change * ch)267c43e99fdSEd Maste epoll_apply_one_change(struct event_base *base,
268c43e99fdSEd Maste     struct epollop *epollop,
269c43e99fdSEd Maste     const struct event_change *ch)
270c43e99fdSEd Maste {
271c43e99fdSEd Maste 	struct epoll_event epev;
272c43e99fdSEd Maste 	int op, events = 0;
273c43e99fdSEd Maste 	int idx;
274c43e99fdSEd Maste 
275c43e99fdSEd Maste 	idx = EPOLL_OP_TABLE_INDEX(ch);
276c43e99fdSEd Maste 	op = epoll_op_table[idx].op;
277c43e99fdSEd Maste 	events = epoll_op_table[idx].events;
278c43e99fdSEd Maste 
279c43e99fdSEd Maste 	if (!events) {
280c43e99fdSEd Maste 		EVUTIL_ASSERT(op == 0);
281c43e99fdSEd Maste 		return 0;
282c43e99fdSEd Maste 	}
283c43e99fdSEd Maste 
284*b50261e2SCy Schubert 	if ((ch->read_change|ch->write_change|ch->close_change) & EV_CHANGE_ET)
285c43e99fdSEd Maste 		events |= EPOLLET;
286c43e99fdSEd Maste 
287c43e99fdSEd Maste 	memset(&epev, 0, sizeof(epev));
288c43e99fdSEd Maste 	epev.data.fd = ch->fd;
289c43e99fdSEd Maste 	epev.events = events;
290c43e99fdSEd Maste 	if (epoll_ctl(epollop->epfd, op, ch->fd, &epev) == 0) {
291c43e99fdSEd Maste 		event_debug((PRINT_CHANGES(op, epev.events, ch, "okay")));
292c43e99fdSEd Maste 		return 0;
293c43e99fdSEd Maste 	}
294c43e99fdSEd Maste 
295c43e99fdSEd Maste 	switch (op) {
296c43e99fdSEd Maste 	case EPOLL_CTL_MOD:
297c43e99fdSEd Maste 		if (errno == ENOENT) {
298c43e99fdSEd Maste 			/* If a MOD operation fails with ENOENT, the
299c43e99fdSEd Maste 			 * fd was probably closed and re-opened.  We
300c43e99fdSEd Maste 			 * should retry the operation as an ADD.
301c43e99fdSEd Maste 			 */
302c43e99fdSEd Maste 			if (epoll_ctl(epollop->epfd, EPOLL_CTL_ADD, ch->fd, &epev) == -1) {
303c43e99fdSEd Maste 				event_warn("Epoll MOD(%d) on %d retried as ADD; that failed too",
304c43e99fdSEd Maste 				    (int)epev.events, ch->fd);
305c43e99fdSEd Maste 				return -1;
306c43e99fdSEd Maste 			} else {
307c43e99fdSEd Maste 				event_debug(("Epoll MOD(%d) on %d retried as ADD; succeeded.",
308c43e99fdSEd Maste 					(int)epev.events,
309c43e99fdSEd Maste 					ch->fd));
310c43e99fdSEd Maste 				return 0;
311c43e99fdSEd Maste 			}
312c43e99fdSEd Maste 		}
313c43e99fdSEd Maste 		break;
314c43e99fdSEd Maste 	case EPOLL_CTL_ADD:
315c43e99fdSEd Maste 		if (errno == EEXIST) {
316c43e99fdSEd Maste 			/* If an ADD operation fails with EEXIST,
317c43e99fdSEd Maste 			 * either the operation was redundant (as with a
318c43e99fdSEd Maste 			 * precautionary add), or we ran into a fun
319c43e99fdSEd Maste 			 * kernel bug where using dup*() to duplicate the
320c43e99fdSEd Maste 			 * same file into the same fd gives you the same epitem
321c43e99fdSEd Maste 			 * rather than a fresh one.  For the second case,
322c43e99fdSEd Maste 			 * we must retry with MOD. */
323c43e99fdSEd Maste 			if (epoll_ctl(epollop->epfd, EPOLL_CTL_MOD, ch->fd, &epev) == -1) {
324c43e99fdSEd Maste 				event_warn("Epoll ADD(%d) on %d retried as MOD; that failed too",
325c43e99fdSEd Maste 				    (int)epev.events, ch->fd);
326c43e99fdSEd Maste 				return -1;
327c43e99fdSEd Maste 			} else {
328c43e99fdSEd Maste 				event_debug(("Epoll ADD(%d) on %d retried as MOD; succeeded.",
329c43e99fdSEd Maste 					(int)epev.events,
330c43e99fdSEd Maste 					ch->fd));
331c43e99fdSEd Maste 				return 0;
332c43e99fdSEd Maste 			}
333c43e99fdSEd Maste 		}
334c43e99fdSEd Maste 		break;
335c43e99fdSEd Maste 	case EPOLL_CTL_DEL:
336c43e99fdSEd Maste 		if (errno == ENOENT || errno == EBADF || errno == EPERM) {
337c43e99fdSEd Maste 			/* If a delete fails with one of these errors,
338c43e99fdSEd Maste 			 * that's fine too: we closed the fd before we
339c43e99fdSEd Maste 			 * got around to calling epoll_dispatch. */
340c43e99fdSEd Maste 			event_debug(("Epoll DEL(%d) on fd %d gave %s: DEL was unnecessary.",
341c43e99fdSEd Maste 				(int)epev.events,
342c43e99fdSEd Maste 				ch->fd,
343c43e99fdSEd Maste 				strerror(errno)));
344c43e99fdSEd Maste 			return 0;
345c43e99fdSEd Maste 		}
346c43e99fdSEd Maste 		break;
347c43e99fdSEd Maste 	default:
348c43e99fdSEd Maste 		break;
349c43e99fdSEd Maste 	}
350c43e99fdSEd Maste 
351c43e99fdSEd Maste 	event_warn(PRINT_CHANGES(op, epev.events, ch, "failed"));
352c43e99fdSEd Maste 	return -1;
353c43e99fdSEd Maste }
354c43e99fdSEd Maste 
355c43e99fdSEd Maste static int
epoll_apply_changes(struct event_base * base)356c43e99fdSEd Maste epoll_apply_changes(struct event_base *base)
357c43e99fdSEd Maste {
358c43e99fdSEd Maste 	struct event_changelist *changelist = &base->changelist;
359c43e99fdSEd Maste 	struct epollop *epollop = base->evbase;
360c43e99fdSEd Maste 	struct event_change *ch;
361c43e99fdSEd Maste 
362c43e99fdSEd Maste 	int r = 0;
363c43e99fdSEd Maste 	int i;
364c43e99fdSEd Maste 
365c43e99fdSEd Maste 	for (i = 0; i < changelist->n_changes; ++i) {
366c43e99fdSEd Maste 		ch = &changelist->changes[i];
367c43e99fdSEd Maste 		if (epoll_apply_one_change(base, epollop, ch) < 0)
368c43e99fdSEd Maste 			r = -1;
369c43e99fdSEd Maste 	}
370c43e99fdSEd Maste 
371c43e99fdSEd Maste 	return (r);
372c43e99fdSEd Maste }
373c43e99fdSEd Maste 
374c43e99fdSEd Maste static int
epoll_nochangelist_add(struct event_base * base,evutil_socket_t fd,short old,short events,void * p)375c43e99fdSEd Maste epoll_nochangelist_add(struct event_base *base, evutil_socket_t fd,
376c43e99fdSEd Maste     short old, short events, void *p)
377c43e99fdSEd Maste {
378c43e99fdSEd Maste 	struct event_change ch;
379c43e99fdSEd Maste 	ch.fd = fd;
380c43e99fdSEd Maste 	ch.old_events = old;
381c43e99fdSEd Maste 	ch.read_change = ch.write_change = ch.close_change = 0;
382c43e99fdSEd Maste 	if (events & EV_WRITE)
383c43e99fdSEd Maste 		ch.write_change = EV_CHANGE_ADD |
384c43e99fdSEd Maste 		    (events & EV_ET);
385c43e99fdSEd Maste 	if (events & EV_READ)
386c43e99fdSEd Maste 		ch.read_change = EV_CHANGE_ADD |
387c43e99fdSEd Maste 		    (events & EV_ET);
388c43e99fdSEd Maste 	if (events & EV_CLOSED)
389c43e99fdSEd Maste 		ch.close_change = EV_CHANGE_ADD |
390c43e99fdSEd Maste 		    (events & EV_ET);
391c43e99fdSEd Maste 
392c43e99fdSEd Maste 	return epoll_apply_one_change(base, base->evbase, &ch);
393c43e99fdSEd Maste }
394c43e99fdSEd Maste 
395c43e99fdSEd Maste static int
epoll_nochangelist_del(struct event_base * base,evutil_socket_t fd,short old,short events,void * p)396c43e99fdSEd Maste epoll_nochangelist_del(struct event_base *base, evutil_socket_t fd,
397c43e99fdSEd Maste     short old, short events, void *p)
398c43e99fdSEd Maste {
399c43e99fdSEd Maste 	struct event_change ch;
400c43e99fdSEd Maste 	ch.fd = fd;
401c43e99fdSEd Maste 	ch.old_events = old;
402c43e99fdSEd Maste 	ch.read_change = ch.write_change = ch.close_change = 0;
403c43e99fdSEd Maste 	if (events & EV_WRITE)
404*b50261e2SCy Schubert 		ch.write_change = EV_CHANGE_DEL |
405*b50261e2SCy Schubert 		    (events & EV_ET);
406c43e99fdSEd Maste 	if (events & EV_READ)
407*b50261e2SCy Schubert 		ch.read_change = EV_CHANGE_DEL |
408*b50261e2SCy Schubert 		    (events & EV_ET);
409c43e99fdSEd Maste 	if (events & EV_CLOSED)
410*b50261e2SCy Schubert 		ch.close_change = EV_CHANGE_DEL |
411*b50261e2SCy Schubert 		    (events & EV_ET);
412c43e99fdSEd Maste 
413c43e99fdSEd Maste 	return epoll_apply_one_change(base, base->evbase, &ch);
414c43e99fdSEd Maste }
415c43e99fdSEd Maste 
416c43e99fdSEd Maste static int
epoll_dispatch(struct event_base * base,struct timeval * tv)417c43e99fdSEd Maste epoll_dispatch(struct event_base *base, struct timeval *tv)
418c43e99fdSEd Maste {
419c43e99fdSEd Maste 	struct epollop *epollop = base->evbase;
420c43e99fdSEd Maste 	struct epoll_event *events = epollop->events;
421c43e99fdSEd Maste 	int i, res;
422c43e99fdSEd Maste 	long timeout = -1;
423c43e99fdSEd Maste 
424c43e99fdSEd Maste #ifdef USING_TIMERFD
425c43e99fdSEd Maste 	if (epollop->timerfd >= 0) {
426c43e99fdSEd Maste 		struct itimerspec is;
427c43e99fdSEd Maste 		is.it_interval.tv_sec = 0;
428c43e99fdSEd Maste 		is.it_interval.tv_nsec = 0;
429c43e99fdSEd Maste 		if (tv == NULL) {
430c43e99fdSEd Maste 			/* No timeout; disarm the timer. */
431c43e99fdSEd Maste 			is.it_value.tv_sec = 0;
432c43e99fdSEd Maste 			is.it_value.tv_nsec = 0;
433c43e99fdSEd Maste 		} else {
434c43e99fdSEd Maste 			if (tv->tv_sec == 0 && tv->tv_usec == 0) {
435c43e99fdSEd Maste 				/* we need to exit immediately; timerfd can't
436c43e99fdSEd Maste 				 * do that. */
437c43e99fdSEd Maste 				timeout = 0;
438c43e99fdSEd Maste 			}
439c43e99fdSEd Maste 			is.it_value.tv_sec = tv->tv_sec;
440c43e99fdSEd Maste 			is.it_value.tv_nsec = tv->tv_usec * 1000;
441c43e99fdSEd Maste 		}
442c43e99fdSEd Maste 		/* TODO: we could avoid unnecessary syscalls here by only
443c43e99fdSEd Maste 		   calling timerfd_settime when the top timeout changes, or
444c43e99fdSEd Maste 		   when we're called with a different timeval.
445c43e99fdSEd Maste 		*/
446c43e99fdSEd Maste 		if (timerfd_settime(epollop->timerfd, 0, &is, NULL) < 0) {
447c43e99fdSEd Maste 			event_warn("timerfd_settime");
448c43e99fdSEd Maste 		}
449c43e99fdSEd Maste 	} else
450c43e99fdSEd Maste #endif
451c43e99fdSEd Maste 	if (tv != NULL) {
452c43e99fdSEd Maste 		timeout = evutil_tv_to_msec_(tv);
453c43e99fdSEd Maste 		if (timeout < 0 || timeout > MAX_EPOLL_TIMEOUT_MSEC) {
454c43e99fdSEd Maste 			/* Linux kernels can wait forever if the timeout is
455c43e99fdSEd Maste 			 * too big; see comment on MAX_EPOLL_TIMEOUT_MSEC. */
456c43e99fdSEd Maste 			timeout = MAX_EPOLL_TIMEOUT_MSEC;
457c43e99fdSEd Maste 		}
458c43e99fdSEd Maste 	}
459c43e99fdSEd Maste 
460c43e99fdSEd Maste 	epoll_apply_changes(base);
461c43e99fdSEd Maste 	event_changelist_remove_all_(&base->changelist, base);
462c43e99fdSEd Maste 
463c43e99fdSEd Maste 	EVBASE_RELEASE_LOCK(base, th_base_lock);
464c43e99fdSEd Maste 
465c43e99fdSEd Maste 	res = epoll_wait(epollop->epfd, events, epollop->nevents, timeout);
466c43e99fdSEd Maste 
467c43e99fdSEd Maste 	EVBASE_ACQUIRE_LOCK(base, th_base_lock);
468c43e99fdSEd Maste 
469c43e99fdSEd Maste 	if (res == -1) {
470c43e99fdSEd Maste 		if (errno != EINTR) {
471c43e99fdSEd Maste 			event_warn("epoll_wait");
472c43e99fdSEd Maste 			return (-1);
473c43e99fdSEd Maste 		}
474c43e99fdSEd Maste 
475c43e99fdSEd Maste 		return (0);
476c43e99fdSEd Maste 	}
477c43e99fdSEd Maste 
478c43e99fdSEd Maste 	event_debug(("%s: epoll_wait reports %d", __func__, res));
479c43e99fdSEd Maste 	EVUTIL_ASSERT(res <= epollop->nevents);
480c43e99fdSEd Maste 
481c43e99fdSEd Maste 	for (i = 0; i < res; i++) {
482c43e99fdSEd Maste 		int what = events[i].events;
483c43e99fdSEd Maste 		short ev = 0;
484c43e99fdSEd Maste #ifdef USING_TIMERFD
485c43e99fdSEd Maste 		if (events[i].data.fd == epollop->timerfd)
486c43e99fdSEd Maste 			continue;
487c43e99fdSEd Maste #endif
488c43e99fdSEd Maste 
489*b50261e2SCy Schubert 		if (what & EPOLLERR) {
490*b50261e2SCy Schubert 			ev = EV_READ | EV_WRITE;
491*b50261e2SCy Schubert 		} else if ((what & EPOLLHUP) && !(what & EPOLLRDHUP)) {
492c43e99fdSEd Maste 			ev = EV_READ | EV_WRITE;
493c43e99fdSEd Maste 		} else {
494c43e99fdSEd Maste 			if (what & EPOLLIN)
495c43e99fdSEd Maste 				ev |= EV_READ;
496c43e99fdSEd Maste 			if (what & EPOLLOUT)
497c43e99fdSEd Maste 				ev |= EV_WRITE;
498c43e99fdSEd Maste 			if (what & EPOLLRDHUP)
499c43e99fdSEd Maste 				ev |= EV_CLOSED;
500c43e99fdSEd Maste 		}
501c43e99fdSEd Maste 
502c43e99fdSEd Maste 		if (!ev)
503c43e99fdSEd Maste 			continue;
504c43e99fdSEd Maste 
505c43e99fdSEd Maste 		evmap_io_active_(base, events[i].data.fd, ev | EV_ET);
506c43e99fdSEd Maste 	}
507c43e99fdSEd Maste 
508c43e99fdSEd Maste 	if (res == epollop->nevents && epollop->nevents < MAX_NEVENT) {
509c43e99fdSEd Maste 		/* We used all of the event space this time.  We should
510c43e99fdSEd Maste 		   be ready for more events next time. */
511c43e99fdSEd Maste 		int new_nevents = epollop->nevents * 2;
512c43e99fdSEd Maste 		struct epoll_event *new_events;
513c43e99fdSEd Maste 
514c43e99fdSEd Maste 		new_events = mm_realloc(epollop->events,
515c43e99fdSEd Maste 		    new_nevents * sizeof(struct epoll_event));
516c43e99fdSEd Maste 		if (new_events) {
517c43e99fdSEd Maste 			epollop->events = new_events;
518c43e99fdSEd Maste 			epollop->nevents = new_nevents;
519c43e99fdSEd Maste 		}
520c43e99fdSEd Maste 	}
521c43e99fdSEd Maste 
522c43e99fdSEd Maste 	return (0);
523c43e99fdSEd Maste }
524c43e99fdSEd Maste 
525c43e99fdSEd Maste 
526c43e99fdSEd Maste static void
epoll_dealloc(struct event_base * base)527c43e99fdSEd Maste epoll_dealloc(struct event_base *base)
528c43e99fdSEd Maste {
529c43e99fdSEd Maste 	struct epollop *epollop = base->evbase;
530c43e99fdSEd Maste 
531c43e99fdSEd Maste 	evsig_dealloc_(base);
532c43e99fdSEd Maste 	if (epollop->events)
533c43e99fdSEd Maste 		mm_free(epollop->events);
534c43e99fdSEd Maste 	if (epollop->epfd >= 0)
535c43e99fdSEd Maste 		close(epollop->epfd);
536c43e99fdSEd Maste #ifdef USING_TIMERFD
537c43e99fdSEd Maste 	if (epollop->timerfd >= 0)
538c43e99fdSEd Maste 		close(epollop->timerfd);
539c43e99fdSEd Maste #endif
540c43e99fdSEd Maste 
541c43e99fdSEd Maste 	memset(epollop, 0, sizeof(struct epollop));
542c43e99fdSEd Maste 	mm_free(epollop);
543c43e99fdSEd Maste }
544c43e99fdSEd Maste 
545c43e99fdSEd Maste #endif /* EVENT__HAVE_EPOLL */
546