1c43e99fdSEd Maste /*
2c43e99fdSEd Maste * Copyright 2000-2007 Niels Provos <provos@citi.umich.edu>
3c43e99fdSEd Maste * Copyright 2007-2012 Niels Provos, Nick Mathewson
4c43e99fdSEd Maste *
5c43e99fdSEd Maste * Redistribution and use in source and binary forms, with or without
6c43e99fdSEd Maste * modification, are permitted provided that the following conditions
7c43e99fdSEd Maste * are met:
8c43e99fdSEd Maste * 1. Redistributions of source code must retain the above copyright
9c43e99fdSEd Maste * notice, this list of conditions and the following disclaimer.
10c43e99fdSEd Maste * 2. Redistributions in binary form must reproduce the above copyright
11c43e99fdSEd Maste * notice, this list of conditions and the following disclaimer in the
12c43e99fdSEd Maste * documentation and/or other materials provided with the distribution.
13c43e99fdSEd Maste * 3. The name of the author may not be used to endorse or promote products
14c43e99fdSEd Maste * derived from this software without specific prior written permission.
15c43e99fdSEd Maste *
16c43e99fdSEd Maste * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17c43e99fdSEd Maste * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18c43e99fdSEd Maste * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19c43e99fdSEd Maste * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
20c43e99fdSEd Maste * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
21c43e99fdSEd Maste * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22c43e99fdSEd Maste * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23c43e99fdSEd Maste * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24c43e99fdSEd Maste * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
25c43e99fdSEd Maste * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26c43e99fdSEd Maste */
27c43e99fdSEd Maste #include "event2/event-config.h"
28c43e99fdSEd Maste #include "evconfig-private.h"
29c43e99fdSEd Maste
30c43e99fdSEd Maste #ifdef EVENT__HAVE_EPOLL
31c43e99fdSEd Maste
32c43e99fdSEd Maste #include <stdint.h>
33c43e99fdSEd Maste #include <sys/types.h>
34c43e99fdSEd Maste #include <sys/resource.h>
35c43e99fdSEd Maste #ifdef EVENT__HAVE_SYS_TIME_H
36c43e99fdSEd Maste #include <sys/time.h>
37c43e99fdSEd Maste #endif
38c43e99fdSEd Maste #include <sys/queue.h>
39c43e99fdSEd Maste #include <sys/epoll.h>
40c43e99fdSEd Maste #include <signal.h>
41c43e99fdSEd Maste #include <limits.h>
42c43e99fdSEd Maste #include <stdio.h>
43c43e99fdSEd Maste #include <stdlib.h>
44c43e99fdSEd Maste #include <string.h>
45c43e99fdSEd Maste #include <unistd.h>
46c43e99fdSEd Maste #include <errno.h>
47c43e99fdSEd Maste #ifdef EVENT__HAVE_FCNTL_H
48c43e99fdSEd Maste #include <fcntl.h>
49c43e99fdSEd Maste #endif
50c43e99fdSEd Maste #ifdef EVENT__HAVE_SYS_TIMERFD_H
51c43e99fdSEd Maste #include <sys/timerfd.h>
52c43e99fdSEd Maste #endif
53c43e99fdSEd Maste
54c43e99fdSEd Maste #include "event-internal.h"
55c43e99fdSEd Maste #include "evsignal-internal.h"
56c43e99fdSEd Maste #include "event2/thread.h"
57c43e99fdSEd Maste #include "evthread-internal.h"
58c43e99fdSEd Maste #include "log-internal.h"
59c43e99fdSEd Maste #include "evmap-internal.h"
60c43e99fdSEd Maste #include "changelist-internal.h"
61c43e99fdSEd Maste #include "time-internal.h"
62c43e99fdSEd Maste
63c43e99fdSEd Maste /* Since Linux 2.6.17, epoll is able to report about peer half-closed connection
64c43e99fdSEd Maste using special EPOLLRDHUP flag on a read event.
65c43e99fdSEd Maste */
66c43e99fdSEd Maste #if !defined(EPOLLRDHUP)
67c43e99fdSEd Maste #define EPOLLRDHUP 0
68c43e99fdSEd Maste #define EARLY_CLOSE_IF_HAVE_RDHUP 0
69c43e99fdSEd Maste #else
70c43e99fdSEd Maste #define EARLY_CLOSE_IF_HAVE_RDHUP EV_FEATURE_EARLY_CLOSE
71c43e99fdSEd Maste #endif
72c43e99fdSEd Maste
73c43e99fdSEd Maste #include "epolltable-internal.h"
74c43e99fdSEd Maste
75c43e99fdSEd Maste #if defined(EVENT__HAVE_SYS_TIMERFD_H) && \
76c43e99fdSEd Maste defined(EVENT__HAVE_TIMERFD_CREATE) && \
77c43e99fdSEd Maste defined(HAVE_POSIX_MONOTONIC) && defined(TFD_NONBLOCK) && \
78c43e99fdSEd Maste defined(TFD_CLOEXEC)
79c43e99fdSEd Maste /* Note that we only use timerfd if TFD_NONBLOCK and TFD_CLOEXEC are available
80c43e99fdSEd Maste and working. This means that we can't support it on 2.6.25 (where timerfd
81c43e99fdSEd Maste was introduced) or 2.6.26, since 2.6.27 introduced those flags.
82c43e99fdSEd Maste */
83c43e99fdSEd Maste #define USING_TIMERFD
84c43e99fdSEd Maste #endif
85c43e99fdSEd Maste
86c43e99fdSEd Maste struct epollop {
87c43e99fdSEd Maste struct epoll_event *events;
88c43e99fdSEd Maste int nevents;
89c43e99fdSEd Maste int epfd;
90c43e99fdSEd Maste #ifdef USING_TIMERFD
91c43e99fdSEd Maste int timerfd;
92c43e99fdSEd Maste #endif
93c43e99fdSEd Maste };
94c43e99fdSEd Maste
95c43e99fdSEd Maste static void *epoll_init(struct event_base *);
96c43e99fdSEd Maste static int epoll_dispatch(struct event_base *, struct timeval *);
97c43e99fdSEd Maste static void epoll_dealloc(struct event_base *);
98c43e99fdSEd Maste
99c43e99fdSEd Maste static const struct eventop epollops_changelist = {
100c43e99fdSEd Maste "epoll (with changelist)",
101c43e99fdSEd Maste epoll_init,
102c43e99fdSEd Maste event_changelist_add_,
103c43e99fdSEd Maste event_changelist_del_,
104c43e99fdSEd Maste epoll_dispatch,
105c43e99fdSEd Maste epoll_dealloc,
106c43e99fdSEd Maste 1, /* need reinit */
107c43e99fdSEd Maste EV_FEATURE_ET|EV_FEATURE_O1| EARLY_CLOSE_IF_HAVE_RDHUP,
108c43e99fdSEd Maste EVENT_CHANGELIST_FDINFO_SIZE
109c43e99fdSEd Maste };
110c43e99fdSEd Maste
111c43e99fdSEd Maste
112c43e99fdSEd Maste static int epoll_nochangelist_add(struct event_base *base, evutil_socket_t fd,
113c43e99fdSEd Maste short old, short events, void *p);
114c43e99fdSEd Maste static int epoll_nochangelist_del(struct event_base *base, evutil_socket_t fd,
115c43e99fdSEd Maste short old, short events, void *p);
116c43e99fdSEd Maste
117c43e99fdSEd Maste const struct eventop epollops = {
118c43e99fdSEd Maste "epoll",
119c43e99fdSEd Maste epoll_init,
120c43e99fdSEd Maste epoll_nochangelist_add,
121c43e99fdSEd Maste epoll_nochangelist_del,
122c43e99fdSEd Maste epoll_dispatch,
123c43e99fdSEd Maste epoll_dealloc,
124c43e99fdSEd Maste 1, /* need reinit */
125c43e99fdSEd Maste EV_FEATURE_ET|EV_FEATURE_O1|EV_FEATURE_EARLY_CLOSE,
126c43e99fdSEd Maste 0
127c43e99fdSEd Maste };
128c43e99fdSEd Maste
129c43e99fdSEd Maste #define INITIAL_NEVENT 32
130c43e99fdSEd Maste #define MAX_NEVENT 4096
131c43e99fdSEd Maste
132c43e99fdSEd Maste /* On Linux kernels at least up to 2.6.24.4, epoll can't handle timeout
133c43e99fdSEd Maste * values bigger than (LONG_MAX - 999ULL)/HZ. HZ in the wild can be
134c43e99fdSEd Maste * as big as 1000, and LONG_MAX can be as small as (1<<31)-1, so the
135c43e99fdSEd Maste * largest number of msec we can support here is 2147482. Let's
136c43e99fdSEd Maste * round that down by 47 seconds.
137c43e99fdSEd Maste */
138c43e99fdSEd Maste #define MAX_EPOLL_TIMEOUT_MSEC (35*60*1000)
139c43e99fdSEd Maste
140c43e99fdSEd Maste static void *
epoll_init(struct event_base * base)141c43e99fdSEd Maste epoll_init(struct event_base *base)
142c43e99fdSEd Maste {
143c43e99fdSEd Maste int epfd = -1;
144c43e99fdSEd Maste struct epollop *epollop;
145c43e99fdSEd Maste
146c43e99fdSEd Maste #ifdef EVENT__HAVE_EPOLL_CREATE1
147c43e99fdSEd Maste /* First, try the shiny new epoll_create1 interface, if we have it. */
148c43e99fdSEd Maste epfd = epoll_create1(EPOLL_CLOEXEC);
149c43e99fdSEd Maste #endif
150c43e99fdSEd Maste if (epfd == -1) {
151c43e99fdSEd Maste /* Initialize the kernel queue using the old interface. (The
152c43e99fdSEd Maste size field is ignored since 2.6.8.) */
153c43e99fdSEd Maste if ((epfd = epoll_create(32000)) == -1) {
154c43e99fdSEd Maste if (errno != ENOSYS)
155c43e99fdSEd Maste event_warn("epoll_create");
156c43e99fdSEd Maste return (NULL);
157c43e99fdSEd Maste }
158c43e99fdSEd Maste evutil_make_socket_closeonexec(epfd);
159c43e99fdSEd Maste }
160c43e99fdSEd Maste
161c43e99fdSEd Maste if (!(epollop = mm_calloc(1, sizeof(struct epollop)))) {
162c43e99fdSEd Maste close(epfd);
163c43e99fdSEd Maste return (NULL);
164c43e99fdSEd Maste }
165c43e99fdSEd Maste
166c43e99fdSEd Maste epollop->epfd = epfd;
167c43e99fdSEd Maste
168c43e99fdSEd Maste /* Initialize fields */
169c43e99fdSEd Maste epollop->events = mm_calloc(INITIAL_NEVENT, sizeof(struct epoll_event));
170c43e99fdSEd Maste if (epollop->events == NULL) {
171c43e99fdSEd Maste mm_free(epollop);
172c43e99fdSEd Maste close(epfd);
173c43e99fdSEd Maste return (NULL);
174c43e99fdSEd Maste }
175c43e99fdSEd Maste epollop->nevents = INITIAL_NEVENT;
176c43e99fdSEd Maste
177c43e99fdSEd Maste if ((base->flags & EVENT_BASE_FLAG_EPOLL_USE_CHANGELIST) != 0 ||
178c43e99fdSEd Maste ((base->flags & EVENT_BASE_FLAG_IGNORE_ENV) == 0 &&
179c43e99fdSEd Maste evutil_getenv_("EVENT_EPOLL_USE_CHANGELIST") != NULL)) {
180c43e99fdSEd Maste
181c43e99fdSEd Maste base->evsel = &epollops_changelist;
182c43e99fdSEd Maste }
183c43e99fdSEd Maste
184c43e99fdSEd Maste #ifdef USING_TIMERFD
185c43e99fdSEd Maste /*
186c43e99fdSEd Maste The epoll interface ordinarily gives us one-millisecond precision,
187c43e99fdSEd Maste so on Linux it makes perfect sense to use the CLOCK_MONOTONIC_COARSE
188c43e99fdSEd Maste timer. But when the user has set the new PRECISE_TIMER flag for an
189c43e99fdSEd Maste event_base, we can try to use timerfd to give them finer granularity.
190c43e99fdSEd Maste */
191c43e99fdSEd Maste if ((base->flags & EVENT_BASE_FLAG_PRECISE_TIMER) &&
192c43e99fdSEd Maste base->monotonic_timer.monotonic_clock == CLOCK_MONOTONIC) {
193c43e99fdSEd Maste int fd;
194c43e99fdSEd Maste fd = epollop->timerfd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK|TFD_CLOEXEC);
195c43e99fdSEd Maste if (epollop->timerfd >= 0) {
196c43e99fdSEd Maste struct epoll_event epev;
197c43e99fdSEd Maste memset(&epev, 0, sizeof(epev));
198c43e99fdSEd Maste epev.data.fd = epollop->timerfd;
199c43e99fdSEd Maste epev.events = EPOLLIN;
200c43e99fdSEd Maste if (epoll_ctl(epollop->epfd, EPOLL_CTL_ADD, fd, &epev) < 0) {
201c43e99fdSEd Maste event_warn("epoll_ctl(timerfd)");
202c43e99fdSEd Maste close(fd);
203c43e99fdSEd Maste epollop->timerfd = -1;
204c43e99fdSEd Maste }
205c43e99fdSEd Maste } else {
206c43e99fdSEd Maste if (errno != EINVAL && errno != ENOSYS) {
207c43e99fdSEd Maste /* These errors probably mean that we were
208c43e99fdSEd Maste * compiled with timerfd/TFD_* support, but
209c43e99fdSEd Maste * we're running on a kernel that lacks those.
210c43e99fdSEd Maste */
211c43e99fdSEd Maste event_warn("timerfd_create");
212c43e99fdSEd Maste }
213c43e99fdSEd Maste epollop->timerfd = -1;
214c43e99fdSEd Maste }
215c43e99fdSEd Maste } else {
216c43e99fdSEd Maste epollop->timerfd = -1;
217c43e99fdSEd Maste }
218c43e99fdSEd Maste #endif
219c43e99fdSEd Maste
220c43e99fdSEd Maste evsig_init_(base);
221c43e99fdSEd Maste
222c43e99fdSEd Maste return (epollop);
223c43e99fdSEd Maste }
224c43e99fdSEd Maste
225c43e99fdSEd Maste static const char *
change_to_string(int change)226c43e99fdSEd Maste change_to_string(int change)
227c43e99fdSEd Maste {
228c43e99fdSEd Maste change &= (EV_CHANGE_ADD|EV_CHANGE_DEL);
229c43e99fdSEd Maste if (change == EV_CHANGE_ADD) {
230c43e99fdSEd Maste return "add";
231c43e99fdSEd Maste } else if (change == EV_CHANGE_DEL) {
232c43e99fdSEd Maste return "del";
233c43e99fdSEd Maste } else if (change == 0) {
234c43e99fdSEd Maste return "none";
235c43e99fdSEd Maste } else {
236c43e99fdSEd Maste return "???";
237c43e99fdSEd Maste }
238c43e99fdSEd Maste }
239c43e99fdSEd Maste
240c43e99fdSEd Maste static const char *
epoll_op_to_string(int op)241c43e99fdSEd Maste epoll_op_to_string(int op)
242c43e99fdSEd Maste {
243c43e99fdSEd Maste return op == EPOLL_CTL_ADD?"ADD":
244c43e99fdSEd Maste op == EPOLL_CTL_DEL?"DEL":
245c43e99fdSEd Maste op == EPOLL_CTL_MOD?"MOD":
246c43e99fdSEd Maste "???";
247c43e99fdSEd Maste }
248c43e99fdSEd Maste
249c43e99fdSEd Maste #define PRINT_CHANGES(op, events, ch, status) \
250c43e99fdSEd Maste "Epoll %s(%d) on fd %d " status ". " \
251c43e99fdSEd Maste "Old events were %d; " \
252c43e99fdSEd Maste "read change was %d (%s); " \
253c43e99fdSEd Maste "write change was %d (%s); " \
254c43e99fdSEd Maste "close change was %d (%s)", \
255c43e99fdSEd Maste epoll_op_to_string(op), \
256c43e99fdSEd Maste events, \
257c43e99fdSEd Maste ch->fd, \
258c43e99fdSEd Maste ch->old_events, \
259c43e99fdSEd Maste ch->read_change, \
260c43e99fdSEd Maste change_to_string(ch->read_change), \
261c43e99fdSEd Maste ch->write_change, \
262c43e99fdSEd Maste change_to_string(ch->write_change), \
263c43e99fdSEd Maste ch->close_change, \
264c43e99fdSEd Maste change_to_string(ch->close_change)
265c43e99fdSEd Maste
266c43e99fdSEd Maste static int
epoll_apply_one_change(struct event_base * base,struct epollop * epollop,const struct event_change * ch)267c43e99fdSEd Maste epoll_apply_one_change(struct event_base *base,
268c43e99fdSEd Maste struct epollop *epollop,
269c43e99fdSEd Maste const struct event_change *ch)
270c43e99fdSEd Maste {
271c43e99fdSEd Maste struct epoll_event epev;
272c43e99fdSEd Maste int op, events = 0;
273c43e99fdSEd Maste int idx;
274c43e99fdSEd Maste
275c43e99fdSEd Maste idx = EPOLL_OP_TABLE_INDEX(ch);
276c43e99fdSEd Maste op = epoll_op_table[idx].op;
277c43e99fdSEd Maste events = epoll_op_table[idx].events;
278c43e99fdSEd Maste
279c43e99fdSEd Maste if (!events) {
280c43e99fdSEd Maste EVUTIL_ASSERT(op == 0);
281c43e99fdSEd Maste return 0;
282c43e99fdSEd Maste }
283c43e99fdSEd Maste
284*b50261e2SCy Schubert if ((ch->read_change|ch->write_change|ch->close_change) & EV_CHANGE_ET)
285c43e99fdSEd Maste events |= EPOLLET;
286c43e99fdSEd Maste
287c43e99fdSEd Maste memset(&epev, 0, sizeof(epev));
288c43e99fdSEd Maste epev.data.fd = ch->fd;
289c43e99fdSEd Maste epev.events = events;
290c43e99fdSEd Maste if (epoll_ctl(epollop->epfd, op, ch->fd, &epev) == 0) {
291c43e99fdSEd Maste event_debug((PRINT_CHANGES(op, epev.events, ch, "okay")));
292c43e99fdSEd Maste return 0;
293c43e99fdSEd Maste }
294c43e99fdSEd Maste
295c43e99fdSEd Maste switch (op) {
296c43e99fdSEd Maste case EPOLL_CTL_MOD:
297c43e99fdSEd Maste if (errno == ENOENT) {
298c43e99fdSEd Maste /* If a MOD operation fails with ENOENT, the
299c43e99fdSEd Maste * fd was probably closed and re-opened. We
300c43e99fdSEd Maste * should retry the operation as an ADD.
301c43e99fdSEd Maste */
302c43e99fdSEd Maste if (epoll_ctl(epollop->epfd, EPOLL_CTL_ADD, ch->fd, &epev) == -1) {
303c43e99fdSEd Maste event_warn("Epoll MOD(%d) on %d retried as ADD; that failed too",
304c43e99fdSEd Maste (int)epev.events, ch->fd);
305c43e99fdSEd Maste return -1;
306c43e99fdSEd Maste } else {
307c43e99fdSEd Maste event_debug(("Epoll MOD(%d) on %d retried as ADD; succeeded.",
308c43e99fdSEd Maste (int)epev.events,
309c43e99fdSEd Maste ch->fd));
310c43e99fdSEd Maste return 0;
311c43e99fdSEd Maste }
312c43e99fdSEd Maste }
313c43e99fdSEd Maste break;
314c43e99fdSEd Maste case EPOLL_CTL_ADD:
315c43e99fdSEd Maste if (errno == EEXIST) {
316c43e99fdSEd Maste /* If an ADD operation fails with EEXIST,
317c43e99fdSEd Maste * either the operation was redundant (as with a
318c43e99fdSEd Maste * precautionary add), or we ran into a fun
319c43e99fdSEd Maste * kernel bug where using dup*() to duplicate the
320c43e99fdSEd Maste * same file into the same fd gives you the same epitem
321c43e99fdSEd Maste * rather than a fresh one. For the second case,
322c43e99fdSEd Maste * we must retry with MOD. */
323c43e99fdSEd Maste if (epoll_ctl(epollop->epfd, EPOLL_CTL_MOD, ch->fd, &epev) == -1) {
324c43e99fdSEd Maste event_warn("Epoll ADD(%d) on %d retried as MOD; that failed too",
325c43e99fdSEd Maste (int)epev.events, ch->fd);
326c43e99fdSEd Maste return -1;
327c43e99fdSEd Maste } else {
328c43e99fdSEd Maste event_debug(("Epoll ADD(%d) on %d retried as MOD; succeeded.",
329c43e99fdSEd Maste (int)epev.events,
330c43e99fdSEd Maste ch->fd));
331c43e99fdSEd Maste return 0;
332c43e99fdSEd Maste }
333c43e99fdSEd Maste }
334c43e99fdSEd Maste break;
335c43e99fdSEd Maste case EPOLL_CTL_DEL:
336c43e99fdSEd Maste if (errno == ENOENT || errno == EBADF || errno == EPERM) {
337c43e99fdSEd Maste /* If a delete fails with one of these errors,
338c43e99fdSEd Maste * that's fine too: we closed the fd before we
339c43e99fdSEd Maste * got around to calling epoll_dispatch. */
340c43e99fdSEd Maste event_debug(("Epoll DEL(%d) on fd %d gave %s: DEL was unnecessary.",
341c43e99fdSEd Maste (int)epev.events,
342c43e99fdSEd Maste ch->fd,
343c43e99fdSEd Maste strerror(errno)));
344c43e99fdSEd Maste return 0;
345c43e99fdSEd Maste }
346c43e99fdSEd Maste break;
347c43e99fdSEd Maste default:
348c43e99fdSEd Maste break;
349c43e99fdSEd Maste }
350c43e99fdSEd Maste
351c43e99fdSEd Maste event_warn(PRINT_CHANGES(op, epev.events, ch, "failed"));
352c43e99fdSEd Maste return -1;
353c43e99fdSEd Maste }
354c43e99fdSEd Maste
355c43e99fdSEd Maste static int
epoll_apply_changes(struct event_base * base)356c43e99fdSEd Maste epoll_apply_changes(struct event_base *base)
357c43e99fdSEd Maste {
358c43e99fdSEd Maste struct event_changelist *changelist = &base->changelist;
359c43e99fdSEd Maste struct epollop *epollop = base->evbase;
360c43e99fdSEd Maste struct event_change *ch;
361c43e99fdSEd Maste
362c43e99fdSEd Maste int r = 0;
363c43e99fdSEd Maste int i;
364c43e99fdSEd Maste
365c43e99fdSEd Maste for (i = 0; i < changelist->n_changes; ++i) {
366c43e99fdSEd Maste ch = &changelist->changes[i];
367c43e99fdSEd Maste if (epoll_apply_one_change(base, epollop, ch) < 0)
368c43e99fdSEd Maste r = -1;
369c43e99fdSEd Maste }
370c43e99fdSEd Maste
371c43e99fdSEd Maste return (r);
372c43e99fdSEd Maste }
373c43e99fdSEd Maste
374c43e99fdSEd Maste static int
epoll_nochangelist_add(struct event_base * base,evutil_socket_t fd,short old,short events,void * p)375c43e99fdSEd Maste epoll_nochangelist_add(struct event_base *base, evutil_socket_t fd,
376c43e99fdSEd Maste short old, short events, void *p)
377c43e99fdSEd Maste {
378c43e99fdSEd Maste struct event_change ch;
379c43e99fdSEd Maste ch.fd = fd;
380c43e99fdSEd Maste ch.old_events = old;
381c43e99fdSEd Maste ch.read_change = ch.write_change = ch.close_change = 0;
382c43e99fdSEd Maste if (events & EV_WRITE)
383c43e99fdSEd Maste ch.write_change = EV_CHANGE_ADD |
384c43e99fdSEd Maste (events & EV_ET);
385c43e99fdSEd Maste if (events & EV_READ)
386c43e99fdSEd Maste ch.read_change = EV_CHANGE_ADD |
387c43e99fdSEd Maste (events & EV_ET);
388c43e99fdSEd Maste if (events & EV_CLOSED)
389c43e99fdSEd Maste ch.close_change = EV_CHANGE_ADD |
390c43e99fdSEd Maste (events & EV_ET);
391c43e99fdSEd Maste
392c43e99fdSEd Maste return epoll_apply_one_change(base, base->evbase, &ch);
393c43e99fdSEd Maste }
394c43e99fdSEd Maste
395c43e99fdSEd Maste static int
epoll_nochangelist_del(struct event_base * base,evutil_socket_t fd,short old,short events,void * p)396c43e99fdSEd Maste epoll_nochangelist_del(struct event_base *base, evutil_socket_t fd,
397c43e99fdSEd Maste short old, short events, void *p)
398c43e99fdSEd Maste {
399c43e99fdSEd Maste struct event_change ch;
400c43e99fdSEd Maste ch.fd = fd;
401c43e99fdSEd Maste ch.old_events = old;
402c43e99fdSEd Maste ch.read_change = ch.write_change = ch.close_change = 0;
403c43e99fdSEd Maste if (events & EV_WRITE)
404*b50261e2SCy Schubert ch.write_change = EV_CHANGE_DEL |
405*b50261e2SCy Schubert (events & EV_ET);
406c43e99fdSEd Maste if (events & EV_READ)
407*b50261e2SCy Schubert ch.read_change = EV_CHANGE_DEL |
408*b50261e2SCy Schubert (events & EV_ET);
409c43e99fdSEd Maste if (events & EV_CLOSED)
410*b50261e2SCy Schubert ch.close_change = EV_CHANGE_DEL |
411*b50261e2SCy Schubert (events & EV_ET);
412c43e99fdSEd Maste
413c43e99fdSEd Maste return epoll_apply_one_change(base, base->evbase, &ch);
414c43e99fdSEd Maste }
415c43e99fdSEd Maste
416c43e99fdSEd Maste static int
epoll_dispatch(struct event_base * base,struct timeval * tv)417c43e99fdSEd Maste epoll_dispatch(struct event_base *base, struct timeval *tv)
418c43e99fdSEd Maste {
419c43e99fdSEd Maste struct epollop *epollop = base->evbase;
420c43e99fdSEd Maste struct epoll_event *events = epollop->events;
421c43e99fdSEd Maste int i, res;
422c43e99fdSEd Maste long timeout = -1;
423c43e99fdSEd Maste
424c43e99fdSEd Maste #ifdef USING_TIMERFD
425c43e99fdSEd Maste if (epollop->timerfd >= 0) {
426c43e99fdSEd Maste struct itimerspec is;
427c43e99fdSEd Maste is.it_interval.tv_sec = 0;
428c43e99fdSEd Maste is.it_interval.tv_nsec = 0;
429c43e99fdSEd Maste if (tv == NULL) {
430c43e99fdSEd Maste /* No timeout; disarm the timer. */
431c43e99fdSEd Maste is.it_value.tv_sec = 0;
432c43e99fdSEd Maste is.it_value.tv_nsec = 0;
433c43e99fdSEd Maste } else {
434c43e99fdSEd Maste if (tv->tv_sec == 0 && tv->tv_usec == 0) {
435c43e99fdSEd Maste /* we need to exit immediately; timerfd can't
436c43e99fdSEd Maste * do that. */
437c43e99fdSEd Maste timeout = 0;
438c43e99fdSEd Maste }
439c43e99fdSEd Maste is.it_value.tv_sec = tv->tv_sec;
440c43e99fdSEd Maste is.it_value.tv_nsec = tv->tv_usec * 1000;
441c43e99fdSEd Maste }
442c43e99fdSEd Maste /* TODO: we could avoid unnecessary syscalls here by only
443c43e99fdSEd Maste calling timerfd_settime when the top timeout changes, or
444c43e99fdSEd Maste when we're called with a different timeval.
445c43e99fdSEd Maste */
446c43e99fdSEd Maste if (timerfd_settime(epollop->timerfd, 0, &is, NULL) < 0) {
447c43e99fdSEd Maste event_warn("timerfd_settime");
448c43e99fdSEd Maste }
449c43e99fdSEd Maste } else
450c43e99fdSEd Maste #endif
451c43e99fdSEd Maste if (tv != NULL) {
452c43e99fdSEd Maste timeout = evutil_tv_to_msec_(tv);
453c43e99fdSEd Maste if (timeout < 0 || timeout > MAX_EPOLL_TIMEOUT_MSEC) {
454c43e99fdSEd Maste /* Linux kernels can wait forever if the timeout is
455c43e99fdSEd Maste * too big; see comment on MAX_EPOLL_TIMEOUT_MSEC. */
456c43e99fdSEd Maste timeout = MAX_EPOLL_TIMEOUT_MSEC;
457c43e99fdSEd Maste }
458c43e99fdSEd Maste }
459c43e99fdSEd Maste
460c43e99fdSEd Maste epoll_apply_changes(base);
461c43e99fdSEd Maste event_changelist_remove_all_(&base->changelist, base);
462c43e99fdSEd Maste
463c43e99fdSEd Maste EVBASE_RELEASE_LOCK(base, th_base_lock);
464c43e99fdSEd Maste
465c43e99fdSEd Maste res = epoll_wait(epollop->epfd, events, epollop->nevents, timeout);
466c43e99fdSEd Maste
467c43e99fdSEd Maste EVBASE_ACQUIRE_LOCK(base, th_base_lock);
468c43e99fdSEd Maste
469c43e99fdSEd Maste if (res == -1) {
470c43e99fdSEd Maste if (errno != EINTR) {
471c43e99fdSEd Maste event_warn("epoll_wait");
472c43e99fdSEd Maste return (-1);
473c43e99fdSEd Maste }
474c43e99fdSEd Maste
475c43e99fdSEd Maste return (0);
476c43e99fdSEd Maste }
477c43e99fdSEd Maste
478c43e99fdSEd Maste event_debug(("%s: epoll_wait reports %d", __func__, res));
479c43e99fdSEd Maste EVUTIL_ASSERT(res <= epollop->nevents);
480c43e99fdSEd Maste
481c43e99fdSEd Maste for (i = 0; i < res; i++) {
482c43e99fdSEd Maste int what = events[i].events;
483c43e99fdSEd Maste short ev = 0;
484c43e99fdSEd Maste #ifdef USING_TIMERFD
485c43e99fdSEd Maste if (events[i].data.fd == epollop->timerfd)
486c43e99fdSEd Maste continue;
487c43e99fdSEd Maste #endif
488c43e99fdSEd Maste
489*b50261e2SCy Schubert if (what & EPOLLERR) {
490*b50261e2SCy Schubert ev = EV_READ | EV_WRITE;
491*b50261e2SCy Schubert } else if ((what & EPOLLHUP) && !(what & EPOLLRDHUP)) {
492c43e99fdSEd Maste ev = EV_READ | EV_WRITE;
493c43e99fdSEd Maste } else {
494c43e99fdSEd Maste if (what & EPOLLIN)
495c43e99fdSEd Maste ev |= EV_READ;
496c43e99fdSEd Maste if (what & EPOLLOUT)
497c43e99fdSEd Maste ev |= EV_WRITE;
498c43e99fdSEd Maste if (what & EPOLLRDHUP)
499c43e99fdSEd Maste ev |= EV_CLOSED;
500c43e99fdSEd Maste }
501c43e99fdSEd Maste
502c43e99fdSEd Maste if (!ev)
503c43e99fdSEd Maste continue;
504c43e99fdSEd Maste
505c43e99fdSEd Maste evmap_io_active_(base, events[i].data.fd, ev | EV_ET);
506c43e99fdSEd Maste }
507c43e99fdSEd Maste
508c43e99fdSEd Maste if (res == epollop->nevents && epollop->nevents < MAX_NEVENT) {
509c43e99fdSEd Maste /* We used all of the event space this time. We should
510c43e99fdSEd Maste be ready for more events next time. */
511c43e99fdSEd Maste int new_nevents = epollop->nevents * 2;
512c43e99fdSEd Maste struct epoll_event *new_events;
513c43e99fdSEd Maste
514c43e99fdSEd Maste new_events = mm_realloc(epollop->events,
515c43e99fdSEd Maste new_nevents * sizeof(struct epoll_event));
516c43e99fdSEd Maste if (new_events) {
517c43e99fdSEd Maste epollop->events = new_events;
518c43e99fdSEd Maste epollop->nevents = new_nevents;
519c43e99fdSEd Maste }
520c43e99fdSEd Maste }
521c43e99fdSEd Maste
522c43e99fdSEd Maste return (0);
523c43e99fdSEd Maste }
524c43e99fdSEd Maste
525c43e99fdSEd Maste
526c43e99fdSEd Maste static void
epoll_dealloc(struct event_base * base)527c43e99fdSEd Maste epoll_dealloc(struct event_base *base)
528c43e99fdSEd Maste {
529c43e99fdSEd Maste struct epollop *epollop = base->evbase;
530c43e99fdSEd Maste
531c43e99fdSEd Maste evsig_dealloc_(base);
532c43e99fdSEd Maste if (epollop->events)
533c43e99fdSEd Maste mm_free(epollop->events);
534c43e99fdSEd Maste if (epollop->epfd >= 0)
535c43e99fdSEd Maste close(epollop->epfd);
536c43e99fdSEd Maste #ifdef USING_TIMERFD
537c43e99fdSEd Maste if (epollop->timerfd >= 0)
538c43e99fdSEd Maste close(epollop->timerfd);
539c43e99fdSEd Maste #endif
540c43e99fdSEd Maste
541c43e99fdSEd Maste memset(epollop, 0, sizeof(struct epollop));
542c43e99fdSEd Maste mm_free(epollop);
543c43e99fdSEd Maste }
544c43e99fdSEd Maste
545c43e99fdSEd Maste #endif /* EVENT__HAVE_EPOLL */
546