1 /* 2 * Copyright 2000-2007 Niels Provos <provos@citi.umich.edu> 3 * Copyright 2007-2012 Niels Provos, Nick Mathewson 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 3. The name of the author may not be used to endorse or promote products 14 * derived from this software without specific prior written permission. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 17 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 18 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 19 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 21 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 22 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 23 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 25 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 #include "event2/event-config.h" 28 #include "evconfig-private.h" 29 30 #ifdef EVENT__HAVE_EPOLL 31 32 #include <stdint.h> 33 #include <sys/types.h> 34 #include <sys/resource.h> 35 #ifdef EVENT__HAVE_SYS_TIME_H 36 #include <sys/time.h> 37 #endif 38 #include <sys/queue.h> 39 #include <sys/epoll.h> 40 #include <signal.h> 41 #include <limits.h> 42 #include <stdio.h> 43 #include <stdlib.h> 44 #include <string.h> 45 #include <unistd.h> 46 #include <errno.h> 47 #ifdef EVENT__HAVE_FCNTL_H 48 #include <fcntl.h> 49 #endif 50 #ifdef EVENT__HAVE_SYS_TIMERFD_H 51 #include <sys/timerfd.h> 52 #endif 53 54 #include "event-internal.h" 55 #include "evsignal-internal.h" 56 #include "event2/thread.h" 57 #include "evthread-internal.h" 58 #include "log-internal.h" 59 #include "evmap-internal.h" 60 #include "changelist-internal.h" 61 #include "time-internal.h" 62 63 /* Since Linux 2.6.17, epoll is able to report about peer half-closed connection 64 using special EPOLLRDHUP flag on a read event. 65 */ 66 #if !defined(EPOLLRDHUP) 67 #define EPOLLRDHUP 0 68 #define EARLY_CLOSE_IF_HAVE_RDHUP 0 69 #else 70 #define EARLY_CLOSE_IF_HAVE_RDHUP EV_FEATURE_EARLY_CLOSE 71 #endif 72 73 #include "epolltable-internal.h" 74 75 #if defined(EVENT__HAVE_SYS_TIMERFD_H) && \ 76 defined(EVENT__HAVE_TIMERFD_CREATE) && \ 77 defined(HAVE_POSIX_MONOTONIC) && defined(TFD_NONBLOCK) && \ 78 defined(TFD_CLOEXEC) 79 /* Note that we only use timerfd if TFD_NONBLOCK and TFD_CLOEXEC are available 80 and working. This means that we can't support it on 2.6.25 (where timerfd 81 was introduced) or 2.6.26, since 2.6.27 introduced those flags. 82 */ 83 #define USING_TIMERFD 84 #endif 85 86 struct epollop { 87 struct epoll_event *events; 88 int nevents; 89 int epfd; 90 #ifdef USING_TIMERFD 91 int timerfd; 92 #endif 93 }; 94 95 static void *epoll_init(struct event_base *); 96 static int epoll_dispatch(struct event_base *, struct timeval *); 97 static void epoll_dealloc(struct event_base *); 98 99 static const struct eventop epollops_changelist = { 100 "epoll (with changelist)", 101 epoll_init, 102 event_changelist_add_, 103 event_changelist_del_, 104 epoll_dispatch, 105 epoll_dealloc, 106 1, /* need reinit */ 107 EV_FEATURE_ET|EV_FEATURE_O1| EARLY_CLOSE_IF_HAVE_RDHUP, 108 EVENT_CHANGELIST_FDINFO_SIZE 109 }; 110 111 112 static int epoll_nochangelist_add(struct event_base *base, evutil_socket_t fd, 113 short old, short events, void *p); 114 static int epoll_nochangelist_del(struct event_base *base, evutil_socket_t fd, 115 short old, short events, void *p); 116 117 const struct eventop epollops = { 118 "epoll", 119 epoll_init, 120 epoll_nochangelist_add, 121 epoll_nochangelist_del, 122 epoll_dispatch, 123 epoll_dealloc, 124 1, /* need reinit */ 125 EV_FEATURE_ET|EV_FEATURE_O1|EV_FEATURE_EARLY_CLOSE, 126 0 127 }; 128 129 #define INITIAL_NEVENT 32 130 #define MAX_NEVENT 4096 131 132 /* On Linux kernels at least up to 2.6.24.4, epoll can't handle timeout 133 * values bigger than (LONG_MAX - 999ULL)/HZ. HZ in the wild can be 134 * as big as 1000, and LONG_MAX can be as small as (1<<31)-1, so the 135 * largest number of msec we can support here is 2147482. Let's 136 * round that down by 47 seconds. 137 */ 138 #define MAX_EPOLL_TIMEOUT_MSEC (35*60*1000) 139 140 static void * 141 epoll_init(struct event_base *base) 142 { 143 int epfd = -1; 144 struct epollop *epollop; 145 146 #ifdef EVENT__HAVE_EPOLL_CREATE1 147 /* First, try the shiny new epoll_create1 interface, if we have it. */ 148 epfd = epoll_create1(EPOLL_CLOEXEC); 149 #endif 150 if (epfd == -1) { 151 /* Initialize the kernel queue using the old interface. (The 152 size field is ignored since 2.6.8.) */ 153 if ((epfd = epoll_create(32000)) == -1) { 154 if (errno != ENOSYS) 155 event_warn("epoll_create"); 156 return (NULL); 157 } 158 evutil_make_socket_closeonexec(epfd); 159 } 160 161 if (!(epollop = mm_calloc(1, sizeof(struct epollop)))) { 162 close(epfd); 163 return (NULL); 164 } 165 166 epollop->epfd = epfd; 167 168 /* Initialize fields */ 169 epollop->events = mm_calloc(INITIAL_NEVENT, sizeof(struct epoll_event)); 170 if (epollop->events == NULL) { 171 mm_free(epollop); 172 close(epfd); 173 return (NULL); 174 } 175 epollop->nevents = INITIAL_NEVENT; 176 177 if ((base->flags & EVENT_BASE_FLAG_EPOLL_USE_CHANGELIST) != 0 || 178 ((base->flags & EVENT_BASE_FLAG_IGNORE_ENV) == 0 && 179 evutil_getenv_("EVENT_EPOLL_USE_CHANGELIST") != NULL)) { 180 181 base->evsel = &epollops_changelist; 182 } 183 184 #ifdef USING_TIMERFD 185 /* 186 The epoll interface ordinarily gives us one-millisecond precision, 187 so on Linux it makes perfect sense to use the CLOCK_MONOTONIC_COARSE 188 timer. But when the user has set the new PRECISE_TIMER flag for an 189 event_base, we can try to use timerfd to give them finer granularity. 190 */ 191 if ((base->flags & EVENT_BASE_FLAG_PRECISE_TIMER) && 192 base->monotonic_timer.monotonic_clock == CLOCK_MONOTONIC) { 193 int fd; 194 fd = epollop->timerfd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK|TFD_CLOEXEC); 195 if (epollop->timerfd >= 0) { 196 struct epoll_event epev; 197 memset(&epev, 0, sizeof(epev)); 198 epev.data.fd = epollop->timerfd; 199 epev.events = EPOLLIN; 200 if (epoll_ctl(epollop->epfd, EPOLL_CTL_ADD, fd, &epev) < 0) { 201 event_warn("epoll_ctl(timerfd)"); 202 close(fd); 203 epollop->timerfd = -1; 204 } 205 } else { 206 if (errno != EINVAL && errno != ENOSYS) { 207 /* These errors probably mean that we were 208 * compiled with timerfd/TFD_* support, but 209 * we're running on a kernel that lacks those. 210 */ 211 event_warn("timerfd_create"); 212 } 213 epollop->timerfd = -1; 214 } 215 } else { 216 epollop->timerfd = -1; 217 } 218 #endif 219 220 evsig_init_(base); 221 222 return (epollop); 223 } 224 225 static const char * 226 change_to_string(int change) 227 { 228 change &= (EV_CHANGE_ADD|EV_CHANGE_DEL); 229 if (change == EV_CHANGE_ADD) { 230 return "add"; 231 } else if (change == EV_CHANGE_DEL) { 232 return "del"; 233 } else if (change == 0) { 234 return "none"; 235 } else { 236 return "???"; 237 } 238 } 239 240 static const char * 241 epoll_op_to_string(int op) 242 { 243 return op == EPOLL_CTL_ADD?"ADD": 244 op == EPOLL_CTL_DEL?"DEL": 245 op == EPOLL_CTL_MOD?"MOD": 246 "???"; 247 } 248 249 static int 250 epoll_apply_one_change(struct event_base *base, 251 struct epollop *epollop, 252 const struct event_change *ch) 253 { 254 struct epoll_event epev; 255 int op, events = 0; 256 int idx; 257 258 idx = EPOLL_OP_TABLE_INDEX(ch); 259 op = epoll_op_table[idx].op; 260 events = epoll_op_table[idx].events; 261 262 if (!events) { 263 EVUTIL_ASSERT(op == 0); 264 return 0; 265 } 266 267 if ((ch->read_change|ch->write_change) & EV_CHANGE_ET) 268 events |= EPOLLET; 269 270 memset(&epev, 0, sizeof(epev)); 271 epev.data.fd = ch->fd; 272 epev.events = events; 273 if (epoll_ctl(epollop->epfd, op, ch->fd, &epev) == 0) { 274 event_debug(("Epoll %s(%d) on fd %d okay. [old events were %d; read change was %d; write change was %d; close change was %d]", 275 epoll_op_to_string(op), 276 (int)epev.events, 277 (int)ch->fd, 278 ch->old_events, 279 ch->read_change, 280 ch->write_change, 281 ch->close_change)); 282 return 0; 283 } 284 285 switch (op) { 286 case EPOLL_CTL_MOD: 287 if (errno == ENOENT) { 288 /* If a MOD operation fails with ENOENT, the 289 * fd was probably closed and re-opened. We 290 * should retry the operation as an ADD. 291 */ 292 if (epoll_ctl(epollop->epfd, EPOLL_CTL_ADD, ch->fd, &epev) == -1) { 293 event_warn("Epoll MOD(%d) on %d retried as ADD; that failed too", 294 (int)epev.events, ch->fd); 295 return -1; 296 } else { 297 event_debug(("Epoll MOD(%d) on %d retried as ADD; succeeded.", 298 (int)epev.events, 299 ch->fd)); 300 return 0; 301 } 302 } 303 break; 304 case EPOLL_CTL_ADD: 305 if (errno == EEXIST) { 306 /* If an ADD operation fails with EEXIST, 307 * either the operation was redundant (as with a 308 * precautionary add), or we ran into a fun 309 * kernel bug where using dup*() to duplicate the 310 * same file into the same fd gives you the same epitem 311 * rather than a fresh one. For the second case, 312 * we must retry with MOD. */ 313 if (epoll_ctl(epollop->epfd, EPOLL_CTL_MOD, ch->fd, &epev) == -1) { 314 event_warn("Epoll ADD(%d) on %d retried as MOD; that failed too", 315 (int)epev.events, ch->fd); 316 return -1; 317 } else { 318 event_debug(("Epoll ADD(%d) on %d retried as MOD; succeeded.", 319 (int)epev.events, 320 ch->fd)); 321 return 0; 322 } 323 } 324 break; 325 case EPOLL_CTL_DEL: 326 if (errno == ENOENT || errno == EBADF || errno == EPERM) { 327 /* If a delete fails with one of these errors, 328 * that's fine too: we closed the fd before we 329 * got around to calling epoll_dispatch. */ 330 event_debug(("Epoll DEL(%d) on fd %d gave %s: DEL was unnecessary.", 331 (int)epev.events, 332 ch->fd, 333 strerror(errno))); 334 return 0; 335 } 336 break; 337 default: 338 break; 339 } 340 341 event_warn("Epoll %s(%d) on fd %d failed. Old events were %d; read change was %d (%s); write change was %d (%s); close change was %d (%s)", 342 epoll_op_to_string(op), 343 (int)epev.events, 344 ch->fd, 345 ch->old_events, 346 ch->read_change, 347 change_to_string(ch->read_change), 348 ch->write_change, 349 change_to_string(ch->write_change), 350 ch->close_change, 351 change_to_string(ch->close_change)); 352 353 return -1; 354 } 355 356 static int 357 epoll_apply_changes(struct event_base *base) 358 { 359 struct event_changelist *changelist = &base->changelist; 360 struct epollop *epollop = base->evbase; 361 struct event_change *ch; 362 363 int r = 0; 364 int i; 365 366 for (i = 0; i < changelist->n_changes; ++i) { 367 ch = &changelist->changes[i]; 368 if (epoll_apply_one_change(base, epollop, ch) < 0) 369 r = -1; 370 } 371 372 return (r); 373 } 374 375 static int 376 epoll_nochangelist_add(struct event_base *base, evutil_socket_t fd, 377 short old, short events, void *p) 378 { 379 struct event_change ch; 380 ch.fd = fd; 381 ch.old_events = old; 382 ch.read_change = ch.write_change = ch.close_change = 0; 383 if (events & EV_WRITE) 384 ch.write_change = EV_CHANGE_ADD | 385 (events & EV_ET); 386 if (events & EV_READ) 387 ch.read_change = EV_CHANGE_ADD | 388 (events & EV_ET); 389 if (events & EV_CLOSED) 390 ch.close_change = EV_CHANGE_ADD | 391 (events & EV_ET); 392 393 return epoll_apply_one_change(base, base->evbase, &ch); 394 } 395 396 static int 397 epoll_nochangelist_del(struct event_base *base, evutil_socket_t fd, 398 short old, short events, void *p) 399 { 400 struct event_change ch; 401 ch.fd = fd; 402 ch.old_events = old; 403 ch.read_change = ch.write_change = ch.close_change = 0; 404 if (events & EV_WRITE) 405 ch.write_change = EV_CHANGE_DEL; 406 if (events & EV_READ) 407 ch.read_change = EV_CHANGE_DEL; 408 if (events & EV_CLOSED) 409 ch.close_change = EV_CHANGE_DEL; 410 411 return epoll_apply_one_change(base, base->evbase, &ch); 412 } 413 414 static int 415 epoll_dispatch(struct event_base *base, struct timeval *tv) 416 { 417 struct epollop *epollop = base->evbase; 418 struct epoll_event *events = epollop->events; 419 int i, res; 420 long timeout = -1; 421 422 #ifdef USING_TIMERFD 423 if (epollop->timerfd >= 0) { 424 struct itimerspec is; 425 is.it_interval.tv_sec = 0; 426 is.it_interval.tv_nsec = 0; 427 if (tv == NULL) { 428 /* No timeout; disarm the timer. */ 429 is.it_value.tv_sec = 0; 430 is.it_value.tv_nsec = 0; 431 } else { 432 if (tv->tv_sec == 0 && tv->tv_usec == 0) { 433 /* we need to exit immediately; timerfd can't 434 * do that. */ 435 timeout = 0; 436 } 437 is.it_value.tv_sec = tv->tv_sec; 438 is.it_value.tv_nsec = tv->tv_usec * 1000; 439 } 440 /* TODO: we could avoid unnecessary syscalls here by only 441 calling timerfd_settime when the top timeout changes, or 442 when we're called with a different timeval. 443 */ 444 if (timerfd_settime(epollop->timerfd, 0, &is, NULL) < 0) { 445 event_warn("timerfd_settime"); 446 } 447 } else 448 #endif 449 if (tv != NULL) { 450 timeout = evutil_tv_to_msec_(tv); 451 if (timeout < 0 || timeout > MAX_EPOLL_TIMEOUT_MSEC) { 452 /* Linux kernels can wait forever if the timeout is 453 * too big; see comment on MAX_EPOLL_TIMEOUT_MSEC. */ 454 timeout = MAX_EPOLL_TIMEOUT_MSEC; 455 } 456 } 457 458 epoll_apply_changes(base); 459 event_changelist_remove_all_(&base->changelist, base); 460 461 EVBASE_RELEASE_LOCK(base, th_base_lock); 462 463 res = epoll_wait(epollop->epfd, events, epollop->nevents, timeout); 464 465 EVBASE_ACQUIRE_LOCK(base, th_base_lock); 466 467 if (res == -1) { 468 if (errno != EINTR) { 469 event_warn("epoll_wait"); 470 return (-1); 471 } 472 473 return (0); 474 } 475 476 event_debug(("%s: epoll_wait reports %d", __func__, res)); 477 EVUTIL_ASSERT(res <= epollop->nevents); 478 479 for (i = 0; i < res; i++) { 480 int what = events[i].events; 481 short ev = 0; 482 #ifdef USING_TIMERFD 483 if (events[i].data.fd == epollop->timerfd) 484 continue; 485 #endif 486 487 if (what & (EPOLLHUP|EPOLLERR)) { 488 ev = EV_READ | EV_WRITE; 489 } else { 490 if (what & EPOLLIN) 491 ev |= EV_READ; 492 if (what & EPOLLOUT) 493 ev |= EV_WRITE; 494 if (what & EPOLLRDHUP) 495 ev |= EV_CLOSED; 496 } 497 498 if (!ev) 499 continue; 500 501 evmap_io_active_(base, events[i].data.fd, ev | EV_ET); 502 } 503 504 if (res == epollop->nevents && epollop->nevents < MAX_NEVENT) { 505 /* We used all of the event space this time. We should 506 be ready for more events next time. */ 507 int new_nevents = epollop->nevents * 2; 508 struct epoll_event *new_events; 509 510 new_events = mm_realloc(epollop->events, 511 new_nevents * sizeof(struct epoll_event)); 512 if (new_events) { 513 epollop->events = new_events; 514 epollop->nevents = new_nevents; 515 } 516 } 517 518 return (0); 519 } 520 521 522 static void 523 epoll_dealloc(struct event_base *base) 524 { 525 struct epollop *epollop = base->evbase; 526 527 evsig_dealloc_(base); 528 if (epollop->events) 529 mm_free(epollop->events); 530 if (epollop->epfd >= 0) 531 close(epollop->epfd); 532 #ifdef USING_TIMERFD 533 if (epollop->timerfd >= 0) 534 close(epollop->timerfd); 535 #endif 536 537 memset(epollop, 0, sizeof(struct epollop)); 538 mm_free(epollop); 539 } 540 541 #endif /* EVENT__HAVE_EPOLL */ 542