1 /* 2 * Copyright 2000-2007 Niels Provos <provos@citi.umich.edu> 3 * Copyright 2007-2012 Niels Provos, Nick Mathewson 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 3. The name of the author may not be used to endorse or promote products 14 * derived from this software without specific prior written permission. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 17 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 18 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 19 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 21 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 22 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 23 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 25 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 #include "event2/event-config.h" 28 #include "evconfig-private.h" 29 30 #ifdef EVENT__HAVE_EPOLL 31 32 #include <stdint.h> 33 #include <sys/types.h> 34 #include <sys/resource.h> 35 #ifdef EVENT__HAVE_SYS_TIME_H 36 #include <sys/time.h> 37 #endif 38 #include <sys/queue.h> 39 #include <sys/epoll.h> 40 #include <signal.h> 41 #include <limits.h> 42 #include <stdio.h> 43 #include <stdlib.h> 44 #include <string.h> 45 #include <unistd.h> 46 #include <errno.h> 47 #ifdef EVENT__HAVE_FCNTL_H 48 #include <fcntl.h> 49 #endif 50 #ifdef EVENT__HAVE_SYS_TIMERFD_H 51 #include <sys/timerfd.h> 52 #endif 53 54 #include "event-internal.h" 55 #include "evsignal-internal.h" 56 #include "event2/thread.h" 57 #include "evthread-internal.h" 58 #include "log-internal.h" 59 #include "evmap-internal.h" 60 #include "changelist-internal.h" 61 #include "time-internal.h" 62 63 /* Since Linux 2.6.17, epoll is able to report about peer half-closed connection 64 using special EPOLLRDHUP flag on a read event. 65 */ 66 #if !defined(EPOLLRDHUP) 67 #define EPOLLRDHUP 0 68 #define EARLY_CLOSE_IF_HAVE_RDHUP 0 69 #else 70 #define EARLY_CLOSE_IF_HAVE_RDHUP EV_FEATURE_EARLY_CLOSE 71 #endif 72 73 #include "epolltable-internal.h" 74 75 #if defined(EVENT__HAVE_SYS_TIMERFD_H) && \ 76 defined(EVENT__HAVE_TIMERFD_CREATE) && \ 77 defined(HAVE_POSIX_MONOTONIC) && defined(TFD_NONBLOCK) && \ 78 defined(TFD_CLOEXEC) 79 /* Note that we only use timerfd if TFD_NONBLOCK and TFD_CLOEXEC are available 80 and working. This means that we can't support it on 2.6.25 (where timerfd 81 was introduced) or 2.6.26, since 2.6.27 introduced those flags. 82 */ 83 #define USING_TIMERFD 84 #endif 85 86 struct epollop { 87 struct epoll_event *events; 88 int nevents; 89 int epfd; 90 #ifdef USING_TIMERFD 91 int timerfd; 92 #endif 93 }; 94 95 static void *epoll_init(struct event_base *); 96 static int epoll_dispatch(struct event_base *, struct timeval *); 97 static void epoll_dealloc(struct event_base *); 98 99 static const struct eventop epollops_changelist = { 100 "epoll (with changelist)", 101 epoll_init, 102 event_changelist_add_, 103 event_changelist_del_, 104 epoll_dispatch, 105 epoll_dealloc, 106 1, /* need reinit */ 107 EV_FEATURE_ET|EV_FEATURE_O1| EARLY_CLOSE_IF_HAVE_RDHUP, 108 EVENT_CHANGELIST_FDINFO_SIZE 109 }; 110 111 112 static int epoll_nochangelist_add(struct event_base *base, evutil_socket_t fd, 113 short old, short events, void *p); 114 static int epoll_nochangelist_del(struct event_base *base, evutil_socket_t fd, 115 short old, short events, void *p); 116 117 const struct eventop epollops = { 118 "epoll", 119 epoll_init, 120 epoll_nochangelist_add, 121 epoll_nochangelist_del, 122 epoll_dispatch, 123 epoll_dealloc, 124 1, /* need reinit */ 125 EV_FEATURE_ET|EV_FEATURE_O1|EV_FEATURE_EARLY_CLOSE, 126 0 127 }; 128 129 #define INITIAL_NEVENT 32 130 #define MAX_NEVENT 4096 131 132 /* On Linux kernels at least up to 2.6.24.4, epoll can't handle timeout 133 * values bigger than (LONG_MAX - 999ULL)/HZ. HZ in the wild can be 134 * as big as 1000, and LONG_MAX can be as small as (1<<31)-1, so the 135 * largest number of msec we can support here is 2147482. Let's 136 * round that down by 47 seconds. 137 */ 138 #define MAX_EPOLL_TIMEOUT_MSEC (35*60*1000) 139 140 static void * 141 epoll_init(struct event_base *base) 142 { 143 int epfd = -1; 144 struct epollop *epollop; 145 146 #ifdef EVENT__HAVE_EPOLL_CREATE1 147 /* First, try the shiny new epoll_create1 interface, if we have it. */ 148 epfd = epoll_create1(EPOLL_CLOEXEC); 149 #endif 150 if (epfd == -1) { 151 /* Initialize the kernel queue using the old interface. (The 152 size field is ignored since 2.6.8.) */ 153 if ((epfd = epoll_create(32000)) == -1) { 154 if (errno != ENOSYS) 155 event_warn("epoll_create"); 156 return (NULL); 157 } 158 evutil_make_socket_closeonexec(epfd); 159 } 160 161 if (!(epollop = mm_calloc(1, sizeof(struct epollop)))) { 162 close(epfd); 163 return (NULL); 164 } 165 166 epollop->epfd = epfd; 167 168 /* Initialize fields */ 169 epollop->events = mm_calloc(INITIAL_NEVENT, sizeof(struct epoll_event)); 170 if (epollop->events == NULL) { 171 mm_free(epollop); 172 close(epfd); 173 return (NULL); 174 } 175 epollop->nevents = INITIAL_NEVENT; 176 177 if ((base->flags & EVENT_BASE_FLAG_EPOLL_USE_CHANGELIST) != 0 || 178 ((base->flags & EVENT_BASE_FLAG_IGNORE_ENV) == 0 && 179 evutil_getenv_("EVENT_EPOLL_USE_CHANGELIST") != NULL)) { 180 181 base->evsel = &epollops_changelist; 182 } 183 184 #ifdef USING_TIMERFD 185 /* 186 The epoll interface ordinarily gives us one-millisecond precision, 187 so on Linux it makes perfect sense to use the CLOCK_MONOTONIC_COARSE 188 timer. But when the user has set the new PRECISE_TIMER flag for an 189 event_base, we can try to use timerfd to give them finer granularity. 190 */ 191 if ((base->flags & EVENT_BASE_FLAG_PRECISE_TIMER) && 192 base->monotonic_timer.monotonic_clock == CLOCK_MONOTONIC) { 193 int fd; 194 fd = epollop->timerfd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK|TFD_CLOEXEC); 195 if (epollop->timerfd >= 0) { 196 struct epoll_event epev; 197 memset(&epev, 0, sizeof(epev)); 198 epev.data.fd = epollop->timerfd; 199 epev.events = EPOLLIN; 200 if (epoll_ctl(epollop->epfd, EPOLL_CTL_ADD, fd, &epev) < 0) { 201 event_warn("epoll_ctl(timerfd)"); 202 close(fd); 203 epollop->timerfd = -1; 204 } 205 } else { 206 if (errno != EINVAL && errno != ENOSYS) { 207 /* These errors probably mean that we were 208 * compiled with timerfd/TFD_* support, but 209 * we're running on a kernel that lacks those. 210 */ 211 event_warn("timerfd_create"); 212 } 213 epollop->timerfd = -1; 214 } 215 } else { 216 epollop->timerfd = -1; 217 } 218 #endif 219 220 evsig_init_(base); 221 222 return (epollop); 223 } 224 225 static const char * 226 change_to_string(int change) 227 { 228 change &= (EV_CHANGE_ADD|EV_CHANGE_DEL); 229 if (change == EV_CHANGE_ADD) { 230 return "add"; 231 } else if (change == EV_CHANGE_DEL) { 232 return "del"; 233 } else if (change == 0) { 234 return "none"; 235 } else { 236 return "???"; 237 } 238 } 239 240 static const char * 241 epoll_op_to_string(int op) 242 { 243 return op == EPOLL_CTL_ADD?"ADD": 244 op == EPOLL_CTL_DEL?"DEL": 245 op == EPOLL_CTL_MOD?"MOD": 246 "???"; 247 } 248 249 #define PRINT_CHANGES(op, events, ch, status) \ 250 "Epoll %s(%d) on fd %d " status ". " \ 251 "Old events were %d; " \ 252 "read change was %d (%s); " \ 253 "write change was %d (%s); " \ 254 "close change was %d (%s)", \ 255 epoll_op_to_string(op), \ 256 events, \ 257 ch->fd, \ 258 ch->old_events, \ 259 ch->read_change, \ 260 change_to_string(ch->read_change), \ 261 ch->write_change, \ 262 change_to_string(ch->write_change), \ 263 ch->close_change, \ 264 change_to_string(ch->close_change) 265 266 static int 267 epoll_apply_one_change(struct event_base *base, 268 struct epollop *epollop, 269 const struct event_change *ch) 270 { 271 struct epoll_event epev; 272 int op, events = 0; 273 int idx; 274 275 idx = EPOLL_OP_TABLE_INDEX(ch); 276 op = epoll_op_table[idx].op; 277 events = epoll_op_table[idx].events; 278 279 if (!events) { 280 EVUTIL_ASSERT(op == 0); 281 return 0; 282 } 283 284 if ((ch->read_change|ch->write_change) & EV_CHANGE_ET) 285 events |= EPOLLET; 286 287 memset(&epev, 0, sizeof(epev)); 288 epev.data.fd = ch->fd; 289 epev.events = events; 290 if (epoll_ctl(epollop->epfd, op, ch->fd, &epev) == 0) { 291 event_debug((PRINT_CHANGES(op, epev.events, ch, "okay"))); 292 return 0; 293 } 294 295 switch (op) { 296 case EPOLL_CTL_MOD: 297 if (errno == ENOENT) { 298 /* If a MOD operation fails with ENOENT, the 299 * fd was probably closed and re-opened. We 300 * should retry the operation as an ADD. 301 */ 302 if (epoll_ctl(epollop->epfd, EPOLL_CTL_ADD, ch->fd, &epev) == -1) { 303 event_warn("Epoll MOD(%d) on %d retried as ADD; that failed too", 304 (int)epev.events, ch->fd); 305 return -1; 306 } else { 307 event_debug(("Epoll MOD(%d) on %d retried as ADD; succeeded.", 308 (int)epev.events, 309 ch->fd)); 310 return 0; 311 } 312 } 313 break; 314 case EPOLL_CTL_ADD: 315 if (errno == EEXIST) { 316 /* If an ADD operation fails with EEXIST, 317 * either the operation was redundant (as with a 318 * precautionary add), or we ran into a fun 319 * kernel bug where using dup*() to duplicate the 320 * same file into the same fd gives you the same epitem 321 * rather than a fresh one. For the second case, 322 * we must retry with MOD. */ 323 if (epoll_ctl(epollop->epfd, EPOLL_CTL_MOD, ch->fd, &epev) == -1) { 324 event_warn("Epoll ADD(%d) on %d retried as MOD; that failed too", 325 (int)epev.events, ch->fd); 326 return -1; 327 } else { 328 event_debug(("Epoll ADD(%d) on %d retried as MOD; succeeded.", 329 (int)epev.events, 330 ch->fd)); 331 return 0; 332 } 333 } 334 break; 335 case EPOLL_CTL_DEL: 336 if (errno == ENOENT || errno == EBADF || errno == EPERM) { 337 /* If a delete fails with one of these errors, 338 * that's fine too: we closed the fd before we 339 * got around to calling epoll_dispatch. */ 340 event_debug(("Epoll DEL(%d) on fd %d gave %s: DEL was unnecessary.", 341 (int)epev.events, 342 ch->fd, 343 strerror(errno))); 344 return 0; 345 } 346 break; 347 default: 348 break; 349 } 350 351 event_warn(PRINT_CHANGES(op, epev.events, ch, "failed")); 352 return -1; 353 } 354 355 static int 356 epoll_apply_changes(struct event_base *base) 357 { 358 struct event_changelist *changelist = &base->changelist; 359 struct epollop *epollop = base->evbase; 360 struct event_change *ch; 361 362 int r = 0; 363 int i; 364 365 for (i = 0; i < changelist->n_changes; ++i) { 366 ch = &changelist->changes[i]; 367 if (epoll_apply_one_change(base, epollop, ch) < 0) 368 r = -1; 369 } 370 371 return (r); 372 } 373 374 static int 375 epoll_nochangelist_add(struct event_base *base, evutil_socket_t fd, 376 short old, short events, void *p) 377 { 378 struct event_change ch; 379 ch.fd = fd; 380 ch.old_events = old; 381 ch.read_change = ch.write_change = ch.close_change = 0; 382 if (events & EV_WRITE) 383 ch.write_change = EV_CHANGE_ADD | 384 (events & EV_ET); 385 if (events & EV_READ) 386 ch.read_change = EV_CHANGE_ADD | 387 (events & EV_ET); 388 if (events & EV_CLOSED) 389 ch.close_change = EV_CHANGE_ADD | 390 (events & EV_ET); 391 392 return epoll_apply_one_change(base, base->evbase, &ch); 393 } 394 395 static int 396 epoll_nochangelist_del(struct event_base *base, evutil_socket_t fd, 397 short old, short events, void *p) 398 { 399 struct event_change ch; 400 ch.fd = fd; 401 ch.old_events = old; 402 ch.read_change = ch.write_change = ch.close_change = 0; 403 if (events & EV_WRITE) 404 ch.write_change = EV_CHANGE_DEL; 405 if (events & EV_READ) 406 ch.read_change = EV_CHANGE_DEL; 407 if (events & EV_CLOSED) 408 ch.close_change = EV_CHANGE_DEL; 409 410 return epoll_apply_one_change(base, base->evbase, &ch); 411 } 412 413 static int 414 epoll_dispatch(struct event_base *base, struct timeval *tv) 415 { 416 struct epollop *epollop = base->evbase; 417 struct epoll_event *events = epollop->events; 418 int i, res; 419 long timeout = -1; 420 421 #ifdef USING_TIMERFD 422 if (epollop->timerfd >= 0) { 423 struct itimerspec is; 424 is.it_interval.tv_sec = 0; 425 is.it_interval.tv_nsec = 0; 426 if (tv == NULL) { 427 /* No timeout; disarm the timer. */ 428 is.it_value.tv_sec = 0; 429 is.it_value.tv_nsec = 0; 430 } else { 431 if (tv->tv_sec == 0 && tv->tv_usec == 0) { 432 /* we need to exit immediately; timerfd can't 433 * do that. */ 434 timeout = 0; 435 } 436 is.it_value.tv_sec = tv->tv_sec; 437 is.it_value.tv_nsec = tv->tv_usec * 1000; 438 } 439 /* TODO: we could avoid unnecessary syscalls here by only 440 calling timerfd_settime when the top timeout changes, or 441 when we're called with a different timeval. 442 */ 443 if (timerfd_settime(epollop->timerfd, 0, &is, NULL) < 0) { 444 event_warn("timerfd_settime"); 445 } 446 } else 447 #endif 448 if (tv != NULL) { 449 timeout = evutil_tv_to_msec_(tv); 450 if (timeout < 0 || timeout > MAX_EPOLL_TIMEOUT_MSEC) { 451 /* Linux kernels can wait forever if the timeout is 452 * too big; see comment on MAX_EPOLL_TIMEOUT_MSEC. */ 453 timeout = MAX_EPOLL_TIMEOUT_MSEC; 454 } 455 } 456 457 epoll_apply_changes(base); 458 event_changelist_remove_all_(&base->changelist, base); 459 460 EVBASE_RELEASE_LOCK(base, th_base_lock); 461 462 res = epoll_wait(epollop->epfd, events, epollop->nevents, timeout); 463 464 EVBASE_ACQUIRE_LOCK(base, th_base_lock); 465 466 if (res == -1) { 467 if (errno != EINTR) { 468 event_warn("epoll_wait"); 469 return (-1); 470 } 471 472 return (0); 473 } 474 475 event_debug(("%s: epoll_wait reports %d", __func__, res)); 476 EVUTIL_ASSERT(res <= epollop->nevents); 477 478 for (i = 0; i < res; i++) { 479 int what = events[i].events; 480 short ev = 0; 481 #ifdef USING_TIMERFD 482 if (events[i].data.fd == epollop->timerfd) 483 continue; 484 #endif 485 486 if (what & (EPOLLHUP|EPOLLERR)) { 487 ev = EV_READ | EV_WRITE; 488 } else { 489 if (what & EPOLLIN) 490 ev |= EV_READ; 491 if (what & EPOLLOUT) 492 ev |= EV_WRITE; 493 if (what & EPOLLRDHUP) 494 ev |= EV_CLOSED; 495 } 496 497 if (!ev) 498 continue; 499 500 evmap_io_active_(base, events[i].data.fd, ev | EV_ET); 501 } 502 503 if (res == epollop->nevents && epollop->nevents < MAX_NEVENT) { 504 /* We used all of the event space this time. We should 505 be ready for more events next time. */ 506 int new_nevents = epollop->nevents * 2; 507 struct epoll_event *new_events; 508 509 new_events = mm_realloc(epollop->events, 510 new_nevents * sizeof(struct epoll_event)); 511 if (new_events) { 512 epollop->events = new_events; 513 epollop->nevents = new_nevents; 514 } 515 } 516 517 return (0); 518 } 519 520 521 static void 522 epoll_dealloc(struct event_base *base) 523 { 524 struct epollop *epollop = base->evbase; 525 526 evsig_dealloc_(base); 527 if (epollop->events) 528 mm_free(epollop->events); 529 if (epollop->epfd >= 0) 530 close(epollop->epfd); 531 #ifdef USING_TIMERFD 532 if (epollop->timerfd >= 0) 533 close(epollop->timerfd); 534 #endif 535 536 memset(epollop, 0, sizeof(struct epollop)); 537 mm_free(epollop); 538 } 539 540 #endif /* EVENT__HAVE_EPOLL */ 541