1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2011 NetApp, Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 * 28 * $FreeBSD$ 29 */ 30 31 /* 32 * Copyright 2018 Joyent, Inc. 33 */ 34 35 /* 36 * Micro event library for FreeBSD, designed for a single i/o thread 37 * using kqueue, and having events be persistent by default. 38 */ 39 40 #include <sys/cdefs.h> 41 __FBSDID("$FreeBSD$"); 42 43 #include <assert.h> 44 #ifndef WITHOUT_CAPSICUM 45 #include <capsicum_helpers.h> 46 #endif 47 #include <err.h> 48 #include <errno.h> 49 #include <stdbool.h> 50 #include <stdlib.h> 51 #include <stdio.h> 52 #include <string.h> 53 #include <sysexits.h> 54 #include <unistd.h> 55 56 #include <sys/types.h> 57 #ifndef WITHOUT_CAPSICUM 58 #include <sys/capsicum.h> 59 #endif 60 #ifdef __FreeBSD__ 61 #include <sys/event.h> 62 #else 63 #include <port.h> 64 #include <sys/poll.h> 65 #include <sys/siginfo.h> 66 #include <sys/queue.h> 67 #include <sys/debug.h> 68 #endif 69 #include <sys/time.h> 70 71 #include <pthread.h> 72 #include <pthread_np.h> 73 74 #include "mevent.h" 75 76 #define MEVENT_MAX 64 77 78 #ifndef __FreeBSD__ 79 #define EV_ENABLE 0x01 80 #define EV_ADD EV_ENABLE 81 #define EV_DISABLE 0x02 82 #define EV_DELETE 0x04 83 #endif 84 85 extern char *vmname; 86 87 static pthread_t mevent_tid; 88 static int mevent_timid = 43; 89 static int mevent_pipefd[2]; 90 static pthread_mutex_t mevent_lmutex = PTHREAD_MUTEX_INITIALIZER; 91 92 struct mevent { 93 void (*me_func)(int, enum ev_type, void *); 94 #define me_msecs me_fd 95 int me_fd; 96 #ifdef __FreeBSD__ 97 int me_timid; 98 #else 99 timer_t me_timid; 100 #endif 101 enum ev_type me_type; 102 void *me_param; 103 int me_cq; 104 int me_state; /* Desired kevent flags. */ 105 int me_closefd; 106 #ifndef __FreeBSD__ 107 port_notify_t me_notify; 108 struct sigevent me_sigev; 109 boolean_t me_auto_requeue; 110 #endif 111 LIST_ENTRY(mevent) me_list; 112 }; 113 114 static LIST_HEAD(listhead, mevent) global_head, change_head; 115 116 static void 117 mevent_qlock(void) 118 { 119 pthread_mutex_lock(&mevent_lmutex); 120 } 121 122 static void 123 mevent_qunlock(void) 124 { 125 pthread_mutex_unlock(&mevent_lmutex); 126 } 127 128 static void 129 mevent_pipe_read(int fd, enum ev_type type, void *param) 130 { 131 char buf[MEVENT_MAX]; 132 int status; 133 134 /* 135 * Drain the pipe read side. The fd is non-blocking so this is 136 * safe to do. 137 */ 138 do { 139 status = read(fd, buf, sizeof(buf)); 140 } while (status == MEVENT_MAX); 141 } 142 143 static void 144 mevent_notify(void) 145 { 146 char c = '\0'; 147 148 /* 149 * If calling from outside the i/o thread, write a byte on the 150 * pipe to force the i/o thread to exit the blocking kevent call. 151 */ 152 if (mevent_pipefd[1] != 0 && pthread_self() != mevent_tid) { 153 write(mevent_pipefd[1], &c, 1); 154 } 155 } 156 #ifdef __FreeBSD__ 157 static int 158 mevent_kq_filter(struct mevent *mevp) 159 { 160 int retval; 161 162 retval = 0; 163 164 if (mevp->me_type == EVF_READ) 165 retval = EVFILT_READ; 166 167 if (mevp->me_type == EVF_WRITE) 168 retval = EVFILT_WRITE; 169 170 if (mevp->me_type == EVF_TIMER) 171 retval = EVFILT_TIMER; 172 173 if (mevp->me_type == EVF_SIGNAL) 174 retval = EVFILT_SIGNAL; 175 176 return (retval); 177 } 178 179 static int 180 mevent_kq_flags(struct mevent *mevp) 181 { 182 return (mevp->me_state); 183 } 184 185 static int 186 mevent_kq_fflags(struct mevent *mevp) 187 { 188 /* XXX nothing yet, perhaps EV_EOF for reads ? */ 189 return (0); 190 } 191 192 static int 193 mevent_build(int mfd, struct kevent *kev) 194 { 195 struct mevent *mevp, *tmpp; 196 int i; 197 198 i = 0; 199 200 mevent_qlock(); 201 202 LIST_FOREACH_SAFE(mevp, &change_head, me_list, tmpp) { 203 if (mevp->me_closefd) { 204 /* 205 * A close of the file descriptor will remove the 206 * event 207 */ 208 close(mevp->me_fd); 209 } else { 210 if (mevp->me_type == EVF_TIMER) { 211 kev[i].ident = mevp->me_timid; 212 kev[i].data = mevp->me_msecs; 213 } else { 214 kev[i].ident = mevp->me_fd; 215 kev[i].data = 0; 216 } 217 kev[i].filter = mevent_kq_filter(mevp); 218 kev[i].flags = mevent_kq_flags(mevp); 219 kev[i].fflags = mevent_kq_fflags(mevp); 220 kev[i].udata = mevp; 221 i++; 222 } 223 224 mevp->me_cq = 0; 225 LIST_REMOVE(mevp, me_list); 226 227 if (mevp->me_state & EV_DELETE) { 228 free(mevp); 229 } else { 230 /* 231 * We need to add the event only once, so we can 232 * reset the EV_ADD bit after it has been propagated 233 * to the kevent() arguments the first time. 234 */ 235 mevp->me_state &= ~EV_ADD; 236 LIST_INSERT_HEAD(&global_head, mevp, me_list); 237 } 238 239 assert(i < MEVENT_MAX); 240 } 241 242 mevent_qunlock(); 243 244 return (i); 245 } 246 247 static void 248 mevent_handle(struct kevent *kev, int numev) 249 { 250 struct mevent *mevp; 251 int i; 252 253 for (i = 0; i < numev; i++) { 254 mevp = kev[i].udata; 255 256 /* XXX check for EV_ERROR ? */ 257 258 (*mevp->me_func)(mevp->me_fd, mevp->me_type, mevp->me_param); 259 } 260 } 261 262 #else /* __FreeBSD__ */ 263 264 static boolean_t 265 mevent_clarify_state(struct mevent *mevp) 266 { 267 const int state = mevp->me_state; 268 269 if ((state & EV_DELETE) != 0) { 270 /* All other intents are overriden by delete. */ 271 mevp->me_state = EV_DELETE; 272 return (B_TRUE); 273 } 274 275 /* 276 * Without a distinction between EV_ADD and EV_ENABLE in our emulation, 277 * handling the add-disabled case means eliding the portfs operation 278 * when both flags are present. 279 * 280 * This is not a concern for subsequent enable/disable operations, as 281 * mevent_update() toggles the flags properly so they are not left in 282 * conflict. 283 */ 284 if (state == (EV_ENABLE|EV_DISABLE)) { 285 mevp->me_state = EV_DISABLE; 286 return (B_FALSE); 287 } 288 289 return (B_TRUE); 290 } 291 292 static void 293 mevent_update_one(struct mevent *mevp) 294 { 295 int portfd = mevp->me_notify.portnfy_port; 296 297 switch (mevp->me_type) { 298 case EVF_READ: 299 case EVF_WRITE: 300 mevp->me_auto_requeue = B_FALSE; 301 302 switch (mevp->me_state) { 303 case EV_ENABLE: 304 { 305 int events; 306 307 events = (mevp->me_type == EVF_READ) ? POLLIN : POLLOUT; 308 309 if (port_associate(portfd, PORT_SOURCE_FD, mevp->me_fd, 310 events, mevp) != 0) { 311 (void) fprintf(stderr, 312 "port_associate fd %d %p failed: %s\n", 313 mevp->me_fd, mevp, strerror(errno)); 314 } 315 return; 316 } 317 case EV_DISABLE: 318 case EV_DELETE: 319 /* 320 * A disable that comes in while an event is being 321 * handled will result in an ENOENT. 322 */ 323 if (port_dissociate(portfd, PORT_SOURCE_FD, 324 mevp->me_fd) != 0 && errno != ENOENT) { 325 (void) fprintf(stderr, "port_dissociate " 326 "portfd %d fd %d mevp %p failed: %s\n", 327 portfd, mevp->me_fd, mevp, strerror(errno)); 328 } 329 return; 330 default: 331 goto abort; 332 } 333 334 case EVF_TIMER: 335 mevp->me_auto_requeue = B_TRUE; 336 337 switch (mevp->me_state) { 338 case EV_ENABLE: 339 { 340 struct itimerspec it = { 0 }; 341 342 mevp->me_sigev.sigev_notify = SIGEV_PORT; 343 mevp->me_sigev.sigev_value.sival_ptr = &mevp->me_notify; 344 345 if (timer_create(CLOCK_REALTIME, &mevp->me_sigev, 346 &mevp->me_timid) != 0) { 347 (void) fprintf(stderr, 348 "timer_create failed: %s", strerror(errno)); 349 return; 350 } 351 352 /* The first timeout */ 353 it.it_value.tv_sec = mevp->me_msecs / MILLISEC; 354 it.it_value.tv_nsec = 355 MSEC2NSEC(mevp->me_msecs % MILLISEC); 356 /* Repeat at the same interval */ 357 it.it_interval = it.it_value; 358 359 if (timer_settime(mevp->me_timid, 0, &it, NULL) != 0) { 360 (void) fprintf(stderr, "timer_settime failed: " 361 "%s", strerror(errno)); 362 } 363 return; 364 } 365 case EV_DISABLE: 366 case EV_DELETE: 367 if (timer_delete(mevp->me_timid) != 0) { 368 (void) fprintf(stderr, "timer_delete failed: " 369 "%s", strerror(errno)); 370 } 371 return; 372 default: 373 goto abort; 374 } 375 default: 376 /* EVF_SIGNAL not yet implemented. */ 377 goto abort; 378 } 379 380 abort: 381 (void) fprintf(stderr, "%s: unhandled type %d state %d\n", __func__, 382 mevp->me_type, mevp->me_state); 383 abort(); 384 } 385 386 static void 387 mevent_update_pending(int portfd) 388 { 389 struct mevent *mevp, *tmpp; 390 391 mevent_qlock(); 392 393 LIST_FOREACH_SAFE(mevp, &change_head, me_list, tmpp) { 394 mevp->me_notify.portnfy_port = portfd; 395 mevp->me_notify.portnfy_user = mevp; 396 if (mevp->me_closefd) { 397 /* 398 * A close of the file descriptor will remove the 399 * event 400 */ 401 (void) close(mevp->me_fd); 402 mevp->me_fd = -1; 403 } else { 404 if (mevent_clarify_state(mevp)) { 405 mevent_update_one(mevp); 406 } 407 } 408 409 mevp->me_cq = 0; 410 LIST_REMOVE(mevp, me_list); 411 412 if (mevp->me_state & EV_DELETE) { 413 free(mevp); 414 } else { 415 LIST_INSERT_HEAD(&global_head, mevp, me_list); 416 } 417 } 418 419 mevent_qunlock(); 420 } 421 422 static void 423 mevent_handle_pe(port_event_t *pe) 424 { 425 struct mevent *mevp = pe->portev_user; 426 427 mevent_qunlock(); 428 429 (*mevp->me_func)(mevp->me_fd, mevp->me_type, mevp->me_param); 430 431 mevent_qlock(); 432 if (!mevp->me_cq && !mevp->me_auto_requeue) { 433 mevent_update_one(mevp); 434 } 435 mevent_qunlock(); 436 } 437 #endif 438 439 static struct mevent * 440 mevent_add_state(int tfd, enum ev_type type, 441 void (*func)(int, enum ev_type, void *), void *param, 442 int state) 443 { 444 struct mevent *lp, *mevp; 445 446 if (tfd < 0 || func == NULL) { 447 return (NULL); 448 } 449 450 mevp = NULL; 451 452 mevent_qlock(); 453 454 /* 455 * Verify that the fd/type tuple is not present in any list 456 */ 457 LIST_FOREACH(lp, &global_head, me_list) { 458 if (type != EVF_TIMER && lp->me_fd == tfd && 459 lp->me_type == type) { 460 goto exit; 461 } 462 } 463 464 LIST_FOREACH(lp, &change_head, me_list) { 465 if (type != EVF_TIMER && lp->me_fd == tfd && 466 lp->me_type == type) { 467 goto exit; 468 } 469 } 470 471 /* 472 * Allocate an entry, populate it, and add it to the change list. 473 */ 474 mevp = calloc(1, sizeof(struct mevent)); 475 if (mevp == NULL) { 476 goto exit; 477 } 478 479 if (type == EVF_TIMER) { 480 mevp->me_msecs = tfd; 481 mevp->me_timid = mevent_timid++; 482 } else 483 mevp->me_fd = tfd; 484 mevp->me_type = type; 485 mevp->me_func = func; 486 mevp->me_param = param; 487 488 LIST_INSERT_HEAD(&change_head, mevp, me_list); 489 mevp->me_cq = 1; 490 mevp->me_state = state; 491 mevent_notify(); 492 493 exit: 494 mevent_qunlock(); 495 496 return (mevp); 497 } 498 499 struct mevent * 500 mevent_add(int tfd, enum ev_type type, 501 void (*func)(int, enum ev_type, void *), void *param) 502 { 503 504 return (mevent_add_state(tfd, type, func, param, EV_ADD)); 505 } 506 507 struct mevent * 508 mevent_add_disabled(int tfd, enum ev_type type, 509 void (*func)(int, enum ev_type, void *), void *param) 510 { 511 512 return (mevent_add_state(tfd, type, func, param, EV_ADD | EV_DISABLE)); 513 } 514 515 static int 516 mevent_update(struct mevent *evp, bool enable) 517 { 518 int newstate; 519 520 mevent_qlock(); 521 522 /* 523 * It's not possible to enable/disable a deleted event 524 */ 525 assert((evp->me_state & EV_DELETE) == 0); 526 527 newstate = evp->me_state; 528 if (enable) { 529 newstate |= EV_ENABLE; 530 newstate &= ~EV_DISABLE; 531 } else { 532 newstate |= EV_DISABLE; 533 newstate &= ~EV_ENABLE; 534 } 535 536 /* 537 * No update needed if state isn't changing 538 */ 539 if (evp->me_state != newstate) { 540 evp->me_state = newstate; 541 542 /* 543 * Place the entry onto the changed list if not 544 * already there. 545 */ 546 if (evp->me_cq == 0) { 547 evp->me_cq = 1; 548 LIST_REMOVE(evp, me_list); 549 LIST_INSERT_HEAD(&change_head, evp, me_list); 550 mevent_notify(); 551 } 552 } 553 554 mevent_qunlock(); 555 556 return (0); 557 } 558 559 int 560 mevent_enable(struct mevent *evp) 561 { 562 563 return (mevent_update(evp, true)); 564 } 565 566 int 567 mevent_disable(struct mevent *evp) 568 { 569 570 return (mevent_update(evp, false)); 571 } 572 573 static int 574 mevent_delete_event(struct mevent *evp, int closefd) 575 { 576 mevent_qlock(); 577 578 /* 579 * Place the entry onto the changed list if not already there, and 580 * mark as to be deleted. 581 */ 582 if (evp->me_cq == 0) { 583 evp->me_cq = 1; 584 LIST_REMOVE(evp, me_list); 585 LIST_INSERT_HEAD(&change_head, evp, me_list); 586 mevent_notify(); 587 } 588 evp->me_state = EV_DELETE; 589 590 if (closefd) 591 evp->me_closefd = 1; 592 593 mevent_qunlock(); 594 595 return (0); 596 } 597 598 int 599 mevent_delete(struct mevent *evp) 600 { 601 602 return (mevent_delete_event(evp, 0)); 603 } 604 605 int 606 mevent_delete_close(struct mevent *evp) 607 { 608 609 return (mevent_delete_event(evp, 1)); 610 } 611 612 static void 613 mevent_set_name(void) 614 { 615 616 pthread_set_name_np(mevent_tid, "mevent"); 617 } 618 619 void 620 mevent_dispatch(void) 621 { 622 #ifdef __FreeBSD__ 623 struct kevent changelist[MEVENT_MAX]; 624 struct kevent eventlist[MEVENT_MAX]; 625 struct mevent *pipev; 626 int mfd; 627 int numev; 628 #else 629 struct mevent *pipev; 630 int portfd; 631 #endif 632 int ret; 633 #ifndef WITHOUT_CAPSICUM 634 cap_rights_t rights; 635 #endif 636 637 mevent_tid = pthread_self(); 638 mevent_set_name(); 639 640 #ifdef __FreeBSD__ 641 mfd = kqueue(); 642 assert(mfd > 0); 643 #else 644 portfd = port_create(); 645 assert(portfd >= 0); 646 #endif 647 648 #ifndef WITHOUT_CAPSICUM 649 cap_rights_init(&rights, CAP_KQUEUE); 650 if (caph_rights_limit(mfd, &rights) == -1) 651 errx(EX_OSERR, "Unable to apply rights for sandbox"); 652 #endif 653 654 /* 655 * Open the pipe that will be used for other threads to force 656 * the blocking kqueue call to exit by writing to it. Set the 657 * descriptor to non-blocking. 658 */ 659 ret = pipe(mevent_pipefd); 660 if (ret < 0) { 661 perror("pipe"); 662 exit(0); 663 } 664 665 #ifndef WITHOUT_CAPSICUM 666 cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE); 667 if (caph_rights_limit(mevent_pipefd[0], &rights) == -1) 668 errx(EX_OSERR, "Unable to apply rights for sandbox"); 669 if (caph_rights_limit(mevent_pipefd[1], &rights) == -1) 670 errx(EX_OSERR, "Unable to apply rights for sandbox"); 671 #endif 672 673 /* 674 * Add internal event handler for the pipe write fd 675 */ 676 pipev = mevent_add(mevent_pipefd[0], EVF_READ, mevent_pipe_read, NULL); 677 assert(pipev != NULL); 678 679 for (;;) { 680 #ifdef __FreeBSD__ 681 /* 682 * Build changelist if required. 683 * XXX the changelist can be put into the blocking call 684 * to eliminate the extra syscall. Currently better for 685 * debug. 686 */ 687 numev = mevent_build(mfd, changelist); 688 if (numev) { 689 ret = kevent(mfd, changelist, numev, NULL, 0, NULL); 690 if (ret == -1) { 691 perror("Error return from kevent change"); 692 } 693 } 694 695 /* 696 * Block awaiting events 697 */ 698 ret = kevent(mfd, NULL, 0, eventlist, MEVENT_MAX, NULL); 699 if (ret == -1 && errno != EINTR) { 700 perror("Error return from kevent monitor"); 701 } 702 703 /* 704 * Handle reported events 705 */ 706 mevent_handle(eventlist, ret); 707 708 #else /* __FreeBSD__ */ 709 port_event_t pev; 710 711 /* Handle any pending updates */ 712 mevent_update_pending(portfd); 713 714 /* Block awaiting events */ 715 ret = port_get(portfd, &pev, NULL); 716 if (ret != 0) { 717 if (errno != EINTR) 718 perror("Error return from port_get"); 719 continue; 720 } 721 722 /* Handle reported event */ 723 mevent_handle_pe(&pev); 724 #endif /* __FreeBSD__ */ 725 } 726 } 727